marcosegura34 commited on
Commit
408f96b
·
verified ·
1 Parent(s): da3c5f0

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. .github/ISSUE_TEMPLATE/bug_report.yml +50 -0
  3. .github/ISSUE_TEMPLATE/feature_request.yml +62 -0
  4. .github/ISSUE_TEMPLATE/help_wanted.yml +50 -0
  5. .github/ISSUE_TEMPLATE/question.yml +26 -0
  6. .github/workflows/pre-commit.yaml +14 -0
  7. .github/workflows/publish-docker-image.yaml +60 -0
  8. .github/workflows/sync-hf.yaml +18 -0
  9. .gitignore +173 -0
  10. .gitmodules +3 -0
  11. .gradio/certificate.pem +31 -0
  12. .pre-commit-config.yaml +14 -0
  13. Dockerfile +24 -0
  14. LICENSE +21 -0
  15. README.md +159 -13
  16. Spanish_F5.ipynb +433 -0
  17. app.py +7 -0
  18. pyproject.toml +62 -0
  19. ref_audio.wav +3 -0
  20. ruff.toml +10 -0
  21. src/f5_tts.egg-info/PKG-INFO +198 -0
  22. src/f5_tts.egg-info/SOURCES.txt +67 -0
  23. src/f5_tts.egg-info/dependency_links.txt +1 -0
  24. src/f5_tts.egg-info/entry_points.txt +5 -0
  25. src/f5_tts.egg-info/requires.txt +34 -0
  26. src/f5_tts.egg-info/top_level.txt +2 -0
  27. src/f5_tts/api.py +151 -0
  28. src/f5_tts/eval/README.md +49 -0
  29. src/f5_tts/eval/ecapa_tdnn.py +330 -0
  30. src/f5_tts/eval/eval_infer_batch.py +207 -0
  31. src/f5_tts/eval/eval_infer_batch.sh +13 -0
  32. src/f5_tts/eval/eval_librispeech_test_clean.py +73 -0
  33. src/f5_tts/eval/eval_seedtts_testset.py +75 -0
  34. src/f5_tts/eval/utils_eval.py +405 -0
  35. src/f5_tts/infer/README.md +189 -0
  36. src/f5_tts/infer/examples/basic/basic.toml +10 -0
  37. src/f5_tts/infer/examples/basic/basic_ref_en.wav +0 -0
  38. src/f5_tts/infer/examples/basic/basic_ref_zh.wav +0 -0
  39. src/f5_tts/infer/examples/multi/country.flac +0 -0
  40. src/f5_tts/infer/examples/multi/main.flac +0 -0
  41. src/f5_tts/infer/examples/multi/story.toml +19 -0
  42. src/f5_tts/infer/examples/multi/story.txt +1 -0
  43. src/f5_tts/infer/examples/multi/town.flac +0 -0
  44. src/f5_tts/infer/examples/vocab.txt +2545 -0
  45. src/f5_tts/infer/infer_cli.py +220 -0
  46. src/f5_tts/infer/infer_gradio.py +744 -0
  47. src/f5_tts/infer/speech_edit.py +191 -0
  48. src/f5_tts/infer/utils_infer.py +511 -0
  49. src/f5_tts/model/__init__.py +10 -0
  50. src/f5_tts/model/backbones/README.md +20 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ ref_audio.wav filter=lfs diff=lfs merge=lfs -text
37
+ src/third_party/BigVGAN/demo/examples/megalovania_24k.wav filter=lfs diff=lfs merge=lfs -text
38
+ src/third_party/BigVGAN/filelists/LibriTTS/train-full.txt filter=lfs diff=lfs merge=lfs -text
.github/ISSUE_TEMPLATE/bug_report.yml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "Bug Report"
2
+ description: |
3
+ Please provide as much details to help address the issue, including logs and screenshots.
4
+ labels:
5
+ - bug
6
+ body:
7
+ - type: checkboxes
8
+ attributes:
9
+ label: Checks
10
+ description: "To ensure timely help, please confirm the following:"
11
+ options:
12
+ - label: This template is only for bug reports, usage problems go with 'Help Wanted'.
13
+ required: true
14
+ - label: I have thoroughly reviewed the project documentation but couldn't find information to solve my problem.
15
+ required: true
16
+ - label: I have searched for existing issues, including closed ones, and couldn't find a solution.
17
+ required: true
18
+ - label: I confirm that I am using English to submit this report in order to facilitate communication.
19
+ required: true
20
+ - type: textarea
21
+ attributes:
22
+ label: Environment Details
23
+ description: "Provide details such as OS, Python version, and any relevant software or dependencies."
24
+ placeholder: e.g., CentOS Linux 7, RTX 3090, Python 3.10, torch==2.3.0, cuda 11.8
25
+ validations:
26
+ required: true
27
+ - type: textarea
28
+ attributes:
29
+ label: Steps to Reproduce
30
+ description: |
31
+ Include detailed steps, screenshots, and logs. Use the correct markdown syntax for code blocks.
32
+ placeholder: |
33
+ 1. Create a new conda environment.
34
+ 2. Clone the repository, install as local editable and properly set up.
35
+ 3. Run the command: `accelerate launch src/f5_tts/train/train.py`.
36
+ 4. Have following error message... (attach logs).
37
+ validations:
38
+ required: true
39
+ - type: textarea
40
+ attributes:
41
+ label: ✔️ Expected Behavior
42
+ placeholder: Describe what you expected to happen.
43
+ validations:
44
+ required: false
45
+ - type: textarea
46
+ attributes:
47
+ label: ❌ Actual Behavior
48
+ placeholder: Describe what actually happened.
49
+ validations:
50
+ required: false
.github/ISSUE_TEMPLATE/feature_request.yml ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "Feature Request"
2
+ description: |
3
+ Some constructive suggestions and new ideas regarding current repo.
4
+ labels:
5
+ - enhancement
6
+ body:
7
+ - type: checkboxes
8
+ attributes:
9
+ label: Checks
10
+ description: "To help us grasp quickly, please confirm the following:"
11
+ options:
12
+ - label: This template is only for feature request.
13
+ required: true
14
+ - label: I have thoroughly reviewed the project documentation but couldn't find any relevant information that meets my needs.
15
+ required: true
16
+ - label: I have searched for existing issues, including closed ones, and found not discussion yet.
17
+ required: true
18
+ - label: I confirm that I am using English to submit this report in order to facilitate communication.
19
+ required: true
20
+ - type: textarea
21
+ attributes:
22
+ label: 1. Is this request related to a challenge you're experiencing? Tell us your story.
23
+ description: |
24
+ Describe the specific problem or scenario you're facing in detail. For example:
25
+ *"I was trying to use [feature] for [specific task], but encountered [issue]. This was frustrating because...."*
26
+ placeholder: Please describe the situation in as much detail as possible.
27
+ validations:
28
+ required: true
29
+
30
+ - type: textarea
31
+ attributes:
32
+ label: 2. What is your suggested solution?
33
+ description: |
34
+ Provide a clear description of the feature or enhancement you'd like to propose.
35
+ How would this feature solve your issue or improve the project?
36
+ placeholder: Describe your idea or proposed solution here.
37
+ validations:
38
+ required: true
39
+
40
+ - type: textarea
41
+ attributes:
42
+ label: 3. Additional context or comments
43
+ description: |
44
+ Any other relevant information, links, documents, or screenshots that provide clarity.
45
+ Use this section for anything not covered above.
46
+ placeholder: Add any extra details here.
47
+ validations:
48
+ required: false
49
+
50
+ - type: checkboxes
51
+ attributes:
52
+ label: 4. Can you help us with this feature?
53
+ description: |
54
+ Let us know if you're interested in contributing. This is not a commitment but a way to express interest in collaboration.
55
+ options:
56
+ - label: I am interested in contributing to this feature.
57
+ required: false
58
+
59
+ - type: markdown
60
+ attributes:
61
+ value: |
62
+ **Note:** Please submit only one request per issue to keep discussions focused and manageable.
.github/ISSUE_TEMPLATE/help_wanted.yml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "Help Wanted"
2
+ description: |
3
+ Please provide as much details to help address the issue, including logs and screenshots.
4
+ labels:
5
+ - help wanted
6
+ body:
7
+ - type: checkboxes
8
+ attributes:
9
+ label: Checks
10
+ description: "To ensure timely help, please confirm the following:"
11
+ options:
12
+ - label: This template is only for usage issues encountered.
13
+ required: true
14
+ - label: I have thoroughly reviewed the project documentation but couldn't find information to solve my problem.
15
+ required: true
16
+ - label: I have searched for existing issues, including closed ones, and couldn't find a solution.
17
+ required: true
18
+ - label: I confirm that I am using English to submit this report in order to facilitate communication.
19
+ required: true
20
+ - type: textarea
21
+ attributes:
22
+ label: Environment Details
23
+ description: "Provide details such as OS, Python version, and any relevant software or dependencies."
24
+ placeholder: e.g., macOS 13.5, Python 3.10, torch==2.3.0, Gradio 4.44.1
25
+ validations:
26
+ required: true
27
+ - type: textarea
28
+ attributes:
29
+ label: Steps to Reproduce
30
+ description: |
31
+ Include detailed steps, screenshots, and logs. Use the correct markdown syntax for code blocks.
32
+ placeholder: |
33
+ 1. Create a new conda environment.
34
+ 2. Clone the repository and install as pip package.
35
+ 3. Run the command: `f5-tts_infer-gradio` with no ref_text provided.
36
+ 4. Stuck there with the following message... (attach logs and also error msg e.g. after ctrl-c).
37
+ validations:
38
+ required: true
39
+ - type: textarea
40
+ attributes:
41
+ label: ✔️ Expected Behavior
42
+ placeholder: Describe what you expected to happen, e.g. output a generated audio
43
+ validations:
44
+ required: false
45
+ - type: textarea
46
+ attributes:
47
+ label: ❌ Actual Behavior
48
+ placeholder: Describe what actually happened, failure messages, etc.
49
+ validations:
50
+ required: false
.github/ISSUE_TEMPLATE/question.yml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "Question"
2
+ description: |
3
+ Pure question or inquiry about the project.
4
+ labels:
5
+ - question
6
+ body:
7
+ - type: checkboxes
8
+ attributes:
9
+ label: Checks
10
+ description: "To help us grasp quickly, please confirm the following:"
11
+ options:
12
+ - label: This template is only for question, not feature requests or bug reports.
13
+ required: true
14
+ - label: I have thoroughly reviewed the project documentation and read the related paper(s).
15
+ required: true
16
+ - label: I have searched for existing issues, including closed ones, no similar questions.
17
+ required: true
18
+ - label: I confirm that I am using English to submit this report in order to facilitate communication.
19
+ required: true
20
+ - type: textarea
21
+ attributes:
22
+ label: Question details
23
+ description: |
24
+ Question details, clearly stated using proper markdown syntax.
25
+ validations:
26
+ required: true
.github/workflows/pre-commit.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: pre-commit
2
+
3
+ on:
4
+ pull_request:
5
+ push:
6
+ branches: [main]
7
+
8
+ jobs:
9
+ pre-commit:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v3
13
+ - uses: actions/setup-python@v3
14
+ - uses: pre-commit/[email protected]
.github/workflows/publish-docker-image.yaml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Create and publish a Docker image
2
+
3
+ # Configures this workflow to run every time a change is pushed to the branch called `release`.
4
+ on:
5
+ push:
6
+ branches: ['main']
7
+
8
+ # Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds.
9
+ env:
10
+ REGISTRY: ghcr.io
11
+ IMAGE_NAME: ${{ github.repository }}
12
+
13
+ # There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu.
14
+ jobs:
15
+ build-and-push-image:
16
+ runs-on: ubuntu-latest
17
+ # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job.
18
+ permissions:
19
+ contents: read
20
+ packages: write
21
+ #
22
+ steps:
23
+ - name: Checkout repository
24
+ uses: actions/checkout@v4
25
+ - name: Free Up GitHub Actions Ubuntu Runner Disk Space 🔧
26
+ uses: jlumbroso/free-disk-space@main
27
+ with:
28
+ # This might remove tools that are actually needed, if set to "true" but frees about 6 GB
29
+ tool-cache: false
30
+
31
+ # All of these default to true, but feel free to set to "false" if necessary for your workflow
32
+ android: true
33
+ dotnet: true
34
+ haskell: true
35
+ large-packages: false
36
+ swap-storage: false
37
+ docker-images: false
38
+ # Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here.
39
+ - name: Log in to the Container registry
40
+ uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1
41
+ with:
42
+ registry: ${{ env.REGISTRY }}
43
+ username: ${{ github.actor }}
44
+ password: ${{ secrets.GITHUB_TOKEN }}
45
+ # This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels.
46
+ - name: Extract metadata (tags, labels) for Docker
47
+ id: meta
48
+ uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
49
+ with:
50
+ images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
51
+ # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages.
52
+ # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository.
53
+ # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step.
54
+ - name: Build and push Docker image
55
+ uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4
56
+ with:
57
+ context: .
58
+ push: true
59
+ tags: ${{ steps.meta.outputs.tags }}
60
+ labels: ${{ steps.meta.outputs.labels }}
.github/workflows/sync-hf.yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Sync to HF Space
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ trigger_curl:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Send cURL POST request
14
+ run: |
15
+ curl -X POST https://mrfakename-sync-f5.hf.space/gradio_api/call/refresh \
16
+ -s \
17
+ -H "Content-Type: application/json" \
18
+ -d "{\"data\": [\"${{ secrets.REFRESH_PASSWORD }}\"]}"
.gitignore ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Customed
2
+ .vscode/
3
+ tests/
4
+ runs/
5
+ data/
6
+ ckpts/
7
+ wandb/
8
+ results/
9
+
10
+
11
+
12
+ # Byte-compiled / optimized / DLL files
13
+ __pycache__/
14
+ *.py[cod]
15
+ *$py.class
16
+
17
+ # C extensions
18
+ *.so
19
+
20
+ # Distribution / packaging
21
+ .Python
22
+ build/
23
+ develop-eggs/
24
+ dist/
25
+ downloads/
26
+ eggs/
27
+ .eggs/
28
+ lib/
29
+ lib64/
30
+ parts/
31
+ sdist/
32
+ var/
33
+ wheels/
34
+ share/python-wheels/
35
+ *.egg-info/
36
+ .installed.cfg
37
+ *.egg
38
+ MANIFEST
39
+
40
+ # PyInstaller
41
+ # Usually these files are written by a python script from a template
42
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
43
+ *.manifest
44
+ *.spec
45
+
46
+ # Installer logs
47
+ pip-log.txt
48
+ pip-delete-this-directory.txt
49
+
50
+ # Unit test / coverage reports
51
+ htmlcov/
52
+ .tox/
53
+ .nox/
54
+ .coverage
55
+ .coverage.*
56
+ .cache
57
+ nosetests.xml
58
+ coverage.xml
59
+ *.cover
60
+ *.py,cover
61
+ .hypothesis/
62
+ .pytest_cache/
63
+ cover/
64
+
65
+ # Translations
66
+ *.mo
67
+ *.pot
68
+
69
+ # Django stuff:
70
+ *.log
71
+ local_settings.py
72
+ db.sqlite3
73
+ db.sqlite3-journal
74
+
75
+ # Flask stuff:
76
+ instance/
77
+ .webassets-cache
78
+
79
+ # Scrapy stuff:
80
+ .scrapy
81
+
82
+ # Sphinx documentation
83
+ docs/_build/
84
+
85
+ # PyBuilder
86
+ .pybuilder/
87
+ target/
88
+
89
+ # Jupyter Notebook
90
+ .ipynb_checkpoints
91
+
92
+ # IPython
93
+ profile_default/
94
+ ipython_config.py
95
+
96
+ # pyenv
97
+ # For a library or package, you might want to ignore these files since the code is
98
+ # intended to run in multiple environments; otherwise, check them in:
99
+ # .python-version
100
+
101
+ # pipenv
102
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
103
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
104
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
105
+ # install all needed dependencies.
106
+ #Pipfile.lock
107
+
108
+ # poetry
109
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
110
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
111
+ # commonly ignored for libraries.
112
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
113
+ #poetry.lock
114
+
115
+ # pdm
116
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
117
+ #pdm.lock
118
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
119
+ # in version control.
120
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
121
+ .pdm.toml
122
+ .pdm-python
123
+ .pdm-build/
124
+
125
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
126
+ __pypackages__/
127
+
128
+ # Celery stuff
129
+ celerybeat-schedule
130
+ celerybeat.pid
131
+
132
+ # SageMath parsed files
133
+ *.sage.py
134
+
135
+ # Environments
136
+ .env
137
+ .venv
138
+ env/
139
+ venv/
140
+ ENV/
141
+ env.bak/
142
+ venv.bak/
143
+
144
+ # Spyder project settings
145
+ .spyderproject
146
+ .spyproject
147
+
148
+ # Rope project settings
149
+ .ropeproject
150
+
151
+ # mkdocs documentation
152
+ /site
153
+
154
+ # mypy
155
+ .mypy_cache/
156
+ .dmypy.json
157
+ dmypy.json
158
+
159
+ # Pyre type checker
160
+ .pyre/
161
+
162
+ # pytype static type analyzer
163
+ .pytype/
164
+
165
+ # Cython debug symbols
166
+ cython_debug/
167
+
168
+ # PyCharm
169
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
170
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
171
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
172
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
173
+ #.idea/
.gitmodules ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [submodule "src/third_party/BigVGAN"]
2
+ path = src/third_party/BigVGAN
3
+ url = https://github.com/NVIDIA/BigVGAN.git
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
.pre-commit-config.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ # Ruff version.
4
+ rev: v0.7.0
5
+ hooks:
6
+ # Run the linter.
7
+ - id: ruff
8
+ args: [--fix]
9
+ # Run the formatter.
10
+ - id: ruff-format
11
+ - repo: https://github.com/pre-commit/pre-commit-hooks
12
+ rev: v2.3.0
13
+ hooks:
14
+ - id: check-yaml
Dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM pytorch/pytorch:2.4.0-cuda12.4-cudnn9-devel
2
+
3
+ USER root
4
+
5
+ ARG DEBIAN_FRONTEND=noninteractive
6
+
7
+ LABEL github_repo="https://github.com/SWivid/F5-TTS"
8
+
9
+ RUN set -x \
10
+ && apt-get update \
11
+ && apt-get -y install wget curl man git less openssl libssl-dev unzip unar build-essential aria2 tmux vim \
12
+ && apt-get install -y openssh-server sox libsox-fmt-all libsox-fmt-mp3 libsndfile1-dev ffmpeg \
13
+ && rm -rf /var/lib/apt/lists/* \
14
+ && apt-get clean
15
+
16
+ WORKDIR /workspace
17
+
18
+ RUN git clone https://github.com/SWivid/F5-TTS.git \
19
+ && cd F5-TTS \
20
+ && pip install -e .[eval]
21
+
22
+ ENV SHELL=/bin/bash
23
+
24
+ WORKDIR /workspace/F5-TTS
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Yushen CHEN
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,13 +1,159 @@
1
- ---
2
- title: Spanish F5
3
- emoji: 👀
4
- colorFrom: indigo
5
- colorTo: pink
6
- sdk: gradio
7
- sdk_version: 5.9.1
8
- app_file: app.py
9
- pinned: false
10
- short_description: Spanish-F5
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Spanish-F5
3
+ app_file: app.py
4
+ sdk: gradio
5
+ sdk_version: 5.9.1
6
+ ---
7
+ # F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching
8
+
9
+ **F5-TTS**: Diffusion Transformer with ConvNeXt V2, faster trained and inference.
10
+
11
+ **E2 TTS**: Flat-UNet Transformer, closest reproduction from [paper](https://arxiv.org/abs/2406.18009).
12
+
13
+ **Sway Sampling**: Inference-time flow step sampling strategy, greatly improves performance
14
+
15
+ ### Thanks to all the contributors !
16
+
17
+ ## News
18
+ Spanish model: https://huggingface.co/jpgallegoar/F5-Spanish/
19
+
20
+ ## Installation
21
+
22
+ ```bash
23
+ # Create a python 3.10 conda env (you could also use virtualenv)
24
+ conda create -n f5-tts python=3.10
25
+ conda activate f5-tts
26
+
27
+ # Install pytorch with your CUDA version, e.g.
28
+ pip install torch==2.3.0+cu118 torchaudio==2.3.0+cu118 --extra-index-url https://download.pytorch.org/whl/cu118
29
+ ```
30
+
31
+ Then you can choose from a few options below:
32
+
33
+ ### 1. As a pip package (if just for inference)
34
+
35
+ ```bash
36
+ pip install git+https://github.com/jpgallegoar/Spanish-F5.git
37
+ ```
38
+
39
+ ### 2. Local editable (if also do training, finetuning)
40
+
41
+ ```bash
42
+ git clone https://github.com/jpgallegoar/Spanish-F5.git
43
+ cd F5-TTS
44
+ # git submodule update --init --recursive # (optional, if need bigvgan)
45
+ pip install -e .
46
+ ```
47
+ If initialize submodule, you should add the following code at the beginning of `src/third_party/BigVGAN/bigvgan.py`.
48
+ ```python
49
+ import os
50
+ import sys
51
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
52
+ ```
53
+
54
+ ## Inference
55
+
56
+ ### 1. Gradio App
57
+
58
+ Currently supported features:
59
+
60
+ - Basic TTS with Chunk Inference
61
+ - Multi-Style / Multi-Speaker Generation
62
+ - Voice Chat powered by Qwen2.5-3B-Instruct
63
+
64
+ ```bash
65
+ # Launch a Gradio app (web interface)
66
+ f5-tts_infer-gradio
67
+
68
+ # Specify the port/host
69
+ f5-tts_infer-gradio --port 7860 --host 0.0.0.0
70
+
71
+ # Launch a share link
72
+ f5-tts_infer-gradio --share
73
+ ```
74
+
75
+ ### 2. CLI Inference
76
+
77
+ ```bash
78
+ # Run with flags
79
+ # Leave --ref_text "" will have ASR model transcribe (extra GPU memory usage)
80
+ f5-tts_infer-cli \
81
+ --model "F5-TTS" \
82
+ --ref_audio "ref_audio.wav" \
83
+ --ref_text "The content, subtitle or transcription of reference audio." \
84
+ --gen_text "Some text you want TTS model generate for you."
85
+
86
+ # Run with default setting. src/f5_tts/infer/examples/basic/basic.toml
87
+ f5-tts_infer-cli
88
+ # Or with your own .toml file
89
+ f5-tts_infer-cli -c custom.toml
90
+
91
+ # Multi voice. See src/f5_tts/infer/README.md
92
+ f5-tts_infer-cli -c src/f5_tts/infer/examples/multi/story.toml
93
+ ```
94
+
95
+ ### 3. More instructions
96
+
97
+ - In order to have better generation results, take a moment to read [detailed guidance](src/f5_tts/infer).
98
+ - The [Issues](https://github.com/SWivid/F5-TTS/issues?q=is%3Aissue) are very useful, please try to find the solution by properly searching the keywords of problem encountered. If no answer found, then feel free to open an issue.
99
+
100
+
101
+ ## Training
102
+
103
+ ### 1. Gradio App
104
+
105
+ Read [training & finetuning guidance](src/f5_tts/train) for more instructions.
106
+
107
+ ```bash
108
+ # Quick start with Gradio web interface
109
+ f5-tts_finetune-gradio
110
+ ```
111
+
112
+
113
+ ## [Evaluation](src/f5_tts/eval)
114
+
115
+
116
+ ## Development
117
+
118
+ Use pre-commit to ensure code quality (will run linters and formatters automatically)
119
+
120
+ ```bash
121
+ pip install pre-commit
122
+ pre-commit install
123
+ ```
124
+
125
+ When making a pull request, before each commit, run:
126
+
127
+ ```bash
128
+ pre-commit run --all-files
129
+ ```
130
+
131
+ Note: Some model components have linting exceptions for E722 to accommodate tensor notation
132
+
133
+
134
+ ## Acknowledgements
135
+
136
+ - [E2-TTS](https://arxiv.org/abs/2406.18009) brilliant work, simple and effective
137
+ - [Emilia](https://arxiv.org/abs/2407.05361), [WenetSpeech4TTS](https://arxiv.org/abs/2406.05763) valuable datasets
138
+ - [lucidrains](https://github.com/lucidrains) initial CFM structure with also [bfs18](https://github.com/bfs18) for discussion
139
+ - [SD3](https://arxiv.org/abs/2403.03206) & [Hugging Face diffusers](https://github.com/huggingface/diffusers) DiT and MMDiT code structure
140
+ - [torchdiffeq](https://github.com/rtqichen/torchdiffeq) as ODE solver, [Vocos](https://huggingface.co/charactr/vocos-mel-24khz) as vocoder
141
+ - [FunASR](https://github.com/modelscope/FunASR), [faster-whisper](https://github.com/SYSTRAN/faster-whisper), [UniSpeech](https://github.com/microsoft/UniSpeech) for evaluation tools
142
+ - [ctc-forced-aligner](https://github.com/MahmoudAshraf97/ctc-forced-aligner) for speech edit test
143
+ - [mrfakename](https://x.com/realmrfakename) huggingface space demo ~
144
+ - [f5-tts-mlx](https://github.com/lucasnewman/f5-tts-mlx/tree/main) Implementation with MLX framework by [Lucas Newman](https://github.com/lucasnewman)
145
+ - [F5-TTS-ONNX](https://github.com/DakeQQ/F5-TTS-ONNX) ONNX Runtime version by [DakeQQ](https://github.com/DakeQQ)
146
+
147
+ ## Citation
148
+ If our work and codebase is useful for you, please cite as:
149
+ ```
150
+ @article{chen-etal-2024-f5tts,
151
+ title={F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching},
152
+ author={Yushen Chen and Zhikang Niu and Ziyang Ma and Keqi Deng and Chunhui Wang and Jian Zhao and Kai Yu and Xie Chen},
153
+ journal={arXiv preprint arXiv:2410.06885},
154
+ year={2024},
155
+ }
156
+ ```
157
+ ## License
158
+
159
+ Our code is released under MIT License. The pre-trained models are licensed under the CC-BY-NC license due to the training data Emilia, which is an in-the-wild dataset. Sorry for any inconvenience this may cause.
Spanish_F5.ipynb ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": []
7
+ },
8
+ "kernelspec": {
9
+ "name": "python3",
10
+ "display_name": "Python 3"
11
+ },
12
+ "language_info": {
13
+ "name": "python"
14
+ }
15
+ },
16
+ "cells": [
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 1,
20
+ "metadata": {
21
+ "colab": {
22
+ "base_uri": "https://localhost:8080/"
23
+ },
24
+ "id": "0-duLUr_Iema",
25
+ "outputId": "615d30a6-2a04-4859-ad9d-79edd6a3ffc1"
26
+ },
27
+ "outputs": [
28
+ {
29
+ "output_type": "stream",
30
+ "name": "stdout",
31
+ "text": [
32
+ "Collecting git+https://github.com/jpgallegoar/Spanish-F5.git\n",
33
+ " Cloning https://github.com/jpgallegoar/Spanish-F5.git to /tmp/pip-req-build-mupnta95\n",
34
+ " Running command git clone --filter=blob:none --quiet https://github.com/jpgallegoar/Spanish-F5.git /tmp/pip-req-build-mupnta95\n",
35
+ " Resolved https://github.com/jpgallegoar/Spanish-F5.git to commit 840d95016e3dd838ee3e7bbe30c64fdced73a7ef\n",
36
+ " Running command git submodule update --init --recursive -q\n",
37
+ " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
38
+ " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
39
+ " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
40
+ "Requirement already satisfied: accelerate>=0.33.0 in /usr/local/lib/python3.10/dist-packages (from f5-tts==0.0.0) (0.34.2)\n",
41
+ "Collecting bitsandbytes>0.37.0 (from f5-tts==0.0.0)\n",
42
+ " Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)\n",
43
+ "Collecting cached-path (from f5-tts==0.0.0)\n",
44
+ " Downloading cached_path-1.6.3-py3-none-any.whl.metadata (19 kB)\n",
45
+ "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from f5-tts==0.0.0) (8.1.7)\n",
46
+ "Collecting datasets (from f5-tts==0.0.0)\n",
47
+ " Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)\n",
48
+ "Collecting ema-pytorch>=0.5.2 (from f5-tts==0.0.0)\n",
49
+ " Downloading ema_pytorch-0.7.3-py3-none-any.whl.metadata (691 bytes)\n",
50
+ "Collecting gradio>=3.45.2 (from f5-tts==0.0.0)\n",
51
+ " Downloading gradio-5.4.0-py3-none-any.whl.metadata (16 kB)\n",
52
+ "Requirement already satisfied: jieba in /usr/local/lib/python3.10/dist-packages (from f5-tts==0.0.0) (0.42.1)\n",
53
+ "Requirement already satisfied: librosa in /usr/local/lib/python3.10/dist-packages (from f5-tts==0.0.0) (0.10.2.post1)\n",
54
+ "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from f5-tts==0.0.0) (3.8.0)\n",
55
+ "Requirement already satisfied: numpy<=1.26.4 in /usr/local/lib/python3.10/dist-packages (from f5-tts==0.0.0) (1.26.4)\n",
56
+ "Collecting pydub (from f5-tts==0.0.0)\n",
57
+ " Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)\n",
58
+ "Collecting pypinyin (from f5-tts==0.0.0)\n",
59
+ " Downloading pypinyin-0.53.0-py2.py3-none-any.whl.metadata (12 kB)\n",
60
+ "Requirement already satisfied: safetensors in /usr/local/lib/python3.10/dist-packages (from f5-tts==0.0.0) (0.4.5)\n",
61
+ "Requirement already satisfied: soundfile in /usr/local/lib/python3.10/dist-packages (from f5-tts==0.0.0) (0.12.1)\n",
62
+ "Requirement already satisfied: tomli in /usr/local/lib/python3.10/dist-packages (from f5-tts==0.0.0) (2.0.2)\n",
63
+ "Requirement already satisfied: torch>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from f5-tts==0.0.0) (2.5.0+cu121)\n",
64
+ "Requirement already satisfied: torchaudio>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from f5-tts==0.0.0) (2.5.0+cu121)\n",
65
+ "Collecting torchdiffeq (from f5-tts==0.0.0)\n",
66
+ " Downloading torchdiffeq-0.2.4-py3-none-any.whl.metadata (440 bytes)\n",
67
+ "Requirement already satisfied: tqdm>=4.65.0 in /usr/local/lib/python3.10/dist-packages (from f5-tts==0.0.0) (4.66.6)\n",
68
+ "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (from f5-tts==0.0.0) (4.44.2)\n",
69
+ "Collecting transformers-stream-generator (from f5-tts==0.0.0)\n",
70
+ " Downloading transformers-stream-generator-0.0.5.tar.gz (13 kB)\n",
71
+ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
72
+ "Collecting vocos (from f5-tts==0.0.0)\n",
73
+ " Downloading vocos-0.1.0-py3-none-any.whl.metadata (4.8 kB)\n",
74
+ "Requirement already satisfied: wandb in /usr/local/lib/python3.10/dist-packages (from f5-tts==0.0.0) (0.18.5)\n",
75
+ "Collecting x-transformers>=1.31.14 (from f5-tts==0.0.0)\n",
76
+ " Downloading x_transformers-1.42.5-py3-none-any.whl.metadata (689 bytes)\n",
77
+ "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.33.0->f5-tts==0.0.0) (24.1)\n",
78
+ "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.33.0->f5-tts==0.0.0) (5.9.5)\n",
79
+ "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.33.0->f5-tts==0.0.0) (6.0.2)\n",
80
+ "Requirement already satisfied: huggingface-hub>=0.21.0 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.33.0->f5-tts==0.0.0) (0.24.7)\n",
81
+ "Collecting aiofiles<24.0,>=22.0 (from gradio>=3.45.2->f5-tts==0.0.0)\n",
82
+ " Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)\n",
83
+ "Requirement already satisfied: anyio<5.0,>=3.0 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->f5-tts==0.0.0) (3.7.1)\n",
84
+ "Collecting fastapi<1.0,>=0.115.2 (from gradio>=3.45.2->f5-tts==0.0.0)\n",
85
+ " Downloading fastapi-0.115.4-py3-none-any.whl.metadata (27 kB)\n",
86
+ "Collecting ffmpy (from gradio>=3.45.2->f5-tts==0.0.0)\n",
87
+ " Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)\n",
88
+ "Collecting gradio-client==1.4.2 (from gradio>=3.45.2->f5-tts==0.0.0)\n",
89
+ " Downloading gradio_client-1.4.2-py3-none-any.whl.metadata (7.1 kB)\n",
90
+ "Requirement already satisfied: httpx>=0.24.1 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->f5-tts==0.0.0) (0.27.2)\n",
91
+ "Collecting huggingface-hub>=0.21.0 (from accelerate>=0.33.0->f5-tts==0.0.0)\n",
92
+ " Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)\n",
93
+ "Requirement already satisfied: jinja2<4.0 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->f5-tts==0.0.0) (3.1.4)\n",
94
+ "Collecting markupsafe~=2.0 (from gradio>=3.45.2->f5-tts==0.0.0)\n",
95
+ " Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)\n",
96
+ "Requirement already satisfied: orjson~=3.0 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->f5-tts==0.0.0) (3.10.10)\n",
97
+ "Requirement already satisfied: pandas<3.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->f5-tts==0.0.0) (2.2.2)\n",
98
+ "Requirement already satisfied: pillow<12.0,>=8.0 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->f5-tts==0.0.0) (10.4.0)\n",
99
+ "Requirement already satisfied: pydantic>=2.0 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->f5-tts==0.0.0) (2.9.2)\n",
100
+ "Collecting python-multipart==0.0.12 (from gradio>=3.45.2->f5-tts==0.0.0)\n",
101
+ " Downloading python_multipart-0.0.12-py3-none-any.whl.metadata (1.9 kB)\n",
102
+ "Collecting ruff>=0.2.2 (from gradio>=3.45.2->f5-tts==0.0.0)\n",
103
+ " Downloading ruff-0.7.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)\n",
104
+ "Collecting safehttpx<1.0,>=0.1.1 (from gradio>=3.45.2->f5-tts==0.0.0)\n",
105
+ " Downloading safehttpx-0.1.1-py3-none-any.whl.metadata (4.1 kB)\n",
106
+ "Collecting semantic-version~=2.0 (from gradio>=3.45.2->f5-tts==0.0.0)\n",
107
+ " Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)\n",
108
+ "Collecting starlette<1.0,>=0.40.0 (from gradio>=3.45.2->f5-tts==0.0.0)\n",
109
+ " Downloading starlette-0.41.2-py3-none-any.whl.metadata (6.0 kB)\n",
110
+ "Collecting tomlkit==0.12.0 (from gradio>=3.45.2->f5-tts==0.0.0)\n",
111
+ " Downloading tomlkit-0.12.0-py3-none-any.whl.metadata (2.7 kB)\n",
112
+ "Requirement already satisfied: typer<1.0,>=0.12 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->f5-tts==0.0.0) (0.12.5)\n",
113
+ "Requirement already satisfied: typing-extensions~=4.0 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->f5-tts==0.0.0) (4.12.2)\n",
114
+ "Collecting uvicorn>=0.14.0 (from gradio>=3.45.2->f5-tts==0.0.0)\n",
115
+ " Downloading uvicorn-0.32.0-py3-none-any.whl.metadata (6.6 kB)\n",
116
+ "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from gradio-client==1.4.2->gradio>=3.45.2->f5-tts==0.0.0) (2024.10.0)\n",
117
+ "Collecting websockets<13.0,>=10.0 (from gradio-client==1.4.2->gradio>=3.45.2->f5-tts==0.0.0)\n",
118
+ " Downloading websockets-12.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n",
119
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=2.0.0->f5-tts==0.0.0) (3.16.1)\n",
120
+ "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=2.0.0->f5-tts==0.0.0) (3.4.2)\n",
121
+ "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.10/dist-packages (from torch>=2.0.0->f5-tts==0.0.0) (1.13.1)\n",
122
+ "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy==1.13.1->torch>=2.0.0->f5-tts==0.0.0) (1.3.0)\n",
123
+ "Collecting einx>=0.3.0 (from x-transformers>=1.31.14->f5-tts==0.0.0)\n",
124
+ " Downloading einx-0.3.0-py3-none-any.whl.metadata (6.9 kB)\n",
125
+ "Requirement already satisfied: einops>=0.8.0 in /usr/local/lib/python3.10/dist-packages (from x-transformers>=1.31.14->f5-tts==0.0.0) (0.8.0)\n",
126
+ "Requirement already satisfied: requests<3.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from cached-path->f5-tts==0.0.0) (2.32.3)\n",
127
+ "Requirement already satisfied: rich<14.0,>=12.1 in /usr/local/lib/python3.10/dist-packages (from cached-path->f5-tts==0.0.0) (13.9.3)\n",
128
+ "Collecting filelock (from torch>=2.0.0->f5-tts==0.0.0)\n",
129
+ " Downloading filelock-3.13.4-py3-none-any.whl.metadata (2.8 kB)\n",
130
+ "Collecting boto3<2.0,>=1.0 (from cached-path->f5-tts==0.0.0)\n",
131
+ " Downloading boto3-1.35.54-py3-none-any.whl.metadata (6.7 kB)\n",
132
+ "Requirement already satisfied: google-cloud-storage<3.0,>=1.32.0 in /usr/local/lib/python3.10/dist-packages (from cached-path->f5-tts==0.0.0) (2.8.0)\n",
133
+ "INFO: pip is looking at multiple versions of cached-path to determine which version is compatible with other requirements. This could take a while.\n",
134
+ "Collecting cached-path (from f5-tts==0.0.0)\n",
135
+ " Downloading cached_path-1.6.2-py3-none-any.whl.metadata (19 kB)\n",
136
+ " Downloading cached_path-1.6.0-py3-none-any.whl.metadata (19 kB)\n",
137
+ " Downloading cached_path-1.5.1-py3-none-any.whl.metadata (19 kB)\n",
138
+ "Collecting filelock (from torch>=2.0.0->f5-tts==0.0.0)\n",
139
+ " Downloading filelock-3.12.4-py3-none-any.whl.metadata (2.8 kB)\n",
140
+ "Collecting cached-path (from f5-tts==0.0.0)\n",
141
+ " Downloading cached_path-1.5.0-py3-none-any.whl.metadata (19 kB)\n",
142
+ " Downloading cached_path-1.4.0-py3-none-any.whl.metadata (6.3 kB)\n",
143
+ " Downloading cached_path-1.3.5-py3-none-any.whl.metadata (6.3 kB)\n",
144
+ " Downloading cached_path-1.3.4-py3-none-any.whl.metadata (6.3 kB)\n",
145
+ "Collecting filelock (from torch>=2.0.0->f5-tts==0.0.0)\n",
146
+ " Downloading filelock-3.9.1-py3-none-any.whl.metadata (2.4 kB)\n",
147
+ "INFO: pip is still looking at multiple versions of cached-path to determine which version is compatible with other requirements. This could take a while.\n",
148
+ "Collecting cached-path (from f5-tts==0.0.0)\n",
149
+ " Downloading cached_path-1.3.3-py3-none-any.whl.metadata (6.3 kB)\n",
150
+ " Downloading cached_path-1.3.2-py3-none-any.whl.metadata (6.3 kB)\n",
151
+ " Downloading cached_path-1.3.1-py3-none-any.whl.metadata (6.3 kB)\n",
152
+ " Downloading cached_path-1.3.0-py3-none-any.whl.metadata (6.3 kB)\n",
153
+ " Downloading cached_path-1.2.0-py3-none-any.whl.metadata (6.0 kB)\n",
154
+ "INFO: This is taking longer than usual. You might need to provide the dependency resolver with stricter constraints to reduce runtime. See https://pip.pypa.io/warnings/backtracking for guidance. If you want to abort this run, press Ctrl + C.\n",
155
+ " Downloading cached_path-1.1.6-py3-none-any.whl.metadata (6.0 kB)\n",
156
+ "Collecting rich<13.0,>=12.1 (from cached-path->f5-tts==0.0.0)\n",
157
+ " Downloading rich-12.6.0-py3-none-any.whl.metadata (18 kB)\n",
158
+ "Collecting filelock (from torch>=2.0.0->f5-tts==0.0.0)\n",
159
+ " Downloading filelock-3.8.2-py3-none-any.whl.metadata (2.3 kB)\n",
160
+ "Collecting cached-path (from f5-tts==0.0.0)\n",
161
+ " Downloading cached_path-1.1.5-py3-none-any.whl.metadata (6.0 kB)\n",
162
+ "Collecting filelock (from torch>=2.0.0->f5-tts==0.0.0)\n",
163
+ " Downloading filelock-3.7.1-py3-none-any.whl.metadata (2.5 kB)\n",
164
+ "Collecting cached-path (from f5-tts==0.0.0)\n",
165
+ " Downloading cached_path-1.1.4-py3-none-any.whl.metadata (6.0 kB)\n",
166
+ " Downloading cached_path-1.1.3-py3-none-any.whl.metadata (6.0 kB)\n",
167
+ " Downloading cached_path-1.1.2-py3-none-any.whl.metadata (6.0 kB)\n",
168
+ " Downloading cached_path-1.1.1-py3-none-any.whl.metadata (6.0 kB)\n",
169
+ " Downloading cached_path-1.1.0-py3-none-any.whl.metadata (6.0 kB)\n",
170
+ " Downloading cached_path-1.0.2-py3-none-any.whl.metadata (6.0 kB)\n",
171
+ " Downloading cached_path-1.0.1-py3-none-any.whl.metadata (5.9 kB)\n",
172
+ " Downloading cached_path-1.0.0-py3-none-any.whl.metadata (5.9 kB)\n",
173
+ " Downloading cached_path-0.3.4-py3-none-any.whl.metadata (6.0 kB)\n",
174
+ " Downloading cached_path-0.3.3-py3-none-any.whl.metadata (6.0 kB)\n",
175
+ " Downloading cached_path-0.3.2-py3-none-any.whl.metadata (6.0 kB)\n",
176
+ " Downloading cached_path-0.3.1-py3-none-any.whl.metadata (5.1 kB)\n",
177
+ " Downloading cached_path-0.3.0-py3-none-any.whl.metadata (4.1 kB)\n",
178
+ "Collecting google-cloud-storage<2.0,>=1.0 (from cached-path->f5-tts==0.0.0)\n",
179
+ " Downloading google_cloud_storage-1.44.0-py2.py3-none-any.whl.metadata (5.9 kB)\n",
180
+ "Collecting overrides<6.2,>=3.1 (from cached-path->f5-tts==0.0.0)\n",
181
+ " Downloading overrides-6.1.0-py3-none-any.whl.metadata (5.0 kB)\n",
182
+ "Collecting cached-path (from f5-tts==0.0.0)\n",
183
+ " Downloading cached_path-0.2.0-py3-none-any.whl.metadata (4.1 kB)\n",
184
+ " Downloading cached_path-0.1.0-py3-none-any.whl.metadata (3.4 kB)\n",
185
+ "Collecting overrides==3.1.0 (from cached-path->f5-tts==0.0.0)\n",
186
+ " Downloading overrides-3.1.0.tar.gz (11 kB)\n",
187
+ " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
188
+ "Collecting gradio>=3.45.2 (from f5-tts==0.0.0)\n",
189
+ " Downloading gradio-5.3.0-py3-none-any.whl.metadata (15 kB)\n",
190
+ "Collecting python-multipart>=0.0.9 (from gradio>=3.45.2->f5-tts==0.0.0)\n",
191
+ " Downloading python_multipart-0.0.17-py3-none-any.whl.metadata (1.8 kB)\n",
192
+ "Collecting gradio>=3.45.2 (from f5-tts==0.0.0)\n",
193
+ " Downloading gradio-5.1.0-py3-none-any.whl.metadata (15 kB)\n",
194
+ "Collecting gradio-client==1.4.0 (from gradio>=3.45.2->f5-tts==0.0.0)\n",
195
+ " Downloading gradio_client-1.4.0-py3-none-any.whl.metadata (7.1 kB)\n",
196
+ "Collecting gradio>=3.45.2 (from f5-tts==0.0.0)\n",
197
+ " Downloading gradio-5.0.2-py3-none-any.whl.metadata (15 kB)\n",
198
+ " Downloading gradio-5.0.1-py3-none-any.whl.metadata (15 kB)\n",
199
+ " Downloading gradio-5.0.0-py3-none-any.whl.metadata (15 kB)\n",
200
+ " Downloading gradio-4.44.1-py3-none-any.whl.metadata (15 kB)\n",
201
+ "Collecting gradio-client==1.3.0 (from gradio>=3.45.2->f5-tts==0.0.0)\n",
202
+ " Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)\n",
203
+ "Requirement already satisfied: importlib-resources<7.0,>=1.3 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->f5-tts==0.0.0) (6.4.5)\n",
204
+ "Requirement already satisfied: urllib3~=2.0 in /usr/local/lib/python3.10/dist-packages (from gradio>=3.45.2->f5-tts==0.0.0) (2.2.3)\n",
205
+ "Collecting huggingface-hub>=0.21.0 (from accelerate>=0.33.0->f5-tts==0.0.0)\n",
206
+ " Downloading huggingface_hub-0.23.5-py3-none-any.whl.metadata (12 kB)\n",
207
+ "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->f5-tts==0.0.0) (1.3.0)\n",
208
+ "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->f5-tts==0.0.0) (0.12.1)\n",
209
+ "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->f5-tts==0.0.0) (4.54.1)\n",
210
+ "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->f5-tts==0.0.0) (1.4.7)\n",
211
+ "Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->f5-tts==0.0.0) (3.2.0)\n",
212
+ "Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->f5-tts==0.0.0) (2.8.2)\n",
213
+ "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets->f5-tts==0.0.0) (17.0.0)\n",
214
+ "Collecting dill<0.3.9,>=0.3.0 (from datasets->f5-tts==0.0.0)\n",
215
+ " Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n",
216
+ "Collecting xxhash (from datasets->f5-tts==0.0.0)\n",
217
+ " Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
218
+ "Collecting multiprocess<0.70.17 (from datasets->f5-tts==0.0.0)\n",
219
+ " Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)\n",
220
+ "Collecting fsspec (from gradio-client==1.3.0->gradio>=3.45.2->f5-tts==0.0.0)\n",
221
+ " Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)\n",
222
+ "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets->f5-tts==0.0.0) (3.10.10)\n",
223
+ "Requirement already satisfied: audioread>=2.1.9 in /usr/local/lib/python3.10/dist-packages (from librosa->f5-tts==0.0.0) (3.0.1)\n",
224
+ "Requirement already satisfied: scipy>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from librosa->f5-tts==0.0.0) (1.13.1)\n",
225
+ "Requirement already satisfied: scikit-learn>=0.20.0 in /usr/local/lib/python3.10/dist-packages (from librosa->f5-tts==0.0.0) (1.5.2)\n",
226
+ "Requirement already satisfied: joblib>=0.14 in /usr/local/lib/python3.10/dist-packages (from librosa->f5-tts==0.0.0) (1.4.2)\n",
227
+ "Requirement already satisfied: decorator>=4.3.0 in /usr/local/lib/python3.10/dist-packages (from librosa->f5-tts==0.0.0) (4.4.2)\n",
228
+ "Requirement already satisfied: numba>=0.51.0 in /usr/local/lib/python3.10/dist-packages (from librosa->f5-tts==0.0.0) (0.60.0)\n",
229
+ "Requirement already satisfied: pooch>=1.1 in /usr/local/lib/python3.10/dist-packages (from librosa->f5-tts==0.0.0) (1.8.2)\n",
230
+ "Requirement already satisfied: soxr>=0.3.2 in /usr/local/lib/python3.10/dist-packages (from librosa->f5-tts==0.0.0) (0.5.0.post1)\n",
231
+ "Requirement already satisfied: lazy-loader>=0.1 in /usr/local/lib/python3.10/dist-packages (from librosa->f5-tts==0.0.0) (0.4)\n",
232
+ "Requirement already satisfied: msgpack>=1.0 in /usr/local/lib/python3.10/dist-packages (from librosa->f5-tts==0.0.0) (1.1.0)\n",
233
+ "Requirement already satisfied: cffi>=1.0 in /usr/local/lib/python3.10/dist-packages (from soundfile->f5-tts==0.0.0) (1.17.1)\n",
234
+ "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers->f5-tts==0.0.0) (2024.9.11)\n",
235
+ "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers->f5-tts==0.0.0) (0.19.1)\n",
236
+ "Collecting encodec==0.1.1 (from vocos->f5-tts==0.0.0)\n",
237
+ " Downloading encodec-0.1.1.tar.gz (3.7 MB)\n",
238
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.7/3.7 MB\u001b[0m \u001b[31m32.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
239
+ "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
240
+ "Requirement already satisfied: docker-pycreds>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from wandb->f5-tts==0.0.0) (0.4.0)\n",
241
+ "Requirement already satisfied: gitpython!=3.1.29,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from wandb->f5-tts==0.0.0) (3.1.43)\n",
242
+ "Requirement already satisfied: platformdirs in /usr/local/lib/python3.10/dist-packages (from wandb->f5-tts==0.0.0) (4.3.6)\n",
243
+ "Requirement already satisfied: protobuf!=4.21.0,!=5.28.0,<6,>=3.19.0 in /usr/local/lib/python3.10/dist-packages (from wandb->f5-tts==0.0.0) (3.20.3)\n",
244
+ "Requirement already satisfied: sentry-sdk>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from wandb->f5-tts==0.0.0) (2.17.0)\n",
245
+ "Requirement already satisfied: setproctitle in /usr/local/lib/python3.10/dist-packages (from wandb->f5-tts==0.0.0) (1.3.3)\n",
246
+ "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from wandb->f5-tts==0.0.0) (75.1.0)\n",
247
+ "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<5.0,>=3.0->gradio>=3.45.2->f5-tts==0.0.0) (3.10)\n",
248
+ "Requirement already satisfied: sniffio>=1.1 in /usr/local/lib/python3.10/dist-packages (from anyio<5.0,>=3.0->gradio>=3.45.2->f5-tts==0.0.0) (1.3.1)\n",
249
+ "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5.0,>=3.0->gradio>=3.45.2->f5-tts==0.0.0) (1.2.2)\n",
250
+ "Collecting botocore<1.36.0,>=1.35.54 (from boto3<2.0,>=1.0->cached-path->f5-tts==0.0.0)\n",
251
+ " Downloading botocore-1.35.54-py3-none-any.whl.metadata (5.7 kB)\n",
252
+ "Collecting jmespath<2.0.0,>=0.7.1 (from boto3<2.0,>=1.0->cached-path->f5-tts==0.0.0)\n",
253
+ " Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)\n",
254
+ "Collecting s3transfer<0.11.0,>=0.10.0 (from boto3<2.0,>=1.0->cached-path->f5-tts==0.0.0)\n",
255
+ " Downloading s3transfer-0.10.3-py3-none-any.whl.metadata (1.7 kB)\n",
256
+ "Requirement already satisfied: pycparser in /usr/local/lib/python3.10/dist-packages (from cffi>=1.0->soundfile->f5-tts==0.0.0) (2.22)\n",
257
+ "Requirement already satisfied: six>=1.4.0 in /usr/local/lib/python3.10/dist-packages (from docker-pycreds>=0.4.0->wandb->f5-tts==0.0.0) (1.16.0)\n",
258
+ "Requirement already satisfied: frozendict in /usr/local/lib/python3.10/dist-packages (from einx>=0.3.0->x-transformers>=1.31.14->f5-tts==0.0.0) (2.4.6)\n",
259
+ "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->f5-tts==0.0.0) (2.4.3)\n",
260
+ "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->f5-tts==0.0.0) (1.3.1)\n",
261
+ "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->f5-tts==0.0.0) (24.2.0)\n",
262
+ "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->f5-tts==0.0.0) (1.5.0)\n",
263
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->f5-tts==0.0.0) (6.1.0)\n",
264
+ "Requirement already satisfied: yarl<2.0,>=1.12.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->f5-tts==0.0.0) (1.17.0)\n",
265
+ "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->f5-tts==0.0.0) (4.0.3)\n",
266
+ "Requirement already satisfied: gitdb<5,>=4.0.1 in /usr/local/lib/python3.10/dist-packages (from gitpython!=3.1.29,>=1.0.0->wandb->f5-tts==0.0.0) (4.0.11)\n",
267
+ "Requirement already satisfied: google-auth<3.0dev,>=1.25.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-storage<3.0,>=1.32.0->cached-path->f5-tts==0.0.0) (2.27.0)\n",
268
+ "Requirement already satisfied: google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5 in /usr/local/lib/python3.10/dist-packages (from google-cloud-storage<3.0,>=1.32.0->cached-path->f5-tts==0.0.0) (2.19.2)\n",
269
+ "Requirement already satisfied: google-cloud-core<3.0dev,>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from google-cloud-storage<3.0,>=1.32.0->cached-path->f5-tts==0.0.0) (2.4.1)\n",
270
+ "Requirement already satisfied: google-resumable-media>=2.3.2 in /usr/local/lib/python3.10/dist-packages (from google-cloud-storage<3.0,>=1.32.0->cached-path->f5-tts==0.0.0) (2.7.2)\n",
271
+ "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx>=0.24.1->gradio>=3.45.2->f5-tts==0.0.0) (2024.8.30)\n",
272
+ "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx>=0.24.1->gradio>=3.45.2->f5-tts==0.0.0) (1.0.6)\n",
273
+ "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx>=0.24.1->gradio>=3.45.2->f5-tts==0.0.0) (0.14.0)\n",
274
+ "Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba>=0.51.0->librosa->f5-tts==0.0.0) (0.43.0)\n",
275
+ "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas<3.0,>=1.0->gradio>=3.45.2->f5-tts==0.0.0) (2024.2)\n",
276
+ "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas<3.0,>=1.0->gradio>=3.45.2->f5-tts==0.0.0) (2024.2)\n",
277
+ "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic>=2.0->gradio>=3.45.2->f5-tts==0.0.0) (0.7.0)\n",
278
+ "Requirement already satisfied: pydantic-core==2.23.4 in /usr/local/lib/python3.10/dist-packages (from pydantic>=2.0->gradio>=3.45.2->f5-tts==0.0.0) (2.23.4)\n",
279
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3.0,>=2.0->cached-path->f5-tts==0.0.0) (3.4.0)\n",
280
+ "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich<14.0,>=12.1->cached-path->f5-tts==0.0.0) (3.0.0)\n",
281
+ "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich<14.0,>=12.1->cached-path->f5-tts==0.0.0) (2.18.0)\n",
282
+ "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.20.0->librosa->f5-tts==0.0.0) (3.5.0)\n",
283
+ "Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.10/dist-packages (from typer<1.0,>=0.12->gradio>=3.45.2->f5-tts==0.0.0) (1.5.4)\n",
284
+ "Requirement already satisfied: smmap<6,>=3.0.1 in /usr/local/lib/python3.10/dist-packages (from gitdb<5,>=4.0.1->gitpython!=3.1.29,>=1.0.0->wandb->f5-tts==0.0.0) (5.0.1)\n",
285
+ "Requirement already satisfied: googleapis-common-protos<2.0.dev0,>=1.56.2 in /usr/local/lib/python3.10/dist-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->google-cloud-storage<3.0,>=1.32.0->cached-path->f5-tts==0.0.0) (1.65.0)\n",
286
+ "Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.3 in /usr/local/lib/python3.10/dist-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->google-cloud-storage<3.0,>=1.32.0->cached-path->f5-tts==0.0.0) (1.25.0)\n",
287
+ "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0dev,>=1.25.0->google-cloud-storage<3.0,>=1.32.0->cached-path->f5-tts==0.0.0) (5.5.0)\n",
288
+ "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0dev,>=1.25.0->google-cloud-storage<3.0,>=1.32.0->cached-path->f5-tts==0.0.0) (0.4.1)\n",
289
+ "Requirement already satisfied: rsa<5,>=3.1.4 in /usr/local/lib/python3.10/dist-packages (from google-auth<3.0dev,>=1.25.0->google-cloud-storage<3.0,>=1.32.0->cached-path->f5-tts==0.0.0) (4.9)\n",
290
+ "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /usr/local/lib/python3.10/dist-packages (from google-resumable-media>=2.3.2->google-cloud-storage<3.0,>=1.32.0->cached-path->f5-tts==0.0.0) (1.6.0)\n",
291
+ "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich<14.0,>=12.1->cached-path->f5-tts==0.0.0) (0.1.2)\n",
292
+ "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from yarl<2.0,>=1.12.0->aiohttp->datasets->f5-tts==0.0.0) (0.2.0)\n",
293
+ "Requirement already satisfied: pyasn1<0.7.0,>=0.4.6 in /usr/local/lib/python3.10/dist-packages (from pyasn1-modules>=0.2.1->google-auth<3.0dev,>=1.25.0->google-cloud-storage<3.0,>=1.32.0->cached-path->f5-tts==0.0.0) (0.6.1)\n",
294
+ "Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)\n",
295
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m122.4/122.4 MB\u001b[0m \u001b[31m6.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
296
+ "\u001b[?25hDownloading ema_pytorch-0.7.3-py3-none-any.whl (9.6 kB)\n",
297
+ "Downloading gradio-4.44.1-py3-none-any.whl (18.1 MB)\n",
298
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m18.1/18.1 MB\u001b[0m \u001b[31m53.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
299
+ "\u001b[?25hDownloading gradio_client-1.3.0-py3-none-any.whl (318 kB)\n",
300
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m318.7/318.7 kB\u001b[0m \u001b[31m17.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
301
+ "\u001b[?25hDownloading tomlkit-0.12.0-py3-none-any.whl (37 kB)\n",
302
+ "Downloading cached_path-1.6.3-py3-none-any.whl (35 kB)\n",
303
+ "Downloading huggingface_hub-0.23.5-py3-none-any.whl (402 kB)\n",
304
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m402.8/402.8 kB\u001b[0m \u001b[31m20.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
305
+ "\u001b[?25hDownloading x_transformers-1.42.5-py3-none-any.whl (47 kB)\n",
306
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m47.5/47.5 kB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
307
+ "\u001b[?25hDownloading datasets-3.1.0-py3-none-any.whl (480 kB)\n",
308
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m23.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
309
+ "\u001b[?25hDownloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)\n",
310
+ "Downloading pypinyin-0.53.0-py2.py3-none-any.whl (834 kB)\n",
311
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m834.7/834.7 kB\u001b[0m \u001b[31m32.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
312
+ "\u001b[?25hDownloading torchdiffeq-0.2.4-py3-none-any.whl (32 kB)\n",
313
+ "Downloading vocos-0.1.0-py3-none-any.whl (24 kB)\n",
314
+ "Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)\n",
315
+ "Downloading boto3-1.35.54-py3-none-any.whl (139 kB)\n",
316
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m139.2/139.2 kB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
317
+ "\u001b[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)\n",
318
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
319
+ "\u001b[?25hDownloading einx-0.3.0-py3-none-any.whl (102 kB)\n",
320
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m103.0/103.0 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
321
+ "\u001b[?25hDownloading fastapi-0.115.4-py3-none-any.whl (94 kB)\n",
322
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m94.7/94.7 kB\u001b[0m \u001b[31m4.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
323
+ "\u001b[?25hDownloading filelock-3.13.4-py3-none-any.whl (11 kB)\n",
324
+ "Downloading fsspec-2024.9.0-py3-none-any.whl (179 kB)\n",
325
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m9.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
326
+ "\u001b[?25hDownloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (25 kB)\n",
327
+ "Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n",
328
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m9.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
329
+ "\u001b[?25hDownloading python_multipart-0.0.17-py3-none-any.whl (24 kB)\n",
330
+ "Downloading ruff-0.7.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.0 MB)\n",
331
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.0/11.0 MB\u001b[0m \u001b[31m90.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
332
+ "\u001b[?25hDownloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)\n",
333
+ "Downloading uvicorn-0.32.0-py3-none-any.whl (63 kB)\n",
334
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m63.7/63.7 kB\u001b[0m \u001b[31m4.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
335
+ "\u001b[?25hDownloading ffmpy-0.4.0-py3-none-any.whl (5.8 kB)\n",
336
+ "Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n",
337
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m12.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
338
+ "\u001b[?25hDownloading botocore-1.35.54-py3-none-any.whl (12.7 MB)\n",
339
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.7/12.7 MB\u001b[0m \u001b[31m78.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
340
+ "\u001b[?25hDownloading jmespath-1.0.1-py3-none-any.whl (20 kB)\n",
341
+ "Downloading s3transfer-0.10.3-py3-none-any.whl (82 kB)\n",
342
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m82.6/82.6 kB\u001b[0m \u001b[31m5.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
343
+ "\u001b[?25hDownloading starlette-0.41.2-py3-none-any.whl (73 kB)\n",
344
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m73.3/73.3 kB\u001b[0m \u001b[31m5.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
345
+ "\u001b[?25hDownloading websockets-12.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (130 kB)\n",
346
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m130.2/130.2 kB\u001b[0m \u001b[31m6.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
347
+ "\u001b[?25hBuilding wheels for collected packages: f5-tts, transformers-stream-generator, encodec\n",
348
+ " Building wheel for f5-tts (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
349
+ " Created wheel for f5-tts: filename=f5_tts-0.0.0-py3-none-any.whl size=1280758 sha256=b900d02175b3eccb21dec0d1801d9bffc4fd30451509a67ef45a6f0653580c7c\n",
350
+ " Stored in directory: /tmp/pip-ephem-wheel-cache-3_lciyxv/wheels/63/43/c5/fb04687e74a83e9bc15bb575c33e53b6aca4bf7a2fb32982f2\n",
351
+ " Building wheel for transformers-stream-generator (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
352
+ " Created wheel for transformers-stream-generator: filename=transformers_stream_generator-0.0.5-py3-none-any.whl size=12425 sha256=0e9f6ab066ff4bcfa59962bd47464e4dc82ea27afed7086565aad6d86b0b6e5b\n",
353
+ " Stored in directory: /root/.cache/pip/wheels/95/4a/90/140f7b67d125906f6a165f38aad212ecb4a695ad0d87582437\n",
354
+ " Building wheel for encodec (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
355
+ " Created wheel for encodec: filename=encodec-0.1.1-py3-none-any.whl size=45760 sha256=bccc14b58f86cf341371dda33deedf0bd6ef90e06649662b5337eb0e9e785a92\n",
356
+ " Stored in directory: /root/.cache/pip/wheels/fc/36/cb/81af8b985a5f5e0815312d5e52b41263237af07b977e6bcbf3\n",
357
+ "Successfully built f5-tts transformers-stream-generator encodec\n",
358
+ "Installing collected packages: pydub, xxhash, websockets, uvicorn, tomlkit, semantic-version, ruff, python-multipart, pypinyin, markupsafe, jmespath, fsspec, filelock, ffmpy, dill, aiofiles, starlette, multiprocess, huggingface-hub, einx, botocore, s3transfer, gradio-client, fastapi, x-transformers, torchdiffeq, gradio, ema-pytorch, boto3, bitsandbytes, transformers-stream-generator, encodec, datasets, vocos, cached-path, f5-tts\n",
359
+ " Attempting uninstall: markupsafe\n",
360
+ " Found existing installation: MarkupSafe 3.0.2\n",
361
+ " Uninstalling MarkupSafe-3.0.2:\n",
362
+ " Successfully uninstalled MarkupSafe-3.0.2\n",
363
+ " Attempting uninstall: fsspec\n",
364
+ " Found existing installation: fsspec 2024.10.0\n",
365
+ " Uninstalling fsspec-2024.10.0:\n",
366
+ " Successfully uninstalled fsspec-2024.10.0\n",
367
+ " Attempting uninstall: filelock\n",
368
+ " Found existing installation: filelock 3.16.1\n",
369
+ " Uninstalling filelock-3.16.1:\n",
370
+ " Successfully uninstalled filelock-3.16.1\n",
371
+ " Attempting uninstall: huggingface-hub\n",
372
+ " Found existing installation: huggingface-hub 0.24.7\n",
373
+ " Uninstalling huggingface-hub-0.24.7:\n",
374
+ " Successfully uninstalled huggingface-hub-0.24.7\n",
375
+ "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
376
+ "gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.\n",
377
+ "pytensor 2.25.5 requires filelock>=3.15, but you have filelock 3.13.4 which is incompatible.\u001b[0m\u001b[31m\n",
378
+ "\u001b[0mSuccessfully installed aiofiles-23.2.1 bitsandbytes-0.44.1 boto3-1.35.54 botocore-1.35.54 cached-path-1.6.3 datasets-3.1.0 dill-0.3.8 einx-0.3.0 ema-pytorch-0.7.3 encodec-0.1.1 f5-tts-0.0.0 fastapi-0.115.4 ffmpy-0.4.0 filelock-3.13.4 fsspec-2024.9.0 gradio-4.44.1 gradio-client-1.3.0 huggingface-hub-0.23.5 jmespath-1.0.1 markupsafe-2.1.5 multiprocess-0.70.16 pydub-0.25.1 pypinyin-0.53.0 python-multipart-0.0.17 ruff-0.7.2 s3transfer-0.10.3 semantic-version-2.10.0 starlette-0.41.2 tomlkit-0.12.0 torchdiffeq-0.2.4 transformers-stream-generator-0.0.5 uvicorn-0.32.0 vocos-0.1.0 websockets-12.0 x-transformers-1.42.5 xxhash-3.5.0\n"
379
+ ]
380
+ }
381
+ ],
382
+ "source": [
383
+ "pip install git+https://github.com/jpgallegoar/Spanish-F5.git"
384
+ ]
385
+ },
386
+ {
387
+ "cell_type": "code",
388
+ "source": [
389
+ "!f5-tts_infer-gradio"
390
+ ],
391
+ "metadata": {
392
+ "colab": {
393
+ "base_uri": "https://localhost:8080/"
394
+ },
395
+ "id": "a2LYVge6IhOh",
396
+ "outputId": "be4d8d64-c020-4c97-eb57-ed0ceced7fba"
397
+ },
398
+ "execution_count": null,
399
+ "outputs": [
400
+ {
401
+ "output_type": "stream",
402
+ "name": "stdout",
403
+ "text": [
404
+ "2024-11-04 17:05:51.302161: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
405
+ "2024-11-04 17:05:51.328444: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
406
+ "2024-11-04 17:05:51.335942: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
407
+ "2024-11-04 17:05:51.354757: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
408
+ "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
409
+ "2024-11-04 17:05:52.875946: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
410
+ "Download Vocos from huggingface charactr/vocos-mel-24khz\n",
411
+ "config.yaml: 100% 461/461 [00:00<00:00, 2.61MB/s]\n",
412
+ "pytorch_model.bin: 100% 54.4M/54.4M [00:00<00:00, 139MB/s]\n",
413
+ "/usr/local/lib/python3.10/dist-packages/vocos/pretrained.py:70: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
414
+ " state_dict = torch.load(model_path, map_location=\"cpu\")\n",
415
+ "model_1200000.safetensors: 100% 1.35G/1.35G [02:09<00:00, 10.4MB/s]\n",
416
+ "\n",
417
+ "vocab : /usr/local/lib/python3.10/dist-packages/f5_tts/infer/examples/vocab.txt\n",
418
+ "tokenizer : custom\n",
419
+ "model : /root/.cache/huggingface/hub/models--jpgallegoar--F5-Spanish/snapshots/120ddcfa7813b928325a787b9ad2bf038d2c32df/model_1200000.safetensors \n",
420
+ "\n",
421
+ "Starting app...\n",
422
+ "Running on local URL: http://127.0.0.1:7860\n",
423
+ "Running on public URL: https://dd1c9f211495273ff6.gradio.live\n",
424
+ "\n",
425
+ "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n",
426
+ "config.json: 100% 1.26k/1.26k [00:00<00:00, 1.89MB/s]\n",
427
+ "model.safetensors: 21% 336M/1.62G [00:32<02:03, 10.4MB/s]"
428
+ ]
429
+ }
430
+ ]
431
+ }
432
+ ]
433
+ }
app.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ def greet(name):
4
+ return f"Hola {name}!"
5
+
6
+ iface = gr.Interface(fn=greet, inputs="text", outputs="text")
7
+ iface.launch()
pyproject.toml ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools >= 61.0", "setuptools-scm>=8.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "f5-tts"
7
+ dynamic = ["version"]
8
+ description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
9
+ readme = "README.md"
10
+ license = {text = "MIT License"}
11
+ classifiers = [
12
+ "License :: OSI Approved :: MIT License",
13
+ "Operating System :: OS Independent",
14
+ "Programming Language :: Python :: 3",
15
+ ]
16
+ dependencies = [
17
+ "accelerate>=0.33.0",
18
+ "bitsandbytes==0.44.1",
19
+ "cached_path",
20
+ "click",
21
+ "datasets",
22
+ "ema_pytorch>=0.5.2",
23
+ "gradio>=3.45.2",
24
+ "jieba",
25
+ "librosa",
26
+ "matplotlib",
27
+ "numpy<=1.26.4",
28
+ "pydub",
29
+ "pypinyin",
30
+ "safetensors",
31
+ "soundfile",
32
+ "tomli",
33
+ "torch>=2.0.0",
34
+ "torchaudio>=2.0.0",
35
+ "torchdiffeq",
36
+ "tqdm>=4.65.0",
37
+ "transformers",
38
+ "transformers_stream_generator",
39
+ "vocos",
40
+ "wandb",
41
+ "x_transformers>=1.31.14",
42
+ "num2words",
43
+ ]
44
+
45
+ [project.optional-dependencies]
46
+ eval = [
47
+ "faster_whisper==0.10.1",
48
+ "funasr",
49
+ "jiwer",
50
+ "modelscope",
51
+ "zhconv",
52
+ "zhon",
53
+ ]
54
+
55
+ [project.urls]
56
+ Homepage = "https://github.com/SWivid/F5-TTS"
57
+
58
+ [project.scripts]
59
+ "f5-tts_infer-cli" = "f5_tts.infer.infer_cli:main"
60
+ "f5-tts_infer-gradio" = "f5_tts.infer.infer_gradio:main"
61
+ "f5-tts_finetune-cli" = "f5_tts.train.finetune_cli:main"
62
+ "f5-tts_finetune-gradio" = "f5_tts.train.finetune_gradio:main"
ref_audio.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43beaf8e4f7097eff57943f10f166561e4dfbfb8fe34fafd188e2b123468eebc
3
+ size 1441870
ruff.toml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ line-length = 120
2
+ target-version = "py310"
3
+
4
+ [lint]
5
+ # Only ignore variables with names starting with "_".
6
+ dummy-variable-rgx = "^_.*$"
7
+
8
+ [lint.isort]
9
+ force-single-line = true
10
+ lines-after-imports = 2
src/f5_tts.egg-info/PKG-INFO ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: f5-tts
3
+ Version: 0.0.0
4
+ Summary: F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching
5
+ License: MIT License
6
+ Project-URL: Homepage, https://github.com/SWivid/F5-TTS
7
+ Classifier: License :: OSI Approved :: MIT License
8
+ Classifier: Operating System :: OS Independent
9
+ Classifier: Programming Language :: Python :: 3
10
+ Description-Content-Type: text/markdown
11
+ License-File: LICENSE
12
+ Requires-Dist: accelerate>=0.33.0
13
+ Requires-Dist: bitsandbytes==0.44.1
14
+ Requires-Dist: cached_path
15
+ Requires-Dist: click
16
+ Requires-Dist: datasets
17
+ Requires-Dist: ema_pytorch>=0.5.2
18
+ Requires-Dist: gradio>=3.45.2
19
+ Requires-Dist: jieba
20
+ Requires-Dist: librosa
21
+ Requires-Dist: matplotlib
22
+ Requires-Dist: numpy<=1.26.4
23
+ Requires-Dist: pydub
24
+ Requires-Dist: pypinyin
25
+ Requires-Dist: safetensors
26
+ Requires-Dist: soundfile
27
+ Requires-Dist: tomli
28
+ Requires-Dist: torch>=2.0.0
29
+ Requires-Dist: torchaudio>=2.0.0
30
+ Requires-Dist: torchdiffeq
31
+ Requires-Dist: tqdm>=4.65.0
32
+ Requires-Dist: transformers
33
+ Requires-Dist: transformers_stream_generator
34
+ Requires-Dist: vocos
35
+ Requires-Dist: wandb
36
+ Requires-Dist: x_transformers>=1.31.14
37
+ Requires-Dist: num2words
38
+ Provides-Extra: eval
39
+ Requires-Dist: faster_whisper==0.10.1; extra == "eval"
40
+ Requires-Dist: funasr; extra == "eval"
41
+ Requires-Dist: jiwer; extra == "eval"
42
+ Requires-Dist: modelscope; extra == "eval"
43
+ Requires-Dist: zhconv; extra == "eval"
44
+ Requires-Dist: zhon; extra == "eval"
45
+
46
+ # F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching
47
+
48
+ **F5-TTS**: Diffusion Transformer with ConvNeXt V2, faster trained and inference.
49
+
50
+ **E2 TTS**: Flat-UNet Transformer, closest reproduction from [paper](https://arxiv.org/abs/2406.18009).
51
+
52
+ **Sway Sampling**: Inference-time flow step sampling strategy, greatly improves performance
53
+
54
+ ### Thanks to all the contributors !
55
+
56
+ ## News
57
+ Spanish model: https://huggingface.co/jpgallegoar/F5-Spanish/
58
+
59
+ ## Installation
60
+
61
+ ```bash
62
+ # Create a python 3.10 conda env (you could also use virtualenv)
63
+ conda create -n f5-tts python=3.10
64
+ conda activate f5-tts
65
+
66
+ # Install pytorch with your CUDA version, e.g.
67
+ pip install torch==2.3.0+cu118 torchaudio==2.3.0+cu118 --extra-index-url https://download.pytorch.org/whl/cu118
68
+ ```
69
+
70
+ Then you can choose from a few options below:
71
+
72
+ ### 1. As a pip package (if just for inference)
73
+
74
+ ```bash
75
+ pip install git+https://github.com/jpgallegoar/Spanish-F5.git
76
+ ```
77
+
78
+ ### 2. Local editable (if also do training, finetuning)
79
+
80
+ ```bash
81
+ git clone https://github.com/jpgallegoar/Spanish-F5.git
82
+ cd F5-TTS
83
+ # git submodule update --init --recursive # (optional, if need bigvgan)
84
+ pip install -e .
85
+ ```
86
+ If initialize submodule, you should add the following code at the beginning of `src/third_party/BigVGAN/bigvgan.py`.
87
+ ```python
88
+ import os
89
+ import sys
90
+ sys.path.append(os.path.dirname(os.path.abspath(__file__)))
91
+ ```
92
+
93
+ ## Inference
94
+
95
+ ### 1. Gradio App
96
+
97
+ Currently supported features:
98
+
99
+ - Basic TTS with Chunk Inference
100
+ - Multi-Style / Multi-Speaker Generation
101
+ - Voice Chat powered by Qwen2.5-3B-Instruct
102
+
103
+ ```bash
104
+ # Launch a Gradio app (web interface)
105
+ f5-tts_infer-gradio
106
+
107
+ # Specify the port/host
108
+ f5-tts_infer-gradio --port 7860 --host 0.0.0.0
109
+
110
+ # Launch a share link
111
+ f5-tts_infer-gradio --share
112
+ ```
113
+
114
+ ### 2. CLI Inference
115
+
116
+ ```bash
117
+ # Run with flags
118
+ # Leave --ref_text "" will have ASR model transcribe (extra GPU memory usage)
119
+ f5-tts_infer-cli \
120
+ --model "F5-TTS" \
121
+ --ref_audio "ref_audio.wav" \
122
+ --ref_text "The content, subtitle or transcription of reference audio." \
123
+ --gen_text "Some text you want TTS model generate for you."
124
+
125
+ # Run with default setting. src/f5_tts/infer/examples/basic/basic.toml
126
+ f5-tts_infer-cli
127
+ # Or with your own .toml file
128
+ f5-tts_infer-cli -c custom.toml
129
+
130
+ # Multi voice. See src/f5_tts/infer/README.md
131
+ f5-tts_infer-cli -c src/f5_tts/infer/examples/multi/story.toml
132
+ ```
133
+
134
+ ### 3. More instructions
135
+
136
+ - In order to have better generation results, take a moment to read [detailed guidance](src/f5_tts/infer).
137
+ - The [Issues](https://github.com/SWivid/F5-TTS/issues?q=is%3Aissue) are very useful, please try to find the solution by properly searching the keywords of problem encountered. If no answer found, then feel free to open an issue.
138
+
139
+
140
+ ## Training
141
+
142
+ ### 1. Gradio App
143
+
144
+ Read [training & finetuning guidance](src/f5_tts/train) for more instructions.
145
+
146
+ ```bash
147
+ # Quick start with Gradio web interface
148
+ f5-tts_finetune-gradio
149
+ ```
150
+
151
+
152
+ ## [Evaluation](src/f5_tts/eval)
153
+
154
+
155
+ ## Development
156
+
157
+ Use pre-commit to ensure code quality (will run linters and formatters automatically)
158
+
159
+ ```bash
160
+ pip install pre-commit
161
+ pre-commit install
162
+ ```
163
+
164
+ When making a pull request, before each commit, run:
165
+
166
+ ```bash
167
+ pre-commit run --all-files
168
+ ```
169
+
170
+ Note: Some model components have linting exceptions for E722 to accommodate tensor notation
171
+
172
+
173
+ ## Acknowledgements
174
+
175
+ - [E2-TTS](https://arxiv.org/abs/2406.18009) brilliant work, simple and effective
176
+ - [Emilia](https://arxiv.org/abs/2407.05361), [WenetSpeech4TTS](https://arxiv.org/abs/2406.05763) valuable datasets
177
+ - [lucidrains](https://github.com/lucidrains) initial CFM structure with also [bfs18](https://github.com/bfs18) for discussion
178
+ - [SD3](https://arxiv.org/abs/2403.03206) & [Hugging Face diffusers](https://github.com/huggingface/diffusers) DiT and MMDiT code structure
179
+ - [torchdiffeq](https://github.com/rtqichen/torchdiffeq) as ODE solver, [Vocos](https://huggingface.co/charactr/vocos-mel-24khz) as vocoder
180
+ - [FunASR](https://github.com/modelscope/FunASR), [faster-whisper](https://github.com/SYSTRAN/faster-whisper), [UniSpeech](https://github.com/microsoft/UniSpeech) for evaluation tools
181
+ - [ctc-forced-aligner](https://github.com/MahmoudAshraf97/ctc-forced-aligner) for speech edit test
182
+ - [mrfakename](https://x.com/realmrfakename) huggingface space demo ~
183
+ - [f5-tts-mlx](https://github.com/lucasnewman/f5-tts-mlx/tree/main) Implementation with MLX framework by [Lucas Newman](https://github.com/lucasnewman)
184
+ - [F5-TTS-ONNX](https://github.com/DakeQQ/F5-TTS-ONNX) ONNX Runtime version by [DakeQQ](https://github.com/DakeQQ)
185
+
186
+ ## Citation
187
+ If our work and codebase is useful for you, please cite as:
188
+ ```
189
+ @article{chen-etal-2024-f5tts,
190
+ title={F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching},
191
+ author={Yushen Chen and Zhikang Niu and Ziyang Ma and Keqi Deng and Chunhui Wang and Jian Zhao and Kai Yu and Xie Chen},
192
+ journal={arXiv preprint arXiv:2410.06885},
193
+ year={2024},
194
+ }
195
+ ```
196
+ ## License
197
+
198
+ Our code is released under MIT License. The pre-trained models are licensed under the CC-BY-NC license due to the training data Emilia, which is an in-the-wild dataset. Sorry for any inconvenience this may cause.
src/f5_tts.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .gitignore
2
+ .gitmodules
3
+ .pre-commit-config.yaml
4
+ Dockerfile
5
+ LICENSE
6
+ README.md
7
+ Spanish_F5.ipynb
8
+ pyproject.toml
9
+ ruff.toml
10
+ .github/ISSUE_TEMPLATE/bug_report.yml
11
+ .github/ISSUE_TEMPLATE/feature_request.yml
12
+ .github/ISSUE_TEMPLATE/help_wanted.yml
13
+ .github/ISSUE_TEMPLATE/question.yml
14
+ .github/workflows/pre-commit.yaml
15
+ .github/workflows/publish-docker-image.yaml
16
+ .github/workflows/sync-hf.yaml
17
+ ckpts/README.md
18
+ data/librispeech_pc_test_clean_cross_sentence.lst
19
+ data/Emilia_ZH_EN_pinyin/vocab.txt
20
+ src/f5_tts/api.py
21
+ src/f5_tts/socket_server.py
22
+ src/f5_tts.egg-info/PKG-INFO
23
+ src/f5_tts.egg-info/SOURCES.txt
24
+ src/f5_tts.egg-info/dependency_links.txt
25
+ src/f5_tts.egg-info/entry_points.txt
26
+ src/f5_tts.egg-info/requires.txt
27
+ src/f5_tts.egg-info/top_level.txt
28
+ src/f5_tts/eval/README.md
29
+ src/f5_tts/eval/ecapa_tdnn.py
30
+ src/f5_tts/eval/eval_infer_batch.py
31
+ src/f5_tts/eval/eval_infer_batch.sh
32
+ src/f5_tts/eval/eval_librispeech_test_clean.py
33
+ src/f5_tts/eval/eval_seedtts_testset.py
34
+ src/f5_tts/eval/utils_eval.py
35
+ src/f5_tts/infer/README.md
36
+ src/f5_tts/infer/infer_cli.py
37
+ src/f5_tts/infer/infer_gradio.py
38
+ src/f5_tts/infer/speech_edit.py
39
+ src/f5_tts/infer/utils_infer.py
40
+ src/f5_tts/infer/examples/vocab.txt
41
+ src/f5_tts/infer/examples/basic/basic.toml
42
+ src/f5_tts/infer/examples/basic/basic_ref_en.wav
43
+ src/f5_tts/infer/examples/basic/basic_ref_zh.wav
44
+ src/f5_tts/infer/examples/multi/country.flac
45
+ src/f5_tts/infer/examples/multi/main.flac
46
+ src/f5_tts/infer/examples/multi/story.toml
47
+ src/f5_tts/infer/examples/multi/story.txt
48
+ src/f5_tts/infer/examples/multi/town.flac
49
+ src/f5_tts/model/__init__.py
50
+ src/f5_tts/model/cfm.py
51
+ src/f5_tts/model/dataset.py
52
+ src/f5_tts/model/modules.py
53
+ src/f5_tts/model/trainer.py
54
+ src/f5_tts/model/utils.py
55
+ src/f5_tts/model/backbones/README.md
56
+ src/f5_tts/model/backbones/dit.py
57
+ src/f5_tts/model/backbones/mmdit.py
58
+ src/f5_tts/model/backbones/unett.py
59
+ src/f5_tts/scripts/count_max_epoch.py
60
+ src/f5_tts/scripts/count_params_gflops.py
61
+ src/f5_tts/train/README.md
62
+ src/f5_tts/train/finetune_cli.py
63
+ src/f5_tts/train/finetune_gradio.py
64
+ src/f5_tts/train/train.py
65
+ src/f5_tts/train/datasets/prepare_csv_wavs.py
66
+ src/f5_tts/train/datasets/prepare_emilia.py
67
+ src/f5_tts/train/datasets/prepare_wenetspeech4tts.py
src/f5_tts.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
src/f5_tts.egg-info/entry_points.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ [console_scripts]
2
+ f5-tts_finetune-cli = f5_tts.train.finetune_cli:main
3
+ f5-tts_finetune-gradio = f5_tts.train.finetune_gradio:main
4
+ f5-tts_infer-cli = f5_tts.infer.infer_cli:main
5
+ f5-tts_infer-gradio = f5_tts.infer.infer_gradio:main
src/f5_tts.egg-info/requires.txt ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate>=0.33.0
2
+ bitsandbytes==0.44.1
3
+ cached_path
4
+ click
5
+ datasets
6
+ ema_pytorch>=0.5.2
7
+ gradio>=3.45.2
8
+ jieba
9
+ librosa
10
+ matplotlib
11
+ numpy<=1.26.4
12
+ pydub
13
+ pypinyin
14
+ safetensors
15
+ soundfile
16
+ tomli
17
+ torch>=2.0.0
18
+ torchaudio>=2.0.0
19
+ torchdiffeq
20
+ tqdm>=4.65.0
21
+ transformers
22
+ transformers_stream_generator
23
+ vocos
24
+ wandb
25
+ x_transformers>=1.31.14
26
+ num2words
27
+
28
+ [eval]
29
+ faster_whisper==0.10.1
30
+ funasr
31
+ jiwer
32
+ modelscope
33
+ zhconv
34
+ zhon
src/f5_tts.egg-info/top_level.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ f5_tts
2
+ third_party
src/f5_tts/api.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import sys
3
+ from importlib.resources import files
4
+
5
+ import soundfile as sf
6
+ import torch
7
+ import tqdm
8
+ from cached_path import cached_path
9
+
10
+ from f5_tts.infer.utils_infer import (
11
+ hop_length,
12
+ infer_process,
13
+ load_model,
14
+ load_vocoder,
15
+ preprocess_ref_audio_text,
16
+ remove_silence_for_generated_wav,
17
+ save_spectrogram,
18
+ target_sample_rate,
19
+ )
20
+ from f5_tts.model import DiT, UNetT
21
+ from f5_tts.model.utils import seed_everything
22
+
23
+
24
+ class F5TTS:
25
+ def __init__(
26
+ self,
27
+ model_type="F5-TTS",
28
+ ckpt_file="",
29
+ vocab_file="",
30
+ ode_method="euler",
31
+ use_ema=True,
32
+ vocoder_name="vocos",
33
+ local_path=None,
34
+ device=None,
35
+ ):
36
+ # Initialize parameters
37
+ self.final_wave = None
38
+ self.target_sample_rate = target_sample_rate
39
+ self.hop_length = hop_length
40
+ self.seed = -1
41
+ self.mel_spec_type = vocoder_name
42
+
43
+ # Set device
44
+ self.device = device or (
45
+ "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
46
+ )
47
+
48
+ # Load models
49
+ self.load_vocoder_model(vocoder_name, local_path)
50
+ self.load_ema_model(model_type, ckpt_file, vocoder_name, vocab_file, ode_method, use_ema)
51
+
52
+ def load_vocoder_model(self, vocoder_name, local_path):
53
+ self.vocoder = load_vocoder(vocoder_name, local_path is not None, local_path, self.device)
54
+
55
+ def load_ema_model(self, model_type, ckpt_file, mel_spec_type, vocab_file, ode_method, use_ema):
56
+ if model_type == "F5-TTS":
57
+ if not ckpt_file:
58
+ if mel_spec_type == "vocos":
59
+ ckpt_file = str(cached_path("hf://SWivid/F5-TTS/F5TTS_Base/model_1200000.safetensors"))
60
+ elif mel_spec_type == "bigvgan":
61
+ ckpt_file = str(cached_path("hf://SWivid/F5-TTS/F5TTS_Base_bigvgan/model_1250000.pt"))
62
+ model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
63
+ model_cls = DiT
64
+ elif model_type == "E2-TTS":
65
+ if not ckpt_file:
66
+ ckpt_file = str(cached_path("hf://SWivid/E2-TTS/E2TTS_Base/model_1200000.safetensors"))
67
+ model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
68
+ model_cls = UNetT
69
+ else:
70
+ raise ValueError(f"Unknown model type: {model_type}")
71
+
72
+ self.ema_model = load_model(
73
+ model_cls, model_cfg, ckpt_file, mel_spec_type, vocab_file, ode_method, use_ema, self.device
74
+ )
75
+
76
+ def export_wav(self, wav, file_wave, remove_silence=False):
77
+ sf.write(file_wave, wav, self.target_sample_rate)
78
+
79
+ if remove_silence:
80
+ remove_silence_for_generated_wav(file_wave)
81
+
82
+ def export_spectrogram(self, spect, file_spect):
83
+ save_spectrogram(spect, file_spect)
84
+
85
+ def infer(
86
+ self,
87
+ ref_file,
88
+ ref_text,
89
+ gen_text,
90
+ show_info=print,
91
+ progress=tqdm,
92
+ target_rms=0.1,
93
+ cross_fade_duration=0.15,
94
+ sway_sampling_coef=-1,
95
+ cfg_strength=2,
96
+ nfe_step=32,
97
+ speed=1.0,
98
+ fix_duration=None,
99
+ remove_silence=False,
100
+ file_wave=None,
101
+ file_spect=None,
102
+ seed=-1,
103
+ ):
104
+ if seed == -1:
105
+ seed = random.randint(0, sys.maxsize)
106
+ seed_everything(seed)
107
+ self.seed = seed
108
+
109
+ ref_file, ref_text = preprocess_ref_audio_text(ref_file, ref_text, device=self.device)
110
+
111
+ wav, sr, spect = infer_process(
112
+ ref_file,
113
+ ref_text,
114
+ gen_text,
115
+ self.ema_model,
116
+ self.vocoder,
117
+ self.mel_spec_type,
118
+ show_info=show_info,
119
+ progress=progress,
120
+ target_rms=target_rms,
121
+ cross_fade_duration=cross_fade_duration,
122
+ nfe_step=nfe_step,
123
+ cfg_strength=cfg_strength,
124
+ sway_sampling_coef=sway_sampling_coef,
125
+ speed=speed,
126
+ fix_duration=fix_duration,
127
+ device=self.device,
128
+ )
129
+
130
+ if file_wave is not None:
131
+ self.export_wav(wav, file_wave, remove_silence)
132
+
133
+ if file_spect is not None:
134
+ self.export_spectrogram(spect, file_spect)
135
+
136
+ return wav, sr, spect
137
+
138
+
139
+ if __name__ == "__main__":
140
+ f5tts = F5TTS()
141
+
142
+ wav, sr, spect = f5tts.infer(
143
+ ref_file=str(files("f5_tts").joinpath("infer/examples/basic/basic_ref_en.wav")),
144
+ ref_text="some call me nature, others call me mother nature.",
145
+ gen_text="""I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring. Respect me and I'll nurture you; ignore me and you shall face the consequences.""",
146
+ file_wave=str(files("f5_tts").joinpath("../../tests/api_out.wav")),
147
+ file_spect=str(files("f5_tts").joinpath("../../tests/api_out.png")),
148
+ seed=-1, # random seed = -1
149
+ )
150
+
151
+ print("seed :", f5tts.seed)
src/f5_tts/eval/README.md ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Evaluation
3
+
4
+ Install packages for evaluation:
5
+
6
+ ```bash
7
+ pip install -e .[eval]
8
+ ```
9
+
10
+ ## Generating Samples for Evaluation
11
+
12
+ ### Prepare Test Datasets
13
+
14
+ 1. *Seed-TTS testset*: Download from [seed-tts-eval](https://github.com/BytedanceSpeech/seed-tts-eval).
15
+ 2. *LibriSpeech test-clean*: Download from [OpenSLR](http://www.openslr.org/12/).
16
+ 3. Unzip the downloaded datasets and place them in the `data/` directory.
17
+ 4. Update the path for *LibriSpeech test-clean* data in `src/f5_tts/eval/eval_infer_batch.py`
18
+ 5. Our filtered LibriSpeech-PC 4-10s subset: `data/librispeech_pc_test_clean_cross_sentence.lst`
19
+
20
+ ### Batch Inference for Test Set
21
+
22
+ To run batch inference for evaluations, execute the following commands:
23
+
24
+ ```bash
25
+ # batch inference for evaluations
26
+ accelerate config # if not set before
27
+ bash src/f5_tts/eval/eval_infer_batch.sh
28
+ ```
29
+
30
+ ## Objective Evaluation on Generated Results
31
+
32
+ ### Download Evaluation Model Checkpoints
33
+
34
+ 1. Chinese ASR Model: [Paraformer-zh](https://huggingface.co/funasr/paraformer-zh)
35
+ 2. English ASR Model: [Faster-Whisper](https://huggingface.co/Systran/faster-whisper-large-v3)
36
+ 3. WavLM Model: Download from [Google Drive](https://drive.google.com/file/d/1-aE1NfzpRCLxA4GUxX9ITI3F9LlbtEGP/view).
37
+
38
+ Then update in the following scripts with the paths you put evaluation model ckpts to.
39
+
40
+ ### Objective Evaluation
41
+
42
+ Update the path with your batch-inferenced results, and carry out WER / SIM evaluations:
43
+ ```bash
44
+ # Evaluation for Seed-TTS test set
45
+ python src/f5_tts/eval/eval_seedtts_testset.py
46
+
47
+ # Evaluation for LibriSpeech-PC test-clean (cross-sentence)
48
+ python src/f5_tts/eval/eval_librispeech_test_clean.py
49
+ ```
src/f5_tts/eval/ecapa_tdnn.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # just for speaker similarity evaluation, third-party code
2
+
3
+ # From https://github.com/microsoft/UniSpeech/blob/main/downstreams/speaker_verification/models/
4
+ # part of the code is borrowed from https://github.com/lawlict/ECAPA-TDNN
5
+
6
+ import os
7
+ import torch
8
+ import torch.nn as nn
9
+ import torch.nn.functional as F
10
+
11
+
12
+ """ Res2Conv1d + BatchNorm1d + ReLU
13
+ """
14
+
15
+
16
+ class Res2Conv1dReluBn(nn.Module):
17
+ """
18
+ in_channels == out_channels == channels
19
+ """
20
+
21
+ def __init__(self, channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True, scale=4):
22
+ super().__init__()
23
+ assert channels % scale == 0, "{} % {} != 0".format(channels, scale)
24
+ self.scale = scale
25
+ self.width = channels // scale
26
+ self.nums = scale if scale == 1 else scale - 1
27
+
28
+ self.convs = []
29
+ self.bns = []
30
+ for i in range(self.nums):
31
+ self.convs.append(nn.Conv1d(self.width, self.width, kernel_size, stride, padding, dilation, bias=bias))
32
+ self.bns.append(nn.BatchNorm1d(self.width))
33
+ self.convs = nn.ModuleList(self.convs)
34
+ self.bns = nn.ModuleList(self.bns)
35
+
36
+ def forward(self, x):
37
+ out = []
38
+ spx = torch.split(x, self.width, 1)
39
+ for i in range(self.nums):
40
+ if i == 0:
41
+ sp = spx[i]
42
+ else:
43
+ sp = sp + spx[i]
44
+ # Order: conv -> relu -> bn
45
+ sp = self.convs[i](sp)
46
+ sp = self.bns[i](F.relu(sp))
47
+ out.append(sp)
48
+ if self.scale != 1:
49
+ out.append(spx[self.nums])
50
+ out = torch.cat(out, dim=1)
51
+
52
+ return out
53
+
54
+
55
+ """ Conv1d + BatchNorm1d + ReLU
56
+ """
57
+
58
+
59
+ class Conv1dReluBn(nn.Module):
60
+ def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1, bias=True):
61
+ super().__init__()
62
+ self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias=bias)
63
+ self.bn = nn.BatchNorm1d(out_channels)
64
+
65
+ def forward(self, x):
66
+ return self.bn(F.relu(self.conv(x)))
67
+
68
+
69
+ """ The SE connection of 1D case.
70
+ """
71
+
72
+
73
+ class SE_Connect(nn.Module):
74
+ def __init__(self, channels, se_bottleneck_dim=128):
75
+ super().__init__()
76
+ self.linear1 = nn.Linear(channels, se_bottleneck_dim)
77
+ self.linear2 = nn.Linear(se_bottleneck_dim, channels)
78
+
79
+ def forward(self, x):
80
+ out = x.mean(dim=2)
81
+ out = F.relu(self.linear1(out))
82
+ out = torch.sigmoid(self.linear2(out))
83
+ out = x * out.unsqueeze(2)
84
+
85
+ return out
86
+
87
+
88
+ """ SE-Res2Block of the ECAPA-TDNN architecture.
89
+ """
90
+
91
+ # def SE_Res2Block(channels, kernel_size, stride, padding, dilation, scale):
92
+ # return nn.Sequential(
93
+ # Conv1dReluBn(channels, 512, kernel_size=1, stride=1, padding=0),
94
+ # Res2Conv1dReluBn(512, kernel_size, stride, padding, dilation, scale=scale),
95
+ # Conv1dReluBn(512, channels, kernel_size=1, stride=1, padding=0),
96
+ # SE_Connect(channels)
97
+ # )
98
+
99
+
100
+ class SE_Res2Block(nn.Module):
101
+ def __init__(self, in_channels, out_channels, kernel_size, stride, padding, dilation, scale, se_bottleneck_dim):
102
+ super().__init__()
103
+ self.Conv1dReluBn1 = Conv1dReluBn(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
104
+ self.Res2Conv1dReluBn = Res2Conv1dReluBn(out_channels, kernel_size, stride, padding, dilation, scale=scale)
105
+ self.Conv1dReluBn2 = Conv1dReluBn(out_channels, out_channels, kernel_size=1, stride=1, padding=0)
106
+ self.SE_Connect = SE_Connect(out_channels, se_bottleneck_dim)
107
+
108
+ self.shortcut = None
109
+ if in_channels != out_channels:
110
+ self.shortcut = nn.Conv1d(
111
+ in_channels=in_channels,
112
+ out_channels=out_channels,
113
+ kernel_size=1,
114
+ )
115
+
116
+ def forward(self, x):
117
+ residual = x
118
+ if self.shortcut:
119
+ residual = self.shortcut(x)
120
+
121
+ x = self.Conv1dReluBn1(x)
122
+ x = self.Res2Conv1dReluBn(x)
123
+ x = self.Conv1dReluBn2(x)
124
+ x = self.SE_Connect(x)
125
+
126
+ return x + residual
127
+
128
+
129
+ """ Attentive weighted mean and standard deviation pooling.
130
+ """
131
+
132
+
133
+ class AttentiveStatsPool(nn.Module):
134
+ def __init__(self, in_dim, attention_channels=128, global_context_att=False):
135
+ super().__init__()
136
+ self.global_context_att = global_context_att
137
+
138
+ # Use Conv1d with stride == 1 rather than Linear, then we don't need to transpose inputs.
139
+ if global_context_att:
140
+ self.linear1 = nn.Conv1d(in_dim * 3, attention_channels, kernel_size=1) # equals W and b in the paper
141
+ else:
142
+ self.linear1 = nn.Conv1d(in_dim, attention_channels, kernel_size=1) # equals W and b in the paper
143
+ self.linear2 = nn.Conv1d(attention_channels, in_dim, kernel_size=1) # equals V and k in the paper
144
+
145
+ def forward(self, x):
146
+ if self.global_context_att:
147
+ context_mean = torch.mean(x, dim=-1, keepdim=True).expand_as(x)
148
+ context_std = torch.sqrt(torch.var(x, dim=-1, keepdim=True) + 1e-10).expand_as(x)
149
+ x_in = torch.cat((x, context_mean, context_std), dim=1)
150
+ else:
151
+ x_in = x
152
+
153
+ # DON'T use ReLU here! In experiments, I find ReLU hard to converge.
154
+ alpha = torch.tanh(self.linear1(x_in))
155
+ # alpha = F.relu(self.linear1(x_in))
156
+ alpha = torch.softmax(self.linear2(alpha), dim=2)
157
+ mean = torch.sum(alpha * x, dim=2)
158
+ residuals = torch.sum(alpha * (x**2), dim=2) - mean**2
159
+ std = torch.sqrt(residuals.clamp(min=1e-9))
160
+ return torch.cat([mean, std], dim=1)
161
+
162
+
163
+ class ECAPA_TDNN(nn.Module):
164
+ def __init__(
165
+ self,
166
+ feat_dim=80,
167
+ channels=512,
168
+ emb_dim=192,
169
+ global_context_att=False,
170
+ feat_type="wavlm_large",
171
+ sr=16000,
172
+ feature_selection="hidden_states",
173
+ update_extract=False,
174
+ config_path=None,
175
+ ):
176
+ super().__init__()
177
+
178
+ self.feat_type = feat_type
179
+ self.feature_selection = feature_selection
180
+ self.update_extract = update_extract
181
+ self.sr = sr
182
+
183
+ torch.hub._validate_not_a_forked_repo = lambda a, b, c: True
184
+ try:
185
+ local_s3prl_path = os.path.expanduser("~/.cache/torch/hub/s3prl_s3prl_main")
186
+ self.feature_extract = torch.hub.load(local_s3prl_path, feat_type, source="local", config_path=config_path)
187
+ except: # noqa: E722
188
+ self.feature_extract = torch.hub.load("s3prl/s3prl", feat_type)
189
+
190
+ if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(
191
+ self.feature_extract.model.encoder.layers[23].self_attn, "fp32_attention"
192
+ ):
193
+ self.feature_extract.model.encoder.layers[23].self_attn.fp32_attention = False
194
+ if len(self.feature_extract.model.encoder.layers) == 24 and hasattr(
195
+ self.feature_extract.model.encoder.layers[11].self_attn, "fp32_attention"
196
+ ):
197
+ self.feature_extract.model.encoder.layers[11].self_attn.fp32_attention = False
198
+
199
+ self.feat_num = self.get_feat_num()
200
+ self.feature_weight = nn.Parameter(torch.zeros(self.feat_num))
201
+
202
+ if feat_type != "fbank" and feat_type != "mfcc":
203
+ freeze_list = ["final_proj", "label_embs_concat", "mask_emb", "project_q", "quantizer"]
204
+ for name, param in self.feature_extract.named_parameters():
205
+ for freeze_val in freeze_list:
206
+ if freeze_val in name:
207
+ param.requires_grad = False
208
+ break
209
+
210
+ if not self.update_extract:
211
+ for param in self.feature_extract.parameters():
212
+ param.requires_grad = False
213
+
214
+ self.instance_norm = nn.InstanceNorm1d(feat_dim)
215
+ # self.channels = [channels] * 4 + [channels * 3]
216
+ self.channels = [channels] * 4 + [1536]
217
+
218
+ self.layer1 = Conv1dReluBn(feat_dim, self.channels[0], kernel_size=5, padding=2)
219
+ self.layer2 = SE_Res2Block(
220
+ self.channels[0],
221
+ self.channels[1],
222
+ kernel_size=3,
223
+ stride=1,
224
+ padding=2,
225
+ dilation=2,
226
+ scale=8,
227
+ se_bottleneck_dim=128,
228
+ )
229
+ self.layer3 = SE_Res2Block(
230
+ self.channels[1],
231
+ self.channels[2],
232
+ kernel_size=3,
233
+ stride=1,
234
+ padding=3,
235
+ dilation=3,
236
+ scale=8,
237
+ se_bottleneck_dim=128,
238
+ )
239
+ self.layer4 = SE_Res2Block(
240
+ self.channels[2],
241
+ self.channels[3],
242
+ kernel_size=3,
243
+ stride=1,
244
+ padding=4,
245
+ dilation=4,
246
+ scale=8,
247
+ se_bottleneck_dim=128,
248
+ )
249
+
250
+ # self.conv = nn.Conv1d(self.channels[-1], self.channels[-1], kernel_size=1)
251
+ cat_channels = channels * 3
252
+ self.conv = nn.Conv1d(cat_channels, self.channels[-1], kernel_size=1)
253
+ self.pooling = AttentiveStatsPool(
254
+ self.channels[-1], attention_channels=128, global_context_att=global_context_att
255
+ )
256
+ self.bn = nn.BatchNorm1d(self.channels[-1] * 2)
257
+ self.linear = nn.Linear(self.channels[-1] * 2, emb_dim)
258
+
259
+ def get_feat_num(self):
260
+ self.feature_extract.eval()
261
+ wav = [torch.randn(self.sr).to(next(self.feature_extract.parameters()).device)]
262
+ with torch.no_grad():
263
+ features = self.feature_extract(wav)
264
+ select_feature = features[self.feature_selection]
265
+ if isinstance(select_feature, (list, tuple)):
266
+ return len(select_feature)
267
+ else:
268
+ return 1
269
+
270
+ def get_feat(self, x):
271
+ if self.update_extract:
272
+ x = self.feature_extract([sample for sample in x])
273
+ else:
274
+ with torch.no_grad():
275
+ if self.feat_type == "fbank" or self.feat_type == "mfcc":
276
+ x = self.feature_extract(x) + 1e-6 # B x feat_dim x time_len
277
+ else:
278
+ x = self.feature_extract([sample for sample in x])
279
+
280
+ if self.feat_type == "fbank":
281
+ x = x.log()
282
+
283
+ if self.feat_type != "fbank" and self.feat_type != "mfcc":
284
+ x = x[self.feature_selection]
285
+ if isinstance(x, (list, tuple)):
286
+ x = torch.stack(x, dim=0)
287
+ else:
288
+ x = x.unsqueeze(0)
289
+ norm_weights = F.softmax(self.feature_weight, dim=-1).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
290
+ x = (norm_weights * x).sum(dim=0)
291
+ x = torch.transpose(x, 1, 2) + 1e-6
292
+
293
+ x = self.instance_norm(x)
294
+ return x
295
+
296
+ def forward(self, x):
297
+ x = self.get_feat(x)
298
+
299
+ out1 = self.layer1(x)
300
+ out2 = self.layer2(out1)
301
+ out3 = self.layer3(out2)
302
+ out4 = self.layer4(out3)
303
+
304
+ out = torch.cat([out2, out3, out4], dim=1)
305
+ out = F.relu(self.conv(out))
306
+ out = self.bn(self.pooling(out))
307
+ out = self.linear(out)
308
+
309
+ return out
310
+
311
+
312
+ def ECAPA_TDNN_SMALL(
313
+ feat_dim,
314
+ emb_dim=256,
315
+ feat_type="wavlm_large",
316
+ sr=16000,
317
+ feature_selection="hidden_states",
318
+ update_extract=False,
319
+ config_path=None,
320
+ ):
321
+ return ECAPA_TDNN(
322
+ feat_dim=feat_dim,
323
+ channels=512,
324
+ emb_dim=emb_dim,
325
+ feat_type=feat_type,
326
+ sr=sr,
327
+ feature_selection=feature_selection,
328
+ update_extract=update_extract,
329
+ config_path=config_path,
330
+ )
src/f5_tts/eval/eval_infer_batch.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ sys.path.append(os.getcwd())
5
+
6
+ import argparse
7
+ import time
8
+ from importlib.resources import files
9
+
10
+ import torch
11
+ import torchaudio
12
+ from accelerate import Accelerator
13
+ from tqdm import tqdm
14
+
15
+ from f5_tts.eval.utils_eval import (
16
+ get_inference_prompt,
17
+ get_librispeech_test_clean_metainfo,
18
+ get_seedtts_testset_metainfo,
19
+ )
20
+ from f5_tts.infer.utils_infer import load_checkpoint, load_vocoder
21
+ from f5_tts.model import CFM, DiT, UNetT
22
+ from f5_tts.model.utils import get_tokenizer
23
+
24
+ accelerator = Accelerator()
25
+ device = f"cuda:{accelerator.process_index}"
26
+
27
+
28
+ # --------------------- Dataset Settings -------------------- #
29
+
30
+ target_sample_rate = 24000
31
+ n_mel_channels = 100
32
+ hop_length = 256
33
+ win_length = 1024
34
+ n_fft = 1024
35
+ target_rms = 0.1
36
+
37
+
38
+ tokenizer = "pinyin"
39
+ rel_path = str(files("f5_tts").joinpath("../../"))
40
+
41
+
42
+ def main():
43
+ # ---------------------- infer setting ---------------------- #
44
+
45
+ parser = argparse.ArgumentParser(description="batch inference")
46
+
47
+ parser.add_argument("-s", "--seed", default=None, type=int)
48
+ parser.add_argument("-d", "--dataset", default="Emilia_ZH_EN")
49
+ parser.add_argument("-n", "--expname", required=True)
50
+ parser.add_argument("-c", "--ckptstep", default=1200000, type=int)
51
+ parser.add_argument("-m", "--mel_spec_type", default="vocos", type=str, choices=["bigvgan", "vocos"])
52
+
53
+ parser.add_argument("-nfe", "--nfestep", default=32, type=int)
54
+ parser.add_argument("-o", "--odemethod", default="euler")
55
+ parser.add_argument("-ss", "--swaysampling", default=-1, type=float)
56
+
57
+ parser.add_argument("-t", "--testset", required=True)
58
+
59
+ args = parser.parse_args()
60
+
61
+ seed = args.seed
62
+ dataset_name = args.dataset
63
+ exp_name = args.expname
64
+ ckpt_step = args.ckptstep
65
+ ckpt_path = rel_path + f"/ckpts/{exp_name}/model_{ckpt_step}.pt"
66
+ mel_spec_type = args.mel_spec_type
67
+
68
+ nfe_step = args.nfestep
69
+ ode_method = args.odemethod
70
+ sway_sampling_coef = args.swaysampling
71
+
72
+ testset = args.testset
73
+
74
+ infer_batch_size = 1 # max frames. 1 for ddp single inference (recommended)
75
+ cfg_strength = 2.0
76
+ speed = 1.0
77
+ use_truth_duration = False
78
+ no_ref_audio = False
79
+
80
+ if exp_name == "F5TTS_Base":
81
+ model_cls = DiT
82
+ model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
83
+
84
+ elif exp_name == "E2TTS_Base":
85
+ model_cls = UNetT
86
+ model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
87
+
88
+ if testset == "ls_pc_test_clean":
89
+ metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
90
+ librispeech_test_clean_path = "<SOME_PATH>/LibriSpeech/test-clean" # test-clean path
91
+ metainfo = get_librispeech_test_clean_metainfo(metalst, librispeech_test_clean_path)
92
+
93
+ elif testset == "seedtts_test_zh":
94
+ metalst = rel_path + "/data/seedtts_testset/zh/meta.lst"
95
+ metainfo = get_seedtts_testset_metainfo(metalst)
96
+
97
+ elif testset == "seedtts_test_en":
98
+ metalst = rel_path + "/data/seedtts_testset/en/meta.lst"
99
+ metainfo = get_seedtts_testset_metainfo(metalst)
100
+
101
+ # path to save genereted wavs
102
+ output_dir = (
103
+ f"{rel_path}/"
104
+ f"results/{exp_name}_{ckpt_step}/{testset}/"
105
+ f"seed{seed}_{ode_method}_nfe{nfe_step}_{mel_spec_type}"
106
+ f"{f'_ss{sway_sampling_coef}' if sway_sampling_coef else ''}"
107
+ f"_cfg{cfg_strength}_speed{speed}"
108
+ f"{'_gt-dur' if use_truth_duration else ''}"
109
+ f"{'_no-ref-audio' if no_ref_audio else ''}"
110
+ )
111
+
112
+ # -------------------------------------------------#
113
+
114
+ use_ema = True
115
+
116
+ prompts_all = get_inference_prompt(
117
+ metainfo,
118
+ speed=speed,
119
+ tokenizer=tokenizer,
120
+ target_sample_rate=target_sample_rate,
121
+ n_mel_channels=n_mel_channels,
122
+ hop_length=hop_length,
123
+ mel_spec_type=mel_spec_type,
124
+ target_rms=target_rms,
125
+ use_truth_duration=use_truth_duration,
126
+ infer_batch_size=infer_batch_size,
127
+ )
128
+
129
+ # Vocoder model
130
+ local = False
131
+ if mel_spec_type == "vocos":
132
+ vocoder_local_path = "../checkpoints/charactr/vocos-mel-24khz"
133
+ elif mel_spec_type == "bigvgan":
134
+ vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
135
+ vocoder = load_vocoder(vocoder_name=mel_spec_type, is_local=local, local_path=vocoder_local_path)
136
+
137
+ # Tokenizer
138
+ vocab_char_map, vocab_size = get_tokenizer(dataset_name, tokenizer)
139
+
140
+ # Model
141
+ model = CFM(
142
+ transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
143
+ mel_spec_kwargs=dict(
144
+ n_fft=n_fft,
145
+ hop_length=hop_length,
146
+ win_length=win_length,
147
+ n_mel_channels=n_mel_channels,
148
+ target_sample_rate=target_sample_rate,
149
+ mel_spec_type=mel_spec_type,
150
+ ),
151
+ odeint_kwargs=dict(
152
+ method=ode_method,
153
+ ),
154
+ vocab_char_map=vocab_char_map,
155
+ ).to(device)
156
+
157
+ dtype = torch.float32 if mel_spec_type == "bigvgan" else None
158
+ model = load_checkpoint(model, ckpt_path, device, dtype=dtype, use_ema=use_ema)
159
+
160
+ if not os.path.exists(output_dir) and accelerator.is_main_process:
161
+ os.makedirs(output_dir)
162
+
163
+ # start batch inference
164
+ accelerator.wait_for_everyone()
165
+ start = time.time()
166
+
167
+ with accelerator.split_between_processes(prompts_all) as prompts:
168
+ for prompt in tqdm(prompts, disable=not accelerator.is_local_main_process):
169
+ utts, ref_rms_list, ref_mels, ref_mel_lens, total_mel_lens, final_text_list = prompt
170
+ ref_mels = ref_mels.to(device)
171
+ ref_mel_lens = torch.tensor(ref_mel_lens, dtype=torch.long).to(device)
172
+ total_mel_lens = torch.tensor(total_mel_lens, dtype=torch.long).to(device)
173
+
174
+ # Inference
175
+ with torch.inference_mode():
176
+ generated, _ = model.sample(
177
+ cond=ref_mels,
178
+ text=final_text_list,
179
+ duration=total_mel_lens,
180
+ lens=ref_mel_lens,
181
+ steps=nfe_step,
182
+ cfg_strength=cfg_strength,
183
+ sway_sampling_coef=sway_sampling_coef,
184
+ no_ref_audio=no_ref_audio,
185
+ seed=seed,
186
+ )
187
+ # Final result
188
+ for i, gen in enumerate(generated):
189
+ gen = gen[ref_mel_lens[i] : total_mel_lens[i], :].unsqueeze(0)
190
+ gen_mel_spec = gen.permute(0, 2, 1)
191
+ if mel_spec_type == "vocos":
192
+ generated_wave = vocoder.decode(gen_mel_spec)
193
+ elif mel_spec_type == "bigvgan":
194
+ generated_wave = vocoder(gen_mel_spec)
195
+
196
+ if ref_rms_list[i] < target_rms:
197
+ generated_wave = generated_wave * ref_rms_list[i] / target_rms
198
+ torchaudio.save(f"{output_dir}/{utts[i]}.wav", generated_wave.squeeze(0).cpu(), target_sample_rate)
199
+
200
+ accelerator.wait_for_everyone()
201
+ if accelerator.is_main_process:
202
+ timediff = time.time() - start
203
+ print(f"Done batch inference in {timediff / 60 :.2f} minutes.")
204
+
205
+
206
+ if __name__ == "__main__":
207
+ main()
src/f5_tts/eval/eval_infer_batch.sh ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # e.g. F5-TTS, 16 NFE
4
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "F5TTS_Base" -t "seedtts_test_zh" -nfe 16
5
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "F5TTS_Base" -t "seedtts_test_en" -nfe 16
6
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "F5TTS_Base" -t "ls_pc_test_clean" -nfe 16
7
+
8
+ # e.g. Vanilla E2 TTS, 32 NFE
9
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "E2TTS_Base" -t "seedtts_test_zh" -o "midpoint" -ss 0
10
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "E2TTS_Base" -t "seedtts_test_en" -o "midpoint" -ss 0
11
+ accelerate launch src/f5_tts/eval/eval_infer_batch.py -s 0 -n "E2TTS_Base" -t "ls_pc_test_clean" -o "midpoint" -ss 0
12
+
13
+ # etc.
src/f5_tts/eval/eval_librispeech_test_clean.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Evaluate with Librispeech test-clean, ~3s prompt to generate 4-10s audio (the way of valle/voicebox evaluation)
2
+
3
+ import sys
4
+ import os
5
+
6
+ sys.path.append(os.getcwd())
7
+
8
+ import multiprocessing as mp
9
+ from importlib.resources import files
10
+
11
+ import numpy as np
12
+
13
+ from f5_tts.eval.utils_eval import (
14
+ get_librispeech_test,
15
+ run_asr_wer,
16
+ run_sim,
17
+ )
18
+
19
+ rel_path = str(files("f5_tts").joinpath("../../"))
20
+
21
+
22
+ eval_task = "wer" # sim | wer
23
+ lang = "en"
24
+ metalst = rel_path + "/data/librispeech_pc_test_clean_cross_sentence.lst"
25
+ librispeech_test_clean_path = "<SOME_PATH>/LibriSpeech/test-clean" # test-clean path
26
+ gen_wav_dir = "PATH_TO_GENERATED" # generated wavs
27
+
28
+ gpus = [0, 1, 2, 3, 4, 5, 6, 7]
29
+ test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path)
30
+
31
+ ## In LibriSpeech, some speakers utilized varying voice characteristics for different characters in the book,
32
+ ## leading to a low similarity for the ground truth in some cases.
33
+ # test_set = get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path, eval_ground_truth = True) # eval ground truth
34
+
35
+ local = False
36
+ if local: # use local custom checkpoint dir
37
+ asr_ckpt_dir = "../checkpoints/Systran/faster-whisper-large-v3"
38
+ else:
39
+ asr_ckpt_dir = "" # auto download to cache dir
40
+
41
+ wavlm_ckpt_dir = "../checkpoints/UniSpeech/wavlm_large_finetune.pth"
42
+
43
+
44
+ # --------------------------- WER ---------------------------
45
+
46
+ if eval_task == "wer":
47
+ wers = []
48
+
49
+ with mp.Pool(processes=len(gpus)) as pool:
50
+ args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
51
+ results = pool.map(run_asr_wer, args)
52
+ for wers_ in results:
53
+ wers.extend(wers_)
54
+
55
+ wer = round(np.mean(wers) * 100, 3)
56
+ print(f"\nTotal {len(wers)} samples")
57
+ print(f"WER : {wer}%")
58
+
59
+
60
+ # --------------------------- SIM ---------------------------
61
+
62
+ if eval_task == "sim":
63
+ sim_list = []
64
+
65
+ with mp.Pool(processes=len(gpus)) as pool:
66
+ args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
67
+ results = pool.map(run_sim, args)
68
+ for sim_ in results:
69
+ sim_list.extend(sim_)
70
+
71
+ sim = round(sum(sim_list) / len(sim_list), 3)
72
+ print(f"\nTotal {len(sim_list)} samples")
73
+ print(f"SIM : {sim}")
src/f5_tts/eval/eval_seedtts_testset.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Evaluate with Seed-TTS testset
2
+
3
+ import sys
4
+ import os
5
+
6
+ sys.path.append(os.getcwd())
7
+
8
+ import multiprocessing as mp
9
+ from importlib.resources import files
10
+
11
+ import numpy as np
12
+
13
+ from f5_tts.eval.utils_eval import (
14
+ get_seed_tts_test,
15
+ run_asr_wer,
16
+ run_sim,
17
+ )
18
+
19
+ rel_path = str(files("f5_tts").joinpath("../../"))
20
+
21
+
22
+ eval_task = "wer" # sim | wer
23
+ lang = "zh" # zh | en
24
+ metalst = rel_path + f"/data/seedtts_testset/{lang}/meta.lst" # seed-tts testset
25
+ # gen_wav_dir = rel_path + f"/data/seedtts_testset/{lang}/wavs" # ground truth wavs
26
+ gen_wav_dir = "PATH_TO_GENERATED" # generated wavs
27
+
28
+
29
+ # NOTE. paraformer-zh result will be slightly different according to the number of gpus, cuz batchsize is different
30
+ # zh 1.254 seems a result of 4 workers wer_seed_tts
31
+ gpus = [0, 1, 2, 3, 4, 5, 6, 7]
32
+ test_set = get_seed_tts_test(metalst, gen_wav_dir, gpus)
33
+
34
+ local = False
35
+ if local: # use local custom checkpoint dir
36
+ if lang == "zh":
37
+ asr_ckpt_dir = "../checkpoints/funasr" # paraformer-zh dir under funasr
38
+ elif lang == "en":
39
+ asr_ckpt_dir = "../checkpoints/Systran/faster-whisper-large-v3"
40
+ else:
41
+ asr_ckpt_dir = "" # auto download to cache dir
42
+
43
+ wavlm_ckpt_dir = "../checkpoints/UniSpeech/wavlm_large_finetune.pth"
44
+
45
+
46
+ # --------------------------- WER ---------------------------
47
+
48
+ if eval_task == "wer":
49
+ wers = []
50
+
51
+ with mp.Pool(processes=len(gpus)) as pool:
52
+ args = [(rank, lang, sub_test_set, asr_ckpt_dir) for (rank, sub_test_set) in test_set]
53
+ results = pool.map(run_asr_wer, args)
54
+ for wers_ in results:
55
+ wers.extend(wers_)
56
+
57
+ wer = round(np.mean(wers) * 100, 3)
58
+ print(f"\nTotal {len(wers)} samples")
59
+ print(f"WER : {wer}%")
60
+
61
+
62
+ # --------------------------- SIM ---------------------------
63
+
64
+ if eval_task == "sim":
65
+ sim_list = []
66
+
67
+ with mp.Pool(processes=len(gpus)) as pool:
68
+ args = [(rank, sub_test_set, wavlm_ckpt_dir) for (rank, sub_test_set) in test_set]
69
+ results = pool.map(run_sim, args)
70
+ for sim_ in results:
71
+ sim_list.extend(sim_)
72
+
73
+ sim = round(sum(sim_list) / len(sim_list), 3)
74
+ print(f"\nTotal {len(sim_list)} samples")
75
+ print(f"SIM : {sim}")
src/f5_tts/eval/utils_eval.py ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import os
3
+ import random
4
+ import string
5
+
6
+ import torch
7
+ import torch.nn.functional as F
8
+ import torchaudio
9
+ from tqdm import tqdm
10
+
11
+ from f5_tts.eval.ecapa_tdnn import ECAPA_TDNN_SMALL
12
+ from f5_tts.model.modules import MelSpec
13
+ from f5_tts.model.utils import convert_char_to_pinyin
14
+
15
+
16
+ # seedtts testset metainfo: utt, prompt_text, prompt_wav, gt_text, gt_wav
17
+ def get_seedtts_testset_metainfo(metalst):
18
+ f = open(metalst)
19
+ lines = f.readlines()
20
+ f.close()
21
+ metainfo = []
22
+ for line in lines:
23
+ if len(line.strip().split("|")) == 5:
24
+ utt, prompt_text, prompt_wav, gt_text, gt_wav = line.strip().split("|")
25
+ elif len(line.strip().split("|")) == 4:
26
+ utt, prompt_text, prompt_wav, gt_text = line.strip().split("|")
27
+ gt_wav = os.path.join(os.path.dirname(metalst), "wavs", utt + ".wav")
28
+ if not os.path.isabs(prompt_wav):
29
+ prompt_wav = os.path.join(os.path.dirname(metalst), prompt_wav)
30
+ metainfo.append((utt, prompt_text, prompt_wav, gt_text, gt_wav))
31
+ return metainfo
32
+
33
+
34
+ # librispeech test-clean metainfo: gen_utt, ref_txt, ref_wav, gen_txt, gen_wav
35
+ def get_librispeech_test_clean_metainfo(metalst, librispeech_test_clean_path):
36
+ f = open(metalst)
37
+ lines = f.readlines()
38
+ f.close()
39
+ metainfo = []
40
+ for line in lines:
41
+ ref_utt, ref_dur, ref_txt, gen_utt, gen_dur, gen_txt = line.strip().split("\t")
42
+
43
+ # ref_txt = ref_txt[0] + ref_txt[1:].lower() + '.' # if use librispeech test-clean (no-pc)
44
+ ref_spk_id, ref_chaptr_id, _ = ref_utt.split("-")
45
+ ref_wav = os.path.join(librispeech_test_clean_path, ref_spk_id, ref_chaptr_id, ref_utt + ".flac")
46
+
47
+ # gen_txt = gen_txt[0] + gen_txt[1:].lower() + '.' # if use librispeech test-clean (no-pc)
48
+ gen_spk_id, gen_chaptr_id, _ = gen_utt.split("-")
49
+ gen_wav = os.path.join(librispeech_test_clean_path, gen_spk_id, gen_chaptr_id, gen_utt + ".flac")
50
+
51
+ metainfo.append((gen_utt, ref_txt, ref_wav, " " + gen_txt, gen_wav))
52
+
53
+ return metainfo
54
+
55
+
56
+ # padded to max length mel batch
57
+ def padded_mel_batch(ref_mels):
58
+ max_mel_length = torch.LongTensor([mel.shape[-1] for mel in ref_mels]).amax()
59
+ padded_ref_mels = []
60
+ for mel in ref_mels:
61
+ padded_ref_mel = F.pad(mel, (0, max_mel_length - mel.shape[-1]), value=0)
62
+ padded_ref_mels.append(padded_ref_mel)
63
+ padded_ref_mels = torch.stack(padded_ref_mels)
64
+ padded_ref_mels = padded_ref_mels.permute(0, 2, 1)
65
+ return padded_ref_mels
66
+
67
+
68
+ # get prompts from metainfo containing: utt, prompt_text, prompt_wav, gt_text, gt_wav
69
+
70
+
71
+ def get_inference_prompt(
72
+ metainfo,
73
+ speed=1.0,
74
+ tokenizer="pinyin",
75
+ polyphone=True,
76
+ target_sample_rate=24000,
77
+ n_fft=1024,
78
+ win_length=1024,
79
+ n_mel_channels=100,
80
+ hop_length=256,
81
+ mel_spec_type="vocos",
82
+ target_rms=0.1,
83
+ use_truth_duration=False,
84
+ infer_batch_size=1,
85
+ num_buckets=200,
86
+ min_secs=3,
87
+ max_secs=40,
88
+ ):
89
+ prompts_all = []
90
+
91
+ min_tokens = min_secs * target_sample_rate // hop_length
92
+ max_tokens = max_secs * target_sample_rate // hop_length
93
+
94
+ batch_accum = [0] * num_buckets
95
+ utts, ref_rms_list, ref_mels, ref_mel_lens, total_mel_lens, final_text_list = (
96
+ [[] for _ in range(num_buckets)] for _ in range(6)
97
+ )
98
+
99
+ mel_spectrogram = MelSpec(
100
+ n_fft=n_fft,
101
+ hop_length=hop_length,
102
+ win_length=win_length,
103
+ n_mel_channels=n_mel_channels,
104
+ target_sample_rate=target_sample_rate,
105
+ mel_spec_type=mel_spec_type,
106
+ )
107
+
108
+ for utt, prompt_text, prompt_wav, gt_text, gt_wav in tqdm(metainfo, desc="Processing prompts..."):
109
+ # Audio
110
+ ref_audio, ref_sr = torchaudio.load(prompt_wav)
111
+ ref_rms = torch.sqrt(torch.mean(torch.square(ref_audio)))
112
+ if ref_rms < target_rms:
113
+ ref_audio = ref_audio * target_rms / ref_rms
114
+ assert ref_audio.shape[-1] > 5000, f"Empty prompt wav: {prompt_wav}, or torchaudio backend issue."
115
+ if ref_sr != target_sample_rate:
116
+ resampler = torchaudio.transforms.Resample(ref_sr, target_sample_rate)
117
+ ref_audio = resampler(ref_audio)
118
+
119
+ # Text
120
+ if len(prompt_text[-1].encode("utf-8")) == 1:
121
+ prompt_text = prompt_text + " "
122
+ text = [prompt_text + gt_text]
123
+ if tokenizer == "pinyin":
124
+ text_list = convert_char_to_pinyin(text, polyphone=polyphone)
125
+ else:
126
+ text_list = text
127
+
128
+ # Duration, mel frame length
129
+ ref_mel_len = ref_audio.shape[-1] // hop_length
130
+ if use_truth_duration:
131
+ gt_audio, gt_sr = torchaudio.load(gt_wav)
132
+ if gt_sr != target_sample_rate:
133
+ resampler = torchaudio.transforms.Resample(gt_sr, target_sample_rate)
134
+ gt_audio = resampler(gt_audio)
135
+ total_mel_len = ref_mel_len + int(gt_audio.shape[-1] / hop_length / speed)
136
+
137
+ # # test vocoder resynthesis
138
+ # ref_audio = gt_audio
139
+ else:
140
+ ref_text_len = len(prompt_text.encode("utf-8"))
141
+ gen_text_len = len(gt_text.encode("utf-8"))
142
+ total_mel_len = ref_mel_len + int(ref_mel_len / ref_text_len * gen_text_len / speed)
143
+
144
+ # to mel spectrogram
145
+ ref_mel = mel_spectrogram(ref_audio)
146
+ ref_mel = ref_mel.squeeze(0)
147
+
148
+ # deal with batch
149
+ assert infer_batch_size > 0, "infer_batch_size should be greater than 0."
150
+ assert (
151
+ min_tokens <= total_mel_len <= max_tokens
152
+ ), f"Audio {utt} has duration {total_mel_len*hop_length//target_sample_rate}s out of range [{min_secs}, {max_secs}]."
153
+ bucket_i = math.floor((total_mel_len - min_tokens) / (max_tokens - min_tokens + 1) * num_buckets)
154
+
155
+ utts[bucket_i].append(utt)
156
+ ref_rms_list[bucket_i].append(ref_rms)
157
+ ref_mels[bucket_i].append(ref_mel)
158
+ ref_mel_lens[bucket_i].append(ref_mel_len)
159
+ total_mel_lens[bucket_i].append(total_mel_len)
160
+ final_text_list[bucket_i].extend(text_list)
161
+
162
+ batch_accum[bucket_i] += total_mel_len
163
+
164
+ if batch_accum[bucket_i] >= infer_batch_size:
165
+ # print(f"\n{len(ref_mels[bucket_i][0][0])}\n{ref_mel_lens[bucket_i]}\n{total_mel_lens[bucket_i]}")
166
+ prompts_all.append(
167
+ (
168
+ utts[bucket_i],
169
+ ref_rms_list[bucket_i],
170
+ padded_mel_batch(ref_mels[bucket_i]),
171
+ ref_mel_lens[bucket_i],
172
+ total_mel_lens[bucket_i],
173
+ final_text_list[bucket_i],
174
+ )
175
+ )
176
+ batch_accum[bucket_i] = 0
177
+ (
178
+ utts[bucket_i],
179
+ ref_rms_list[bucket_i],
180
+ ref_mels[bucket_i],
181
+ ref_mel_lens[bucket_i],
182
+ total_mel_lens[bucket_i],
183
+ final_text_list[bucket_i],
184
+ ) = [], [], [], [], [], []
185
+
186
+ # add residual
187
+ for bucket_i, bucket_frames in enumerate(batch_accum):
188
+ if bucket_frames > 0:
189
+ prompts_all.append(
190
+ (
191
+ utts[bucket_i],
192
+ ref_rms_list[bucket_i],
193
+ padded_mel_batch(ref_mels[bucket_i]),
194
+ ref_mel_lens[bucket_i],
195
+ total_mel_lens[bucket_i],
196
+ final_text_list[bucket_i],
197
+ )
198
+ )
199
+ # not only leave easy work for last workers
200
+ random.seed(666)
201
+ random.shuffle(prompts_all)
202
+
203
+ return prompts_all
204
+
205
+
206
+ # get wav_res_ref_text of seed-tts test metalst
207
+ # https://github.com/BytedanceSpeech/seed-tts-eval
208
+
209
+
210
+ def get_seed_tts_test(metalst, gen_wav_dir, gpus):
211
+ f = open(metalst)
212
+ lines = f.readlines()
213
+ f.close()
214
+
215
+ test_set_ = []
216
+ for line in tqdm(lines):
217
+ if len(line.strip().split("|")) == 5:
218
+ utt, prompt_text, prompt_wav, gt_text, gt_wav = line.strip().split("|")
219
+ elif len(line.strip().split("|")) == 4:
220
+ utt, prompt_text, prompt_wav, gt_text = line.strip().split("|")
221
+
222
+ if not os.path.exists(os.path.join(gen_wav_dir, utt + ".wav")):
223
+ continue
224
+ gen_wav = os.path.join(gen_wav_dir, utt + ".wav")
225
+ if not os.path.isabs(prompt_wav):
226
+ prompt_wav = os.path.join(os.path.dirname(metalst), prompt_wav)
227
+
228
+ test_set_.append((gen_wav, prompt_wav, gt_text))
229
+
230
+ num_jobs = len(gpus)
231
+ if num_jobs == 1:
232
+ return [(gpus[0], test_set_)]
233
+
234
+ wav_per_job = len(test_set_) // num_jobs + 1
235
+ test_set = []
236
+ for i in range(num_jobs):
237
+ test_set.append((gpus[i], test_set_[i * wav_per_job : (i + 1) * wav_per_job]))
238
+
239
+ return test_set
240
+
241
+
242
+ # get librispeech test-clean cross sentence test
243
+
244
+
245
+ def get_librispeech_test(metalst, gen_wav_dir, gpus, librispeech_test_clean_path, eval_ground_truth=False):
246
+ f = open(metalst)
247
+ lines = f.readlines()
248
+ f.close()
249
+
250
+ test_set_ = []
251
+ for line in tqdm(lines):
252
+ ref_utt, ref_dur, ref_txt, gen_utt, gen_dur, gen_txt = line.strip().split("\t")
253
+
254
+ if eval_ground_truth:
255
+ gen_spk_id, gen_chaptr_id, _ = gen_utt.split("-")
256
+ gen_wav = os.path.join(librispeech_test_clean_path, gen_spk_id, gen_chaptr_id, gen_utt + ".flac")
257
+ else:
258
+ if not os.path.exists(os.path.join(gen_wav_dir, gen_utt + ".wav")):
259
+ raise FileNotFoundError(f"Generated wav not found: {gen_utt}")
260
+ gen_wav = os.path.join(gen_wav_dir, gen_utt + ".wav")
261
+
262
+ ref_spk_id, ref_chaptr_id, _ = ref_utt.split("-")
263
+ ref_wav = os.path.join(librispeech_test_clean_path, ref_spk_id, ref_chaptr_id, ref_utt + ".flac")
264
+
265
+ test_set_.append((gen_wav, ref_wav, gen_txt))
266
+
267
+ num_jobs = len(gpus)
268
+ if num_jobs == 1:
269
+ return [(gpus[0], test_set_)]
270
+
271
+ wav_per_job = len(test_set_) // num_jobs + 1
272
+ test_set = []
273
+ for i in range(num_jobs):
274
+ test_set.append((gpus[i], test_set_[i * wav_per_job : (i + 1) * wav_per_job]))
275
+
276
+ return test_set
277
+
278
+
279
+ # load asr model
280
+
281
+
282
+ def load_asr_model(lang, ckpt_dir=""):
283
+ if lang == "zh":
284
+ from funasr import AutoModel
285
+
286
+ model = AutoModel(
287
+ model=os.path.join(ckpt_dir, "paraformer-zh"),
288
+ # vad_model = os.path.join(ckpt_dir, "fsmn-vad"),
289
+ # punc_model = os.path.join(ckpt_dir, "ct-punc"),
290
+ # spk_model = os.path.join(ckpt_dir, "cam++"),
291
+ disable_update=True,
292
+ ) # following seed-tts setting
293
+ elif lang == "en":
294
+ from faster_whisper import WhisperModel
295
+
296
+ model_size = "large-v3" if ckpt_dir == "" else ckpt_dir
297
+ model = WhisperModel(model_size, device="cuda", compute_type="float16")
298
+ return model
299
+
300
+
301
+ # WER Evaluation, the way Seed-TTS does
302
+
303
+
304
+ def run_asr_wer(args):
305
+ rank, lang, test_set, ckpt_dir = args
306
+
307
+ if lang == "zh":
308
+ import zhconv
309
+
310
+ torch.cuda.set_device(rank)
311
+ elif lang == "en":
312
+ os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)
313
+ else:
314
+ raise NotImplementedError(
315
+ "lang support only 'zh' (funasr paraformer-zh), 'en' (faster-whisper-large-v3), for now."
316
+ )
317
+
318
+ asr_model = load_asr_model(lang, ckpt_dir=ckpt_dir)
319
+
320
+ from zhon.hanzi import punctuation
321
+
322
+ punctuation_all = punctuation + string.punctuation
323
+ wers = []
324
+
325
+ from jiwer import compute_measures
326
+
327
+ for gen_wav, prompt_wav, truth in tqdm(test_set):
328
+ if lang == "zh":
329
+ res = asr_model.generate(input=gen_wav, batch_size_s=300, disable_pbar=True)
330
+ hypo = res[0]["text"]
331
+ hypo = zhconv.convert(hypo, "zh-cn")
332
+ elif lang == "en":
333
+ segments, _ = asr_model.transcribe(gen_wav, beam_size=5, language="en")
334
+ hypo = ""
335
+ for segment in segments:
336
+ hypo = hypo + " " + segment.text
337
+
338
+ # raw_truth = truth
339
+ # raw_hypo = hypo
340
+
341
+ for x in punctuation_all:
342
+ truth = truth.replace(x, "")
343
+ hypo = hypo.replace(x, "")
344
+
345
+ truth = truth.replace(" ", " ")
346
+ hypo = hypo.replace(" ", " ")
347
+
348
+ if lang == "zh":
349
+ truth = " ".join([x for x in truth])
350
+ hypo = " ".join([x for x in hypo])
351
+ elif lang == "en":
352
+ truth = truth.lower()
353
+ hypo = hypo.lower()
354
+
355
+ measures = compute_measures(truth, hypo)
356
+ wer = measures["wer"]
357
+
358
+ # ref_list = truth.split(" ")
359
+ # subs = measures["substitutions"] / len(ref_list)
360
+ # dele = measures["deletions"] / len(ref_list)
361
+ # inse = measures["insertions"] / len(ref_list)
362
+
363
+ wers.append(wer)
364
+
365
+ return wers
366
+
367
+
368
+ # SIM Evaluation
369
+
370
+
371
+ def run_sim(args):
372
+ rank, test_set, ckpt_dir = args
373
+ device = f"cuda:{rank}"
374
+
375
+ model = ECAPA_TDNN_SMALL(feat_dim=1024, feat_type="wavlm_large", config_path=None)
376
+ state_dict = torch.load(ckpt_dir, weights_only=True, map_location=lambda storage, loc: storage)
377
+ model.load_state_dict(state_dict["model"], strict=False)
378
+
379
+ use_gpu = True if torch.cuda.is_available() else False
380
+ if use_gpu:
381
+ model = model.cuda(device)
382
+ model.eval()
383
+
384
+ sim_list = []
385
+ for wav1, wav2, truth in tqdm(test_set):
386
+ wav1, sr1 = torchaudio.load(wav1)
387
+ wav2, sr2 = torchaudio.load(wav2)
388
+
389
+ resample1 = torchaudio.transforms.Resample(orig_freq=sr1, new_freq=16000)
390
+ resample2 = torchaudio.transforms.Resample(orig_freq=sr2, new_freq=16000)
391
+ wav1 = resample1(wav1)
392
+ wav2 = resample2(wav2)
393
+
394
+ if use_gpu:
395
+ wav1 = wav1.cuda(device)
396
+ wav2 = wav2.cuda(device)
397
+ with torch.no_grad():
398
+ emb1 = model(wav1)
399
+ emb2 = model(wav2)
400
+
401
+ sim = F.cosine_similarity(emb1, emb2)[0].item()
402
+ # print(f"VSim score between two audios: {sim:.4f} (-1.0, 1.0).")
403
+ sim_list.append(sim)
404
+
405
+ return sim_list
src/f5_tts/infer/README.md ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Inference
2
+
3
+ The pretrained model checkpoints can be reached at [🤗 Hugging Face](https://huggingface.co/SWivid/F5-TTS) and [🤖 Model Scope](https://www.modelscope.cn/models/SWivid/F5-TTS_Emilia-ZH-EN), or will be automatically downloaded when running inference scripts.
4
+
5
+ Currently support **30s for a single** generation, which is the **total length** including both prompt and output audio. However, you can provide `infer_cli` and `infer_gradio` with longer text, will automatically do chunk generation. Long reference audio will be **clip short to ~15s**.
6
+
7
+ To avoid possible inference failures, make sure you have seen through the following instructions.
8
+
9
+ - Use reference audio <15s and leave some silence (e.g. 1s) at the end. Otherwise there is a risk of truncating in the middle of word, leading to suboptimal generation.
10
+ - Uppercased letters will be uttered letter by letter, so use lowercased letters for normal words.
11
+ - Add some spaces (blank: " ") or punctuations (e.g. "," ".") to explicitly introduce some pauses.
12
+ - Preprocess numbers to Chinese letters if you want to have them read in Chinese, otherwise in English.
13
+
14
+
15
+ ## Gradio App
16
+
17
+ Currently supported features:
18
+
19
+ - Basic TTS with Chunk Inference
20
+ - Multi-Style / Multi-Speaker Generation
21
+ - Voice Chat powered by Qwen2.5-3B-Instruct
22
+
23
+ The cli command `f5-tts_infer-gradio` equals to `python src/f5_tts/infer/infer_gradio.py`, which launches a Gradio APP (web interface) for inference.
24
+
25
+ The script will load model checkpoints from Huggingface. You can also manually download files and update the path to `load_model()` in `infer_gradio.py`. Currently only load TTS models first, will load ASR model to do transcription if `ref_text` not provided, will load LLM model if use Voice Chat.
26
+
27
+ Could also be used as a component for larger application.
28
+ ```python
29
+ import gradio as gr
30
+ from f5_tts.infer.infer_gradio import app
31
+
32
+ with gr.Blocks() as main_app:
33
+ gr.Markdown("# This is an example of using F5-TTS within a bigger Gradio app")
34
+
35
+ # ... other Gradio components
36
+
37
+ app.render()
38
+
39
+ main_app.launch()
40
+ ```
41
+
42
+
43
+ ## CLI Inference
44
+
45
+ The cli command `f5-tts_infer-cli` equals to `python src/f5_tts/infer/infer_cli.py`, which is a command line tool for inference.
46
+
47
+ The script will load model checkpoints from Huggingface. You can also manually download files and use `--ckpt_file` to specify the model you want to load, or directly update in `infer_cli.py`.
48
+
49
+ For change vocab.txt use `--vocab_file` to provide your `vocab.txt` file.
50
+
51
+ Basically you can inference with flags:
52
+ ```bash
53
+ # Leave --ref_text "" will have ASR model transcribe (extra GPU memory usage)
54
+ f5-tts_infer-cli \
55
+ --model "F5-TTS" \
56
+ --ref_audio "ref_audio.wav" \
57
+ --ref_text "The content, subtitle or transcription of reference audio." \
58
+ --gen_text "Some text you want TTS model generate for you."
59
+
60
+ # Choose Vocoder
61
+ f5-tts_infer-cli --vocoder_name bigvgan --load_vocoder_from_local --ckpt_file <YOUR_CKPT_PATH, eg:ckpts/F5TTS_Base_bigvgan/model_1250000.pt>
62
+ f5-tts_infer-cli --vocoder_name vocos --load_vocoder_from_local --ckpt_file <YOUR_CKPT_PATH, eg:ckpts/F5TTS_Base/model_1200000.safetensors>
63
+ ```
64
+
65
+ And a `.toml` file would help with more flexible usage.
66
+
67
+ ```bash
68
+ f5-tts_infer-cli -c custom.toml
69
+ ```
70
+
71
+ For example, you can use `.toml` to pass in variables, refer to `src/f5_tts/infer/examples/basic/basic.toml`:
72
+
73
+ ```toml
74
+ # F5-TTS | E2-TTS
75
+ model = "F5-TTS"
76
+ ref_audio = "infer/examples/basic/basic_ref_en.wav"
77
+ # If an empty "", transcribes the reference audio automatically.
78
+ ref_text = "Some call me nature, others call me mother nature."
79
+ gen_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring."
80
+ # File with text to generate. Ignores the text above.
81
+ gen_file = ""
82
+ remove_silence = false
83
+ output_dir = "tests"
84
+ ```
85
+
86
+ You can also leverage `.toml` file to do multi-style generation, refer to `src/f5_tts/infer/examples/multi/story.toml`.
87
+
88
+ ```toml
89
+ # F5-TTS | E2-TTS
90
+ model = "F5-TTS"
91
+ ref_audio = "infer/examples/multi/main.flac"
92
+ # If an empty "", transcribes the reference audio automatically.
93
+ ref_text = ""
94
+ gen_text = ""
95
+ # File with text to generate. Ignores the text above.
96
+ gen_file = "infer/examples/multi/story.txt"
97
+ remove_silence = true
98
+ output_dir = "tests"
99
+
100
+ [voices.town]
101
+ ref_audio = "infer/examples/multi/town.flac"
102
+ ref_text = ""
103
+
104
+ [voices.country]
105
+ ref_audio = "infer/examples/multi/country.flac"
106
+ ref_text = ""
107
+ ```
108
+ You should mark the voice with `[main]` `[town]` `[country]` whenever you want to change voice, refer to `src/f5_tts/infer/examples/multi/story.txt`.
109
+
110
+ ## Speech Editing
111
+
112
+ To test speech editing capabilities, use the following command:
113
+
114
+ ```bash
115
+ python src/f5_tts/infer/speech_edit.py
116
+ ```
117
+
118
+ ## Socket Realtime Client
119
+
120
+ To communicate with socket server you need to run
121
+ ```bash
122
+ python src/f5_tts/socket_server.py
123
+ ```
124
+
125
+ <details>
126
+ <summary>Then create client to communicate</summary>
127
+
128
+ ``` python
129
+ import socket
130
+ import numpy as np
131
+ import asyncio
132
+ import pyaudio
133
+
134
+ async def listen_to_voice(text, server_ip='localhost', server_port=9999):
135
+ client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
136
+ client_socket.connect((server_ip, server_port))
137
+
138
+ async def play_audio_stream():
139
+ buffer = b''
140
+ p = pyaudio.PyAudio()
141
+ stream = p.open(format=pyaudio.paFloat32,
142
+ channels=1,
143
+ rate=24000, # Ensure this matches the server's sampling rate
144
+ output=True,
145
+ frames_per_buffer=2048)
146
+
147
+ try:
148
+ while True:
149
+ chunk = await asyncio.get_event_loop().run_in_executor(None, client_socket.recv, 1024)
150
+ if not chunk: # End of stream
151
+ break
152
+ if b"END_OF_AUDIO" in chunk:
153
+ buffer += chunk.replace(b"END_OF_AUDIO", b"")
154
+ if buffer:
155
+ audio_array = np.frombuffer(buffer, dtype=np.float32).copy() # Make a writable copy
156
+ stream.write(audio_array.tobytes())
157
+ break
158
+ buffer += chunk
159
+ if len(buffer) >= 4096:
160
+ audio_array = np.frombuffer(buffer[:4096], dtype=np.float32).copy() # Make a writable copy
161
+ stream.write(audio_array.tobytes())
162
+ buffer = buffer[4096:]
163
+ finally:
164
+ stream.stop_stream()
165
+ stream.close()
166
+ p.terminate()
167
+
168
+ try:
169
+ # Send only the text to the server
170
+ await asyncio.get_event_loop().run_in_executor(None, client_socket.sendall, text.encode('utf-8'))
171
+ await play_audio_stream()
172
+ print("Audio playback finished.")
173
+
174
+ except Exception as e:
175
+ print(f"Error in listen_to_voice: {e}")
176
+
177
+ finally:
178
+ client_socket.close()
179
+
180
+ # Example usage: Replace this with your actual server IP and port
181
+ async def main():
182
+ await listen_to_voice("my name is jenny..", server_ip='localhost', server_port=9998)
183
+
184
+ # Run the main async function
185
+ asyncio.run(main())
186
+ ```
187
+
188
+ </details>
189
+
src/f5_tts/infer/examples/basic/basic.toml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # F5-TTS | E2-TTS
2
+ model = "F5-TTS"
3
+ ref_audio = "infer/examples/basic/basic_ref_en.wav"
4
+ # If an empty "", transcribes the reference audio automatically.
5
+ ref_text = "Some call me nature, others call me mother nature."
6
+ gen_text = "I don't really care what you call me. I've been a silent spectator, watching species evolve, empires rise and fall. But always remember, I am mighty and enduring."
7
+ # File with text to generate. Ignores the text above.
8
+ gen_file = ""
9
+ remove_silence = false
10
+ output_dir = "tests"
src/f5_tts/infer/examples/basic/basic_ref_en.wav ADDED
Binary file (256 kB). View file
 
src/f5_tts/infer/examples/basic/basic_ref_zh.wav ADDED
Binary file (325 kB). View file
 
src/f5_tts/infer/examples/multi/country.flac ADDED
Binary file (180 kB). View file
 
src/f5_tts/infer/examples/multi/main.flac ADDED
Binary file (279 kB). View file
 
src/f5_tts/infer/examples/multi/story.toml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # F5-TTS | E2-TTS
2
+ model = "F5-TTS"
3
+ ref_audio = "infer/examples/multi/main.flac"
4
+ # If an empty "", transcribes the reference audio automatically.
5
+ ref_text = ""
6
+ gen_text = ""
7
+ # File with text to generate. Ignores the text above.
8
+ gen_file = "infer/examples/multi/story.txt"
9
+ remove_silence = true
10
+ output_dir = "tests"
11
+
12
+ [voices.town]
13
+ ref_audio = "infer/examples/multi/town.flac"
14
+ ref_text = ""
15
+
16
+ [voices.country]
17
+ ref_audio = "infer/examples/multi/country.flac"
18
+ ref_text = ""
19
+
src/f5_tts/infer/examples/multi/story.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ A Town Mouse and a Country Mouse were acquaintances, and the Country Mouse one day invited his friend to come and see him at his home in the fields. The Town Mouse came, and they sat down to a dinner of barleycorns and roots, the latter of which had a distinctly earthy flavour. The fare was not much to the taste of the guest, and presently he broke out with [town] “My poor dear friend, you live here no better than the ants. Now, you should just see how I fare! My larder is a regular horn of plenty. You must come and stay with me, and I promise you you shall live on the fat of the land.” [main] So when he returned to town he took the Country Mouse with him, and showed him into a larder containing flour and oatmeal and figs and honey and dates. The Country Mouse had never seen anything like it, and sat down to enjoy the luxuries his friend provided: but before they had well begun, the door of the larder opened and someone came in. The two Mice scampered off and hid themselves in a narrow and exceedingly uncomfortable hole. Presently, when all was quiet, they ventured out again; but someone else came in, and off they scuttled again. This was too much for the visitor. [country] “Goodbye,” [main] said he, [country] “I’m off. You live in the lap of luxury, I can see, but you are surrounded by dangers; whereas at home I can enjoy my simple dinner of roots and corn in peace.”
src/f5_tts/infer/examples/multi/town.flac ADDED
Binary file (229 kB). View file
 
src/f5_tts/infer/examples/vocab.txt ADDED
@@ -0,0 +1,2545 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ !
3
+ "
4
+ #
5
+ $
6
+ %
7
+ &
8
+ '
9
+ (
10
+ )
11
+ *
12
+ +
13
+ ,
14
+ -
15
+ .
16
+ /
17
+ 0
18
+ 1
19
+ 2
20
+ 3
21
+ 4
22
+ 5
23
+ 6
24
+ 7
25
+ 8
26
+ 9
27
+ :
28
+ ;
29
+ =
30
+ >
31
+ ?
32
+ @
33
+ A
34
+ B
35
+ C
36
+ D
37
+ E
38
+ F
39
+ G
40
+ H
41
+ I
42
+ J
43
+ K
44
+ L
45
+ M
46
+ N
47
+ O
48
+ P
49
+ Q
50
+ R
51
+ S
52
+ T
53
+ U
54
+ V
55
+ W
56
+ X
57
+ Y
58
+ Z
59
+ [
60
+ \
61
+ ]
62
+ _
63
+ a
64
+ a1
65
+ ai1
66
+ ai2
67
+ ai3
68
+ ai4
69
+ an1
70
+ an3
71
+ an4
72
+ ang1
73
+ ang2
74
+ ang4
75
+ ao1
76
+ ao2
77
+ ao3
78
+ ao4
79
+ b
80
+ ba
81
+ ba1
82
+ ba2
83
+ ba3
84
+ ba4
85
+ bai1
86
+ bai2
87
+ bai3
88
+ bai4
89
+ ban1
90
+ ban2
91
+ ban3
92
+ ban4
93
+ bang1
94
+ bang2
95
+ bang3
96
+ bang4
97
+ bao1
98
+ bao2
99
+ bao3
100
+ bao4
101
+ bei
102
+ bei1
103
+ bei2
104
+ bei3
105
+ bei4
106
+ ben1
107
+ ben2
108
+ ben3
109
+ ben4
110
+ beng
111
+ beng1
112
+ beng2
113
+ beng3
114
+ beng4
115
+ bi1
116
+ bi2
117
+ bi3
118
+ bi4
119
+ bian1
120
+ bian2
121
+ bian3
122
+ bian4
123
+ biao1
124
+ biao2
125
+ biao3
126
+ bie1
127
+ bie2
128
+ bie3
129
+ bie4
130
+ bin1
131
+ bin4
132
+ bing1
133
+ bing2
134
+ bing3
135
+ bing4
136
+ bo
137
+ bo1
138
+ bo2
139
+ bo3
140
+ bo4
141
+ bu2
142
+ bu3
143
+ bu4
144
+ c
145
+ ca1
146
+ cai1
147
+ cai2
148
+ cai3
149
+ cai4
150
+ can1
151
+ can2
152
+ can3
153
+ can4
154
+ cang1
155
+ cang2
156
+ cao1
157
+ cao2
158
+ cao3
159
+ ce4
160
+ cen1
161
+ cen2
162
+ ceng1
163
+ ceng2
164
+ ceng4
165
+ cha1
166
+ cha2
167
+ cha3
168
+ cha4
169
+ chai1
170
+ chai2
171
+ chan1
172
+ chan2
173
+ chan3
174
+ chan4
175
+ chang1
176
+ chang2
177
+ chang3
178
+ chang4
179
+ chao1
180
+ chao2
181
+ chao3
182
+ che1
183
+ che2
184
+ che3
185
+ che4
186
+ chen1
187
+ chen2
188
+ chen3
189
+ chen4
190
+ cheng1
191
+ cheng2
192
+ cheng3
193
+ cheng4
194
+ chi1
195
+ chi2
196
+ chi3
197
+ chi4
198
+ chong1
199
+ chong2
200
+ chong3
201
+ chong4
202
+ chou1
203
+ chou2
204
+ chou3
205
+ chou4
206
+ chu1
207
+ chu2
208
+ chu3
209
+ chu4
210
+ chua1
211
+ chuai1
212
+ chuai2
213
+ chuai3
214
+ chuai4
215
+ chuan1
216
+ chuan2
217
+ chuan3
218
+ chuan4
219
+ chuang1
220
+ chuang2
221
+ chuang3
222
+ chuang4
223
+ chui1
224
+ chui2
225
+ chun1
226
+ chun2
227
+ chun3
228
+ chuo1
229
+ chuo4
230
+ ci1
231
+ ci2
232
+ ci3
233
+ ci4
234
+ cong1
235
+ cong2
236
+ cou4
237
+ cu1
238
+ cu4
239
+ cuan1
240
+ cuan2
241
+ cuan4
242
+ cui1
243
+ cui3
244
+ cui4
245
+ cun1
246
+ cun2
247
+ cun4
248
+ cuo1
249
+ cuo2
250
+ cuo4
251
+ d
252
+ da
253
+ da1
254
+ da2
255
+ da3
256
+ da4
257
+ dai1
258
+ dai2
259
+ dai3
260
+ dai4
261
+ dan1
262
+ dan2
263
+ dan3
264
+ dan4
265
+ dang1
266
+ dang2
267
+ dang3
268
+ dang4
269
+ dao1
270
+ dao2
271
+ dao3
272
+ dao4
273
+ de
274
+ de1
275
+ de2
276
+ dei3
277
+ den4
278
+ deng1
279
+ deng2
280
+ deng3
281
+ deng4
282
+ di1
283
+ di2
284
+ di3
285
+ di4
286
+ dia3
287
+ dian1
288
+ dian2
289
+ dian3
290
+ dian4
291
+ diao1
292
+ diao3
293
+ diao4
294
+ die1
295
+ die2
296
+ die4
297
+ ding1
298
+ ding2
299
+ ding3
300
+ ding4
301
+ diu1
302
+ dong1
303
+ dong3
304
+ dong4
305
+ dou1
306
+ dou2
307
+ dou3
308
+ dou4
309
+ du1
310
+ du2
311
+ du3
312
+ du4
313
+ duan1
314
+ duan2
315
+ duan3
316
+ duan4
317
+ dui1
318
+ dui4
319
+ dun1
320
+ dun3
321
+ dun4
322
+ duo1
323
+ duo2
324
+ duo3
325
+ duo4
326
+ e
327
+ e1
328
+ e2
329
+ e3
330
+ e4
331
+ ei2
332
+ en1
333
+ en4
334
+ er
335
+ er2
336
+ er3
337
+ er4
338
+ f
339
+ fa1
340
+ fa2
341
+ fa3
342
+ fa4
343
+ fan1
344
+ fan2
345
+ fan3
346
+ fan4
347
+ fang1
348
+ fang2
349
+ fang3
350
+ fang4
351
+ fei1
352
+ fei2
353
+ fei3
354
+ fei4
355
+ fen1
356
+ fen2
357
+ fen3
358
+ fen4
359
+ feng1
360
+ feng2
361
+ feng3
362
+ feng4
363
+ fo2
364
+ fou2
365
+ fou3
366
+ fu1
367
+ fu2
368
+ fu3
369
+ fu4
370
+ g
371
+ ga1
372
+ ga2
373
+ ga3
374
+ ga4
375
+ gai1
376
+ gai2
377
+ gai3
378
+ gai4
379
+ gan1
380
+ gan2
381
+ gan3
382
+ gan4
383
+ gang1
384
+ gang2
385
+ gang3
386
+ gang4
387
+ gao1
388
+ gao2
389
+ gao3
390
+ gao4
391
+ ge1
392
+ ge2
393
+ ge3
394
+ ge4
395
+ gei2
396
+ gei3
397
+ gen1
398
+ gen2
399
+ gen3
400
+ gen4
401
+ geng1
402
+ geng3
403
+ geng4
404
+ gong1
405
+ gong3
406
+ gong4
407
+ gou1
408
+ gou2
409
+ gou3
410
+ gou4
411
+ gu
412
+ gu1
413
+ gu2
414
+ gu3
415
+ gu4
416
+ gua1
417
+ gua2
418
+ gua3
419
+ gua4
420
+ guai1
421
+ guai2
422
+ guai3
423
+ guai4
424
+ guan1
425
+ guan2
426
+ guan3
427
+ guan4
428
+ guang1
429
+ guang2
430
+ guang3
431
+ guang4
432
+ gui1
433
+ gui2
434
+ gui3
435
+ gui4
436
+ gun3
437
+ gun4
438
+ guo1
439
+ guo2
440
+ guo3
441
+ guo4
442
+ h
443
+ ha1
444
+ ha2
445
+ ha3
446
+ hai1
447
+ hai2
448
+ hai3
449
+ hai4
450
+ han1
451
+ han2
452
+ han3
453
+ han4
454
+ hang1
455
+ hang2
456
+ hang4
457
+ hao1
458
+ hao2
459
+ hao3
460
+ hao4
461
+ he1
462
+ he2
463
+ he4
464
+ hei1
465
+ hen2
466
+ hen3
467
+ hen4
468
+ heng1
469
+ heng2
470
+ heng4
471
+ hong1
472
+ hong2
473
+ hong3
474
+ hong4
475
+ hou1
476
+ hou2
477
+ hou3
478
+ hou4
479
+ hu1
480
+ hu2
481
+ hu3
482
+ hu4
483
+ hua1
484
+ hua2
485
+ hua4
486
+ huai2
487
+ huai4
488
+ huan1
489
+ huan2
490
+ huan3
491
+ huan4
492
+ huang1
493
+ huang2
494
+ huang3
495
+ huang4
496
+ hui1
497
+ hui2
498
+ hui3
499
+ hui4
500
+ hun1
501
+ hun2
502
+ hun4
503
+ huo
504
+ huo1
505
+ huo2
506
+ huo3
507
+ huo4
508
+ i
509
+ j
510
+ ji1
511
+ ji2
512
+ ji3
513
+ ji4
514
+ jia
515
+ jia1
516
+ jia2
517
+ jia3
518
+ jia4
519
+ jian1
520
+ jian2
521
+ jian3
522
+ jian4
523
+ jiang1
524
+ jiang2
525
+ jiang3
526
+ jiang4
527
+ jiao1
528
+ jiao2
529
+ jiao3
530
+ jiao4
531
+ jie1
532
+ jie2
533
+ jie3
534
+ jie4
535
+ jin1
536
+ jin2
537
+ jin3
538
+ jin4
539
+ jing1
540
+ jing2
541
+ jing3
542
+ jing4
543
+ jiong3
544
+ jiu1
545
+ jiu2
546
+ jiu3
547
+ jiu4
548
+ ju1
549
+ ju2
550
+ ju3
551
+ ju4
552
+ juan1
553
+ juan2
554
+ juan3
555
+ juan4
556
+ jue1
557
+ jue2
558
+ jue4
559
+ jun1
560
+ jun4
561
+ k
562
+ ka1
563
+ ka2
564
+ ka3
565
+ kai1
566
+ kai2
567
+ kai3
568
+ kai4
569
+ kan1
570
+ kan2
571
+ kan3
572
+ kan4
573
+ kang1
574
+ kang2
575
+ kang4
576
+ kao1
577
+ kao2
578
+ kao3
579
+ kao4
580
+ ke1
581
+ ke2
582
+ ke3
583
+ ke4
584
+ ken3
585
+ keng1
586
+ kong1
587
+ kong3
588
+ kong4
589
+ kou1
590
+ kou2
591
+ kou3
592
+ kou4
593
+ ku1
594
+ ku2
595
+ ku3
596
+ ku4
597
+ kua1
598
+ kua3
599
+ kua4
600
+ kuai3
601
+ kuai4
602
+ kuan1
603
+ kuan2
604
+ kuan3
605
+ kuang1
606
+ kuang2
607
+ kuang4
608
+ kui1
609
+ kui2
610
+ kui3
611
+ kui4
612
+ kun1
613
+ kun3
614
+ kun4
615
+ kuo4
616
+ l
617
+ la
618
+ la1
619
+ la2
620
+ la3
621
+ la4
622
+ lai2
623
+ lai4
624
+ lan2
625
+ lan3
626
+ lan4
627
+ lang1
628
+ lang2
629
+ lang3
630
+ lang4
631
+ lao1
632
+ lao2
633
+ lao3
634
+ lao4
635
+ le
636
+ le1
637
+ le4
638
+ lei
639
+ lei1
640
+ lei2
641
+ lei3
642
+ lei4
643
+ leng1
644
+ leng2
645
+ leng3
646
+ leng4
647
+ li
648
+ li1
649
+ li2
650
+ li3
651
+ li4
652
+ lia3
653
+ lian2
654
+ lian3
655
+ lian4
656
+ liang2
657
+ liang3
658
+ liang4
659
+ liao1
660
+ liao2
661
+ liao3
662
+ liao4
663
+ lie1
664
+ lie2
665
+ lie3
666
+ lie4
667
+ lin1
668
+ lin2
669
+ lin3
670
+ lin4
671
+ ling2
672
+ ling3
673
+ ling4
674
+ liu1
675
+ liu2
676
+ liu3
677
+ liu4
678
+ long1
679
+ long2
680
+ long3
681
+ long4
682
+ lou1
683
+ lou2
684
+ lou3
685
+ lou4
686
+ lu1
687
+ lu2
688
+ lu3
689
+ lu4
690
+ luan2
691
+ luan3
692
+ luan4
693
+ lun1
694
+ lun2
695
+ lun4
696
+ luo1
697
+ luo2
698
+ luo3
699
+ luo4
700
+ lv2
701
+ lv3
702
+ lv4
703
+ lve3
704
+ lve4
705
+ m
706
+ ma
707
+ ma1
708
+ ma2
709
+ ma3
710
+ ma4
711
+ mai2
712
+ mai3
713
+ mai4
714
+ man1
715
+ man2
716
+ man3
717
+ man4
718
+ mang2
719
+ mang3
720
+ mao1
721
+ mao2
722
+ mao3
723
+ mao4
724
+ me
725
+ mei2
726
+ mei3
727
+ mei4
728
+ men
729
+ men1
730
+ men2
731
+ men4
732
+ meng
733
+ meng1
734
+ meng2
735
+ meng3
736
+ meng4
737
+ mi1
738
+ mi2
739
+ mi3
740
+ mi4
741
+ mian2
742
+ mian3
743
+ mian4
744
+ miao1
745
+ miao2
746
+ miao3
747
+ miao4
748
+ mie1
749
+ mie4
750
+ min2
751
+ min3
752
+ ming2
753
+ ming3
754
+ ming4
755
+ miu4
756
+ mo1
757
+ mo2
758
+ mo3
759
+ mo4
760
+ mou1
761
+ mou2
762
+ mou3
763
+ mu2
764
+ mu3
765
+ mu4
766
+ n
767
+ n2
768
+ na1
769
+ na2
770
+ na3
771
+ na4
772
+ nai2
773
+ nai3
774
+ nai4
775
+ nan1
776
+ nan2
777
+ nan3
778
+ nan4
779
+ nang1
780
+ nang2
781
+ nang3
782
+ nao1
783
+ nao2
784
+ nao3
785
+ nao4
786
+ ne
787
+ ne2
788
+ ne4
789
+ nei3
790
+ nei4
791
+ nen4
792
+ neng2
793
+ ni1
794
+ ni2
795
+ ni3
796
+ ni4
797
+ nian1
798
+ nian2
799
+ nian3
800
+ nian4
801
+ niang2
802
+ niang4
803
+ niao2
804
+ niao3
805
+ niao4
806
+ nie1
807
+ nie4
808
+ nin2
809
+ ning2
810
+ ning3
811
+ ning4
812
+ niu1
813
+ niu2
814
+ niu3
815
+ niu4
816
+ nong2
817
+ nong4
818
+ nou4
819
+ nu2
820
+ nu3
821
+ nu4
822
+ nuan3
823
+ nuo2
824
+ nuo4
825
+ nv2
826
+ nv3
827
+ nve4
828
+ o
829
+ o1
830
+ o2
831
+ ou1
832
+ ou2
833
+ ou3
834
+ ou4
835
+ p
836
+ pa1
837
+ pa2
838
+ pa4
839
+ pai1
840
+ pai2
841
+ pai3
842
+ pai4
843
+ pan1
844
+ pan2
845
+ pan4
846
+ pang1
847
+ pang2
848
+ pang4
849
+ pao1
850
+ pao2
851
+ pao3
852
+ pao4
853
+ pei1
854
+ pei2
855
+ pei4
856
+ pen1
857
+ pen2
858
+ pen4
859
+ peng1
860
+ peng2
861
+ peng3
862
+ peng4
863
+ pi1
864
+ pi2
865
+ pi3
866
+ pi4
867
+ pian1
868
+ pian2
869
+ pian4
870
+ piao1
871
+ piao2
872
+ piao3
873
+ piao4
874
+ pie1
875
+ pie2
876
+ pie3
877
+ pin1
878
+ pin2
879
+ pin3
880
+ pin4
881
+ ping1
882
+ ping2
883
+ po1
884
+ po2
885
+ po3
886
+ po4
887
+ pou1
888
+ pu1
889
+ pu2
890
+ pu3
891
+ pu4
892
+ q
893
+ qi1
894
+ qi2
895
+ qi3
896
+ qi4
897
+ qia1
898
+ qia3
899
+ qia4
900
+ qian1
901
+ qian2
902
+ qian3
903
+ qian4
904
+ qiang1
905
+ qiang2
906
+ qiang3
907
+ qiang4
908
+ qiao1
909
+ qiao2
910
+ qiao3
911
+ qiao4
912
+ qie1
913
+ qie2
914
+ qie3
915
+ qie4
916
+ qin1
917
+ qin2
918
+ qin3
919
+ qin4
920
+ qing1
921
+ qing2
922
+ qing3
923
+ qing4
924
+ qiong1
925
+ qiong2
926
+ qiu1
927
+ qiu2
928
+ qiu3
929
+ qu1
930
+ qu2
931
+ qu3
932
+ qu4
933
+ quan1
934
+ quan2
935
+ quan3
936
+ quan4
937
+ que1
938
+ que2
939
+ que4
940
+ qun2
941
+ r
942
+ ran2
943
+ ran3
944
+ rang1
945
+ rang2
946
+ rang3
947
+ rang4
948
+ rao2
949
+ rao3
950
+ rao4
951
+ re2
952
+ re3
953
+ re4
954
+ ren2
955
+ ren3
956
+ ren4
957
+ reng1
958
+ reng2
959
+ ri4
960
+ rong1
961
+ rong2
962
+ rong3
963
+ rou2
964
+ rou4
965
+ ru2
966
+ ru3
967
+ ru4
968
+ ruan2
969
+ ruan3
970
+ rui3
971
+ rui4
972
+ run4
973
+ ruo4
974
+ s
975
+ sa1
976
+ sa2
977
+ sa3
978
+ sa4
979
+ sai1
980
+ sai4
981
+ san1
982
+ san2
983
+ san3
984
+ san4
985
+ sang1
986
+ sang3
987
+ sang4
988
+ sao1
989
+ sao2
990
+ sao3
991
+ sao4
992
+ se4
993
+ sen1
994
+ seng1
995
+ sha1
996
+ sha2
997
+ sha3
998
+ sha4
999
+ shai1
1000
+ shai2
1001
+ shai3
1002
+ shai4
1003
+ shan1
1004
+ shan3
1005
+ shan4
1006
+ shang
1007
+ shang1
1008
+ shang3
1009
+ shang4
1010
+ shao1
1011
+ shao2
1012
+ shao3
1013
+ shao4
1014
+ she1
1015
+ she2
1016
+ she3
1017
+ she4
1018
+ shei2
1019
+ shen1
1020
+ shen2
1021
+ shen3
1022
+ shen4
1023
+ sheng1
1024
+ sheng2
1025
+ sheng3
1026
+ sheng4
1027
+ shi
1028
+ shi1
1029
+ shi2
1030
+ shi3
1031
+ shi4
1032
+ shou1
1033
+ shou2
1034
+ shou3
1035
+ shou4
1036
+ shu1
1037
+ shu2
1038
+ shu3
1039
+ shu4
1040
+ shua1
1041
+ shua2
1042
+ shua3
1043
+ shua4
1044
+ shuai1
1045
+ shuai3
1046
+ shuai4
1047
+ shuan1
1048
+ shuan4
1049
+ shuang1
1050
+ shuang3
1051
+ shui2
1052
+ shui3
1053
+ shui4
1054
+ shun3
1055
+ shun4
1056
+ shuo1
1057
+ shuo4
1058
+ si1
1059
+ si2
1060
+ si3
1061
+ si4
1062
+ song1
1063
+ song3
1064
+ song4
1065
+ sou1
1066
+ sou3
1067
+ sou4
1068
+ su1
1069
+ su2
1070
+ su4
1071
+ suan1
1072
+ suan4
1073
+ sui1
1074
+ sui2
1075
+ sui3
1076
+ sui4
1077
+ sun1
1078
+ sun3
1079
+ suo
1080
+ suo1
1081
+ suo2
1082
+ suo3
1083
+ t
1084
+ ta1
1085
+ ta2
1086
+ ta3
1087
+ ta4
1088
+ tai1
1089
+ tai2
1090
+ tai4
1091
+ tan1
1092
+ tan2
1093
+ tan3
1094
+ tan4
1095
+ tang1
1096
+ tang2
1097
+ tang3
1098
+ tang4
1099
+ tao1
1100
+ tao2
1101
+ tao3
1102
+ tao4
1103
+ te4
1104
+ teng2
1105
+ ti1
1106
+ ti2
1107
+ ti3
1108
+ ti4
1109
+ tian1
1110
+ tian2
1111
+ tian3
1112
+ tiao1
1113
+ tiao2
1114
+ tiao3
1115
+ tiao4
1116
+ tie1
1117
+ tie2
1118
+ tie3
1119
+ tie4
1120
+ ting1
1121
+ ting2
1122
+ ting3
1123
+ tong1
1124
+ tong2
1125
+ tong3
1126
+ tong4
1127
+ tou
1128
+ tou1
1129
+ tou2
1130
+ tou4
1131
+ tu1
1132
+ tu2
1133
+ tu3
1134
+ tu4
1135
+ tuan1
1136
+ tuan2
1137
+ tui1
1138
+ tui2
1139
+ tui3
1140
+ tui4
1141
+ tun1
1142
+ tun2
1143
+ tun4
1144
+ tuo1
1145
+ tuo2
1146
+ tuo3
1147
+ tuo4
1148
+ u
1149
+ v
1150
+ w
1151
+ wa
1152
+ wa1
1153
+ wa2
1154
+ wa3
1155
+ wa4
1156
+ wai1
1157
+ wai3
1158
+ wai4
1159
+ wan1
1160
+ wan2
1161
+ wan3
1162
+ wan4
1163
+ wang1
1164
+ wang2
1165
+ wang3
1166
+ wang4
1167
+ wei1
1168
+ wei2
1169
+ wei3
1170
+ wei4
1171
+ wen1
1172
+ wen2
1173
+ wen3
1174
+ wen4
1175
+ weng1
1176
+ weng4
1177
+ wo1
1178
+ wo2
1179
+ wo3
1180
+ wo4
1181
+ wu1
1182
+ wu2
1183
+ wu3
1184
+ wu4
1185
+ x
1186
+ xi1
1187
+ xi2
1188
+ xi3
1189
+ xi4
1190
+ xia1
1191
+ xia2
1192
+ xia4
1193
+ xian1
1194
+ xian2
1195
+ xian3
1196
+ xian4
1197
+ xiang1
1198
+ xiang2
1199
+ xiang3
1200
+ xiang4
1201
+ xiao1
1202
+ xiao2
1203
+ xiao3
1204
+ xiao4
1205
+ xie1
1206
+ xie2
1207
+ xie3
1208
+ xie4
1209
+ xin1
1210
+ xin2
1211
+ xin4
1212
+ xing1
1213
+ xing2
1214
+ xing3
1215
+ xing4
1216
+ xiong1
1217
+ xiong2
1218
+ xiu1
1219
+ xiu3
1220
+ xiu4
1221
+ xu
1222
+ xu1
1223
+ xu2
1224
+ xu3
1225
+ xu4
1226
+ xuan1
1227
+ xuan2
1228
+ xuan3
1229
+ xuan4
1230
+ xue1
1231
+ xue2
1232
+ xue3
1233
+ xue4
1234
+ xun1
1235
+ xun2
1236
+ xun4
1237
+ y
1238
+ ya
1239
+ ya1
1240
+ ya2
1241
+ ya3
1242
+ ya4
1243
+ yan1
1244
+ yan2
1245
+ yan3
1246
+ yan4
1247
+ yang1
1248
+ yang2
1249
+ yang3
1250
+ yang4
1251
+ yao1
1252
+ yao2
1253
+ yao3
1254
+ yao4
1255
+ ye1
1256
+ ye2
1257
+ ye3
1258
+ ye4
1259
+ yi
1260
+ yi1
1261
+ yi2
1262
+ yi3
1263
+ yi4
1264
+ yin1
1265
+ yin2
1266
+ yin3
1267
+ yin4
1268
+ ying1
1269
+ ying2
1270
+ ying3
1271
+ ying4
1272
+ yo1
1273
+ yong1
1274
+ yong2
1275
+ yong3
1276
+ yong4
1277
+ you1
1278
+ you2
1279
+ you3
1280
+ you4
1281
+ yu1
1282
+ yu2
1283
+ yu3
1284
+ yu4
1285
+ yuan1
1286
+ yuan2
1287
+ yuan3
1288
+ yuan4
1289
+ yue1
1290
+ yue4
1291
+ yun1
1292
+ yun2
1293
+ yun3
1294
+ yun4
1295
+ z
1296
+ za1
1297
+ za2
1298
+ za3
1299
+ zai1
1300
+ zai3
1301
+ zai4
1302
+ zan1
1303
+ zan2
1304
+ zan3
1305
+ zan4
1306
+ zang1
1307
+ zang4
1308
+ zao1
1309
+ zao2
1310
+ zao3
1311
+ zao4
1312
+ ze2
1313
+ ze4
1314
+ zei2
1315
+ zen3
1316
+ zeng1
1317
+ zeng4
1318
+ zha1
1319
+ zha2
1320
+ zha3
1321
+ zha4
1322
+ zhai1
1323
+ zhai2
1324
+ zhai3
1325
+ zhai4
1326
+ zhan1
1327
+ zhan2
1328
+ zhan3
1329
+ zhan4
1330
+ zhang1
1331
+ zhang2
1332
+ zhang3
1333
+ zhang4
1334
+ zhao1
1335
+ zhao2
1336
+ zhao3
1337
+ zhao4
1338
+ zhe
1339
+ zhe1
1340
+ zhe2
1341
+ zhe3
1342
+ zhe4
1343
+ zhen1
1344
+ zhen2
1345
+ zhen3
1346
+ zhen4
1347
+ zheng1
1348
+ zheng2
1349
+ zheng3
1350
+ zheng4
1351
+ zhi1
1352
+ zhi2
1353
+ zhi3
1354
+ zhi4
1355
+ zhong1
1356
+ zhong2
1357
+ zhong3
1358
+ zhong4
1359
+ zhou1
1360
+ zhou2
1361
+ zhou3
1362
+ zhou4
1363
+ zhu1
1364
+ zhu2
1365
+ zhu3
1366
+ zhu4
1367
+ zhua1
1368
+ zhua2
1369
+ zhua3
1370
+ zhuai1
1371
+ zhuai3
1372
+ zhuai4
1373
+ zhuan1
1374
+ zhuan2
1375
+ zhuan3
1376
+ zhuan4
1377
+ zhuang1
1378
+ zhuang4
1379
+ zhui1
1380
+ zhui4
1381
+ zhun1
1382
+ zhun2
1383
+ zhun3
1384
+ zhuo1
1385
+ zhuo2
1386
+ zi
1387
+ zi1
1388
+ zi2
1389
+ zi3
1390
+ zi4
1391
+ zong1
1392
+ zong2
1393
+ zong3
1394
+ zong4
1395
+ zou1
1396
+ zou2
1397
+ zou3
1398
+ zou4
1399
+ zu1
1400
+ zu2
1401
+ zu3
1402
+ zuan1
1403
+ zuan3
1404
+ zuan4
1405
+ zui2
1406
+ zui3
1407
+ zui4
1408
+ zun1
1409
+ zuo
1410
+ zuo1
1411
+ zuo2
1412
+ zuo3
1413
+ zuo4
1414
+ {
1415
+ ~
1416
+ ¡
1417
+ ¢
1418
+ £
1419
+ ¥
1420
+ §
1421
+ ¨
1422
+ ©
1423
+ «
1424
+ ®
1425
+ ¯
1426
+ °
1427
+ ±
1428
+ ²
1429
+ ³
1430
+ ´
1431
+ µ
1432
+ ·
1433
+ ¹
1434
+ º
1435
+ »
1436
+ ¼
1437
+ ½
1438
+ ¾
1439
+ ¿
1440
+ À
1441
+ Á
1442
+ Â
1443
+ Ã
1444
+ Ä
1445
+ Å
1446
+ Æ
1447
+ Ç
1448
+ È
1449
+ É
1450
+ Ê
1451
+ Í
1452
+ Î
1453
+ Ñ
1454
+ Ó
1455
+ Ö
1456
+ ×
1457
+ Ø
1458
+ Ú
1459
+ Ü
1460
+ Ý
1461
+ Þ
1462
+ ß
1463
+ à
1464
+ á
1465
+ â
1466
+ ã
1467
+ ä
1468
+ å
1469
+ æ
1470
+ ç
1471
+ è
1472
+ é
1473
+ ê
1474
+ ë
1475
+ ì
1476
+ í
1477
+ î
1478
+ ï
1479
+ ð
1480
+ ñ
1481
+ ò
1482
+ ó
1483
+ ô
1484
+ õ
1485
+ ö
1486
+ ø
1487
+ ù
1488
+ ú
1489
+ û
1490
+ ü
1491
+ ý
1492
+ Ā
1493
+ ā
1494
+ ă
1495
+ ą
1496
+ ć
1497
+ Č
1498
+ č
1499
+ Đ
1500
+ đ
1501
+ ē
1502
+ ė
1503
+ ę
1504
+ ě
1505
+ ĝ
1506
+ ğ
1507
+ ħ
1508
+ ī
1509
+ į
1510
+ İ
1511
+ ı
1512
+ Ł
1513
+ ł
1514
+ ń
1515
+ ņ
1516
+ ň
1517
+ ŋ
1518
+ Ō
1519
+ ō
1520
+ ő
1521
+ œ
1522
+ ř
1523
+ Ś
1524
+ ś
1525
+ Ş
1526
+ ş
1527
+ Š
1528
+ š
1529
+ Ť
1530
+ ť
1531
+ ũ
1532
+ ū
1533
+ ź
1534
+ Ż
1535
+ ż
1536
+ Ž
1537
+ ž
1538
+ ơ
1539
+ ư
1540
+ ǎ
1541
+ ǐ
1542
+ ǒ
1543
+ ǔ
1544
+ ǚ
1545
+ ș
1546
+ ț
1547
+ ɑ
1548
+ ɔ
1549
+ ɕ
1550
+ ə
1551
+ ɛ
1552
+ ɜ
1553
+ ɡ
1554
+ ɣ
1555
+ ɪ
1556
+ ɫ
1557
+ ɴ
1558
+ ɹ
1559
+ ɾ
1560
+ ʃ
1561
+ ʊ
1562
+ ʌ
1563
+ ʒ
1564
+ ʔ
1565
+ ʰ
1566
+ ʷ
1567
+ ʻ
1568
+ ʾ
1569
+ ʿ
1570
+ ˈ
1571
+ ː
1572
+ ˙
1573
+ ˜
1574
+ ˢ
1575
+ ́
1576
+ ̅
1577
+ Α
1578
+ Β
1579
+ Δ
1580
+ Ε
1581
+ Θ
1582
+ Κ
1583
+ Λ
1584
+ Μ
1585
+ Ξ
1586
+ Π
1587
+ Σ
1588
+ Τ
1589
+ Φ
1590
+ Χ
1591
+ Ψ
1592
+ Ω
1593
+ ά
1594
+ έ
1595
+ ή
1596
+ ί
1597
+ α
1598
+ β
1599
+ γ
1600
+ δ
1601
+ ε
1602
+ ζ
1603
+ η
1604
+ θ
1605
+ ι
1606
+ κ
1607
+ λ
1608
+ μ
1609
+ ν
1610
+ ξ
1611
+ ο
1612
+ π
1613
+ ρ
1614
+ ς
1615
+ σ
1616
+ τ
1617
+ υ
1618
+ φ
1619
+ χ
1620
+ ψ
1621
+ ω
1622
+ ϊ
1623
+ ό
1624
+ ύ
1625
+ ώ
1626
+ ϕ
1627
+ ϵ
1628
+ Ё
1629
+ А
1630
+ Б
1631
+ В
1632
+ Г
1633
+ Д
1634
+ Е
1635
+ Ж
1636
+ З
1637
+ И
1638
+ Й
1639
+ К
1640
+ Л
1641
+ М
1642
+ Н
1643
+ О
1644
+ П
1645
+ Р
1646
+ С
1647
+ Т
1648
+ У
1649
+ Ф
1650
+ Х
1651
+ Ц
1652
+ Ч
1653
+ Ш
1654
+ Щ
1655
+ Ы
1656
+ Ь
1657
+ Э
1658
+ Ю
1659
+ Я
1660
+ а
1661
+ б
1662
+ в
1663
+ г
1664
+ д
1665
+ е
1666
+ ж
1667
+ з
1668
+ и
1669
+ й
1670
+ к
1671
+ л
1672
+ м
1673
+ н
1674
+ о
1675
+ п
1676
+ р
1677
+ с
1678
+ т
1679
+ у
1680
+ ф
1681
+ х
1682
+ ц
1683
+ ч
1684
+ ш
1685
+ щ
1686
+ ъ
1687
+ ы
1688
+ ь
1689
+ э
1690
+ ю
1691
+ я
1692
+ ё
1693
+ і
1694
+ ְ
1695
+ ִ
1696
+ ֵ
1697
+ ֶ
1698
+ ַ
1699
+ ָ
1700
+ ֹ
1701
+ ּ
1702
+ ־
1703
+ ׁ
1704
+ א
1705
+ ב
1706
+ ג
1707
+ ד
1708
+ ה
1709
+ ו
1710
+ ז
1711
+ ח
1712
+ ט
1713
+ י
1714
+ כ
1715
+ ל
1716
+ ם
1717
+ מ
1718
+ ן
1719
+ נ
1720
+ ס
1721
+ ע
1722
+ פ
1723
+ ק
1724
+ ר
1725
+ ש
1726
+ ת
1727
+ أ
1728
+ ب
1729
+ ة
1730
+ ت
1731
+ ج
1732
+ ح
1733
+ د
1734
+ ر
1735
+ ز
1736
+ س
1737
+ ص
1738
+ ط
1739
+ ع
1740
+ ق
1741
+ ك
1742
+ ل
1743
+ م
1744
+ ن
1745
+ ه
1746
+ و
1747
+ ي
1748
+ َ
1749
+ ُ
1750
+ ِ
1751
+ ْ
1752
+
1753
+
1754
+
1755
+
1756
+
1757
+
1758
+
1759
+
1760
+
1761
+
1762
+
1763
+
1764
+
1765
+
1766
+
1767
+
1768
+
1769
+
1770
+
1771
+
1772
+
1773
+
1774
+
1775
+
1776
+
1777
+
1778
+
1779
+
1780
+
1781
+
1782
+
1783
+
1784
+
1785
+
1786
+
1787
+
1788
+
1789
+
1790
+
1791
+
1792
+
1793
+
1794
+
1795
+
1796
+
1797
+
1798
+
1799
+
1800
+ ế
1801
+
1802
+
1803
+
1804
+
1805
+
1806
+
1807
+
1808
+
1809
+
1810
+
1811
+
1812
+
1813
+
1814
+
1815
+
1816
+
1817
+
1818
+
1819
+
1820
+
1821
+
1822
+
1823
+
1824
+
1825
+
1826
+
1827
+
1828
+
1829
+
1830
+
1831
+
1832
+
1833
+
1834
+
1835
+
1836
+
1837
+
1838
+
1839
+
1840
+
1841
+
1842
+
1843
+
1844
+
1845
+
1846
+
1847
+
1848
+
1849
+
1850
+
1851
+
1852
+
1853
+
1854
+
1855
+
1856
+
1857
+
1858
+
1859
+
1860
+
1861
+
1862
+
1863
+
1864
+
1865
+
1866
+
1867
+
1868
+
1869
+
1870
+
1871
+
1872
+
1873
+
1874
+
1875
+
1876
+
1877
+
1878
+
1879
+
1880
+
1881
+
1882
+
1883
+
1884
+
1885
+
1886
+
1887
+
1888
+
1889
+
1890
+
1891
+
1892
+
1893
+
1894
+
1895
+
1896
+
1897
+
1898
+
1899
+
1900
+
1901
+
1902
+
1903
+
1904
+
1905
+
1906
+
1907
+
1908
+
1909
+
1910
+
1911
+
1912
+
1913
+
1914
+
1915
+
1916
+
1917
+
1918
+
1919
+
1920
+
1921
+
1922
+
1923
+
1924
+
1925
+
1926
+
1927
+
1928
+
1929
+
1930
+
1931
+
1932
+
1933
+
1934
+
1935
+
1936
+
1937
+
1938
+
1939
+
1940
+
1941
+
1942
+
1943
+
1944
+
1945
+
1946
+
1947
+
1948
+
1949
+
1950
+
1951
+
1952
+
1953
+
1954
+
1955
+
1956
+
1957
+
1958
+
1959
+
1960
+
1961
+
1962
+
1963
+
1964
+
1965
+
1966
+
1967
+
1968
+
1969
+
1970
+
1971
+
1972
+
1973
+
1974
+
1975
+
1976
+
1977
+
1978
+
1979
+
1980
+
1981
+
1982
+
1983
+
1984
+
1985
+
1986
+
1987
+
1988
+
1989
+
1990
+
1991
+
1992
+
1993
+
1994
+
1995
+
1996
+
1997
+
1998
+
1999
+
2000
+
2001
+
2002
+
2003
+
2004
+
2005
+
2006
+
2007
+
2008
+
2009
+
2010
+
2011
+
2012
+
2013
+
2014
+
2015
+
2016
+
2017
+
2018
+
2019
+
2020
+
2021
+
2022
+
2023
+
2024
+
2025
+
2026
+
2027
+
2028
+
2029
+
2030
+
2031
+
2032
+
2033
+
2034
+
2035
+
2036
+
2037
+
2038
+
2039
+
2040
+
2041
+
2042
+
2043
+
2044
+
2045
+
2046
+
2047
+
2048
+
2049
+
2050
+
2051
+
2052
+
2053
+
2054
+
2055
+
2056
+
2057
+
2058
+
2059
+
2060
+
2061
+
2062
+
2063
+
2064
+
2065
+
2066
+
2067
+
2068
+
2069
+
2070
+
2071
+
2072
+
2073
+
2074
+
2075
+
2076
+
2077
+
2078
+
2079
+
2080
+
2081
+
2082
+
2083
+
2084
+
2085
+
2086
+
2087
+
2088
+
2089
+
2090
+
2091
+
2092
+
2093
+
2094
+
2095
+
2096
+
2097
+
2098
+
2099
+
2100
+
2101
+
2102
+
2103
+
2104
+
2105
+
2106
+
2107
+
2108
+
2109
+
2110
+
2111
+
2112
+
2113
+
2114
+
2115
+
2116
+
2117
+
2118
+
2119
+
2120
+
2121
+
2122
+
2123
+
2124
+
2125
+
2126
+
2127
+
2128
+
2129
+
2130
+
2131
+
2132
+
2133
+
2134
+
2135
+
2136
+
2137
+
2138
+
2139
+
2140
+
2141
+
2142
+
2143
+
2144
+
2145
+
2146
+
2147
+
2148
+
2149
+
2150
+
2151
+
2152
+
2153
+
2154
+
2155
+
2156
+
2157
+
2158
+
2159
+
2160
+
2161
+
2162
+
2163
+
2164
+
2165
+
2166
+
2167
+
2168
+
2169
+
2170
+
2171
+
2172
+
2173
+
2174
+
2175
+
2176
+
2177
+
2178
+
2179
+
2180
+
2181
+
2182
+
2183
+
2184
+
2185
+
2186
+
2187
+
2188
+
2189
+
2190
+
2191
+
2192
+
2193
+
2194
+
2195
+
2196
+
2197
+
2198
+
2199
+
2200
+
2201
+
2202
+
2203
+
2204
+
2205
+
2206
+
2207
+
2208
+
2209
+
2210
+
2211
+
2212
+
2213
+
2214
+
2215
+
2216
+
2217
+
2218
+
2219
+
2220
+
2221
+
2222
+
2223
+
2224
+
2225
+
2226
+
2227
+
2228
+
2229
+
2230
+
2231
+
2232
+
2233
+
2234
+
2235
+
2236
+
2237
+
2238
+
2239
+
2240
+
2241
+
2242
+
2243
+
2244
+
2245
+
2246
+
2247
+
2248
+
2249
+
2250
+
2251
+
2252
+
2253
+
2254
+
2255
+
2256
+
2257
+
2258
+
2259
+
2260
+
2261
+
2262
+
2263
+
2264
+
2265
+
2266
+
2267
+
2268
+
2269
+
2270
+
2271
+
2272
+
2273
+
2274
+
2275
+
2276
+
2277
+
2278
+
2279
+
2280
+
2281
+
2282
+
2283
+
2284
+
2285
+
2286
+
2287
+
2288
+
2289
+
2290
+
2291
+
2292
+
2293
+
2294
+
2295
+
2296
+
2297
+
2298
+
2299
+
2300
+
2301
+
2302
+
2303
+
2304
+
2305
+
2306
+
2307
+
2308
+
2309
+
2310
+
2311
+
2312
+
2313
+
2314
+
2315
+
2316
+
2317
+
2318
+
2319
+
2320
+
2321
+
2322
+
2323
+
2324
+
2325
+
2326
+
2327
+
2328
+
2329
+
2330
+
2331
+
2332
+
2333
+
2334
+
2335
+
2336
+
2337
+
2338
+
2339
+
2340
+
2341
+
2342
+
2343
+
2344
+
2345
+
2346
+
2347
+
2348
+
2349
+
2350
+
2351
+
2352
+
2353
+
2354
+
2355
+
2356
+
2357
+
2358
+
2359
+
2360
+
2361
+
2362
+
2363
+
2364
+
2365
+
2366
+
2367
+
2368
+
2369
+
2370
+
2371
+
2372
+
2373
+
2374
+
2375
+
2376
+
2377
+
2378
+
2379
+
2380
+
2381
+
2382
+
2383
+
2384
+
2385
+
2386
+
2387
+
2388
+
2389
+
2390
+
2391
+
2392
+
2393
+
2394
+
2395
+
2396
+
2397
+
2398
+
2399
+
2400
+
2401
+
2402
+
2403
+
2404
+
2405
+
2406
+
2407
+
2408
+
2409
+
2410
+
2411
+
2412
+
2413
+
2414
+
2415
+
2416
+
2417
+
2418
+
2419
+
2420
+
2421
+
2422
+
2423
+
2424
+
2425
+
2426
+
2427
+
2428
+
2429
+
2430
+
2431
+
2432
+
2433
+
2434
+
2435
+
2436
+
2437
+
2438
+
2439
+
2440
+
2441
+
2442
+
2443
+
2444
+
2445
+
2446
+
2447
+
2448
+
2449
+
2450
+
2451
+
2452
+
2453
+
2454
+
2455
+
2456
+
2457
+
2458
+
2459
+
2460
+
2461
+
2462
+
2463
+
2464
+
2465
+
2466
+
2467
+
2468
+
2469
+
2470
+
2471
+
2472
+
2473
+
2474
+
2475
+
2476
+
2477
+
2478
+
2479
+
2480
+
2481
+
2482
+
2483
+
2484
+
2485
+
2486
+
2487
+
2488
+
2489
+
2490
+
2491
+
2492
+
2493
+
2494
+
2495
+
2496
+
2497
+
2498
+
2499
+
2500
+
2501
+
2502
+
2503
+
2504
+
2505
+
2506
+
2507
+
2508
+
2509
+
2510
+
2511
+
2512
+
2513
+
2514
+
2515
+
2516
+
2517
+
2518
+
2519
+
2520
+
2521
+
2522
+
2523
+
2524
+
2525
+
2526
+
2527
+
2528
+
2529
+
2530
+
2531
+
2532
+
2533
+
2534
+
2535
+
2536
+
2537
+
2538
+
2539
+
2540
+
2541
+
2542
+
2543
+
2544
+
2545
+ 𠮶
src/f5_tts/infer/infer_cli.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import codecs
3
+ import os
4
+ import re
5
+ from importlib.resources import files
6
+ from pathlib import Path
7
+
8
+ import numpy as np
9
+ import soundfile as sf
10
+ import tomli
11
+ from cached_path import cached_path
12
+
13
+ from f5_tts.infer.utils_infer import (
14
+ infer_process,
15
+ load_model,
16
+ load_vocoder,
17
+ preprocess_ref_audio_text,
18
+ remove_silence_for_generated_wav,
19
+ )
20
+ from f5_tts.model import DiT, UNetT
21
+
22
+ parser = argparse.ArgumentParser(
23
+ prog="python3 infer-cli.py",
24
+ description="Commandline interface for E2/F5 TTS with Advanced Batch Processing.",
25
+ epilog="Specify options above to override one or more settings from config.",
26
+ )
27
+ parser.add_argument(
28
+ "-c",
29
+ "--config",
30
+ help="Configuration file. Default=infer/examples/basic/basic.toml",
31
+ default=os.path.join(files("f5_tts").joinpath("infer/examples/basic"), "basic.toml"),
32
+ )
33
+ parser.add_argument(
34
+ "-m",
35
+ "--model",
36
+ help="F5-TTS | E2-TTS",
37
+ )
38
+ parser.add_argument(
39
+ "-p",
40
+ "--ckpt_file",
41
+ help="The Checkpoint .pt",
42
+ )
43
+ parser.add_argument(
44
+ "-v",
45
+ "--vocab_file",
46
+ help="The vocab .txt",
47
+ )
48
+ parser.add_argument("-r", "--ref_audio", type=str, help="Reference audio file < 15 seconds.")
49
+ parser.add_argument("-s", "--ref_text", type=str, default="666", help="Subtitle for the reference audio.")
50
+ parser.add_argument(
51
+ "-t",
52
+ "--gen_text",
53
+ type=str,
54
+ help="Text to generate.",
55
+ )
56
+ parser.add_argument(
57
+ "-f",
58
+ "--gen_file",
59
+ type=str,
60
+ help="File with text to generate. Ignores --text",
61
+ )
62
+ parser.add_argument(
63
+ "-o",
64
+ "--output_dir",
65
+ type=str,
66
+ help="Path to output folder..",
67
+ )
68
+ parser.add_argument(
69
+ "--remove_silence",
70
+ help="Remove silence.",
71
+ )
72
+ parser.add_argument("--vocoder_name", type=str, default="vocos", choices=["vocos", "bigvgan"], help="vocoder name")
73
+ parser.add_argument(
74
+ "--load_vocoder_from_local",
75
+ action="store_true",
76
+ help="load vocoder from local. Default: ../checkpoints/charactr/vocos-mel-24khz",
77
+ )
78
+ parser.add_argument(
79
+ "--speed",
80
+ type=float,
81
+ default=1.0,
82
+ help="Adjust the speed of the audio generation (default: 1.0)",
83
+ )
84
+ args = parser.parse_args()
85
+
86
+ config = tomli.load(open(args.config, "rb"))
87
+
88
+ ref_audio = args.ref_audio if args.ref_audio else config["ref_audio"]
89
+ ref_text = args.ref_text if args.ref_text != "666" else config["ref_text"]
90
+ gen_text = args.gen_text if args.gen_text else config["gen_text"]
91
+ gen_file = args.gen_file if args.gen_file else config["gen_file"]
92
+
93
+ # patches for pip pkg user
94
+ if "infer/examples/" in ref_audio:
95
+ ref_audio = str(files("f5_tts").joinpath(f"{ref_audio}"))
96
+ if "infer/examples/" in gen_file:
97
+ gen_file = str(files("f5_tts").joinpath(f"{gen_file}"))
98
+ if "voices" in config:
99
+ for voice in config["voices"]:
100
+ voice_ref_audio = config["voices"][voice]["ref_audio"]
101
+ if "infer/examples/" in voice_ref_audio:
102
+ config["voices"][voice]["ref_audio"] = str(files("f5_tts").joinpath(f"{voice_ref_audio}"))
103
+
104
+ if gen_file:
105
+ gen_text = codecs.open(gen_file, "r", "utf-8").read()
106
+ output_dir = args.output_dir if args.output_dir else config["output_dir"]
107
+ model = args.model if args.model else config["model"]
108
+ ckpt_file = args.ckpt_file if args.ckpt_file else ""
109
+ vocab_file = args.vocab_file if args.vocab_file else ""
110
+ remove_silence = args.remove_silence if args.remove_silence else config["remove_silence"]
111
+ speed = args.speed
112
+ wave_path = Path(output_dir) / "infer_cli_out.wav"
113
+ # spectrogram_path = Path(output_dir) / "infer_cli_out.png"
114
+ if args.vocoder_name == "vocos":
115
+ vocoder_local_path = "../checkpoints/vocos-mel-24khz"
116
+ elif args.vocoder_name == "bigvgan":
117
+ vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
118
+ mel_spec_type = args.vocoder_name
119
+
120
+ vocoder = load_vocoder(vocoder_name=mel_spec_type, is_local=args.load_vocoder_from_local, local_path=vocoder_local_path)
121
+
122
+
123
+ # load models
124
+ if model == "F5-TTS":
125
+ model_cls = DiT
126
+ model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
127
+ if ckpt_file == "":
128
+ if args.vocoder_name == "vocos":
129
+ repo_name = "F5-TTS"
130
+ exp_name = "F5TTS_Base"
131
+ ckpt_step = 1200000
132
+ ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.safetensors"))
133
+ # ckpt_file = f"ckpts/{exp_name}/model_{ckpt_step}.pt" # .pt | .safetensors; local path
134
+ elif args.vocoder_name == "bigvgan":
135
+ repo_name = "F5-TTS"
136
+ exp_name = "F5TTS_Base_bigvgan"
137
+ ckpt_step = 1250000
138
+ ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.pt"))
139
+
140
+ elif model == "E2-TTS":
141
+ model_cls = UNetT
142
+ model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
143
+ if ckpt_file == "":
144
+ repo_name = "E2-TTS"
145
+ exp_name = "E2TTS_Base"
146
+ ckpt_step = 1200000
147
+ ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.safetensors"))
148
+ # ckpt_file = f"ckpts/{exp_name}/model_{ckpt_step}.pt" # .pt | .safetensors; local path
149
+ elif args.vocoder_name == "bigvgan": # TODO: need to test
150
+ repo_name = "F5-TTS"
151
+ exp_name = "F5TTS_Base_bigvgan"
152
+ ckpt_step = 1250000
153
+ ckpt_file = str(cached_path(f"hf://SWivid/{repo_name}/{exp_name}/model_{ckpt_step}.pt"))
154
+
155
+
156
+ print(f"Using {model}...")
157
+ ema_model = load_model(model_cls, model_cfg, ckpt_file, mel_spec_type=args.vocoder_name, vocab_file=vocab_file)
158
+
159
+
160
+ def main_process(ref_audio, ref_text, text_gen, model_obj, mel_spec_type, remove_silence, speed):
161
+ main_voice = {"ref_audio": ref_audio, "ref_text": ref_text}
162
+ if "voices" not in config:
163
+ voices = {"main": main_voice}
164
+ else:
165
+ voices = config["voices"]
166
+ voices["main"] = main_voice
167
+ for voice in voices:
168
+ voices[voice]["ref_audio"], voices[voice]["ref_text"] = preprocess_ref_audio_text(
169
+ voices[voice]["ref_audio"], voices[voice]["ref_text"]
170
+ )
171
+ print("Voice:", voice)
172
+ print("Ref_audio:", voices[voice]["ref_audio"])
173
+ print("Ref_text:", voices[voice]["ref_text"])
174
+
175
+ generated_audio_segments = []
176
+ reg1 = r"(?=\[\w+\])"
177
+ chunks = re.split(reg1, text_gen)
178
+ reg2 = r"\[(\w+)\]"
179
+ for text in chunks:
180
+ if not text.strip():
181
+ continue
182
+ match = re.match(reg2, text)
183
+ if match:
184
+ voice = match[1]
185
+ else:
186
+ print("No voice tag found, using main.")
187
+ voice = "main"
188
+ if voice not in voices:
189
+ print(f"Voice {voice} not found, using main.")
190
+ voice = "main"
191
+ text = re.sub(reg2, "", text)
192
+ gen_text = text.strip()
193
+ ref_audio = voices[voice]["ref_audio"]
194
+ ref_text = voices[voice]["ref_text"]
195
+ print(f"Voice: {voice}")
196
+ audio, final_sample_rate, spectragram = infer_process(
197
+ ref_audio, ref_text, gen_text, model_obj, vocoder, mel_spec_type=mel_spec_type, speed=speed
198
+ )
199
+ generated_audio_segments.append(audio)
200
+
201
+ if generated_audio_segments:
202
+ final_wave = np.concatenate(generated_audio_segments)
203
+
204
+ if not os.path.exists(output_dir):
205
+ os.makedirs(output_dir)
206
+
207
+ with open(wave_path, "wb") as f:
208
+ sf.write(f.name, final_wave, final_sample_rate)
209
+ # Remove silence
210
+ if remove_silence:
211
+ remove_silence_for_generated_wav(f.name)
212
+ print(f.name)
213
+
214
+
215
+ def main():
216
+ main_process(ref_audio, ref_text, gen_text, ema_model, mel_spec_type, remove_silence, speed)
217
+
218
+
219
+ if __name__ == "__main__":
220
+ main()
src/f5_tts/infer/infer_gradio.py ADDED
@@ -0,0 +1,744 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ruff: noqa: E402
2
+ # Above allows ruff to ignore E402: module level import not at top of file
3
+
4
+ import re
5
+ import tempfile
6
+
7
+ import click
8
+ import gradio as gr
9
+ import numpy as np
10
+ import soundfile as sf
11
+ import torchaudio
12
+ from cached_path import cached_path
13
+ from transformers import AutoModelForCausalLM, AutoTokenizer
14
+ from num2words import num2words
15
+
16
+ try:
17
+ import spaces
18
+
19
+ USING_SPACES = True
20
+ except ImportError:
21
+ USING_SPACES = False
22
+
23
+
24
+ def gpu_decorator(func):
25
+ if USING_SPACES:
26
+ return spaces.GPU(func)
27
+ else:
28
+ return func
29
+
30
+
31
+ from f5_tts.model import DiT, UNetT
32
+ from f5_tts.infer.utils_infer import (
33
+ load_vocoder,
34
+ load_model,
35
+ preprocess_ref_audio_text,
36
+ infer_process,
37
+ remove_silence_for_generated_wav,
38
+ save_spectrogram,
39
+ )
40
+
41
+ vocoder = load_vocoder()
42
+
43
+
44
+ # load models
45
+ F5TTS_model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
46
+ F5TTS_ema_model = load_model(
47
+ DiT, F5TTS_model_cfg, str(cached_path("hf://jpgallegoar/F5-Spanish/model_1200000.safetensors"))
48
+ )
49
+
50
+ chat_model_state = None
51
+ chat_tokenizer_state = None
52
+
53
+
54
+ @gpu_decorator
55
+ def generate_response(messages, model, tokenizer):
56
+ """Generate response using Qwen"""
57
+ text = tokenizer.apply_chat_template(
58
+ messages,
59
+ tokenize=False,
60
+ add_generation_prompt=True,
61
+ )
62
+
63
+ model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
64
+ generated_ids = model.generate(
65
+ **model_inputs,
66
+ max_new_tokens=512,
67
+ temperature=0.7,
68
+ top_p=0.95,
69
+ )
70
+
71
+ generated_ids = [
72
+ output_ids[len(input_ids) :] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
73
+ ]
74
+ return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
75
+
76
+ def traducir_numero_a_texto(texto):
77
+ texto_separado = re.sub(r'([A-Za-z])(\d)', r'\1 \2', texto)
78
+ texto_separado = re.sub(r'(\d)([A-Za-z])', r'\1 \2', texto_separado)
79
+
80
+ def reemplazar_numero(match):
81
+ numero = match.group()
82
+ return num2words(int(numero), lang='es')
83
+
84
+ texto_traducido = re.sub(r'\b\d+\b', reemplazar_numero, texto_separado)
85
+
86
+ return texto_traducido
87
+
88
+ @gpu_decorator
89
+ def infer(
90
+ ref_audio_orig, ref_text, gen_text, model, remove_silence, cross_fade_duration=0.15, speed=1, show_info=gr.Info
91
+ ):
92
+ ref_audio, ref_text = preprocess_ref_audio_text(ref_audio_orig, ref_text, show_info=show_info)
93
+
94
+ ema_model = F5TTS_ema_model
95
+
96
+ if not gen_text.startswith(" "):
97
+ gen_text = " " + gen_text
98
+ if not gen_text.endswith(". "):
99
+ gen_text += ". "
100
+
101
+ gen_text = gen_text.lower()
102
+ gen_text = traducir_numero_a_texto(gen_text)
103
+
104
+ final_wave, final_sample_rate, combined_spectrogram = infer_process(
105
+ ref_audio,
106
+ ref_text,
107
+ gen_text,
108
+ ema_model,
109
+ vocoder,
110
+ cross_fade_duration=cross_fade_duration,
111
+ speed=speed,
112
+ show_info=show_info,
113
+ progress=gr.Progress(),
114
+ )
115
+
116
+ # Remove silence
117
+ if remove_silence:
118
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
119
+ sf.write(f.name, final_wave, final_sample_rate)
120
+ remove_silence_for_generated_wav(f.name)
121
+ final_wave, _ = torchaudio.load(f.name)
122
+ final_wave = final_wave.squeeze().cpu().numpy()
123
+
124
+ # Save the spectrogram
125
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_spectrogram:
126
+ spectrogram_path = tmp_spectrogram.name
127
+ save_spectrogram(combined_spectrogram, spectrogram_path)
128
+
129
+ return (final_sample_rate, final_wave), spectrogram_path
130
+
131
+
132
+ with gr.Blocks() as app_credits:
133
+ gr.Markdown("""
134
+ # Créditos
135
+
136
+ * [mrfakename](https://github.com/fakerybakery) por el [demo online original](https://huggingface.co/spaces/mrfakename/E2-F5-TTS)
137
+ * [RootingInLoad](https://github.com/RootingInLoad) por la generación inicial de fragmentos y exploración de la aplicación de podcast
138
+ * [jpgallegoar](https://github.com/jpgallegoar) por la generación de múltiples tipos de habla, chat de voz y afinación en español
139
+ """)
140
+
141
+
142
+ with gr.Blocks() as app_tts:
143
+ gr.Markdown("# TTS por Lotes")
144
+ ref_audio_input = gr.Audio(label="Audio de Referencia", type="filepath")
145
+ gen_text_input = gr.Textbox(label="Texto para Generar", lines=10)
146
+ model_choice = gr.Radio(choices=["F5-TTS"], label="Seleccionar Modelo TTS", value="F5-TTS")
147
+ generate_btn = gr.Button("Sintetizar", variant="primary")
148
+ with gr.Accordion("Configuraciones Avanzadas", open=False):
149
+ ref_text_input = gr.Textbox(
150
+ label="Texto de Referencia",
151
+ info="Deja en blanco para transcribir automáticamente el audio de referencia. Si ingresas texto, sobrescribirá la transcripción automática.",
152
+ lines=2,
153
+ )
154
+ remove_silence = gr.Checkbox(
155
+ label="Eliminar Silencios",
156
+ info="El modelo tiende a producir silencios, especialmente en audios más largos. Podemos eliminar manualmente los silencios si es necesario. Ten en cuenta que esta es una característica experimental y puede producir resultados extraños. Esto también aumentará el tiempo de generación.",
157
+ value=False,
158
+ )
159
+ speed_slider = gr.Slider(
160
+ label="Velocidad",
161
+ minimum=0.3,
162
+ maximum=2.0,
163
+ value=1.0,
164
+ step=0.1,
165
+ info="Ajusta la velocidad del audio.",
166
+ )
167
+ cross_fade_duration_slider = gr.Slider(
168
+ label="Duración del Cross-Fade (s)",
169
+ minimum=0.0,
170
+ maximum=1.0,
171
+ value=0.15,
172
+ step=0.01,
173
+ info="Establece la duración del cross-fade entre clips de audio.",
174
+ )
175
+
176
+ audio_output = gr.Audio(label="Audio Sintetizado")
177
+ spectrogram_output = gr.Image(label="Espectrograma")
178
+
179
+ generate_btn.click(
180
+ infer,
181
+ inputs=[
182
+ ref_audio_input,
183
+ ref_text_input,
184
+ gen_text_input,
185
+ model_choice,
186
+ remove_silence,
187
+ cross_fade_duration_slider,
188
+ speed_slider,
189
+ ],
190
+ outputs=[audio_output, spectrogram_output],
191
+ )
192
+
193
+
194
+ def parse_speechtypes_text(gen_text):
195
+ # Pattern to find {speechtype}
196
+ pattern = r"\{(.*?)\}"
197
+
198
+ # Split the text by the pattern
199
+ tokens = re.split(pattern, gen_text)
200
+
201
+ segments = []
202
+
203
+ current_style = "Regular"
204
+
205
+ for i in range(len(tokens)):
206
+ if i % 2 == 0:
207
+ # This is text
208
+ text = tokens[i].strip()
209
+ if text:
210
+ segments.append({"style": current_style, "text": text})
211
+ else:
212
+ # This is style
213
+ style = tokens[i].strip()
214
+ current_style = style
215
+
216
+ return segments
217
+
218
+
219
+ with gr.Blocks() as app_multistyle:
220
+ # New section for multistyle generation
221
+ gr.Markdown(
222
+ """
223
+ # Generación de Múltiples Tipos de Habla
224
+
225
+ Esta sección te permite generar múltiples tipos de habla o las voces de múltiples personas. Ingresa tu texto en el formato mostrado a continuación, y el sistema generará el habla utilizando el tipo apropiado. Si no se especifica, el modelo utilizará el tipo de habla regular. El tipo de habla actual se usará hasta que se especifique el siguiente tipo de habla.
226
+ """
227
+ )
228
+
229
+ with gr.Row():
230
+ gr.Markdown(
231
+ """
232
+ **Entrada de Ejemplo:**
233
+ {Regular} Hola, me gustaría pedir un sándwich, por favor.
234
+ {Sorprendido} ¿Qué quieres decir con que no tienen pan?
235
+ {Triste} Realmente quería un sándwich...
236
+ {Enojado} ¡Sabes qué, maldición a ti y a tu pequeña tienda!
237
+ {Susurro} Solo volveré a casa y lloraré ahora.
238
+ {Gritando} ¿Por qué yo?!
239
+ """
240
+ )
241
+
242
+ gr.Markdown(
243
+ """
244
+ **Entrada de Ejemplo 2:**
245
+ {Speaker1_Feliz} Hola, me gustaría pedir un sándwich, por favor.
246
+ {Speaker2_Regular} Lo siento, nos hemos quedado sin pan.
247
+ {Speaker1_Triste} Realmente quería un sándwich...
248
+ {Speaker2_Susurro} Te daré el último que estaba escondiendo.
249
+ """
250
+ )
251
+
252
+ gr.Markdown(
253
+ "Sube diferentes clips de audio para cada tipo de habla. El primer tipo de habla es obligatorio. Puedes agregar tipos de habla adicionales haciendo clic en el botón 'Agregar Tipo de Habla'."
254
+ )
255
+
256
+ # Regular speech type (mandatory)
257
+ with gr.Row():
258
+ with gr.Column():
259
+ regular_name = gr.Textbox(value="Regular", label="Nombre del Tipo de Habla")
260
+ regular_insert = gr.Button("Insertar", variant="secondary")
261
+ regular_audio = gr.Audio(label="Audio de Referencia Regular", type="filepath")
262
+ regular_ref_text = gr.Textbox(label="Texto de Referencia (Regular)", lines=2)
263
+
264
+ # Additional speech types (up to 99 more)
265
+ max_speech_types = 100
266
+ speech_type_rows = []
267
+ speech_type_names = [regular_name]
268
+ speech_type_audios = []
269
+ speech_type_ref_texts = []
270
+ speech_type_delete_btns = []
271
+ speech_type_insert_btns = []
272
+ speech_type_insert_btns.append(regular_insert)
273
+
274
+ for i in range(max_speech_types - 1):
275
+ with gr.Row(visible=False) as row:
276
+ with gr.Column():
277
+ name_input = gr.Textbox(label="Nombre del Tipo de Habla")
278
+ delete_btn = gr.Button("Eliminar", variant="secondary")
279
+ insert_btn = gr.Button("Insertar", variant="secondary")
280
+ audio_input = gr.Audio(label="Audio de Referencia", type="filepath")
281
+ ref_text_input = gr.Textbox(label="Texto de Referencia", lines=2)
282
+ speech_type_rows.append(row)
283
+ speech_type_names.append(name_input)
284
+ speech_type_audios.append(audio_input)
285
+ speech_type_ref_texts.append(ref_text_input)
286
+ speech_type_delete_btns.append(delete_btn)
287
+ speech_type_insert_btns.append(insert_btn)
288
+
289
+ # Button to add speech type
290
+ add_speech_type_btn = gr.Button("Agregar Tipo de Habla")
291
+
292
+ # Keep track of current number of speech types
293
+ speech_type_count = gr.State(value=0)
294
+
295
+ # Function to add a speech type
296
+ def add_speech_type_fn(speech_type_count):
297
+ if speech_type_count < max_speech_types - 1:
298
+ speech_type_count += 1
299
+ # Prepare updates for the rows
300
+ row_updates = []
301
+ for i in range(max_speech_types - 1):
302
+ if i < speech_type_count:
303
+ row_updates.append(gr.update(visible=True))
304
+ else:
305
+ row_updates.append(gr.update())
306
+ else:
307
+ # Optionally, show a warning
308
+ row_updates = [gr.update() for _ in range(max_speech_types - 1)]
309
+ return [speech_type_count] + row_updates
310
+
311
+ add_speech_type_btn.click(
312
+ add_speech_type_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows
313
+ )
314
+
315
+ # Function to delete a speech type
316
+ def make_delete_speech_type_fn(index):
317
+ def delete_speech_type_fn(speech_type_count):
318
+ # Prepare updates
319
+ row_updates = []
320
+
321
+ for i in range(max_speech_types - 1):
322
+ if i == index:
323
+ row_updates.append(gr.update(visible=False))
324
+ else:
325
+ row_updates.append(gr.update())
326
+
327
+ speech_type_count = max(0, speech_type_count - 1)
328
+
329
+ return [speech_type_count] + row_updates
330
+
331
+ return delete_speech_type_fn
332
+
333
+ # Update delete button clicks
334
+ for i, delete_btn in enumerate(speech_type_delete_btns):
335
+ delete_fn = make_delete_speech_type_fn(i)
336
+ delete_btn.click(delete_fn, inputs=speech_type_count, outputs=[speech_type_count] + speech_type_rows)
337
+
338
+ # Text input for the prompt
339
+ gen_text_input_multistyle = gr.Textbox(
340
+ label="Texto para Generar",
341
+ lines=10,
342
+ placeholder="Ingresa el guion con los nombres de los hablantes (o tipos de emociones) al inicio de cada bloque, por ejemplo:\n\n{Regular} Hola, me gustaría pedir un sándwich, por favor.\n{Sorprendido} ¿Qué quieres decir con que no tienen pan?\n{Triste} Realmente quería un sándwich...\n{Enojado} ¡Sabes qué, maldición a ti y a tu pequeña tienda!\n{Susurro} Solo volveré a casa y lloraré ahora.\n{Gritando} ¿Por qué yo?!",
343
+ )
344
+
345
+ def make_insert_speech_type_fn(index):
346
+ def insert_speech_type_fn(current_text, speech_type_name):
347
+ current_text = current_text or ""
348
+ speech_type_name = speech_type_name or "Ninguno"
349
+ updated_text = current_text + f"{{{speech_type_name}}} "
350
+ return gr.update(value=updated_text)
351
+
352
+ return insert_speech_type_fn
353
+
354
+ for i, insert_btn in enumerate(speech_type_insert_btns):
355
+ insert_fn = make_insert_speech_type_fn(i)
356
+ insert_btn.click(
357
+ insert_fn,
358
+ inputs=[gen_text_input_multistyle, speech_type_names[i]],
359
+ outputs=gen_text_input_multistyle,
360
+ )
361
+
362
+ # Model choice
363
+ model_choice_multistyle = gr.Radio(choices=["F5-TTS"], label="Seleccionar Modelo TTS", value="F5-TTS")
364
+
365
+ with gr.Accordion("Configuraciones Avanzadas", open=False):
366
+ remove_silence_multistyle = gr.Checkbox(
367
+ label="Eliminar Silencios",
368
+ value=False,
369
+ )
370
+
371
+ # Generate button
372
+ generate_multistyle_btn = gr.Button("Generar Habla Multi-Estilo", variant="primary")
373
+
374
+ # Output audio
375
+ audio_output_multistyle = gr.Audio(label="Audio Sintetizado")
376
+
377
+ @gpu_decorator
378
+ def generate_multistyle_speech(
379
+ regular_audio,
380
+ regular_ref_text,
381
+ gen_text,
382
+ *args,
383
+ ):
384
+ num_additional_speech_types = max_speech_types - 1
385
+ speech_type_names_list = args[:num_additional_speech_types]
386
+ speech_type_audios_list = args[num_additional_speech_types : 2 * num_additional_speech_types]
387
+ speech_type_ref_texts_list = args[2 * num_additional_speech_types : 3 * num_additional_speech_types]
388
+ model_choice = args[3 * num_additional_speech_types]
389
+ remove_silence = args[3 * num_additional_speech_types + 1]
390
+
391
+ # Collect the speech types and their audios into a dict
392
+ speech_types = {"Regular": {"audio": regular_audio, "ref_text": regular_ref_text}}
393
+
394
+ for name_input, audio_input, ref_text_input in zip(
395
+ speech_type_names_list, speech_type_audios_list, speech_type_ref_texts_list
396
+ ):
397
+ if name_input and audio_input:
398
+ speech_types[name_input] = {"audio": audio_input, "ref_text": ref_text_input}
399
+
400
+ # Parse the gen_text into segments
401
+ segments = parse_speechtypes_text(gen_text)
402
+
403
+ # For each segment, generate speech
404
+ generated_audio_segments = []
405
+ current_style = "Regular"
406
+
407
+ for segment in segments:
408
+ style = segment["style"]
409
+ text = segment["text"]
410
+
411
+ if style in speech_types:
412
+ current_style = style
413
+ else:
414
+ # If style not available, default to Regular
415
+ current_style = "Regular"
416
+
417
+ ref_audio = speech_types[current_style]["audio"]
418
+ ref_text = speech_types[current_style].get("ref_text", "")
419
+
420
+ # Generate speech for this segment
421
+ audio, _ = infer(
422
+ ref_audio, ref_text, text, model_choice, remove_silence, 0, show_info=print
423
+ ) # show_info=print no pull to top when generating
424
+ sr, audio_data = audio
425
+
426
+ generated_audio_segments.append(audio_data)
427
+
428
+ # Concatenate all audio segments
429
+ if generated_audio_segments:
430
+ final_audio_data = np.concatenate(generated_audio_segments)
431
+ return (sr, final_audio_data)
432
+ else:
433
+ gr.Warning("No se generó ningún audio.")
434
+ return None
435
+
436
+ generate_multistyle_btn.click(
437
+ generate_multistyle_speech,
438
+ inputs=[
439
+ regular_audio,
440
+ regular_ref_text,
441
+ gen_text_input_multistyle,
442
+ ]
443
+ + speech_type_names
444
+ + speech_type_audios
445
+ + speech_type_ref_texts
446
+ + [
447
+ model_choice_multistyle,
448
+ remove_silence_multistyle,
449
+ ],
450
+ outputs=audio_output_multistyle,
451
+ )
452
+
453
+ # Validation function to disable Generate button if speech types are missing
454
+ def validate_speech_types(gen_text, regular_name, *args):
455
+ num_additional_speech_types = max_speech_types - 1
456
+ speech_type_names_list = args[:num_additional_speech_types]
457
+
458
+ # Collect the speech types names
459
+ speech_types_available = set()
460
+ if regular_name:
461
+ speech_types_available.add(regular_name)
462
+ for name_input in speech_type_names_list:
463
+ if name_input:
464
+ speech_types_available.add(name_input)
465
+
466
+ # Parse the gen_text to get the speech types used
467
+ segments = parse_speechtypes_text(gen_text)
468
+ speech_types_in_text = set(segment["style"] for segment in segments)
469
+
470
+ # Check if all speech types in text are available
471
+ missing_speech_types = speech_types_in_text - speech_types_available
472
+
473
+ if missing_speech_types:
474
+ # Disable the generate button
475
+ return gr.update(interactive=False)
476
+ else:
477
+ # Enable the generate button
478
+ return gr.update(interactive=True)
479
+
480
+ gen_text_input_multistyle.change(
481
+ validate_speech_types,
482
+ inputs=[gen_text_input_multistyle, regular_name] + speech_type_names,
483
+ outputs=generate_multistyle_btn,
484
+ )
485
+
486
+
487
+ with gr.Blocks() as app_chat:
488
+ gr.Markdown(
489
+ """
490
+ # Chat de Voz
491
+ ¡Mantén una conversación con una IA usando tu voz de referencia!
492
+ 1. Sube un clip de audio de referencia y opcionalmente su transcripción.
493
+ 2. Carga el modelo de chat.
494
+ 3. Graba tu mensaje a través de tu micrófono.
495
+ 4. La IA responderá usando la voz de referencia.
496
+ """
497
+ )
498
+
499
+ if not USING_SPACES:
500
+ load_chat_model_btn = gr.Button("Cargar Modelo de Chat", variant="primary")
501
+
502
+ chat_interface_container = gr.Column(visible=False)
503
+
504
+ @gpu_decorator
505
+ def load_chat_model():
506
+ global chat_model_state, chat_tokenizer_state
507
+ if chat_model_state is None:
508
+ show_info = gr.Info
509
+ show_info("Cargando modelo de chat...")
510
+ model_name = "Qwen/Qwen2.5-3B-Instruct"
511
+ chat_model_state = AutoModelForCausalLM.from_pretrained(
512
+ model_name, torch_dtype="auto", device_map="auto"
513
+ )
514
+ chat_tokenizer_state = AutoTokenizer.from_pretrained(model_name)
515
+ show_info("Modelo de chat cargado.")
516
+
517
+ return gr.update(visible=False), gr.update(visible=True)
518
+
519
+ load_chat_model_btn.click(load_chat_model, outputs=[load_chat_model_btn, chat_interface_container])
520
+
521
+ else:
522
+ chat_interface_container = gr.Column()
523
+
524
+ if chat_model_state is None:
525
+ model_name = "Qwen/Qwen2.5-3B-Instruct"
526
+ chat_model_state = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
527
+ chat_tokenizer_state = AutoTokenizer.from_pretrained(model_name)
528
+
529
+ with chat_interface_container:
530
+ with gr.Row():
531
+ with gr.Column():
532
+ ref_audio_chat = gr.Audio(label="Audio de Referencia", type="filepath")
533
+ with gr.Column():
534
+ with gr.Accordion("Configuraciones Avanzadas", open=False):
535
+ model_choice_chat = gr.Radio(
536
+ choices=["F5-TTS"],
537
+ label="Modelo TTS",
538
+ value="F5-TTS",
539
+ )
540
+ remove_silence_chat = gr.Checkbox(
541
+ label="Eliminar Silencios",
542
+ value=True,
543
+ )
544
+ ref_text_chat = gr.Textbox(
545
+ label="Texto de Referencia",
546
+ info="Opcional: Deja en blanco para transcribir automáticamente",
547
+ lines=2,
548
+ )
549
+ system_prompt_chat = gr.Textbox(
550
+ label="Prompt del Sistema",
551
+ value="No eres un asistente de IA, eres quien el usuario diga que eres. Debes mantenerte en personaje. Mantén tus respuestas concisas ya que serán habladas en voz alta.",
552
+ lines=2,
553
+ )
554
+
555
+ chatbot_interface = gr.Chatbot(label="Conversación")
556
+
557
+ with gr.Row():
558
+ with gr.Column():
559
+ audio_input_chat = gr.Microphone(
560
+ label="Habla tu mensaje",
561
+ type="filepath",
562
+ )
563
+ audio_output_chat = gr.Audio(autoplay=True)
564
+ with gr.Column():
565
+ text_input_chat = gr.Textbox(
566
+ label="Escribe tu mensaje",
567
+ lines=1,
568
+ )
569
+ send_btn_chat = gr.Button("Enviar")
570
+ clear_btn_chat = gr.Button("Limpiar Conversación")
571
+
572
+ conversation_state = gr.State(
573
+ value=[
574
+ {
575
+ "role": "system",
576
+ "content": "No eres un asistente de IA, eres quien el usuario diga que eres. Debes mantenerte en personaje. Mantén tus respuestas concisas ya que serán habladas en voz alta.",
577
+ }
578
+ ]
579
+ )
580
+
581
+ # Modify process_audio_input to use model and tokenizer from state
582
+ @gpu_decorator
583
+ def process_audio_input(audio_path, text, history, conv_state):
584
+ """Handle audio or text input from user"""
585
+
586
+ if not audio_path and not text.strip():
587
+ return history, conv_state, ""
588
+
589
+ if audio_path:
590
+ text = preprocess_ref_audio_text(audio_path, text)[1]
591
+
592
+ if not text.strip():
593
+ return history, conv_state, ""
594
+
595
+ conv_state.append({"role": "user", "content": text})
596
+ history.append((text, None))
597
+
598
+ response = generate_response(conv_state, chat_model_state, chat_tokenizer_state)
599
+
600
+ conv_state.append({"role": "assistant", "content": response})
601
+ history[-1] = (text, response)
602
+
603
+ return history, conv_state, ""
604
+
605
+ @gpu_decorator
606
+ def generate_audio_response(history, ref_audio, ref_text, model, remove_silence):
607
+ """Generate TTS audio for AI response"""
608
+ if not history or not ref_audio:
609
+ return None
610
+
611
+ last_user_message, last_ai_response = history[-1]
612
+ if not last_ai_response:
613
+ return None
614
+
615
+ audio_result, _ = infer(
616
+ ref_audio,
617
+ ref_text,
618
+ last_ai_response,
619
+ model,
620
+ remove_silence,
621
+ cross_fade_duration=0.15,
622
+ speed=1.0,
623
+ show_info=print, # show_info=print no pull to top when generating
624
+ )
625
+ return audio_result
626
+
627
+ def clear_conversation():
628
+ """Reset the conversation"""
629
+ return [], [
630
+ {
631
+ "role": "system",
632
+ "content": "No eres un asistente de IA, eres quien el usuario diga que eres. Debes mantenerte en personaje. Mantén tus respuestas concisas ya que serán habladas en voz alta.",
633
+ }
634
+ ]
635
+
636
+ def update_system_prompt(new_prompt):
637
+ """Update the system prompt and reset the conversation"""
638
+ new_conv_state = [{"role": "system", "content": new_prompt}]
639
+ return [], new_conv_state
640
+
641
+ # Handle audio input
642
+ audio_input_chat.stop_recording(
643
+ process_audio_input,
644
+ inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
645
+ outputs=[chatbot_interface, conversation_state],
646
+ ).then(
647
+ generate_audio_response,
648
+ inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, model_choice_chat, remove_silence_chat],
649
+ outputs=[audio_output_chat],
650
+ ).then(
651
+ lambda: None,
652
+ None,
653
+ audio_input_chat,
654
+ )
655
+
656
+ # Handle text input
657
+ text_input_chat.submit(
658
+ process_audio_input,
659
+ inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
660
+ outputs=[chatbot_interface, conversation_state],
661
+ ).then(
662
+ generate_audio_response,
663
+ inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, model_choice_chat, remove_silence_chat],
664
+ outputs=[audio_output_chat],
665
+ ).then(
666
+ lambda: None,
667
+ None,
668
+ text_input_chat,
669
+ )
670
+
671
+ # Handle send button
672
+ send_btn_chat.click(
673
+ process_audio_input,
674
+ inputs=[audio_input_chat, text_input_chat, chatbot_interface, conversation_state],
675
+ outputs=[chatbot_interface, conversation_state],
676
+ ).then(
677
+ generate_audio_response,
678
+ inputs=[chatbot_interface, ref_audio_chat, ref_text_chat, model_choice_chat, remove_silence_chat],
679
+ outputs=[audio_output_chat],
680
+ ).then(
681
+ lambda: None,
682
+ None,
683
+ text_input_chat,
684
+ )
685
+
686
+ # Handle clear button
687
+ clear_btn_chat.click(
688
+ clear_conversation,
689
+ outputs=[chatbot_interface, conversation_state],
690
+ )
691
+
692
+ # Handle system prompt change and reset conversation
693
+ system_prompt_chat.change(
694
+ update_system_prompt,
695
+ inputs=system_prompt_chat,
696
+ outputs=[chatbot_interface, conversation_state],
697
+ )
698
+
699
+
700
+ with gr.Blocks() as app:
701
+ gr.Markdown(
702
+ """
703
+ # Spanish-F5
704
+
705
+ Esta es una interfaz web para F5 TTS, con un finetuning para poder hablar en castellano
706
+
707
+ Implementación original:
708
+ * [F5-TTS](https://arxiv.org/abs/2410.06885) (A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching)
709
+
710
+ El modelo sólo soporta el castellano.
711
+
712
+ Para los mejores resultados, intenta convertir tu audio de referencia a WAV o MP3, asegurarte de que duren entre 11 y 14 segundos, que comiencen y acaben con entre medio segundo y un segundo de silencio, y a ser posible que acabe con el final de la frase.
713
+
714
+ **NOTA: El texto de referencia será transcrito automáticamente con Whisper si no se proporciona. Para mejores resultados, mantén tus clips de referencia cortos (<15s). Asegúrate de que el audio esté completamente subido antes de generar. Se utiliza la librería num2words para convertir los números a palabras.**
715
+ """
716
+ )
717
+ gr.TabbedInterface(
718
+ [app_tts, app_multistyle, app_chat, app_credits],
719
+ ["TTS", "Multi-Habla", "Chat de Voz", "Créditos"],
720
+ )
721
+
722
+
723
+ @click.command()
724
+ @click.option("--port", "-p", default=None, type=int, help="Puerto para ejecutar la aplicación")
725
+ @click.option("--host", "-H", default=None, help="Host para ejecutar la aplicación")
726
+ @click.option(
727
+ "--share",
728
+ "-s",
729
+ default=False,
730
+ is_flag=True,
731
+ help="Compartir la aplicación a través de un enlace compartido de Gradio",
732
+ )
733
+ @click.option("--api", "-a", default=True, is_flag=True, help="Permitir acceso a la API")
734
+ def main(port, host, share, api):
735
+ global app
736
+ print("Iniciando la aplicación...")
737
+ app.queue(api_open=api).launch(server_name=host, server_port=port, share=True, show_api=api)
738
+
739
+
740
+ if __name__ == "__main__":
741
+ if not USING_SPACES:
742
+ main()
743
+ else:
744
+ app.queue().launch()
src/f5_tts/infer/speech_edit.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch
4
+ import torch.nn.functional as F
5
+ import torchaudio
6
+
7
+ from f5_tts.infer.utils_infer import load_checkpoint, load_vocoder, save_spectrogram
8
+ from f5_tts.model import CFM, DiT, UNetT
9
+ from f5_tts.model.utils import convert_char_to_pinyin, get_tokenizer
10
+
11
+ device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
12
+
13
+
14
+ # --------------------- Dataset Settings -------------------- #
15
+
16
+ target_sample_rate = 24000
17
+ n_mel_channels = 100
18
+ hop_length = 256
19
+ win_length = 1024
20
+ n_fft = 1024
21
+ mel_spec_type = "vocos" # 'vocos' or 'bigvgan'
22
+ target_rms = 0.1
23
+
24
+ tokenizer = "pinyin"
25
+ dataset_name = "Emilia_ZH_EN"
26
+
27
+
28
+ # ---------------------- infer setting ---------------------- #
29
+
30
+ seed = None # int | None
31
+
32
+ exp_name = "F5TTS_Base" # F5TTS_Base | E2TTS_Base
33
+ ckpt_step = 1200000
34
+
35
+ nfe_step = 32 # 16, 32
36
+ cfg_strength = 2.0
37
+ ode_method = "euler" # euler | midpoint
38
+ sway_sampling_coef = -1.0
39
+ speed = 1.0
40
+
41
+ if exp_name == "F5TTS_Base":
42
+ model_cls = DiT
43
+ model_cfg = dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4)
44
+
45
+ elif exp_name == "E2TTS_Base":
46
+ model_cls = UNetT
47
+ model_cfg = dict(dim=1024, depth=24, heads=16, ff_mult=4)
48
+
49
+ ckpt_path = f"ckpts/{exp_name}/model_{ckpt_step}.safetensors"
50
+ output_dir = "tests"
51
+
52
+ # [leverage https://github.com/MahmoudAshraf97/ctc-forced-aligner to get char level alignment]
53
+ # pip install git+https://github.com/MahmoudAshraf97/ctc-forced-aligner.git
54
+ # [write the origin_text into a file, e.g. tests/test_edit.txt]
55
+ # ctc-forced-aligner --audio_path "src/f5_tts/infer/examples/basic/basic_ref_en.wav" --text_path "tests/test_edit.txt" --language "zho" --romanize --split_size "char"
56
+ # [result will be saved at same path of audio file]
57
+ # [--language "zho" for Chinese, "eng" for English]
58
+ # [if local ckpt, set --alignment_model "../checkpoints/mms-300m-1130-forced-aligner"]
59
+
60
+ audio_to_edit = "src/f5_tts/infer/examples/basic/basic_ref_en.wav"
61
+ origin_text = "Some call me nature, others call me mother nature."
62
+ target_text = "Some call me optimist, others call me realist."
63
+ parts_to_edit = [
64
+ [1.42, 2.44],
65
+ [4.04, 4.9],
66
+ ] # stard_ends of "nature" & "mother nature", in seconds
67
+ fix_duration = [
68
+ 1.2,
69
+ 1,
70
+ ] # fix duration for "optimist" & "realist", in seconds
71
+
72
+ # audio_to_edit = "src/f5_tts/infer/examples/basic/basic_ref_zh.wav"
73
+ # origin_text = "对,这就是我,万人敬仰的太乙真人。"
74
+ # target_text = "对,那就是你,万人敬仰的太白金星。"
75
+ # parts_to_edit = [[0.84, 1.4], [1.92, 2.4], [4.26, 6.26], ]
76
+ # fix_duration = None # use origin text duration
77
+
78
+
79
+ # -------------------------------------------------#
80
+
81
+ use_ema = True
82
+
83
+ if not os.path.exists(output_dir):
84
+ os.makedirs(output_dir)
85
+
86
+ # Vocoder model
87
+ local = False
88
+ if mel_spec_type == "vocos":
89
+ vocoder_local_path = "../checkpoints/charactr/vocos-mel-24khz"
90
+ elif mel_spec_type == "bigvgan":
91
+ vocoder_local_path = "../checkpoints/bigvgan_v2_24khz_100band_256x"
92
+ vocoder = load_vocoder(vocoder_name=mel_spec_type, is_local=local, local_path=vocoder_local_path)
93
+
94
+ # Tokenizer
95
+ vocab_char_map, vocab_size = get_tokenizer(dataset_name, tokenizer)
96
+
97
+ # Model
98
+ model = CFM(
99
+ transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
100
+ mel_spec_kwargs=dict(
101
+ n_fft=n_fft,
102
+ hop_length=hop_length,
103
+ win_length=win_length,
104
+ n_mel_channels=n_mel_channels,
105
+ target_sample_rate=target_sample_rate,
106
+ mel_spec_type=mel_spec_type,
107
+ ),
108
+ odeint_kwargs=dict(
109
+ method=ode_method,
110
+ ),
111
+ vocab_char_map=vocab_char_map,
112
+ ).to(device)
113
+
114
+ dtype = torch.float32 if mel_spec_type == "bigvgan" else None
115
+ model = load_checkpoint(model, ckpt_path, device, dtype=dtype, use_ema=use_ema)
116
+
117
+ # Audio
118
+ audio, sr = torchaudio.load(audio_to_edit)
119
+ if audio.shape[0] > 1:
120
+ audio = torch.mean(audio, dim=0, keepdim=True)
121
+ rms = torch.sqrt(torch.mean(torch.square(audio)))
122
+ if rms < target_rms:
123
+ audio = audio * target_rms / rms
124
+ if sr != target_sample_rate:
125
+ resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
126
+ audio = resampler(audio)
127
+ offset = 0
128
+ audio_ = torch.zeros(1, 0)
129
+ edit_mask = torch.zeros(1, 0, dtype=torch.bool)
130
+ for part in parts_to_edit:
131
+ start, end = part
132
+ part_dur = end - start if fix_duration is None else fix_duration.pop(0)
133
+ part_dur = part_dur * target_sample_rate
134
+ start = start * target_sample_rate
135
+ audio_ = torch.cat((audio_, audio[:, round(offset) : round(start)], torch.zeros(1, round(part_dur))), dim=-1)
136
+ edit_mask = torch.cat(
137
+ (
138
+ edit_mask,
139
+ torch.ones(1, round((start - offset) / hop_length), dtype=torch.bool),
140
+ torch.zeros(1, round(part_dur / hop_length), dtype=torch.bool),
141
+ ),
142
+ dim=-1,
143
+ )
144
+ offset = end * target_sample_rate
145
+ # audio = torch.cat((audio_, audio[:, round(offset):]), dim = -1)
146
+ edit_mask = F.pad(edit_mask, (0, audio.shape[-1] // hop_length - edit_mask.shape[-1] + 1), value=True)
147
+ audio = audio.to(device)
148
+ edit_mask = edit_mask.to(device)
149
+
150
+ # Text
151
+ text_list = [target_text]
152
+ if tokenizer == "pinyin":
153
+ final_text_list = convert_char_to_pinyin(text_list)
154
+ else:
155
+ final_text_list = [text_list]
156
+ print(f"text : {text_list}")
157
+ print(f"pinyin: {final_text_list}")
158
+
159
+ # Duration
160
+ ref_audio_len = 0
161
+ duration = audio.shape[-1] // hop_length
162
+
163
+ # Inference
164
+ with torch.inference_mode():
165
+ generated, trajectory = model.sample(
166
+ cond=audio,
167
+ text=final_text_list,
168
+ duration=duration,
169
+ steps=nfe_step,
170
+ cfg_strength=cfg_strength,
171
+ sway_sampling_coef=sway_sampling_coef,
172
+ seed=seed,
173
+ edit_mask=edit_mask,
174
+ )
175
+ print(f"Generated mel: {generated.shape}")
176
+
177
+ # Final result
178
+ generated = generated.to(torch.float32)
179
+ generated = generated[:, ref_audio_len:, :]
180
+ gen_mel_spec = generated.permute(0, 2, 1)
181
+ if mel_spec_type == "vocos":
182
+ generated_wave = vocoder.decode(gen_mel_spec)
183
+ elif mel_spec_type == "bigvgan":
184
+ generated_wave = vocoder(gen_mel_spec)
185
+
186
+ if rms < target_rms:
187
+ generated_wave = generated_wave * rms / target_rms
188
+
189
+ save_spectrogram(gen_mel_spec[0].cpu().numpy(), f"{output_dir}/speech_edit_out.png")
190
+ torchaudio.save(f"{output_dir}/speech_edit_out.wav", generated_wave.squeeze(0).cpu(), target_sample_rate)
191
+ print(f"Generated wav: {generated_wave.shape}")
src/f5_tts/infer/utils_infer.py ADDED
@@ -0,0 +1,511 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # A unified script for inference process
2
+ # Make adjustments inside functions, and consider both gradio and cli scripts if need to change func output format
3
+ import os
4
+ import sys
5
+
6
+ sys.path.append(f"../../{os.path.dirname(os.path.abspath(__file__))}/third_party/BigVGAN/")
7
+
8
+ import hashlib
9
+ import re
10
+ import tempfile
11
+ from importlib.resources import files
12
+
13
+ import matplotlib
14
+
15
+ matplotlib.use("Agg")
16
+
17
+ import matplotlib.pylab as plt
18
+ import numpy as np
19
+ import torch
20
+ import torchaudio
21
+ import tqdm
22
+ from pydub import AudioSegment, silence
23
+ from transformers import pipeline
24
+ from vocos import Vocos
25
+
26
+ from f5_tts.model import CFM
27
+ from f5_tts.model.utils import (
28
+ get_tokenizer,
29
+ convert_char_to_pinyin,
30
+ )
31
+
32
+ _ref_audio_cache = {}
33
+
34
+ device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
35
+
36
+ # -----------------------------------------
37
+
38
+ target_sample_rate = 24000
39
+ n_mel_channels = 100
40
+ hop_length = 256
41
+ win_length = 1024
42
+ n_fft = 1024
43
+ mel_spec_type = "vocos"
44
+ target_rms = 0.1
45
+ cross_fade_duration = 0.15
46
+ ode_method = "euler"
47
+ nfe_step = 32 # 16, 32
48
+ cfg_strength = 2.0
49
+ sway_sampling_coef = -1.0
50
+ speed = 1.0
51
+ fix_duration = None
52
+
53
+ # -----------------------------------------
54
+
55
+
56
+ # chunk text into smaller pieces
57
+
58
+
59
+ def chunk_text(text, max_chars=135):
60
+ """
61
+ Splits the input text into chunks, each with a maximum number of characters.
62
+
63
+ Args:
64
+ text (str): The text to be split.
65
+ max_chars (int): The maximum number of characters per chunk.
66
+
67
+ Returns:
68
+ List[str]: A list of text chunks.
69
+ """
70
+ chunks = []
71
+ current_chunk = ""
72
+ # Split the text into sentences based on punctuation followed by whitespace
73
+ sentences = re.split(r"(?<=[;:,.!?])\s+|(?<=[;:,。!?])", text)
74
+
75
+ for sentence in sentences:
76
+ if len(current_chunk.encode("utf-8")) + len(sentence.encode("utf-8")) <= max_chars:
77
+ current_chunk += sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
78
+ else:
79
+ if current_chunk:
80
+ chunks.append(current_chunk.strip())
81
+ current_chunk = sentence + " " if sentence and len(sentence[-1].encode("utf-8")) == 1 else sentence
82
+
83
+ if current_chunk:
84
+ chunks.append(current_chunk.strip())
85
+
86
+ return chunks
87
+
88
+
89
+ # load vocoder
90
+ def load_vocoder(vocoder_name="vocos", is_local=False, local_path="", device=device):
91
+ if vocoder_name == "vocos":
92
+ if is_local:
93
+ print(f"Load vocos from local path {local_path}")
94
+ vocoder = Vocos.from_hparams(f"{local_path}/config.yaml")
95
+ state_dict = torch.load(f"{local_path}/pytorch_model.bin", map_location="cpu")
96
+ vocoder.load_state_dict(state_dict)
97
+ vocoder = vocoder.eval().to(device)
98
+ else:
99
+ print("Download Vocos from huggingface charactr/vocos-mel-24khz")
100
+ vocoder = Vocos.from_pretrained("charactr/vocos-mel-24khz").to(device)
101
+ elif vocoder_name == "bigvgan":
102
+ try:
103
+ from third_party.BigVGAN import bigvgan
104
+ except ImportError:
105
+ print("You need to follow the README to init submodule and change the BigVGAN source code.")
106
+ if is_local:
107
+ """download from https://huggingface.co/nvidia/bigvgan_v2_24khz_100band_256x/tree/main"""
108
+ vocoder = bigvgan.BigVGAN.from_pretrained(local_path, use_cuda_kernel=False)
109
+ else:
110
+ vocoder = bigvgan.BigVGAN.from_pretrained("nvidia/bigvgan_v2_24khz_100band_256x", use_cuda_kernel=False)
111
+
112
+ vocoder.remove_weight_norm()
113
+ vocoder = vocoder.eval().to(device)
114
+ return vocoder
115
+
116
+
117
+ # load asr pipeline
118
+
119
+ asr_pipe = None
120
+
121
+
122
+ def initialize_asr_pipeline(device=device, dtype=None):
123
+ if dtype is None:
124
+ dtype = (
125
+ torch.float16 if device == "cuda" and torch.cuda.get_device_properties(device).major >= 6 else torch.float32
126
+ )
127
+ global asr_pipe
128
+ asr_pipe = pipeline(
129
+ "automatic-speech-recognition",
130
+ model="openai/whisper-large-v3-turbo",
131
+ torch_dtype=dtype,
132
+ device=device,
133
+ )
134
+
135
+
136
+ # load model checkpoint for inference
137
+
138
+
139
+ def load_checkpoint(model, ckpt_path, device, dtype=None, use_ema=True):
140
+ if dtype is None:
141
+ dtype = (
142
+ torch.float16 if device == "cuda" and torch.cuda.get_device_properties(device).major >= 6 else torch.float32
143
+ )
144
+ model = model.to(dtype)
145
+
146
+ ckpt_type = ckpt_path.split(".")[-1]
147
+ if ckpt_type == "safetensors":
148
+ from safetensors.torch import load_file
149
+
150
+ checkpoint = load_file(ckpt_path)
151
+ else:
152
+ checkpoint = torch.load(ckpt_path, weights_only=True)
153
+
154
+ if use_ema:
155
+ if ckpt_type == "safetensors":
156
+ checkpoint = {"ema_model_state_dict": checkpoint}
157
+ checkpoint["model_state_dict"] = {
158
+ k.replace("ema_model.", ""): v
159
+ for k, v in checkpoint["ema_model_state_dict"].items()
160
+ if k not in ["initted", "step"]
161
+ }
162
+
163
+ # patch for backward compatibility, 305e3ea
164
+ for key in ["mel_spec.mel_stft.mel_scale.fb", "mel_spec.mel_stft.spectrogram.window"]:
165
+ if key in checkpoint["model_state_dict"]:
166
+ del checkpoint["model_state_dict"][key]
167
+
168
+ model.load_state_dict(checkpoint["model_state_dict"])
169
+ else:
170
+ if ckpt_type == "safetensors":
171
+ checkpoint = {"model_state_dict": checkpoint}
172
+ model.load_state_dict(checkpoint["model_state_dict"])
173
+
174
+ return model.to(device)
175
+
176
+
177
+ # load model for inference
178
+
179
+
180
+ def load_model(
181
+ model_cls,
182
+ model_cfg,
183
+ ckpt_path,
184
+ mel_spec_type=mel_spec_type,
185
+ vocab_file="",
186
+ ode_method=ode_method,
187
+ use_ema=True,
188
+ device=device,
189
+ ):
190
+ if vocab_file == "":
191
+ vocab_file = str(files("f5_tts").joinpath("infer/examples/vocab.txt"))
192
+ tokenizer = "custom"
193
+
194
+ print("\nvocab : ", vocab_file)
195
+ print("tokenizer : ", tokenizer)
196
+ print("model : ", ckpt_path, "\n")
197
+
198
+ vocab_char_map, vocab_size = get_tokenizer(vocab_file, tokenizer)
199
+ model = CFM(
200
+ transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels),
201
+ mel_spec_kwargs=dict(
202
+ n_fft=n_fft,
203
+ hop_length=hop_length,
204
+ win_length=win_length,
205
+ n_mel_channels=n_mel_channels,
206
+ target_sample_rate=target_sample_rate,
207
+ mel_spec_type=mel_spec_type,
208
+ ),
209
+ odeint_kwargs=dict(
210
+ method=ode_method,
211
+ ),
212
+ vocab_char_map=vocab_char_map,
213
+ ).to(device)
214
+
215
+ dtype = torch.float32 if mel_spec_type == "bigvgan" else None
216
+ model = load_checkpoint(model, ckpt_path, device, dtype=dtype, use_ema=use_ema)
217
+
218
+ return model
219
+
220
+
221
+ def remove_silence_edges(audio, silence_threshold=-42):
222
+ # Remove silence from the start
223
+ non_silent_start_idx = silence.detect_leading_silence(audio, silence_threshold=silence_threshold)
224
+ audio = audio[non_silent_start_idx:]
225
+
226
+ # Remove silence from the end
227
+ non_silent_end_duration = audio.duration_seconds
228
+ for ms in reversed(audio):
229
+ if ms.dBFS > silence_threshold:
230
+ break
231
+ non_silent_end_duration -= 0.001
232
+ trimmed_audio = audio[: int(non_silent_end_duration * 1000)]
233
+
234
+ return trimmed_audio
235
+
236
+
237
+ # preprocess reference audio and text
238
+
239
+
240
+ def preprocess_ref_audio_text(ref_audio_orig, ref_text, clip_short=True, show_info=print, device=device):
241
+ show_info("Converting audio...")
242
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
243
+ aseg = AudioSegment.from_file(ref_audio_orig)
244
+
245
+ if clip_short:
246
+ # 1. try to find long silence for clipping
247
+ non_silent_segs = silence.split_on_silence(
248
+ aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=1000, seek_step=10
249
+ )
250
+ non_silent_wave = AudioSegment.silent(duration=0)
251
+ for non_silent_seg in non_silent_segs:
252
+ if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 15000:
253
+ show_info("Audio is over 15s, clipping short. (1)")
254
+ break
255
+ non_silent_wave += non_silent_seg
256
+
257
+ # 2. try to find short silence for clipping if 1. failed
258
+ if len(non_silent_wave) > 15000:
259
+ non_silent_segs = silence.split_on_silence(
260
+ aseg, min_silence_len=100, silence_thresh=-40, keep_silence=1000, seek_step=10
261
+ )
262
+ non_silent_wave = AudioSegment.silent(duration=0)
263
+ for non_silent_seg in non_silent_segs:
264
+ if len(non_silent_wave) > 6000 and len(non_silent_wave + non_silent_seg) > 15000:
265
+ show_info("Audio is over 15s, clipping short. (2)")
266
+ break
267
+ non_silent_wave += non_silent_seg
268
+
269
+ aseg = non_silent_wave
270
+
271
+ # 3. if no proper silence found for clipping
272
+ if len(aseg) > 15000:
273
+ aseg = aseg[:15000]
274
+ show_info("Audio is over 15s, clipping short. (3)")
275
+
276
+ aseg = remove_silence_edges(aseg) + AudioSegment.silent(duration=50)
277
+ aseg.export(f.name, format="wav")
278
+ ref_audio = f.name
279
+
280
+ # Compute a hash of the reference audio file
281
+ with open(ref_audio, "rb") as audio_file:
282
+ audio_data = audio_file.read()
283
+ audio_hash = hashlib.md5(audio_data).hexdigest()
284
+
285
+ global _ref_audio_cache
286
+ if audio_hash in _ref_audio_cache:
287
+ # Use cached reference text
288
+ show_info("Using cached reference text...")
289
+ ref_text = _ref_audio_cache[audio_hash]
290
+ else:
291
+ if not ref_text.strip():
292
+ global asr_pipe
293
+ if asr_pipe is None:
294
+ initialize_asr_pipeline(device=device)
295
+ show_info("No reference text provided, transcribing reference audio...")
296
+ ref_text = asr_pipe(
297
+ ref_audio,
298
+ chunk_length_s=30,
299
+ batch_size=128,
300
+ generate_kwargs={"task": "transcribe"},
301
+ return_timestamps=False,
302
+ )["text"].strip()
303
+ show_info("Finished transcription")
304
+ else:
305
+ show_info("Using custom reference text...")
306
+ # Cache the transcribed text
307
+ _ref_audio_cache[audio_hash] = ref_text
308
+
309
+ # Ensure ref_text ends with a proper sentence-ending punctuation
310
+ if not ref_text.endswith(". ") and not ref_text.endswith("。"):
311
+ if ref_text.endswith("."):
312
+ ref_text += " "
313
+ else:
314
+ ref_text += ". "
315
+
316
+ return ref_audio, ref_text
317
+
318
+
319
+ # infer process: chunk text -> infer batches [i.e. infer_batch_process()]
320
+
321
+
322
+ def infer_process(
323
+ ref_audio,
324
+ ref_text,
325
+ gen_text,
326
+ model_obj,
327
+ vocoder,
328
+ mel_spec_type=mel_spec_type,
329
+ show_info=print,
330
+ progress=tqdm,
331
+ target_rms=target_rms,
332
+ cross_fade_duration=cross_fade_duration,
333
+ nfe_step=nfe_step,
334
+ cfg_strength=cfg_strength,
335
+ sway_sampling_coef=sway_sampling_coef,
336
+ speed=speed,
337
+ fix_duration=fix_duration,
338
+ device=device,
339
+ ):
340
+ # Split the input text into batches
341
+ audio, sr = torchaudio.load(ref_audio)
342
+ max_chars = int(len(ref_text.encode("utf-8")) / (audio.shape[-1] / sr) * (25 - audio.shape[-1] / sr))
343
+ gen_text_batches = chunk_text(gen_text, max_chars=max_chars)
344
+ for i, gen_text in enumerate(gen_text_batches):
345
+ print(f"gen_text {i}", gen_text)
346
+
347
+ show_info(f"Generating audio in {len(gen_text_batches)} batches...")
348
+ return infer_batch_process(
349
+ (audio, sr),
350
+ ref_text,
351
+ gen_text_batches,
352
+ model_obj,
353
+ vocoder,
354
+ mel_spec_type=mel_spec_type,
355
+ progress=progress,
356
+ target_rms=target_rms,
357
+ cross_fade_duration=cross_fade_duration,
358
+ nfe_step=nfe_step,
359
+ cfg_strength=cfg_strength,
360
+ sway_sampling_coef=sway_sampling_coef,
361
+ speed=speed,
362
+ fix_duration=fix_duration,
363
+ device=device,
364
+ )
365
+
366
+
367
+ # infer batches
368
+
369
+
370
+ def infer_batch_process(
371
+ ref_audio,
372
+ ref_text,
373
+ gen_text_batches,
374
+ model_obj,
375
+ vocoder,
376
+ mel_spec_type="vocos",
377
+ progress=tqdm,
378
+ target_rms=0.1,
379
+ cross_fade_duration=0.15,
380
+ nfe_step=32,
381
+ cfg_strength=2.0,
382
+ sway_sampling_coef=-1,
383
+ speed=1,
384
+ fix_duration=None,
385
+ device=None,
386
+ ):
387
+ audio, sr = ref_audio
388
+ if audio.shape[0] > 1:
389
+ audio = torch.mean(audio, dim=0, keepdim=True)
390
+
391
+ rms = torch.sqrt(torch.mean(torch.square(audio)))
392
+ if rms < target_rms:
393
+ audio = audio * target_rms / rms
394
+ if sr != target_sample_rate:
395
+ resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
396
+ audio = resampler(audio)
397
+ audio = audio.to(device)
398
+
399
+ generated_waves = []
400
+ spectrograms = []
401
+
402
+ if len(ref_text[-1].encode("utf-8")) == 1:
403
+ ref_text = ref_text + " "
404
+ for i, gen_text in enumerate(progress.tqdm(gen_text_batches)):
405
+ # Prepare the text
406
+ text_list = [ref_text + gen_text]
407
+ final_text_list = convert_char_to_pinyin(text_list)
408
+
409
+ ref_audio_len = audio.shape[-1] // hop_length
410
+ if fix_duration is not None:
411
+ duration = int(fix_duration * target_sample_rate / hop_length)
412
+ else:
413
+ # Calculate duration
414
+ ref_text_len = len(ref_text.encode("utf-8"))
415
+ gen_text_len = len(gen_text.encode("utf-8"))
416
+ duration = ref_audio_len + int(ref_audio_len / ref_text_len * gen_text_len / speed)
417
+
418
+ # inference
419
+ with torch.inference_mode():
420
+ generated, _ = model_obj.sample(
421
+ cond=audio,
422
+ text=final_text_list,
423
+ duration=duration,
424
+ steps=nfe_step,
425
+ cfg_strength=cfg_strength,
426
+ sway_sampling_coef=sway_sampling_coef,
427
+ )
428
+
429
+ generated = generated.to(torch.float32)
430
+ generated = generated[:, ref_audio_len:, :]
431
+ generated_mel_spec = generated.permute(0, 2, 1)
432
+ if mel_spec_type == "vocos":
433
+ generated_wave = vocoder.decode(generated_mel_spec)
434
+ elif mel_spec_type == "bigvgan":
435
+ generated_wave = vocoder(generated_mel_spec)
436
+ if rms < target_rms:
437
+ generated_wave = generated_wave * rms / target_rms
438
+
439
+ # wav -> numpy
440
+ generated_wave = generated_wave.squeeze().cpu().numpy()
441
+
442
+ generated_waves.append(generated_wave)
443
+ spectrograms.append(generated_mel_spec[0].cpu().numpy())
444
+
445
+ # Combine all generated waves with cross-fading
446
+ if cross_fade_duration <= 0:
447
+ # Simply concatenate
448
+ final_wave = np.concatenate(generated_waves)
449
+ else:
450
+ final_wave = generated_waves[0]
451
+ for i in range(1, len(generated_waves)):
452
+ prev_wave = final_wave
453
+ next_wave = generated_waves[i]
454
+
455
+ # Calculate cross-fade samples, ensuring it does not exceed wave lengths
456
+ cross_fade_samples = int(cross_fade_duration * target_sample_rate)
457
+ cross_fade_samples = min(cross_fade_samples, len(prev_wave), len(next_wave))
458
+
459
+ if cross_fade_samples <= 0:
460
+ # No overlap possible, concatenate
461
+ final_wave = np.concatenate([prev_wave, next_wave])
462
+ continue
463
+
464
+ # Overlapping parts
465
+ prev_overlap = prev_wave[-cross_fade_samples:]
466
+ next_overlap = next_wave[:cross_fade_samples]
467
+
468
+ # Fade out and fade in
469
+ fade_out = np.linspace(1, 0, cross_fade_samples)
470
+ fade_in = np.linspace(0, 1, cross_fade_samples)
471
+
472
+ # Cross-faded overlap
473
+ cross_faded_overlap = prev_overlap * fade_out + next_overlap * fade_in
474
+
475
+ # Combine
476
+ new_wave = np.concatenate(
477
+ [prev_wave[:-cross_fade_samples], cross_faded_overlap, next_wave[cross_fade_samples:]]
478
+ )
479
+
480
+ final_wave = new_wave
481
+
482
+ # Create a combined spectrogram
483
+ combined_spectrogram = np.concatenate(spectrograms, axis=1)
484
+
485
+ return final_wave, target_sample_rate, combined_spectrogram
486
+
487
+
488
+ # remove silence from generated wav
489
+
490
+
491
+ def remove_silence_for_generated_wav(filename):
492
+ aseg = AudioSegment.from_file(filename)
493
+ non_silent_segs = silence.split_on_silence(
494
+ aseg, min_silence_len=1000, silence_thresh=-50, keep_silence=500, seek_step=10
495
+ )
496
+ non_silent_wave = AudioSegment.silent(duration=0)
497
+ for non_silent_seg in non_silent_segs:
498
+ non_silent_wave += non_silent_seg
499
+ aseg = non_silent_wave
500
+ aseg.export(filename, format="wav")
501
+
502
+
503
+ # save spectrogram
504
+
505
+
506
+ def save_spectrogram(spectrogram, path):
507
+ plt.figure(figsize=(12, 4))
508
+ plt.imshow(spectrogram, origin="lower", aspect="auto")
509
+ plt.colorbar()
510
+ plt.savefig(path)
511
+ plt.close()
src/f5_tts/model/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from f5_tts.model.cfm import CFM
2
+
3
+ from f5_tts.model.backbones.unett import UNetT
4
+ from f5_tts.model.backbones.dit import DiT
5
+ from f5_tts.model.backbones.mmdit import MMDiT
6
+
7
+ from f5_tts.model.trainer import Trainer
8
+
9
+
10
+ __all__ = ["CFM", "UNetT", "DiT", "MMDiT", "Trainer"]
src/f5_tts/model/backbones/README.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Backbones quick introduction
2
+
3
+
4
+ ### unett.py
5
+ - flat unet transformer
6
+ - structure same as in e2-tts & voicebox paper except using rotary pos emb
7
+ - update: allow possible abs pos emb & convnextv2 blocks for embedded text before concat
8
+
9
+ ### dit.py
10
+ - adaln-zero dit
11
+ - embedded timestep as condition
12
+ - concatted noised_input + masked_cond + embedded_text, linear proj in
13
+ - possible abs pos emb & convnextv2 blocks for embedded text before concat
14
+ - possible long skip connection (first layer to last layer)
15
+
16
+ ### mmdit.py
17
+ - sd3 structure
18
+ - timestep as condition
19
+ - left stream: text embedded and applied a abs pos emb
20
+ - right stream: masked_cond & noised_input concatted and with same conv pos emb as unett