Spaces:
Sleeping
Sleeping
| # Copyright 2022-2023 Xiaomi Corp. (authors: Fangjun Kuang) | |
| # | |
| # See LICENSE for clarification regarding multiple authors | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| from functools import lru_cache | |
| import sherpa_onnx | |
| from huggingface_hub import hf_hub_download | |
| def get_file( | |
| repo_id: str, | |
| filename: str, | |
| subfolder: str = ".", | |
| ) -> str: | |
| model_filename = hf_hub_download( | |
| repo_id=repo_id, | |
| filename=filename, | |
| subfolder=subfolder, | |
| ) | |
| return model_filename | |
| def _get_vits_vctk(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts: | |
| assert repo_id == "csukuangfj/vits-vctk" | |
| model = get_file( | |
| repo_id=repo_id, | |
| filename="vits-vctk.onnx", | |
| subfolder=".", | |
| ) | |
| lexicon = get_file( | |
| repo_id=repo_id, | |
| filename="lexicon.txt", | |
| subfolder=".", | |
| ) | |
| tokens = get_file( | |
| repo_id=repo_id, | |
| filename="tokens.txt", | |
| subfolder=".", | |
| ) | |
| tts_config = sherpa_onnx.OfflineTtsConfig( | |
| model=sherpa_onnx.OfflineTtsModelConfig( | |
| vits=sherpa_onnx.OfflineTtsVitsModelConfig( | |
| model=model, | |
| lexicon=lexicon, | |
| tokens=tokens, | |
| length_scale=1.0 / speed, | |
| ), | |
| provider="cpu", | |
| debug=True, | |
| num_threads=2, | |
| ) | |
| ) | |
| tts = sherpa_onnx.OfflineTts(tts_config) | |
| return tts | |
| def _get_vits_ljs(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts: | |
| assert repo_id == "csukuangfj/vits-ljs" | |
| model = get_file( | |
| repo_id=repo_id, | |
| filename="vits-ljs.onnx", | |
| subfolder=".", | |
| ) | |
| lexicon = get_file( | |
| repo_id=repo_id, | |
| filename="lexicon.txt", | |
| subfolder=".", | |
| ) | |
| tokens = get_file( | |
| repo_id=repo_id, | |
| filename="tokens.txt", | |
| subfolder=".", | |
| ) | |
| tts_config = sherpa_onnx.OfflineTtsConfig( | |
| model=sherpa_onnx.OfflineTtsModelConfig( | |
| vits=sherpa_onnx.OfflineTtsVitsModelConfig( | |
| model=model, | |
| lexicon=lexicon, | |
| tokens=tokens, | |
| length_scale=1.0 / speed, | |
| ), | |
| provider="cpu", | |
| debug=True, | |
| num_threads=2, | |
| ) | |
| ) | |
| tts = sherpa_onnx.OfflineTts(tts_config) | |
| return tts | |
| def _get_vits_piper(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts: | |
| n = len("vits-piper-") | |
| name = repo_id.split("/")[1][n:] | |
| model = get_file( | |
| repo_id=repo_id, | |
| filename=f"{name}.onnx", | |
| subfolder=".", | |
| ) | |
| tokens = get_file( | |
| repo_id=repo_id, | |
| filename="tokens.txt", | |
| subfolder=".", | |
| ) | |
| tts_config = sherpa_onnx.OfflineTtsConfig( | |
| model=sherpa_onnx.OfflineTtsModelConfig( | |
| vits=sherpa_onnx.OfflineTtsVitsModelConfig( | |
| model=model, | |
| lexicon="", | |
| data_dir="/tmp/espeak-ng-data", | |
| tokens=tokens, | |
| length_scale=1.0 / speed, | |
| ), | |
| provider="cpu", | |
| debug=True, | |
| num_threads=2, | |
| ) | |
| ) | |
| tts = sherpa_onnx.OfflineTts(tts_config) | |
| return tts | |
| def _get_vits_zh_aishell3(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts: | |
| assert repo_id == "csukuangfj/vits-zh-aishell3" | |
| model = get_file( | |
| repo_id=repo_id, | |
| filename="vits-aishell3.onnx", | |
| subfolder=".", | |
| ) | |
| lexicon = get_file( | |
| repo_id=repo_id, | |
| filename="lexicon.txt", | |
| subfolder=".", | |
| ) | |
| tokens = get_file( | |
| repo_id=repo_id, | |
| filename="tokens.txt", | |
| subfolder=".", | |
| ) | |
| rule_fst = get_file( | |
| repo_id=repo_id, | |
| filename="rule.fst", | |
| subfolder=".", | |
| ) | |
| tts_config = sherpa_onnx.OfflineTtsConfig( | |
| model=sherpa_onnx.OfflineTtsModelConfig( | |
| vits=sherpa_onnx.OfflineTtsVitsModelConfig( | |
| model=model, | |
| lexicon=lexicon, | |
| tokens=tokens, | |
| length_scale=1.0 / speed, | |
| ), | |
| provider="cpu", | |
| debug=True, | |
| num_threads=2, | |
| ), | |
| rule_fsts=rule_fst, | |
| ) | |
| tts = sherpa_onnx.OfflineTts(tts_config) | |
| return tts | |
| def _get_vits_hf(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts: | |
| if "fanchen" in repo_id or "vits-cantonese-hf-xiaomaiiwn" in repo_id: | |
| model = repo_id.split("/")[-1] | |
| elif "coqui" in repo_id: | |
| model = "model" | |
| else: | |
| model = repo_id.split("-")[-1] | |
| model = get_file( | |
| repo_id=repo_id, | |
| filename=f"{model}.onnx", | |
| subfolder=".", | |
| ) | |
| lexicon = get_file( | |
| repo_id=repo_id, | |
| filename="lexicon.txt", | |
| subfolder=".", | |
| ) | |
| tokens = get_file( | |
| repo_id=repo_id, | |
| filename="tokens.txt", | |
| subfolder=".", | |
| ) | |
| if "coqui" not in repo_id: | |
| rule_fst = get_file( | |
| repo_id=repo_id, | |
| filename="rule.fst", | |
| subfolder=".", | |
| ) | |
| else: | |
| rule_fst = "" | |
| tts_config = sherpa_onnx.OfflineTtsConfig( | |
| model=sherpa_onnx.OfflineTtsModelConfig( | |
| vits=sherpa_onnx.OfflineTtsVitsModelConfig( | |
| model=model, | |
| lexicon=lexicon, | |
| tokens=tokens, | |
| length_scale=1.0 / speed, | |
| ), | |
| provider="cpu", | |
| debug=True, | |
| num_threads=2, | |
| ), | |
| rule_fsts=rule_fst, | |
| ) | |
| tts = sherpa_onnx.OfflineTts(tts_config) | |
| return tts | |
| def get_pretrained_model(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts: | |
| if repo_id in chinese_models: | |
| return chinese_models[repo_id](repo_id, speed) | |
| if repo_id in cantonese_models: | |
| return cantonese_models[repo_id](repo_id, speed) | |
| elif repo_id in english_models: | |
| return english_models[repo_id](repo_id, speed) | |
| elif repo_id in german_models: | |
| return german_models[repo_id](repo_id, speed) | |
| elif repo_id in spanish_models: | |
| return spanish_models[repo_id](repo_id, speed) | |
| elif repo_id in french_models: | |
| return french_models[repo_id](repo_id, speed) | |
| elif repo_id in ukrainian_models: | |
| return ukrainian_models[repo_id](repo_id, speed) | |
| elif repo_id in russian_models: | |
| return russian_models[repo_id](repo_id, speed) | |
| else: | |
| raise ValueError(f"Unsupported repo_id: {repo_id}") | |
| cantonese_models = { | |
| "csukuangfj/vits-cantonese-hf-xiaomaiiwn": _get_vits_hf, | |
| } | |
| chinese_models = { | |
| "csukuangfj/vits-zh-hf-theresa": _get_vits_hf, | |
| "csukuangfj/vits-zh-hf-eula": _get_vits_hf, | |
| "csukuangfj/vits-zh-hf-echo": _get_vits_hf, | |
| "csukuangfj/vits-zh-hf-bronya": _get_vits_hf, | |
| "csukuangfj/vits-zh-aishell3": _get_vits_zh_aishell3, | |
| "csukuangfj/vits-zh-hf-fanchen-wnj": _get_vits_hf, | |
| "csukuangfj/vits-zh-hf-fanchen-C": _get_vits_hf, | |
| "csukuangfj/vits-zh-hf-fanchen-ZhiHuiLaoZhe": _get_vits_hf, | |
| "csukuangfj/vits-zh-hf-fanchen-ZhiHuiLaoZhe_new": _get_vits_hf, | |
| "csukuangfj/vits-zh-hf-fanchen-unity": _get_vits_hf, | |
| "csukuangfj/vits-zh-hf-doom": _get_vits_hf, | |
| "csukuangfj/vits-zh-hf-zenyatta": _get_vits_hf, # 804 | |
| "csukuangfj/vits-zh-hf-abyssinvoker": _get_vits_hf, | |
| "csukuangfj/vits-zh-hf-keqing": _get_vits_hf, | |
| # "csukuangfj/vits-piper-zh_CN-huayan-x_low": _get_vits_piper, | |
| # "csukuangfj/vits-piper-zh_CN-huayan-medium": _get_vits_piper, | |
| } | |
| english_models = { | |
| "csukuangfj/vits-vctk": _get_vits_vctk, # 109 speakers | |
| "csukuangfj/vits-ljs": _get_vits_ljs, | |
| # coqui-ai | |
| "csukuangfj/vits-coqui-en-vctk": _get_vits_hf, | |
| "csukuangfj/vits-coqui-en-ljspeech": _get_vits_hf, | |
| "csukuangfj/vits-coqui-en-ljspeech-neon": _get_vits_hf, | |
| # piper, US | |
| "csukuangfj/vits-piper-en_US-amy-low": _get_vits_piper, | |
| "csukuangfj/vits-piper-en_US-amy-medium": _get_vits_piper, | |
| "csukuangfj/vits-piper-en_US-arctic-medium": _get_vits_piper, # 18 speakers | |
| "csukuangfj/vits-piper-en_US-danny-low": _get_vits_piper, | |
| "csukuangfj/vits-piper-en_US-hfc_male-medium": _get_vits_piper, | |
| "csukuangfj/vits-piper-en_US-joe-medium": _get_vits_piper, | |
| "csukuangfj/vits-piper-en_US-kathleen-low": _get_vits_piper, | |
| "csukuangfj/vits-piper-en_US-kusal-medium": _get_vits_piper, | |
| "csukuangfj/vits-piper-en_US-l2arctic-medium": _get_vits_piper, # 24 speakers | |
| "csukuangfj/vits-piper-en_US-lessac-low": _get_vits_piper, | |
| "csukuangfj/vits-piper-en_US-lessac-medium": _get_vits_piper, | |
| "csukuangfj/vits-piper-en_US-lessac-high": _get_vits_piper, | |
| "csukuangfj/vits-piper-en_US-libritts-high": _get_vits_piper, # 904 speakers | |
| "csukuangfj/vits-piper-en_US-libritts_r-medium": _get_vits_piper, # 904 speakers | |
| "csukuangfj/vits-piper-en_US-ryan-low": _get_vits_piper, | |
| "csukuangfj/vits-piper-en_US-ryan-medium": _get_vits_piper, | |
| "csukuangfj/vits-piper-en_US-ryan-high": _get_vits_piper, | |
| # piper, GB | |
| "csukuangfj/vits-piper-en_GB-alan-low": _get_vits_piper, | |
| "csukuangfj/vits-piper-en_GB-alan-medium": _get_vits_piper, | |
| "csukuangfj/vits-piper-en_GB-alba-medium": _get_vits_piper, | |
| "csukuangfj/vits-piper-en_GB-jenny_dioco-medium": _get_vits_piper, | |
| "csukuangfj/vits-piper-en_GB-northern_english_male-medium": _get_vits_piper, | |
| "csukuangfj/vits-piper-en_GB-semaine-medium": _get_vits_piper, | |
| "csukuangfj/vits-piper-en_GB-southern_english_female-low": _get_vits_piper, | |
| "csukuangfj/vits-piper-en_GB-vctk-medium": _get_vits_piper, | |
| } | |
| german_models = { | |
| "csukuangfj/vits-piper-de_DE-eva_k-x_low": _get_vits_piper, | |
| "csukuangfj/vits-piper-de_DE-karlsson-low": _get_vits_piper, | |
| "csukuangfj/vits-piper-de_DE-kerstin-low": _get_vits_piper, | |
| "csukuangfj/vits-piper-de_DE-pavoque-low": _get_vits_piper, | |
| "csukuangfj/vits-piper-de_DE-ramona-low": _get_vits_piper, | |
| "csukuangfj/vits-piper-de_DE-thorsten-low": _get_vits_piper, | |
| "csukuangfj/vits-piper-de_DE-thorsten-medium": _get_vits_piper, | |
| "csukuangfj/vits-piper-de_DE-thorsten-high": _get_vits_piper, | |
| "csukuangfj/vits-piper-de_DE-thorsten_emotional-medium": _get_vits_piper, # 8 speakers | |
| } | |
| spanish_models = { | |
| "csukuangfj/vits-piper-es_ES-carlfm-x_low": _get_vits_piper, | |
| "csukuangfj/vits-piper-es_ES-davefx-medium": _get_vits_piper, | |
| "csukuangfj/vits-piper-es_ES-mls_10246-low": _get_vits_piper, | |
| "csukuangfj/vits-piper-es_ES-mls_9972-low": _get_vits_piper, | |
| "csukuangfj/vits-piper-es_ES-sharvard-medium": _get_vits_piper, # 2 speakers | |
| "csukuangfj/vits-piper-es_MX-ald-medium": _get_vits_piper, | |
| } | |
| french_models = { | |
| # "csukuangfj/vits-piper-fr_FR-gilles-low": _get_vits_piper, | |
| # "csukuangfj/vits-piper-fr_FR-mls_1840-low": _get_vits_piper, | |
| "csukuangfj/vits-piper-fr_FR-upmc-medium": _get_vits_piper, # 2 speakers, 0-femal, 1-male | |
| "csukuangfj/vits-piper-fr_FR-siwis-low": _get_vits_piper, # female | |
| "csukuangfj/vits-piper-fr_FR-siwis-medium": _get_vits_piper, | |
| "csukuangfj/vits-piper-fr_FR-tjiho-model1": _get_vits_piper, | |
| "csukuangfj/vits-piper-fr_FR-tjiho-model2": _get_vits_piper, | |
| "csukuangfj/vits-piper-fr_FR-tjiho-model3": _get_vits_piper, | |
| } | |
| ukrainian_models = { | |
| "csukuangfj/vits-piper-uk_UA-lada-x_low": _get_vits_piper, | |
| # "csukuangfj/vits-piper-uk_UA-ukrainian_tts-medium": _get_vits_piper, # does not work somehow | |
| } | |
| russian_models = { | |
| "csukuangfj/vits-piper-ru_RU-denis-medium": _get_vits_piper, | |
| "csukuangfj/vits-piper-ru_RU-dmitri-medium": _get_vits_piper, | |
| "csukuangfj/vits-piper-ru_RU-irina-medium": _get_vits_piper, | |
| "csukuangfj/vits-piper-ru_RU-ruslan-medium": _get_vits_piper, | |
| } | |
| language_to_models = { | |
| "English": list(english_models.keys()), | |
| "Chinese (Mandarin, 普通话)": list(chinese_models.keys()), | |
| "Cantonese (粤语)": list(cantonese_models.keys()), | |
| "German": list(german_models.keys()), | |
| "Spanish": list(spanish_models.keys()), | |
| "French": list(french_models.keys()), | |
| "Ukrainian": list(ukrainian_models.keys()), | |
| "Russian": list(russian_models.keys()), | |
| } | |