Spaces:
Runtime error
Runtime error
| import os | |
| import json | |
| import logging | |
| import torch | |
| import config | |
| import numpy as np | |
| from utils.utils import check_is_none | |
| from vits import VITS | |
| from voice import TTS | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| lang_dict = { | |
| "english_cleaners": ["en"], | |
| "english_cleaners2": ["en"], | |
| "japanese_cleaners": ["ja"], | |
| "japanese_cleaners2": ["ja"], | |
| "korean_cleaners": ["ko"], | |
| "chinese_cleaners": ["zh"], | |
| "zh_ja_mixture_cleaners": ["zh", "ja"], | |
| "sanskrit_cleaners": ["sa"], | |
| "cjks_cleaners": ["zh", "ja", "ko", "sa"], | |
| "cjke_cleaners": ["zh", "ja", "ko", "en"], | |
| "cjke_cleaners2": ["zh", "ja", "ko", "en"], | |
| "cje_cleaners": ["zh", "ja", "en"], | |
| "cje_cleaners2": ["zh", "ja", "en"], | |
| "thai_cleaners": ["th"], | |
| "shanghainese_cleaners": ["sh"], | |
| "chinese_dialect_cleaners": ["zh", "ja", "sh", "gd", "en", "SZ", "WX", "CZ", "HZ", "SX", "NB", "JJ", "YX", "JD", | |
| "ZR", "PH", "TX", "JS", "HN", "LP", "XS", "FY", "RA", "CX", "SM", "TT", "WZ", "SC", | |
| "YB"], | |
| "bert_chinese_cleaners": ["zh"], | |
| } | |
| def analysis(model_config_json): | |
| model_config = json.load(model_config_json) | |
| symbols = model_config.get("symbols", None) | |
| emotion_embedding = model_config.get("data").get("emotion_embedding", False) | |
| if "use_spk_conditioned_encoder" in model_config.get("model"): | |
| model_type = 'bert_vits2' | |
| return model_type | |
| if symbols != None: | |
| if not emotion_embedding: | |
| mode_type = "vits" | |
| else: | |
| mode_type = "w2v2" | |
| else: | |
| mode_type = "hubert" | |
| return mode_type | |
| def load_npy(model_): | |
| if isinstance(model_, list): | |
| # check if is .npy | |
| for i in model_: | |
| _model_extention = os.path.splitext(i)[1] | |
| if _model_extention != ".npy": | |
| raise ValueError(f"Unsupported model type: {_model_extention}") | |
| # merge npy files | |
| emotion_reference = np.empty((0, 1024)) | |
| for i in model_: | |
| tmp = np.load(i).reshape(-1, 1024) | |
| emotion_reference = np.append(emotion_reference, tmp, axis=0) | |
| elif os.path.isdir(model_): | |
| emotion_reference = np.empty((0, 1024)) | |
| for root, dirs, files in os.walk(model_): | |
| for file_name in files: | |
| # check if is .npy | |
| _model_extention = os.path.splitext(file_name)[1] | |
| if _model_extention != ".npy": | |
| continue | |
| file_path = os.path.join(root, file_name) | |
| # merge npy files | |
| tmp = np.load(file_path).reshape(-1, 1024) | |
| emotion_reference = np.append(emotion_reference, tmp, axis=0) | |
| elif os.path.isfile(model_): | |
| # check if is .npy | |
| _model_extention = os.path.splitext(model_)[1] | |
| if _model_extention != ".npy": | |
| raise ValueError(f"Unsupported model type: {_model_extention}") | |
| emotion_reference = np.load(model_) | |
| logging.info(f"Loaded emotional dimention npy range:{len(emotion_reference)}") | |
| return emotion_reference | |
| def merge_model(merging_model): | |
| vits_obj = [] | |
| vits_speakers = [] | |
| hubert_vits_obj = [] | |
| hubert_vits_speakers = [] | |
| w2v2_vits_obj = [] | |
| w2v2_vits_speakers = [] | |
| bert_vits2_obj = [] | |
| bert_vits2_speakers = [] | |
| # model list | |
| vits_list = [] | |
| hubert_vits_list = [] | |
| w2v2_vits_list = [] | |
| bert_vits2_list = [] | |
| for l in merging_model: | |
| with open(l[1], 'r', encoding='utf-8') as model_config: | |
| model_type = analysis(model_config) | |
| if model_type == "vits": | |
| vits_list.append(l) | |
| elif model_type == "hubert": | |
| hubert_vits_list.append(l) | |
| elif model_type == "w2v2": | |
| w2v2_vits_list.append(l) | |
| elif model_type == "bert_vits2": | |
| bert_vits2_list.append(l) | |
| # merge vits | |
| new_id = 0 | |
| for obj_id, i in enumerate(vits_list): | |
| obj = VITS(model=i[0], config=i[1], model_type="vits", device=device) | |
| lang = lang_dict.get(obj.get_cleaner(), ["unknown"]) | |
| for id, name in enumerate(obj.get_speakers()): | |
| vits_obj.append([int(id), obj, obj_id]) | |
| vits_speakers.append({"id": new_id, "name": name, "lang": lang}) | |
| new_id += 1 | |
| # merge hubert-vits | |
| if len(hubert_vits_list) != 0: | |
| if getattr(config, "HUBERT_SOFT_MODEL", None) == None or check_is_none(config.HUBERT_SOFT_MODEL): | |
| raise ValueError(f"Please configure HUBERT_SOFT_MODEL path in config.py") | |
| try: | |
| from vits.hubert_model import hubert_soft | |
| hubert = hubert_soft(config.HUBERT_SOFT_MODEL) | |
| except Exception as e: | |
| raise ValueError(f"Load HUBERT_SOFT_MODEL failed {e}") | |
| new_id = 0 | |
| for obj_id, i in enumerate(hubert_vits_list): | |
| obj = VITS(model=i[0], config=i[1], model_=hubert, model_type="hubert", device=device) | |
| lang = lang_dict.get(obj.get_cleaner(), ["unknown"]) | |
| for id, name in enumerate(obj.get_speakers()): | |
| hubert_vits_obj.append([int(id), obj, obj_id]) | |
| hubert_vits_speakers.append({"id": new_id, "name": name, "lang": lang}) | |
| new_id += 1 | |
| # merge w2v2-vits | |
| emotion_reference = None | |
| if len(w2v2_vits_list) != 0: | |
| if getattr(config, "DIMENSIONAL_EMOTION_NPY", None) == None or check_is_none(config.DIMENSIONAL_EMOTION_NPY): | |
| raise ValueError(f"Please configure DIMENSIONAL_EMOTION_NPY path in config.py") | |
| try: | |
| emotion_reference = load_npy(config.DIMENSIONAL_EMOTION_NPY) | |
| except Exception as e: | |
| raise ValueError(f"Load DIMENSIONAL_EMOTION_NPY failed {e}") | |
| new_id = 0 | |
| for obj_id, i in enumerate(w2v2_vits_list): | |
| obj = VITS(model=i[0], config=i[1], model_=emotion_reference, model_type="w2v2", device=device) | |
| lang = lang_dict.get(obj.get_cleaner(), ["unknown"]) | |
| for id, name in enumerate(obj.get_speakers()): | |
| w2v2_vits_obj.append([int(id), obj, obj_id]) | |
| w2v2_vits_speakers.append({"id": new_id, "name": name, "lang": lang}) | |
| new_id += 1 | |
| # merge Bert_VITS2 | |
| new_id = 0 | |
| for obj_id, i in enumerate(bert_vits2_list): | |
| from bert_vits2 import Bert_VITS2 | |
| obj = Bert_VITS2(model=i[0], config=i[1], device=device) | |
| lang = ["ZH"] | |
| for id, name in enumerate(obj.get_speakers()): | |
| bert_vits2_obj.append([int(id), obj, obj_id]) | |
| bert_vits2_speakers.append({"id": new_id, "name": name, "lang": lang}) | |
| new_id += 1 | |
| voice_obj = {"VITS": vits_obj, "HUBERT-VITS": hubert_vits_obj, "W2V2-VITS": w2v2_vits_obj, | |
| "BERT-VITS2": bert_vits2_obj} | |
| voice_speakers = {"VITS": vits_speakers, "HUBERT-VITS": hubert_vits_speakers, "W2V2-VITS": w2v2_vits_speakers, | |
| "BERT-VITS2": bert_vits2_speakers} | |
| w2v2_emotion_count = len(emotion_reference) if emotion_reference is not None else 0 | |
| tts = TTS(voice_obj, voice_speakers, w2v2_emotion_count=w2v2_emotion_count, device=device) | |
| return tts | |