Spaces:
Build error
Build error
| import torch | |
| from lib.infer_pack.models import ( | |
| SynthesizerTrnMs256NSFsid, | |
| SynthesizerTrnMs256NSFsid_nono, | |
| SynthesizerTrnMs768NSFsid, | |
| SynthesizerTrnMs768NSFsid_nono, | |
| ) | |
| from vc_infer_pipeline import VC | |
| import traceback, pdb | |
| from lib.audio import load_audio | |
| import numpy as np | |
| import os | |
| from fairseq import checkpoint_utils | |
| import soundfile as sf | |
| from gtts import gTTS | |
| import edge_tts | |
| import asyncio | |
| import nest_asyncio | |
| # model load | |
| def get_vc(sid, to_return_protect0, to_return_protect1): | |
| global n_spk, tgt_sr, net_g, vc, cpt, version | |
| if sid == "" or sid == []: | |
| global hubert_model | |
| if hubert_model is not None: # change model or not | |
| print("clean_empty_cache") | |
| del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt | |
| hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| ### if clean | |
| if_f0 = cpt.get("f0", 1) | |
| version = cpt.get("version", "v1") | |
| if version == "v1": | |
| if if_f0 == 1: | |
| net_g = SynthesizerTrnMs256NSFsid( | |
| *cpt["config"], is_half=config.is_half | |
| ) | |
| else: | |
| net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) | |
| elif version == "v2": | |
| if if_f0 == 1: | |
| net_g = SynthesizerTrnMs768NSFsid( | |
| *cpt["config"], is_half=config.is_half | |
| ) | |
| else: | |
| net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) | |
| del net_g, cpt | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| return {"visible": False, "__type__": "update"} | |
| person = "%s/%s" % (weight_root, sid) | |
| print("loading %s" % person) | |
| cpt = torch.load(person, map_location="cpu") | |
| tgt_sr = cpt["config"][-1] | |
| cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk | |
| if_f0 = cpt.get("f0", 1) | |
| if if_f0 == 0: | |
| to_return_protect0 = to_return_protect1 = { | |
| "visible": False, | |
| "value": 0.5, | |
| "__type__": "update", | |
| } | |
| else: | |
| to_return_protect0 = { | |
| "visible": True, | |
| "value": to_return_protect0, | |
| "__type__": "update", | |
| } | |
| to_return_protect1 = { | |
| "visible": True, | |
| "value": to_return_protect1, | |
| "__type__": "update", | |
| } | |
| version = cpt.get("version", "v1") | |
| if version == "v1": | |
| if if_f0 == 1: | |
| net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half) | |
| else: | |
| net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) | |
| elif version == "v2": | |
| if if_f0 == 1: | |
| net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half) | |
| else: | |
| net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) | |
| del net_g.enc_q | |
| print(net_g.load_state_dict(cpt["weight"], strict=False)) | |
| net_g.eval().to(config.device) | |
| if config.is_half: | |
| net_g = net_g.half() | |
| else: | |
| net_g = net_g.float() | |
| vc = VC(tgt_sr, config) | |
| n_spk = cpt["config"][-3] | |
| return ( | |
| {"visible": True, "maximum": n_spk, "__type__": "update"}, | |
| to_return_protect0, | |
| to_return_protect1, | |
| ) | |
| # inference | |
| def vc_single( | |
| sid, | |
| input_audio_path, | |
| f0_up_key, | |
| f0_file, | |
| f0_method, | |
| file_index, | |
| file_index2, | |
| # file_big_npy, | |
| index_rate, | |
| filter_radius, | |
| resample_sr, | |
| rms_mix_rate, | |
| protect, | |
| ): | |
| global tgt_sr, net_g, vc, hubert_model, version, cpt | |
| if input_audio_path is None: | |
| return "You need to upload an audio", None | |
| f0_up_key = int(f0_up_key) | |
| try: | |
| audio = load_audio(input_audio_path, 16000) | |
| audio_max = np.abs(audio).max() / 0.95 | |
| if audio_max > 1: | |
| audio /= audio_max | |
| times = [0, 0, 0] | |
| if not hubert_model: | |
| load_hubert() | |
| if_f0 = cpt.get("f0", 1) | |
| file_index = ( | |
| ( | |
| file_index.strip(" ") | |
| .strip('"') | |
| .strip("\n") | |
| .strip('"') | |
| .strip(" ") | |
| .replace("trained", "added") | |
| ) | |
| if file_index != "" | |
| else file_index2 | |
| ) # reemplace for 2 | |
| # file_big_npy = ( | |
| # file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ") | |
| # ) | |
| audio_opt = vc.pipeline( | |
| hubert_model, | |
| net_g, | |
| sid, | |
| audio, | |
| input_audio_path, | |
| times, | |
| f0_up_key, | |
| f0_method, | |
| file_index, | |
| # file_big_npy, | |
| index_rate, | |
| if_f0, | |
| filter_radius, | |
| tgt_sr, | |
| resample_sr, | |
| rms_mix_rate, | |
| version, | |
| protect, | |
| f0_file=f0_file, | |
| ) | |
| if tgt_sr != resample_sr >= 16000: | |
| tgt_sr = resample_sr | |
| index_info = ( | |
| "Using index:%s." % file_index | |
| if os.path.exists(file_index) | |
| else "Index not used." | |
| ) | |
| return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % ( | |
| index_info, | |
| times[0], | |
| times[1], | |
| times[2], | |
| ), (tgt_sr, audio_opt) | |
| except: | |
| info = traceback.format_exc() | |
| print(info) | |
| return info, (None, None) | |
| # hubert model | |
| def load_hubert(): | |
| global hubert_model | |
| models, _, _ = checkpoint_utils.load_model_ensemble_and_task( | |
| ["hubert_base.pt"], | |
| suffix="", | |
| ) | |
| hubert_model = models[0] | |
| hubert_model = hubert_model.to(config.device) | |
| if config.is_half: | |
| hubert_model = hubert_model.half() | |
| else: | |
| hubert_model = hubert_model.float() | |
| hubert_model.eval() | |
| # config cpu | |
| def use_fp32_config(): | |
| for config_file in [ | |
| "32k.json", | |
| "40k.json", | |
| "48k.json", | |
| "48k_v2.json", | |
| "32k_v2.json", | |
| ]: | |
| with open(f"configs/{config_file}", "r") as f: | |
| strr = f.read().replace("true", "false") | |
| with open(f"configs/{config_file}", "w") as f: | |
| f.write(strr) | |
| # config device and torch type | |
| class Config: | |
| def __init__(self, device, is_half): | |
| self.device = device | |
| self.is_half = is_half | |
| self.n_cpu = 2 # set cpu cores #################### | |
| self.gpu_name = None | |
| self.gpu_mem = None | |
| self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config() | |
| def device_config(self) -> tuple: | |
| if torch.cuda.is_available(): | |
| i_device = int(self.device.split(":")[-1]) | |
| self.gpu_name = torch.cuda.get_device_name(i_device) | |
| if ( | |
| ("16" in self.gpu_name and "V100" not in self.gpu_name.upper()) | |
| or "P40" in self.gpu_name.upper() | |
| or "1060" in self.gpu_name | |
| or "1070" in self.gpu_name | |
| or "1080" in self.gpu_name | |
| ): | |
| print("16 series / 10 series graphics cards and P40 force single precision") | |
| self.is_half = False | |
| for config_file in ["32k.json", "40k.json", "48k.json"]: | |
| with open(f"configs/{config_file}", "r") as f: | |
| strr = f.read().replace("true", "false") | |
| with open(f"configs/{config_file}", "w") as f: | |
| f.write(strr) | |
| with open("trainset_preprocess_pipeline_print.py", "r") as f: | |
| strr = f.read().replace("3.7", "3.0") | |
| with open("trainset_preprocess_pipeline_print.py", "w") as f: | |
| f.write(strr) | |
| else: | |
| self.gpu_name = None | |
| self.gpu_mem = int( | |
| torch.cuda.get_device_properties(i_device).total_memory | |
| / 1024 | |
| / 1024 | |
| / 1024 | |
| + 0.4 | |
| ) | |
| if self.gpu_mem <= 4: | |
| with open("trainset_preprocess_pipeline_print.py", "r") as f: | |
| strr = f.read().replace("3.7", "3.0") | |
| with open("trainset_preprocess_pipeline_print.py", "w") as f: | |
| f.write(strr) | |
| elif torch.backends.mps.is_available(): | |
| print("Supported N-card not found, using MPS for inference") | |
| self.device = "mps" | |
| else: | |
| print("No supported N-card found, using CPU for inference") | |
| self.device = "cpu" | |
| self.is_half = False | |
| use_fp32_config() | |
| if self.n_cpu == 0: | |
| self.n_cpu = cpu_count() | |
| if self.is_half: | |
| # 6GB VRAM configuration | |
| x_pad = 3 | |
| x_query = 10 | |
| x_center = 60 | |
| x_max = 65 | |
| else: | |
| # 5GB VRAM configuration | |
| x_pad = 1 | |
| x_query = 6 | |
| x_center = 38 | |
| x_max = 41 | |
| if self.gpu_mem != None and self.gpu_mem <= 4: | |
| x_pad = 1 | |
| x_query = 5 | |
| x_center = 30 | |
| x_max = 32 | |
| print(self.device, self.is_half) | |
| return x_pad, x_query, x_center, x_max | |
| # call inference | |
| class ClassVoices: | |
| def __init__(self): | |
| self.file_index = "" # root | |
| def apply_conf(self, f0method, | |
| model_voice_path00, transpose00, file_index2_00, | |
| model_voice_path01, transpose01, file_index2_01, | |
| model_voice_path02, transpose02, file_index2_02, | |
| model_voice_path03, transpose03, file_index2_03, | |
| model_voice_path04, transpose04, file_index2_04, | |
| model_voice_path05, transpose05, file_index2_05, | |
| model_voice_path99, transpose99, file_index2_99): | |
| #self.filename = filename | |
| self.f0method = f0method # pm | |
| self.model_voice_path00 = model_voice_path00 | |
| self.transpose00 = transpose00 | |
| self.file_index200 = file_index2_00 | |
| self.model_voice_path01 = model_voice_path01 | |
| self.transpose01 = transpose01 | |
| self.file_index201 = file_index2_01 | |
| self.model_voice_path02 = model_voice_path02 | |
| self.transpose02 = transpose02 | |
| self.file_index202 = file_index2_02 | |
| self.model_voice_path03 = model_voice_path03 | |
| self.transpose03 = transpose03 | |
| self.file_index203 = file_index2_03 | |
| self.model_voice_path04 = model_voice_path04 | |
| self.transpose04 = transpose04 | |
| self.file_index204 = file_index2_04 | |
| self.model_voice_path05 = model_voice_path05 | |
| self.transpose05 = transpose05 | |
| self.file_index205 = file_index2_05 | |
| self.model_voice_path99 = model_voice_path99 | |
| self.transpose99 = transpose99 | |
| self.file_index299 = file_index2_99 | |
| return "CONFIGURATION APPLIED" | |
| def custom_voice(self, | |
| _values, # filter indices | |
| audio_files, # all audio files | |
| model_voice_path='', | |
| transpose=0, | |
| f0method='pm', | |
| file_index='', | |
| file_index2='', | |
| ): | |
| #hubert_model = None | |
| get_vc( | |
| sid=model_voice_path, # model path | |
| to_return_protect0=0.33, | |
| to_return_protect1=0.33 | |
| ) | |
| for _value_item in _values: | |
| filename = "audio2/"+audio_files[_value_item] if _value_item != "test" else audio_files[0] | |
| #filename = "audio2/"+audio_files[_value_item] | |
| try: | |
| print(audio_files[_value_item], model_voice_path) | |
| except: | |
| pass | |
| info_, (sample_, audio_output_) = vc_single( | |
| sid=0, | |
| input_audio_path=filename, #f"audio2/{filename}", | |
| f0_up_key=transpose, # transpose for m to f and reverse 0 12 | |
| f0_file=None, | |
| f0_method= f0method, | |
| file_index= file_index, # dir pwd? | |
| file_index2= file_index2, | |
| # file_big_npy1, | |
| index_rate= float(0.66), | |
| filter_radius= int(3), | |
| resample_sr= int(0), | |
| rms_mix_rate= float(0.25), | |
| protect= float(0.33), | |
| ) | |
| sf.write( | |
| file= filename, #f"audio2/{filename}", | |
| samplerate=sample_, | |
| data=audio_output_ | |
| ) | |
| # detele the model | |
| def make_test(self, | |
| tts_text, | |
| tts_voice, | |
| model_path, | |
| index_path, | |
| transpose, | |
| f0_method, | |
| ): | |
| os.system("rm -rf test") | |
| filename = "test/test.wav" | |
| if "SET_LIMIT" == os.getenv("DEMO"): | |
| if len(tts_text) > 60: | |
| tts_text = tts_text[:60] | |
| print("DEMO; limit to 60 characters") | |
| language = tts_voice[:2] | |
| try: | |
| os.system("mkdir test") | |
| #nest_asyncio.apply() # gradio;not | |
| asyncio.run(edge_tts.Communicate(tts_text, "-".join(tts_voice.split('-')[:-1])).save(filename)) | |
| except: | |
| try: | |
| tts = gTTS(tts_text, lang=language) | |
| tts.save(filename) | |
| tts.save | |
| print(f'No audio was received. Please change the tts voice for {tts_voice}. USING gTTS.') | |
| except: | |
| tts = gTTS('a', lang=language) | |
| tts.save(filename) | |
| print('Error: Audio will be replaced.') | |
| os.system("cp test/test.wav test/real_test.wav") | |
| self([],[]) # start modules | |
| self.custom_voice( | |
| ["test"], # filter indices | |
| ["test/test.wav"], # all audio files | |
| model_voice_path=model_path, | |
| transpose=transpose, | |
| f0method=f0_method, | |
| file_index='', | |
| file_index2=index_path, | |
| ) | |
| return "test/test.wav", "test/real_test.wav" | |
| def __call__(self, speakers_list, audio_files): | |
| speakers_indices = {} | |
| for index, speak_ in enumerate(speakers_list): | |
| if speak_ in speakers_indices: | |
| speakers_indices[speak_].append(index) | |
| else: | |
| speakers_indices[speak_] = [index] | |
| # find models and index | |
| global weight_root, index_root, config, hubert_model | |
| weight_root = "weights" | |
| names = [] | |
| for name in os.listdir(weight_root): | |
| if name.endswith(".pth"): | |
| names.append(name) | |
| index_root = "logs" | |
| index_paths = [] | |
| for name in os.listdir(index_root): | |
| if name.endswith(".index"): | |
| index_paths.append(name) | |
| print(names, index_paths) | |
| # config machine | |
| hubert_model = None | |
| config = Config('cuda:0', is_half=True) # config = Config('cpu', is_half=False) # cpu | |
| # filter by speaker | |
| for _speak, _values in speakers_indices.items(): | |
| #print(_speak, _values) | |
| #for _value_item in _values: | |
| # self.filename = "audio2/"+audio_files[_value_item] | |
| ###print(audio_files[_value_item]) | |
| #vc(_speak, _values, audio_files) | |
| if _speak == "SPEAKER_00": | |
| self.custom_voice( | |
| _values, # filteredd | |
| audio_files, | |
| model_voice_path=self.model_voice_path00, | |
| file_index2=self.file_index200, | |
| transpose=self.transpose00, | |
| f0method=self.f0method, | |
| file_index=self.file_index, | |
| ) | |
| elif _speak == "SPEAKER_01": | |
| self.custom_voice( | |
| _values, | |
| audio_files, | |
| model_voice_path=self.model_voice_path01, | |
| file_index2=self.file_index201, | |
| transpose=self.transpose01, | |
| f0method=self.f0method, | |
| file_index=self.file_index, | |
| ) | |
| elif _speak == "SPEAKER_02": | |
| self.custom_voice( | |
| _values, | |
| audio_files, | |
| model_voice_path=self.model_voice_path02, | |
| file_index2=self.file_index202, | |
| transpose=self.transpose02, | |
| f0method=self.f0method, | |
| file_index=self.file_index, | |
| ) | |
| elif _speak == "SPEAKER_03": | |
| self.custom_voice( | |
| _values, | |
| audio_files, | |
| model_voice_path=self.model_voice_path03, | |
| file_index2=self.file_index203, | |
| transpose=self.transpose03, | |
| f0method=self.f0method, | |
| file_index=self.file_index, | |
| ) | |
| elif _speak == "SPEAKER_04": | |
| self.custom_voice( | |
| _values, | |
| audio_files, | |
| model_voice_path=self.model_voice_path04, | |
| file_index2=self.file_index204, | |
| transpose=self.transpose04, | |
| f0method=self.f0method, | |
| file_index=self.file_index, | |
| ) | |
| elif _speak == "SPEAKER_05": | |
| self.custom_voice( | |
| _values, | |
| audio_files, | |
| model_voice_path=self.model_voice_path05, | |
| file_index2=self.file_index205, | |
| transpose=self.transpose05, | |
| f0method=self.f0method, | |
| file_index=self.file_index, | |
| ) | |
| elif _speak == "SPEAKER_99": | |
| self.custom_voice( | |
| _values, | |
| audio_files, | |
| model_voice_path=self.model_voice_path99, | |
| file_index2=self.file_index299, | |
| transpose=self.transpose99, | |
| f0method=self.f0method, | |
| file_index=self.file_index, | |
| ) | |
| else: | |
| pass | |