Spaces:
Runtime error
Runtime error
| #from turtle import title | |
| import gradio as gr | |
| import git | |
| import os | |
| os.system('git clone https://github.com/Edresson/Coqui-TTS -b multilingual-torchaudio-SE TTS') | |
| os.system('pip install -q -e TTS/') | |
| os.system('pip install -q torchaudio==0.9.0') | |
| os.system('pip install voicefixer --upgrade') | |
| from voicefixer import VoiceFixer | |
| voicefixer = VoiceFixer() | |
| import sys | |
| TTS_PATH = "TTS/" | |
| # add libraries into environment | |
| sys.path.append(TTS_PATH) # set this if TTS is not installed globally | |
| import os | |
| import string | |
| import time | |
| import argparse | |
| import json | |
| import numpy as np | |
| import IPython | |
| from IPython.display import Audio | |
| import torch | |
| import torchaudio | |
| from speechbrain.pretrained import SpectralMaskEnhancement | |
| enhance_model = SpectralMaskEnhancement.from_hparams( | |
| source="speechbrain/metricgan-plus-voicebank", | |
| savedir="pretrained_models/metricgan-plus-voicebank", | |
| #run_opts={"device":"cuda"}, | |
| ) | |
| from TTS.tts.utils.synthesis import synthesis | |
| from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols | |
| try: | |
| from TTS.utils.audio import AudioProcessor | |
| except: | |
| from TTS.utils.audio import AudioProcessor | |
| from TTS.tts.models import setup_model | |
| from TTS.config import load_config | |
| from TTS.tts.models.vits import * | |
| OUT_PATH = 'out/' | |
| # create output path | |
| os.makedirs(OUT_PATH, exist_ok=True) | |
| # model vars | |
| MODEL_PATH = '/home/user/app/best_model_latest.pth.tar' | |
| CONFIG_PATH = '/home/user/app/config.json' | |
| TTS_LANGUAGES = "/home/user/app/language_ids.json" | |
| TTS_SPEAKERS = "/home/user/app/speakers.json" | |
| USE_CUDA = torch.cuda.is_available() | |
| # load the config | |
| C = load_config(CONFIG_PATH) | |
| # load the audio processor | |
| ap = AudioProcessor(**C.audio) | |
| speaker_embedding = None | |
| C.model_args['d_vector_file'] = TTS_SPEAKERS | |
| C.model_args['use_speaker_encoder_as_loss'] = False | |
| model = setup_model(C) | |
| model.language_manager.set_language_ids_from_file(TTS_LANGUAGES) | |
| # print(model.language_manager.num_languages, model.embedded_language_dim) | |
| # print(model.emb_l) | |
| cp = torch.load(MODEL_PATH, map_location=torch.device('cpu')) | |
| # remove speaker encoder | |
| model_weights = cp['model'].copy() | |
| for key in list(model_weights.keys()): | |
| if "speaker_encoder" in key: | |
| del model_weights[key] | |
| model.load_state_dict(model_weights) | |
| model.eval() | |
| if USE_CUDA: | |
| model = model.cuda() | |
| # synthesize voice | |
| use_griffin_lim = False | |
| os.system('pip install -q pydub ffmpeg-normalize') | |
| CONFIG_SE_PATH = "config_se.json" | |
| CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar" | |
| from TTS.tts.utils.speakers import SpeakerManager | |
| from pydub import AudioSegment | |
| import librosa | |
| SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=USE_CUDA) | |
| def compute_spec(ref_file): | |
| y, sr = librosa.load(ref_file, sr=ap.sample_rate) | |
| spec = ap.spectrogram(y) | |
| spec = torch.FloatTensor(spec).unsqueeze(0) | |
| return spec | |
| def greet(Text,Voicetoclone,VoiceMicrophone): | |
| text= "%s" % (Text) | |
| if Voicetoclone is not None: | |
| reference_files= "%s" % (Voicetoclone) | |
| print("path url") | |
| print(Voicetoclone) | |
| sample= str(Voicetoclone) | |
| else: | |
| reference_files= "%s" % (VoiceMicrophone) | |
| print("path url") | |
| print(VoiceMicrophone) | |
| sample= str(VoiceMicrophone) | |
| size= len(reference_files)*sys.getsizeof(reference_files) | |
| size2= size / 1000000 | |
| if (size2 > 0.012) or len(text)>2000: | |
| message="File is greater than 30mb or Text inserted is longer than 2000 characters. Please re-try with smaller sizes." | |
| print(message) | |
| raise SystemExit("File is greater than 30mb. Please re-try or Text inserted is longer than 2000 characters. Please re-try with smaller sizes.") | |
| else: | |
| os.system('ffmpeg-normalize $sample -nt rms -t=-27 -o $sample -ar 16000 -f') | |
| reference_emb = SE_speaker_manager.compute_d_vector_from_clip(reference_files) | |
| model.length_scale = 1 # scaler for the duration predictor. The larger it is, the slower the speech. | |
| model.inference_noise_scale = 0.3 # defines the noise variance applied to the random z vector at inference. | |
| model.inference_noise_scale_dp = 0.3 # defines the noise variance applied to the duration predictor z vector at inference. | |
| text = text | |
| model.language_manager.language_id_mapping | |
| language_id = 0 | |
| print(" > text: {}".format(text)) | |
| wav, alignment, _, _ = synthesis( | |
| model, | |
| text, | |
| C, | |
| "cuda" in str(next(model.parameters()).device), | |
| ap, | |
| speaker_id=None, | |
| d_vector=reference_emb, | |
| style_wav=None, | |
| language_id=language_id, | |
| enable_eos_bos_chars=C.enable_eos_bos_chars, | |
| use_griffin_lim=True, | |
| do_trim_silence=False, | |
| ).values() | |
| print("Generated Audio") | |
| IPython.display.display(Audio(wav, rate=ap.sample_rate)) | |
| #file_name = text.replace(" ", "_") | |
| #file_name = file_name.translate(str.maketrans('', '', string.punctuation.replace('_', ''))) + '.wav' | |
| file_name="Audio.wav" | |
| out_path = os.path.join(OUT_PATH, file_name) | |
| print(" > Saving output to {}".format(out_path)) | |
| ap.save_wav(wav, out_path) | |
| voicefixer.restore(input=out_path, # input wav file path | |
| output="audio1.wav", # output wav file path | |
| # cuda=True, # whether to use gpu acceleration' | |
| cuda = False, | |
| mode = 0) # You can try out mode 0, 1, or 2 to find out the best result | |
| noisy = enhance_model.load_audio( | |
| "audio1.wav" | |
| ).unsqueeze(0) | |
| enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.])) | |
| torchaudio.save("enhanced.wav", enhanced.cpu(), 16000) | |
| return "enhanced.wav" | |
| gr.Interface( | |
| fn=greet, | |
| inputs=[gr.inputs.Textbox(label='请输入您想要合成的文字,请自觉合法合规使用!'),gr.Audio(type="filepath", source="upload",label='请上传您喜欢的声音(wav/mp3文件, max. 30mb)'),gr.Audio(source="microphone", type="filepath", label = '请用麦克风上传您喜欢的声音,与文件上传二选一即可')], | |
| outputs="audio", | |
| title="🥳💬💕 - Voice Cloning/声音合成测试版(目前只支持英文文本合成,中文版正在开发中,敬请期待)", | |
| description = "注意❗:请不要生成会对个人以及组织造成侵害的内容,此程序仅供科研、学习使用。用户生成内容与程序开发者无关,请自觉合法合规使用,违反者一切后果自负。", | |
| article = "🤖 - 让有人文关怀的AI造福每一个人!AI向善,文明璀璨!TalktoAI - Enable the future!", | |
| ).launch() | |