Spaces:
Sleeping
Sleeping
| import sys, os | |
| sys.path.append(os.path.dirname(os.path.abspath(__file__))) | |
| import torch | |
| import librosa | |
| import argparse | |
| import numpy as np | |
| from scipy.io.wavfile import write | |
| from vad.utils import init_jit_model, get_speech_timestamps | |
| def load_audio(file: str, sr: int = 16000): | |
| x, sr = librosa.load(file, sr=sr) | |
| return x | |
| if __name__ == '__main__': | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('--ref', type=str, required=True, | |
| help="Path of ref audio.") | |
| parser.add_argument('--svc', type=str, required=True, | |
| help="Path of svc audio.") | |
| parser.add_argument('--out', type=str, required=True, | |
| help="Path of out audio.") | |
| args = parser.parse_args() | |
| print("svc in wave :", args.ref) | |
| print("svc out wave :", args.svc) | |
| print("svc post wave :", args.out) | |
| model = init_jit_model(os.path.join('vad/assets', 'silero_vad.jit')) | |
| model.eval() | |
| ref_wave = load_audio(args.ref, sr=16000) | |
| tmp_wave = torch.from_numpy(ref_wave).squeeze(0) | |
| tag_wave = get_speech_timestamps( | |
| tmp_wave, model, threshold=0.2, sampling_rate=16000) | |
| ref_wave[:] = 0 | |
| for tag in tag_wave: | |
| ref_wave[tag["start"]:tag["end"]] = 1 | |
| ref_wave = np.repeat(ref_wave, 2, -1) | |
| svc_wave = load_audio(args.svc, sr=32000) | |
| min_len = min(len(ref_wave), len(svc_wave)) | |
| ref_wave = ref_wave[:min_len] | |
| svc_wave = svc_wave[:min_len] | |
| svc_wave[ref_wave == 0] = 0 | |
| write(args.out, 32000, svc_wave) | |