ameerazam08's picture
Upload folder using huggingface_hub
e34aada verified
history blame
7.78 kB
import os
import subprocess
import tempfile
import traceback
import uuid
import torch
from scipy.signal import medfilt
from import librosa_wav2spec
from import save_wav
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
import librosa
import numpy as np
from scipy.interpolate import interp1d
from import crepe_with_corrector, crepe_predict, frequency_to_bins, \
from import find_nearest_stft_bin, find_best_f0_using_har_energy
def register_pitch_extractor(name):
def register_pitch_extractor_(cls):
return cls
return register_pitch_extractor_
def get_pitch_extractor(name):
return PITCH_EXTRACTOR[name]
def extract_pitch_simple(wav):
from utils.commons.hparams import hparams
n_mel_frames = (len(wav) + 1) // hparams['hop_size'] - hparams['win_size'] // hparams['hop_size']
return extract_pitch(hparams['pitch_extractor'], wav,
hparams['hop_size'], hparams['audio_sample_rate'],
f0_min=hparams['f0_min'], f0_max=hparams['f0_max'],
def extract_pitch(extractor_name, wav_data, hop_size, audio_sample_rate, f0_min=75, f0_max=800, **kwargs):
return get_pitch_extractor(extractor_name)(wav_data, hop_size, audio_sample_rate, f0_min, f0_max, **kwargs)
def harvest(wav_data, hop_size, audio_sample_rate, *args, **kwargs):
import pyworld as pw
n_mel_frames = int(len(wav_data) // hop_size)
f0, t = pw.harvest(wav_data.astype(np.double), audio_sample_rate)
x_old = np.arange(0, 1, 1 / len(f0))[:len(f0)]
x_old[-1] = 1.0
x_new = np.arange(0, 1, 1 / n_mel_frames)[:n_mel_frames]
f0 = interp1d(x_old, f0, 'nearest')(x_new)
return f0
def dio(wav_data, hop_size, audio_sample_rate, *args, **kwargs):
import pyworld as pw
n_mel_frames = int(len(wav_data) // hop_size)
_f0, t = pw.dio(wav_data.astype(np.double), audio_sample_rate)
f0 = pw.stonemask(wav_data.astype(np.double), _f0, t, audio_sample_rate)
x_old = np.arange(0, 1, 1 / len(f0))[:len(f0)]
x_old[-1] = 1.0
x_new = np.arange(0, 1, 1 / n_mel_frames)[:n_mel_frames]
f0 = interp1d(x_old, f0, 'nearest')(x_new)
return f0
def parselmouth_pitch(wav_data, hop_size, audio_sample_rate, f0_min, f0_max,
voicing_threshold=0.45, *args, **kwargs):
import parselmouth
time_step = hop_size / audio_sample_rate * 1000
n_mel_frames = int(len(wav_data) // hop_size)
f0_pm = parselmouth.Sound(wav_data, audio_sample_rate).to_pitch_ac(
time_step=time_step / 1000, voicing_threshold=voicing_threshold,
pitch_floor=f0_min, pitch_ceiling=f0_max).selected_array['frequency']
pad_size = (n_mel_frames - len(f0_pm) + 1) // 2
f0 = np.pad(f0_pm, [[pad_size, n_mel_frames - len(f0_pm) - pad_size]], mode='constant')
return f0
def reaper_extract_f0(wav_data, hop_size, audio_sample_rate, f0_min, f0_max, denoise=True,
*args, **kwargs):
dirname = f'/tmp/reaper_tmp/{len(wav_data)}_{str(uuid.uuid1())}'
os.makedirs(dirname, exist_ok=True)
with tempfile.TemporaryDirectory(dir=dirname) as _:
if hop_size == 256:
if audio_sample_rate == 24000:
save_wav(wav_data, f'{dirname}/1.wav', 25600, norm=False)
if audio_sample_rate == 48000:
save_wav(wav_data, f'{dirname}/1.wav', 51200, norm=False)
assert hop_size == 240
save_wav(wav_data, f'{dirname}/1.wav', audio_sample_rate, norm=False)
if denoise:
from import trim_long_silences
wav_data_ = wav_data
_, audio_mask, sr = trim_long_silences(wav_data, audio_sample_rate, vad_max_silence_length=20)
sr = audio_sample_rate
wav_noise = wav_data[~audio_mask]
# wav_noise = wav_data[:round(audio_sample_rate * 0.1)]
# from scipy.signal import butter, lfilter
# Define the filter parameters
# cutoff_freq = 200.0 # Hz
# nyquist_freq = 0.5 * sr
# order = 5
# b, a = butter(order, cutoff_freq / nyquist_freq, btype='lowpass')
new_fn = f'{dirname}/0'
save_wav(wav_noise, f'{new_fn}-noise.wav', sr=sr)
save_wav(wav_data, f'{new_fn}.wav', sr=sr)
f'sox {new_fn}-noise.wav -n noiseprof {new_fn}; '
f'sox {new_fn}.wav {new_fn}.denoised.wav noisered {new_fn} 0.21; ', shell=True)
wav_data, _ = librosa.load(f'{new_fn}.denoised.wav', sr=sr)
wav_data = np.concatenate([wav_data, wav_data_[-1024:]], 0)
# wav_data = lfilter(b, a, wav_data)
if hop_size == 256:
if audio_sample_rate == 24000:
save_wav(wav_data, f'{dirname}/2.wav', 25600, norm=False)
if audio_sample_rate == 48000:
save_wav(wav_data, f'{dirname}/2.wav', 51200, norm=False)
assert hop_size == 240
save_wav(wav_data, f'{dirname}/2.wav', audio_sample_rate, norm=False)
retry = 10
while retry > 0:
subprocess.check_call(f'rm -rf {dirname}/*f0', shell=True)
f0 = reaper_extract_f0_(
f'{dirname}/2.wav', f'{dirname}/1.wav', dirname, f0_min, f0_max)[:-8]
except KeyboardInterrupt:
raise KeyboardInterrupt
retry -= 1
if audio_sample_rate == 24000:
if hop_size == 256:
f0 = f0 * audio_sample_rate / 25600
f0[f0 == 0] = -100000
f0 = f0.reshape(-1, 2).mean(-1)
f0[f0 < 0] = 0
if audio_sample_rate == 48000:
if hop_size == 256:
f0 = f0 * audio_sample_rate / 51200
if return_denoised_wav:
return f0, wav_data
return f0
def reaper_extract_f0_(fwav1, fwav2, temp_dir, pitch_lower, pitch_upper):
frame_shift = 5
use_reaper = True
straight_f0_file = f'{temp_dir}/1.sf0'
if not os.path.exists(straight_f0_file):
subprocess.check_call('utils/audio/pitch/bin/ExtractF0ByStraight frame_shift=%d ' \
'min_f0=%d max_f0=%d wave="%s" output="%s"' % (
frame_shift, pitch_lower, pitch_upper,
fwav1, straight_f0_file), shell=True, timeout=20)
if use_reaper:
reaper_f0_file = f'{temp_dir}/1.rf0'
if not os.path.exists(reaper_f0_file):
subprocess.check_call('utils/audio/pitch/bin/ReaperF0 wave="%s" output="%s" ' \
'f0_min=%d f0_max=%d' % (
fwav2, reaper_f0_file,
pitch_lower, pitch_upper), shell=True, timeout=20) # ignore_security_alert
interp_f0_file = f'{temp_dir}/1.tf0'
if not os.path.exists(interp_f0_file):
subprocess.check_call('utils/audio/pitch/bin/InterpF0 straight="%s" ' \
'reaper="%s" output="%s"' % (
straight_f0_file, reaper_f0_file, interp_f0_file), shell=True, timeout=20)
straight_f0_file = interp_f0_file
f0 = np.loadtxt(straight_f0_file, dtype=np.float32)
return f0