|
from skimage.transform import resize |
|
import struct |
|
import webrtcvad |
|
from scipy.ndimage.morphology import binary_dilation, binary_erosion |
|
import librosa |
|
import numpy as np |
|
import pyloudnorm as pyln |
|
import warnings |
|
|
|
warnings.filterwarnings("ignore", message="Possible clipped samples in output") |
|
|
|
int16_max = (2 ** 15) - 1 |
|
|
|
|
|
def trim_long_silences(path, sr=None, return_raw_wav=False, norm=True, vad_max_silence_length=12): |
|
""" |
|
Ensures that segments without voice in the waveform remain no longer than a |
|
threshold determined by the VAD parameters in params.py. |
|
:param wav: the raw waveform as a numpy array of floats |
|
:param vad_max_silence_length: Maximum number of consecutive silent frames a segment can have. |
|
:return: the same waveform with silences trimmed away (length <= original wav length) |
|
""" |
|
|
|
|
|
|
|
|
|
sampling_rate = 16000 |
|
if isinstance(path, str): |
|
wav_raw, sr = librosa.core.load(path, sr=sr) |
|
else: |
|
wav_raw = path |
|
sr = 16000 |
|
|
|
if norm: |
|
meter = pyln.Meter(sr) |
|
loudness = meter.integrated_loudness(wav_raw) |
|
wav_raw = pyln.normalize.loudness(wav_raw, loudness, -20.0) |
|
if np.abs(wav_raw).max() > 0.95: |
|
wav_raw = wav_raw / np.abs(wav_raw).max() * 0.95 |
|
|
|
wav = librosa.resample(wav_raw, sr, sampling_rate, res_type='kaiser_best') |
|
|
|
vad_window_length = 30 |
|
|
|
|
|
vad_moving_average_width = 8 |
|
|
|
|
|
samples_per_window = (vad_window_length * sampling_rate) // 1000 |
|
|
|
|
|
wav = wav[:len(wav) - (len(wav) % samples_per_window)] |
|
|
|
|
|
pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16)) |
|
|
|
|
|
voice_flags = [] |
|
vad = webrtcvad.Vad(mode=3) |
|
for window_start in range(0, len(wav), samples_per_window): |
|
window_end = window_start + samples_per_window |
|
voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2], |
|
sample_rate=sampling_rate)) |
|
voice_flags = np.array(voice_flags) |
|
|
|
|
|
def moving_average(array, width): |
|
array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2))) |
|
ret = np.cumsum(array_padded, dtype=float) |
|
ret[width:] = ret[width:] - ret[:-width] |
|
return ret[width - 1:] / width |
|
|
|
audio_mask = moving_average(voice_flags, vad_moving_average_width) |
|
audio_mask = np.round(audio_mask).astype(bool) |
|
|
|
|
|
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1)) |
|
audio_mask = np.repeat(audio_mask, samples_per_window) |
|
audio_mask = resize(audio_mask, (len(wav_raw),)) > 0 |
|
if return_raw_wav: |
|
return wav_raw, audio_mask, sr |
|
return wav_raw[audio_mask], audio_mask, sr |
|
|