Rolando
Set it up
8718761
import subprocess
import warnings
import ffmpeg
import torch
import torchaudio
import numpy as np
from typing import Union, Optional
from whisper.audio import SAMPLE_RATE
def is_ytdlp_available():
return subprocess.run('yt-dlp -h', shell=True, capture_output=True).returncode == 0
def _load_file(file: Union[str, bytes], verbose: bool = False, only_ffmpeg: bool = False):
if isinstance(file, str) and '://' in file:
if is_ytdlp_available():
verbosity = ' -q' if verbose is None else (' --progress' if verbose else ' --progress -q')
p = subprocess.run(
f'yt-dlp "{file}" -f ba/w -I 1{verbosity} -o -',
shell=True,
stdout=subprocess.PIPE
)
if len(p.stdout) == 0:
raise RuntimeError(f'Failed to download media from "{file}" with yt-dlp')
return p.stdout
else:
warnings.warn('URL detected but yt-dlp not available. '
'To handle a greater variety of URLs (i.e. non-direct links), '
'install yt-dlp, \'pip install yt-dlp\' (repo: https://github.com/yt-dlp/yt-dlp).')
if not only_ffmpeg:
if is_ytdlp_available():
verbosity = ' -q' if verbose is None else (' --progress' if verbose else ' --progress -q')
p = subprocess.run(
f'yt-dlp "{file}" -f ba/w -I 1{verbosity} -o -',
shell=True,
stdout=subprocess.PIPE
)
if p.returncode != 0 or len(p.stdout) == 0:
raise RuntimeError(f'Failed to download media from "{file}" with yt-dlp')
return p.stdout
else:
warnings.warn('URL detected but yt-dlp not available. '
'To handle a greater variety of URLs (i.e. non-direct links), '
'install yt-dlp, \'pip install yt-dlp\' (repo: https://github.com/yt-dlp/yt-dlp).')
return file
# modified version of whisper.audio.load_audio
def load_audio(file: Union[str, bytes], sr: int = SAMPLE_RATE, verbose: bool = True, only_ffmpeg: bool = False):
"""
Open an audio file and read as mono waveform then resamples as necessary.
Parameters
----------
file : str or bytes
The audio file to open, bytes of file, or URL to audio/video.
sr : int, default ``whisper.model.SAMPLE_RATE``
The sample rate to resample the audio if necessary.
verbose : bool, default True
Whether to print yt-dlp log.
only_ffmpeg : bool, default False
Whether to use only FFmpeg (instead of yt-dlp) for URls.
Returns
-------
numpy.ndarray
A array containing the audio waveform in float32.
"""
file = _load_file(file, verbose=verbose, only_ffmpeg=only_ffmpeg)
if isinstance(file, bytes):
inp, file = file, 'pipe:'
else:
inp = None
try:
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
out, _ = (
ffmpeg.input(file, threads=0)
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True, input=inp)
)
except ffmpeg.Error as e:
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
def voice_freq_filter(wf: (torch.Tensor, np.ndarray), sr: int,
upper_freq: int = None,
lower_freq: int = None) -> torch.Tensor:
if isinstance(wf, np.ndarray):
wf = torch.from_numpy(wf)
if upper_freq is None:
upper_freq = 5000
if lower_freq is None:
lower_freq = 200
assert upper_freq > lower_freq, f'upper_freq {upper_freq} must but greater than lower_freq {lower_freq}'
return torchaudio.functional.highpass_biquad(torchaudio.functional.lowpass_biquad(wf, sr, upper_freq),
sr,
lower_freq)
def is_demucs_available():
from importlib.util import find_spec
if find_spec('demucs') is None:
raise ModuleNotFoundError("Please install Demucs; "
"'pip install -U demucs' or "
"'pip install -U git+https://github.com/facebookresearch/demucs#egg=demucs'; "
"Official Demucs repo: https://github.com/facebookresearch/demucs")
def load_demucs_model():
is_demucs_available()
from demucs.pretrained import get_model_from_args
return get_model_from_args(type('args', (object,), dict(name='htdemucs', repo=None))).cpu().eval()
def demucs_audio(audio: (torch.Tensor, str),
input_sr: int = None,
output_sr: int = None,
model=None,
device=None,
verbose: bool = True,
track_name: str = None,
save_path: str = None,
**demucs_options) -> torch.Tensor:
"""
Isolates vocals / remove noise from ``audio`` with Demucs.
Official repo, https://github.com/facebookresearch/demucs.
"""
if model is None:
model = load_demucs_model()
else:
is_demucs_available()
from demucs.apply import apply_model
if track_name:
track_name = f'"{track_name}"'
if isinstance(audio, (str, bytes)):
if isinstance(audio, str) and not track_name:
track_name = f'"{audio}"'
audio = torch.from_numpy(load_audio(audio, model.samplerate))
elif input_sr != model.samplerate:
if input_sr is None:
raise ValueError('No [input_sr] specified for audio tensor.')
audio = torchaudio.functional.resample(audio,
orig_freq=input_sr,
new_freq=model.samplerate)
if not track_name:
track_name = 'audio track'
audio_dims = audio.dim()
if audio_dims == 1:
audio = audio[None, None].repeat_interleave(2, -2)
else:
if audio.shape[-2] == 1:
audio = audio.repeat_interleave(2, -2)
if audio_dims < 3:
audio = audio[None]
if 'mix' in demucs_options:
audio = demucs_options.pop('mix')
if device is None:
device = "cuda" if torch.cuda.is_available() else "cpu"
vocals_idx = model.sources.index('vocals')
if verbose:
print(f'Isolating vocals from {track_name}')
apply_kwarg = dict(
model=model,
mix=audio,
device=device,
split=True,
overlap=.25,
progress=verbose is not None,
)
apply_kwarg.update(demucs_options)
vocals = apply_model(
**apply_kwarg
)[0, vocals_idx].mean(0)
if device != 'cpu':
torch.cuda.empty_cache()
if output_sr is not None and model.samplerate != output_sr:
vocals = torchaudio.functional.resample(vocals,
orig_freq=model.samplerate,
new_freq=output_sr)
if save_path is not None:
if isinstance(save_path, str) and not save_path.lower().endswith('.wav'):
save_path += '.wav'
torchaudio.save(save_path, vocals[None], output_sr or model.samplerate)
print(f'Saved: {save_path}')
return vocals
def get_samplerate(audiofile: (str, bytes)) -> (int, None):
import re
if isinstance(audiofile, str):
metadata = subprocess.run(f'ffmpeg -i {audiofile}', capture_output=True, shell=True).stderr.decode()
else:
p = subprocess.Popen(f'ffmpeg -i -', stderr=subprocess.PIPE, stdin=subprocess.PIPE, shell=True)
try:
p.stdin.write(audiofile)
except BrokenPipeError:
pass
finally:
metadata = p.communicate()[-1]
if metadata is not None:
metadata = metadata.decode()
sr = re.findall(r'\n.+Stream.+Audio.+\D+(\d+) Hz', metadata)
if sr:
return int(sr[0])
def prep_audio(
audio: Union[str, np.ndarray, torch.Tensor, bytes],
demucs: Union[bool, torch.nn.Module] = False,
demucs_options: dict = None,
only_voice_freq: bool = False,
only_ffmpeg: bool = False,
verbose: Optional[bool] = False,
sr: int = None
) -> torch.Tensor:
"""
Converts input audio of many types into a mono waveform as a torch.Tensor.
Parameters
----------
audio : str or numpy.ndarray or torch.Tensor or bytes
Path/URL to the audio file, the audio waveform, or bytes of audio file.
If audio is :class:`numpy.ndarray` or :class:`torch.Tensor`, the audio must be already at sampled to 16kHz.
demucs : bool or torch.nn.Module, default False
Whether to preprocess ``audio`` with Demucs to isolate vocals / remove noise. Set ``demucs`` to an instance of
a Demucs model to avoid reloading the model for each run.
Demucs must be installed to use. Official repo, https://github.com/facebookresearch/demucs.
demucs_options : dict, optional
Options to use for :func:`stable_whisper.audio.demucs_audio`.
only_voice_freq : bool, default False
Whether to only use sound between 200 - 5000 Hz, where majority of human speech are.
sr : int, default None, meaning ``whisper.audio.SAMPLE_RATE``, 16kHZ
The sample rate of ``audio``.
verbose : bool, default False
Whether to print yt-dlp log.
only_ffmpeg: bool, default False
Whether to use only FFmpeg (and not yt-dlp) for URls.
Returns
-------
torch.Tensor
A mono waveform.
"""
if not sr:
sr = SAMPLE_RATE
if isinstance(audio, (str, bytes)):
if demucs:
demucs_kwargs = dict(
audio=audio,
output_sr=sr,
verbose=verbose,
)
demucs_kwargs.update(demucs_options or {})
audio = demucs_audio(**demucs_kwargs)
else:
audio = torch.from_numpy(load_audio(audio, sr=sr, verbose=verbose, only_ffmpeg=only_ffmpeg))
else:
if isinstance(audio, np.ndarray):
audio = torch.from_numpy(audio)
if demucs:
demucs_kwargs = dict(
audio=audio,
input_sr=sr,
output_sr=sr,
verbose=verbose,
)
demucs_kwargs.update(demucs_options or {})
audio = demucs_audio(**demucs_kwargs)
if only_voice_freq:
audio = voice_freq_filter(audio, sr)
return audio