import subprocess import warnings import ffmpeg import torch import torchaudio import numpy as np from typing import Union, Optional from whisper.audio import SAMPLE_RATE def is_ytdlp_available(): return subprocess.run('yt-dlp -h', shell=True, capture_output=True).returncode == 0 def _load_file(file: Union[str, bytes], verbose: bool = False, only_ffmpeg: bool = False): if isinstance(file, str) and '://' in file: if is_ytdlp_available(): verbosity = ' -q' if verbose is None else (' --progress' if verbose else ' --progress -q') p = subprocess.run( f'yt-dlp "{file}" -f ba/w -I 1{verbosity} -o -', shell=True, stdout=subprocess.PIPE ) if len(p.stdout) == 0: raise RuntimeError(f'Failed to download media from "{file}" with yt-dlp') return p.stdout else: warnings.warn('URL detected but yt-dlp not available. ' 'To handle a greater variety of URLs (i.e. non-direct links), ' 'install yt-dlp, \'pip install yt-dlp\' (repo: https://github.com/yt-dlp/yt-dlp).') if not only_ffmpeg: if is_ytdlp_available(): verbosity = ' -q' if verbose is None else (' --progress' if verbose else ' --progress -q') p = subprocess.run( f'yt-dlp "{file}" -f ba/w -I 1{verbosity} -o -', shell=True, stdout=subprocess.PIPE ) if p.returncode != 0 or len(p.stdout) == 0: raise RuntimeError(f'Failed to download media from "{file}" with yt-dlp') return p.stdout else: warnings.warn('URL detected but yt-dlp not available. ' 'To handle a greater variety of URLs (i.e. non-direct links), ' 'install yt-dlp, \'pip install yt-dlp\' (repo: https://github.com/yt-dlp/yt-dlp).') return file # modified version of whisper.audio.load_audio def load_audio(file: Union[str, bytes], sr: int = SAMPLE_RATE, verbose: bool = True, only_ffmpeg: bool = False): """ Open an audio file and read as mono waveform then resamples as necessary. Parameters ---------- file : str or bytes The audio file to open, bytes of file, or URL to audio/video. sr : int, default ``whisper.model.SAMPLE_RATE`` The sample rate to resample the audio if necessary. verbose : bool, default True Whether to print yt-dlp log. only_ffmpeg : bool, default False Whether to use only FFmpeg (instead of yt-dlp) for URls. Returns ------- numpy.ndarray A array containing the audio waveform in float32. """ file = _load_file(file, verbose=verbose, only_ffmpeg=only_ffmpeg) if isinstance(file, bytes): inp, file = file, 'pipe:' else: inp = None try: # This launches a subprocess to decode audio while down-mixing and resampling as necessary. # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. out, _ = ( ffmpeg.input(file, threads=0) .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr) .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True, input=inp) ) except ffmpeg.Error as e: raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 def voice_freq_filter(wf: (torch.Tensor, np.ndarray), sr: int, upper_freq: int = None, lower_freq: int = None) -> torch.Tensor: if isinstance(wf, np.ndarray): wf = torch.from_numpy(wf) if upper_freq is None: upper_freq = 5000 if lower_freq is None: lower_freq = 200 assert upper_freq > lower_freq, f'upper_freq {upper_freq} must but greater than lower_freq {lower_freq}' return torchaudio.functional.highpass_biquad(torchaudio.functional.lowpass_biquad(wf, sr, upper_freq), sr, lower_freq) def is_demucs_available(): from importlib.util import find_spec if find_spec('demucs') is None: raise ModuleNotFoundError("Please install Demucs; " "'pip install -U demucs' or " "'pip install -U git+https://github.com/facebookresearch/demucs#egg=demucs'; " "Official Demucs repo: https://github.com/facebookresearch/demucs") def load_demucs_model(): is_demucs_available() from demucs.pretrained import get_model_from_args return get_model_from_args(type('args', (object,), dict(name='htdemucs', repo=None))).cpu().eval() def demucs_audio(audio: (torch.Tensor, str), input_sr: int = None, output_sr: int = None, model=None, device=None, verbose: bool = True, track_name: str = None, save_path: str = None, **demucs_options) -> torch.Tensor: """ Isolates vocals / remove noise from ``audio`` with Demucs. Official repo, https://github.com/facebookresearch/demucs. """ if model is None: model = load_demucs_model() else: is_demucs_available() from demucs.apply import apply_model if track_name: track_name = f'"{track_name}"' if isinstance(audio, (str, bytes)): if isinstance(audio, str) and not track_name: track_name = f'"{audio}"' audio = torch.from_numpy(load_audio(audio, model.samplerate)) elif input_sr != model.samplerate: if input_sr is None: raise ValueError('No [input_sr] specified for audio tensor.') audio = torchaudio.functional.resample(audio, orig_freq=input_sr, new_freq=model.samplerate) if not track_name: track_name = 'audio track' audio_dims = audio.dim() if audio_dims == 1: audio = audio[None, None].repeat_interleave(2, -2) else: if audio.shape[-2] == 1: audio = audio.repeat_interleave(2, -2) if audio_dims < 3: audio = audio[None] if 'mix' in demucs_options: audio = demucs_options.pop('mix') if device is None: device = "cuda" if torch.cuda.is_available() else "cpu" vocals_idx = model.sources.index('vocals') if verbose: print(f'Isolating vocals from {track_name}') apply_kwarg = dict( model=model, mix=audio, device=device, split=True, overlap=.25, progress=verbose is not None, ) apply_kwarg.update(demucs_options) vocals = apply_model( **apply_kwarg )[0, vocals_idx].mean(0) if device != 'cpu': torch.cuda.empty_cache() if output_sr is not None and model.samplerate != output_sr: vocals = torchaudio.functional.resample(vocals, orig_freq=model.samplerate, new_freq=output_sr) if save_path is not None: if isinstance(save_path, str) and not save_path.lower().endswith('.wav'): save_path += '.wav' torchaudio.save(save_path, vocals[None], output_sr or model.samplerate) print(f'Saved: {save_path}') return vocals def get_samplerate(audiofile: (str, bytes)) -> (int, None): import re if isinstance(audiofile, str): metadata = subprocess.run(f'ffmpeg -i {audiofile}', capture_output=True, shell=True).stderr.decode() else: p = subprocess.Popen(f'ffmpeg -i -', stderr=subprocess.PIPE, stdin=subprocess.PIPE, shell=True) try: p.stdin.write(audiofile) except BrokenPipeError: pass finally: metadata = p.communicate()[-1] if metadata is not None: metadata = metadata.decode() sr = re.findall(r'\n.+Stream.+Audio.+\D+(\d+) Hz', metadata) if sr: return int(sr[0]) def prep_audio( audio: Union[str, np.ndarray, torch.Tensor, bytes], demucs: Union[bool, torch.nn.Module] = False, demucs_options: dict = None, only_voice_freq: bool = False, only_ffmpeg: bool = False, verbose: Optional[bool] = False, sr: int = None ) -> torch.Tensor: """ Converts input audio of many types into a mono waveform as a torch.Tensor. Parameters ---------- audio : str or numpy.ndarray or torch.Tensor or bytes Path/URL to the audio file, the audio waveform, or bytes of audio file. If audio is :class:`numpy.ndarray` or :class:`torch.Tensor`, the audio must be already at sampled to 16kHz. demucs : bool or torch.nn.Module, default False Whether to preprocess ``audio`` with Demucs to isolate vocals / remove noise. Set ``demucs`` to an instance of a Demucs model to avoid reloading the model for each run. Demucs must be installed to use. Official repo, https://github.com/facebookresearch/demucs. demucs_options : dict, optional Options to use for :func:`stable_whisper.audio.demucs_audio`. only_voice_freq : bool, default False Whether to only use sound between 200 - 5000 Hz, where majority of human speech are. sr : int, default None, meaning ``whisper.audio.SAMPLE_RATE``, 16kHZ The sample rate of ``audio``. verbose : bool, default False Whether to print yt-dlp log. only_ffmpeg: bool, default False Whether to use only FFmpeg (and not yt-dlp) for URls. Returns ------- torch.Tensor A mono waveform. """ if not sr: sr = SAMPLE_RATE if isinstance(audio, (str, bytes)): if demucs: demucs_kwargs = dict( audio=audio, output_sr=sr, verbose=verbose, ) demucs_kwargs.update(demucs_options or {}) audio = demucs_audio(**demucs_kwargs) else: audio = torch.from_numpy(load_audio(audio, sr=sr, verbose=verbose, only_ffmpeg=only_ffmpeg)) else: if isinstance(audio, np.ndarray): audio = torch.from_numpy(audio) if demucs: demucs_kwargs = dict( audio=audio, input_sr=sr, output_sr=sr, verbose=verbose, ) demucs_kwargs.update(demucs_options or {}) audio = demucs_audio(**demucs_kwargs) if only_voice_freq: audio = voice_freq_filter(audio, sr) return audio