Spaces:
Running
Running
import warnings | |
import numpy as np | |
import resampy | |
import torch | |
import tqdm | |
import crepe | |
__all__ = ['CENTS_PER_BIN', | |
'MAX_FMAX', | |
'PITCH_BINS', | |
'SAMPLE_RATE', | |
'WINDOW_SIZE', | |
'UNVOICED', | |
'embed', | |
'embed_from_file', | |
'embed_from_file_to_file', | |
'embed_from_files_to_files', | |
'infer', | |
'predict', | |
'predict_from_file', | |
'predict_from_file_to_file', | |
'predict_from_files_to_files', | |
'preprocess', | |
'postprocess', | |
'resample'] | |
############################################################################### | |
# Constants | |
############################################################################### | |
CENTS_PER_BIN = 20 # cents | |
MAX_FMAX = 2006. # hz | |
PITCH_BINS = 360 | |
SAMPLE_RATE = 16000 # hz | |
WINDOW_SIZE = 1024 # samples | |
UNVOICED = np.nan | |
############################################################################### | |
# Crepe pitch prediction | |
############################################################################### | |
def predict(audio, | |
sample_rate, | |
hop_length=None, | |
fmin=50., | |
fmax=MAX_FMAX, | |
model='full', | |
decoder=crepe.decode.viterbi, | |
return_harmonicity=False, | |
return_periodicity=False, | |
batch_size=None, | |
device='cpu', | |
pad=True): | |
"""Performs pitch estimation | |
Arguments | |
audio (torch.tensor [shape=(1, time)]) | |
The audio signal | |
sample_rate (int) | |
The sampling rate in Hz | |
hop_length (int) | |
The hop_length in samples | |
fmin (float) | |
The minimum allowable frequency in Hz | |
fmax (float) | |
The maximum allowable frequency in Hz | |
model (string) | |
The model capacity. One of 'full' or 'tiny'. | |
decoder (function) | |
The decoder to use. See decode.py for decoders. | |
return_harmonicity (bool) [DEPRECATED] | |
Whether to also return the network confidence | |
return_periodicity (bool) | |
Whether to also return the network confidence | |
batch_size (int) | |
The number of frames per batch | |
device (string) | |
The device used to run inference | |
pad (bool) | |
Whether to zero-pad the audio | |
Returns | |
pitch (torch.tensor [shape=(1, 1 + int(time // hop_length))]) | |
(Optional) periodicity (torch.tensor | |
[shape=(1, 1 + int(time // hop_length))]) | |
""" | |
# Deprecate return_harmonicity | |
if return_harmonicity: | |
message = ( | |
'The crepe return_harmonicity argument is deprecated and ' | |
'will be removed in a future release. Please use ' | |
'return_periodicity. Rationale: if network confidence measured ' | |
'harmonics, the value would be low for non-harmonic, periodic ' | |
'sounds (e.g., sine waves). But this is not observed.') | |
warnings.warn(message, DeprecationWarning) | |
return_periodicity = return_harmonicity | |
results = [] | |
# Postprocessing breaks gradients, so just don't compute them | |
with torch.no_grad(): | |
# Preprocess audio | |
generator = preprocess(audio, | |
sample_rate, | |
hop_length, | |
batch_size, | |
device, | |
pad) | |
for frames in generator: | |
# Infer independent probabilities for each pitch bin | |
probabilities = infer(frames, model) | |
# shape=(batch, 360, time / hop_length) | |
probabilities = probabilities.reshape( | |
audio.size(0), -1, PITCH_BINS).transpose(1, 2) | |
# Convert probabilities to F0 and periodicity | |
result = postprocess(probabilities, | |
fmin, | |
fmax, | |
decoder, | |
return_harmonicity, | |
return_periodicity) | |
# Place on same device as audio to allow very long inputs | |
if isinstance(result, tuple): | |
result = (result[0].to(audio.device), | |
result[1].to(audio.device)) | |
else: | |
result = result.to(audio.device) | |
results.append(result) | |
# Split pitch and periodicity | |
if return_periodicity: | |
pitch, periodicity = zip(*results) | |
return torch.cat(pitch, 1), torch.cat(periodicity, 1) | |
# Concatenate | |
return torch.cat(results, 1) | |
def predict_from_file(audio_file, | |
hop_length=None, | |
fmin=50., | |
fmax=MAX_FMAX, | |
model='full', | |
decoder=crepe.decode.viterbi, | |
return_harmonicity=False, | |
return_periodicity=False, | |
batch_size=None, | |
device='cpu', | |
pad=True): | |
"""Performs pitch estimation from file on disk | |
Arguments | |
audio_file (string) | |
The file to perform pitch tracking on | |
hop_length (int) | |
The hop_length in samples | |
fmin (float) | |
The minimum allowable frequency in Hz | |
fmax (float) | |
The maximum allowable frequency in Hz | |
model (string) | |
The model capacity. One of 'full' or 'tiny'. | |
decoder (function) | |
The decoder to use. See decode.py for decoders. | |
return_harmonicity (bool) [DEPRECATED] | |
Whether to also return the network confidence | |
return_periodicity (bool) | |
Whether to also return the network confidence | |
batch_size (int) | |
The number of frames per batch | |
device (string) | |
The device used to run inference | |
pad (bool) | |
Whether to zero-pad the audio | |
Returns | |
pitch (torch.tensor [shape=(1, 1 + int(time // hop_length))]) | |
(Optional) periodicity (torch.tensor | |
[shape=(1, 1 + int(time // hop_length))]) | |
""" | |
# Load audio | |
audio, sample_rate = crepe.load.audio(audio_file) | |
# Predict | |
return predict(audio, | |
sample_rate, | |
hop_length, | |
fmin, | |
fmax, | |
model, | |
decoder, | |
return_harmonicity, | |
return_periodicity, | |
batch_size, | |
device, | |
pad) | |
def predict_from_file_to_file(audio_file, | |
output_pitch_file, | |
output_harmonicity_file=None, | |
output_periodicity_file=None, | |
hop_length=None, | |
fmin=50., | |
fmax=MAX_FMAX, | |
model='full', | |
decoder=crepe.decode.viterbi, | |
batch_size=None, | |
device='cpu', | |
pad=True): | |
"""Performs pitch estimation from file on disk | |
Arguments | |
audio_file (string) | |
The file to perform pitch tracking on | |
output_pitch_file (string) | |
The file to save predicted pitch | |
output_harmonicity_file (string or None) [DEPRECATED] | |
The file to save predicted harmonicity | |
output_periodicity_file (string or None) | |
The file to save predicted periodicity | |
hop_length (int) | |
The hop_length in samples | |
fmin (float) | |
The minimum allowable frequency in Hz | |
fmax (float) | |
The maximum allowable frequency in Hz | |
model (string) | |
The model capacity. One of 'full' or 'tiny'. | |
decoder (function) | |
The decoder to use. See decode.py for decoders. | |
batch_size (int) | |
The number of frames per batch | |
device (string) | |
The device used to run inference | |
pad (bool) | |
Whether to zero-pad the audio | |
""" | |
# Deprecate output_harmonicity_file | |
if output_harmonicity_file is not None: | |
message = ( | |
'The crepe output_harmonicity_file argument is deprecated and ' | |
'will be removed in a future release. Please use ' | |
'output_periodicity_file. Rationale: if network confidence measured ' | |
'harmonic content, the value would be low for non-harmonic, periodic ' | |
'sounds (e.g., sine waves). But this is not observed.') | |
warnings.warn(message, DeprecationWarning) | |
output_periodicity_file = output_harmonicity_file | |
# Predict from file | |
prediction = predict_from_file(audio_file, | |
hop_length, | |
fmin, | |
fmax, | |
model, | |
decoder, | |
False, | |
output_periodicity_file is not None, | |
batch_size, | |
device, | |
pad) | |
# Save to disk | |
if output_periodicity_file is not None: | |
torch.save(prediction[0].detach(), output_pitch_file) | |
torch.save(prediction[1].detach(), output_periodicity_file) | |
else: | |
torch.save(prediction.detach(), output_pitch_file) | |
def predict_from_files_to_files(audio_files, | |
output_pitch_files, | |
output_harmonicity_files=None, | |
output_periodicity_files=None, | |
hop_length=None, | |
fmin=50., | |
fmax=MAX_FMAX, | |
model='full', | |
decoder=crepe.decode.viterbi, | |
batch_size=None, | |
device='cpu', | |
pad=True): | |
"""Performs pitch estimation from files on disk without reloading model | |
Arguments | |
audio_files (list[string]) | |
The files to perform pitch tracking on | |
output_pitch_files (list[string]) | |
The files to save predicted pitch | |
output_harmonicity_files (list[string] or None) [DEPRECATED] | |
The files to save predicted harmonicity | |
output_periodicity_files (list[string] or None) | |
The files to save predicted periodicity | |
hop_length (int) | |
The hop_length in samples | |
fmin (float) | |
The minimum allowable frequency in Hz | |
fmax (float) | |
The maximum allowable frequency in Hz | |
model (string) | |
The model capacity. One of 'full' or 'tiny'. | |
decoder (function) | |
The decoder to use. See decode.py for decoders. | |
batch_size (int) | |
The number of frames per batch | |
device (string) | |
The device used to run inference | |
pad (bool) | |
Whether to zero-pad the audio | |
""" | |
# Deprecate output_harmonicity_files | |
if output_harmonicity_files is not None: | |
message = ( | |
'The crepe output_harmonicity_files argument is deprecated and ' | |
'will be removed in a future release. Please use ' | |
'output_periodicity_files. Rationale: if network confidence measured ' | |
'harmonic content, the value would be low for non-harmonic, periodic ' | |
'sounds (e.g., sine waves). But this is not observed.') | |
warnings.warn(message, DeprecationWarning) | |
output_periodicity_files = output_harmonicity_files | |
if output_periodicity_files is None: | |
output_periodicity_files = len(audio_files) * [None] | |
# Setup iterator | |
iterator = zip(audio_files, output_pitch_files, output_periodicity_files) | |
iterator = tqdm.tqdm(iterator, desc='crepe', dynamic_ncols=True) | |
for audio_file, output_pitch_file, output_periodicity_file in iterator: | |
# Predict a file | |
predict_from_file_to_file(audio_file, | |
output_pitch_file, | |
None, | |
output_periodicity_file, | |
hop_length, | |
fmin, | |
fmax, | |
model, | |
decoder, | |
batch_size, | |
device, | |
pad) | |
############################################################################### | |
# Crepe pitch embedding | |
############################################################################### | |
def embed(audio, | |
sample_rate, | |
hop_length=None, | |
model='full', | |
batch_size=None, | |
device='cpu', | |
pad=True): | |
"""Embeds audio to the output of CREPE's fifth maxpool layer | |
Arguments | |
audio (torch.tensor [shape=(1, time)]) | |
The audio signals | |
sample_rate (int) | |
The sampling rate in Hz | |
hop_length (int) | |
The hop_length in samples | |
model (string) | |
The model capacity. One of 'full' or 'tiny'. | |
batch_size (int) | |
The number of frames per batch | |
device (string) | |
The device to run inference on | |
pad (bool) | |
Whether to zero-pad the audio | |
Returns | |
embedding (torch.tensor [shape=(1, | |
1 + int(time // hop_length), 32, -1)]) | |
""" | |
results = [] | |
# Preprocess audio | |
generator = preprocess(audio, | |
sample_rate, | |
hop_length, | |
batch_size, | |
device, | |
pad) | |
for frames in generator: | |
# Infer pitch embeddings | |
embedding = infer(frames, model, embed=True) | |
# shape=(batch, time / hop_length, 32, embedding_size) | |
result = embedding.reshape(audio.size(0), frames.size(0), 32, -1) | |
# Place on same device as audio. This allows for large inputs. | |
results.append(result.to(audio.device)) | |
# Concatenate | |
return torch.cat(results, 1) | |
def embed_from_file(audio_file, | |
hop_length=None, | |
model='full', | |
batch_size=None, | |
device='cpu', | |
pad=True): | |
"""Embeds audio from disk to the output of CREPE's fifth maxpool layer | |
Arguments | |
audio_file (string) | |
The wav file containing the audio to embed | |
hop_length (int) | |
The hop_length in samples | |
model (string) | |
The model capacity. One of 'full' or 'tiny'. | |
batch_size (int) | |
The number of frames per batch | |
device (string) | |
The device to run inference on | |
pad (bool) | |
Whether to zero-pad the audio | |
Returns | |
embedding (torch.tensor [shape=(1, | |
1 + int(time // hop_length), 32, -1)]) | |
""" | |
# Load audio | |
audio, sample_rate = crepe.load.audio(audio_file) | |
# Embed | |
return embed(audio, | |
sample_rate, | |
hop_length, | |
model, | |
batch_size, | |
device, | |
pad) | |
def embed_from_file_to_file(audio_file, | |
output_file, | |
hop_length=None, | |
model='full', | |
batch_size=None, | |
device='cpu', | |
pad=True): | |
"""Embeds audio from disk and saves to disk | |
Arguments | |
audio_file (string) | |
The wav file containing the audio to embed | |
hop_length (int) | |
The hop_length in samples | |
output_file (string) | |
The file to save the embedding | |
model (string) | |
The model capacity. One of 'full' or 'tiny'. | |
batch_size (int) | |
The number of frames per batch | |
device (string) | |
The device to run inference on | |
pad (bool) | |
Whether to zero-pad the audio | |
""" | |
# No use computing gradients if we're just saving to file | |
with torch.no_grad(): | |
# Embed | |
embedding = embed_from_file(audio_file, | |
hop_length, | |
model, | |
batch_size, | |
device, | |
pad) | |
# Save to disk | |
torch.save(embedding.detach(), output_file) | |
def embed_from_files_to_files(audio_files, | |
output_files, | |
hop_length=None, | |
model='full', | |
batch_size=None, | |
device='cpu', | |
pad=True): | |
"""Embeds audio from disk and saves to disk without reloading model | |
Arguments | |
audio_files (list[string]) | |
The wav files containing the audio to embed | |
output_files (list[string]) | |
The files to save the embeddings | |
hop_length (int) | |
The hop_length in samples | |
model (string) | |
The model capacity. One of 'full' or 'tiny'. | |
batch_size (int) | |
The number of frames per batch | |
device (string) | |
The device to run inference on | |
pad (bool) | |
Whether to zero-pad the audio | |
""" | |
# Setup iterator | |
iterator = zip(audio_files, output_files) | |
iterator = tqdm.tqdm(iterator, desc='crepe', dynamic_ncols=True) | |
for audio_file, output_file in iterator: | |
# Embed a file | |
embed_from_file_to_file(audio_file, | |
output_file, | |
hop_length, | |
model, | |
batch_size, | |
device, | |
pad) | |
############################################################################### | |
# Components for step-by-step prediction | |
############################################################################### | |
def infer(frames, model='full', embed=False): | |
"""Forward pass through the model | |
Arguments | |
frames (torch.tensor [shape=(time / hop_length, 1024)]) | |
The network input | |
model (string) | |
The model capacity. One of 'full' or 'tiny'. | |
embed (bool) | |
Whether to stop inference at the intermediate embedding layer | |
Returns | |
logits (torch.tensor [shape=(1 + int(time // hop_length), 360)]) OR | |
embedding (torch.tensor [shape=(1 + int(time // hop_length), | |
embedding_size)]) | |
""" | |
# Load the model if necessary | |
if not hasattr(infer, 'model') or not hasattr(infer, 'capacity') or \ | |
(hasattr(infer, 'capacity') and infer.capacity != model): | |
crepe.load.model(frames.device, model) | |
# Move model to correct device (no-op if devices are the same) | |
infer.model = infer.model.to(frames.device) | |
# Apply model | |
return infer.model(frames, embed=embed) | |
def postprocess(probabilities, | |
fmin=0., | |
fmax=MAX_FMAX, | |
decoder=crepe.decode.viterbi, | |
return_harmonicity=False, | |
return_periodicity=False): | |
"""Convert model output to F0 and periodicity | |
Arguments | |
probabilities (torch.tensor [shape=(1, 360, time / hop_length)]) | |
The probabilities for each pitch bin inferred by the network | |
fmin (float) | |
The minimum allowable frequency in Hz | |
fmax (float) | |
The maximum allowable frequency in Hz | |
viterbi (bool) | |
Whether to use viterbi decoding | |
return_harmonicity (bool) [DEPRECATED] | |
Whether to also return the network confidence | |
return_periodicity (bool) | |
Whether to also return the network confidence | |
Returns | |
pitch (torch.tensor [shape=(1, 1 + int(time // hop_length))]) | |
periodicity (torch.tensor [shape=(1, 1 + int(time // hop_length))]) | |
""" | |
# Sampling is non-differentiable, so remove from graph | |
probabilities = probabilities.detach() | |
# Convert frequency range to pitch bin range | |
minidx = crepe.convert.frequency_to_bins(torch.tensor(fmin)) | |
maxidx = crepe.convert.frequency_to_bins(torch.tensor(fmax), | |
torch.ceil) | |
# Remove frequencies outside of allowable range | |
probabilities[:, :minidx] = -float('inf') | |
probabilities[:, maxidx:] = -float('inf') | |
# Perform argmax or viterbi sampling | |
bins, pitch = decoder(probabilities) | |
# Deprecate return_harmonicity | |
if return_harmonicity: | |
message = ( | |
'The crepe return_harmonicity argument is deprecated and ' | |
'will be removed in a future release. Please use ' | |
'return_periodicity. Rationale: if network confidence measured ' | |
'harmonics, the value would be low for non-harmonic, periodic ' | |
'sounds (e.g., sine waves). But this is not observed.') | |
warnings.warn(message, DeprecationWarning) | |
return_periodicity = return_harmonicity | |
if not return_periodicity: | |
return pitch | |
# Compute periodicity from probabilities and decoded pitch bins | |
return pitch, periodicity(probabilities, bins) | |
def preprocess(audio, | |
sample_rate, | |
hop_length=None, | |
batch_size=None, | |
device='cpu', | |
pad=True): | |
"""Convert audio to model input | |
Arguments | |
audio (torch.tensor [shape=(1, time)]) | |
The audio signals | |
sample_rate (int) | |
The sampling rate in Hz | |
hop_length (int) | |
The hop_length in samples | |
batch_size (int) | |
The number of frames per batch | |
device (string) | |
The device to run inference on | |
pad (bool) | |
Whether to zero-pad the audio | |
Returns | |
frames (torch.tensor [shape=(1 + int(time // hop_length), 1024)]) | |
""" | |
# Default hop length of 10 ms | |
hop_length = sample_rate // 100 if hop_length is None else hop_length | |
# Resample | |
if sample_rate != SAMPLE_RATE: | |
audio = resample(audio, sample_rate) | |
hop_length = int(hop_length * SAMPLE_RATE / sample_rate) | |
# Get total number of frames | |
# Maybe pad | |
if pad: | |
total_frames = 1 + int(audio.size(1) // hop_length) | |
audio = torch.nn.functional.pad( | |
audio, | |
(WINDOW_SIZE // 2, WINDOW_SIZE // 2)) | |
else: | |
total_frames = 1 + int((audio.size(1) - WINDOW_SIZE) // hop_length) | |
# Default to running all frames in a single batch | |
batch_size = total_frames if batch_size is None else batch_size | |
# Generate batches | |
for i in range(0, total_frames, batch_size): | |
# Batch indices | |
start = max(0, i * hop_length) | |
end = min(audio.size(1), | |
(i + batch_size - 1) * hop_length + WINDOW_SIZE) | |
# Chunk | |
frames = torch.nn.functional.unfold( | |
audio[:, None, None, start:end], | |
kernel_size=(1, WINDOW_SIZE), | |
stride=(1, hop_length)) | |
# shape=(1 + int(time / hop_length, 1024) | |
frames = frames.transpose(1, 2).reshape(-1, WINDOW_SIZE) | |
# Place on device | |
frames = frames.to(device) | |
# Mean-center | |
frames -= frames.mean(dim=1, keepdim=True) | |
# Scale | |
# Note: during silent frames, this produces very large values. But | |
# this seems to be what the network expects. | |
frames /= torch.max(torch.tensor(1e-10, device=frames.device), | |
frames.std(dim=1, keepdim=True)) | |
yield frames | |
############################################################################### | |
# Utilities | |
############################################################################### | |
def periodicity(probabilities, bins): | |
"""Computes the periodicity from the network output and pitch bins""" | |
# shape=(batch * time / hop_length, 360) | |
probs_stacked = probabilities.transpose(1, 2).reshape(-1, PITCH_BINS) | |
# shape=(batch * time / hop_length, 1) | |
bins_stacked = bins.reshape(-1, 1).to(torch.int64) | |
# Use maximum logit over pitch bins as periodicity | |
periodicity = probs_stacked.gather(1, bins_stacked) | |
# shape=(batch, time / hop_length) | |
return periodicity.reshape(probabilities.size(0), probabilities.size(2)) | |
def resample(audio, sample_rate): | |
"""Resample audio""" | |
# Store device for later placement | |
device = audio.device | |
# Convert to numpy | |
audio = audio.detach().cpu().numpy().squeeze(0) | |
# Resample | |
# We have to use resampy if we want numbers to match Crepe | |
audio = resampy.resample(audio, sample_rate, SAMPLE_RATE) | |
# Convert to pytorch | |
return torch.tensor(audio, device=device).unsqueeze(0) | |