Spaces:
Running
Running
# Third Party Imports | |
import torch | |
import onnxruntime as ort | |
# Local Imports | |
from vocal_isolation.models.mdx_net import Conv_TDF_net_trimm | |
# Global Variables | |
from vocal_isolation.constants import EXECUTION_PROVIDER_LIST, COMPUTATION_DEVICE | |
from vocal_isolation.constants import ONNX_MODEL_PATH, PRETRAINED_MODELS_DIRECTORY | |
class KimVocal: | |
def __init__(self): | |
self.models = [ | |
Conv_TDF_net_trimm( | |
ONNX_MODEL_PATH, use_onnx=True, target_name='vocals', | |
L=11, l=3, g=48, bn=8, bias=False, | |
dim_f=11, dim_t=8 | |
) | |
] | |
def demix_both(self, music_tensor, sample_rate): | |
""" | |
Isolating vocals AND instrumental using an ONNX model. | |
Assuming the audio is loaded correctly at 41000hz samplerate. | |
Args: | |
music_tensor (torch.Tensor): Input tensor. | |
model (torch.nn): Model used for inferring. | |
Returns: | |
torch.Tensor: Output tensor after passing through the network. | |
""" | |
number_of_samples = music_tensor.shape[1] | |
vocals_tensor = None | |
# * Extracting vocals | |
overlap = self.models[0].overlap | |
chunk_size = self.models[0].chunk_size | |
gen_size = chunk_size - 2 * overlap | |
pad_size = gen_size - number_of_samples % gen_size | |
# Along the column dimensions (used for features), we pad the left and right side of the mix to keep the integrity of the whole tensor | |
# overlap is added to ensure there's overlap between chunks. | |
mix_padded = torch.cat([torch.zeros(2, overlap), music_tensor, torch.zeros(2, pad_size+overlap)], 1) | |
ort_session = ort.InferenceSession(f'{PRETRAINED_MODELS_DIRECTORY}/{self.models[0].target_name}.onnx', providers=EXECUTION_PROVIDER_LIST) | |
# process one chunk at a time (batch_size=1) | |
demixed_chunks = [] | |
i = 0 | |
while i < number_of_samples + pad_size: | |
chunk = mix_padded[:, i : i + chunk_size] | |
x = self.models[0].stft(chunk.unsqueeze(0).to(COMPUTATION_DEVICE)) | |
with torch.no_grad(): | |
x = torch.tensor(ort_session.run(None, {'input': x.cpu().numpy()})[0]) | |
x = self.models[0].stft.inverse(x).squeeze(0) | |
x = x[...,overlap:-overlap] | |
demixed_chunks.append(x) | |
i += gen_size | |
vocals_tensor = torch.cat(demixed_chunks, -1)[...,:-pad_size].cpu() | |
# Subtract vocals output from the input mix for the remaining models | |
music_minus_vocals_tensor = music_tensor - vocals_tensor | |
# Returning two tensors. | |
return music_minus_vocals_tensor, vocals_tensor | |
def demix_vocals(self, music_tensor, sample_rate): | |
""" | |
Isolating vocals using an ONNX model. | |
Assuming the audio is loaded correctly at 41000hz samplerate. | |
Args: | |
music_tensor (torch.Tensor): Input tensor. | |
model (torch.nn): Model used for inferring. | |
Returns: | |
torch.Tensor: Output tensor after passing through the network. | |
""" | |
number_of_samples = music_tensor.shape[1] | |
overlap = self.models[0].overlap | |
# Calculate chunk_size and gen_size based on the sample rate | |
chunk_size = self.models[0].chunk_size | |
gen_size = chunk_size - 2 * overlap | |
pad_size = gen_size - number_of_samples % gen_size | |
mix_padded = torch.cat( | |
[torch.zeros(2, overlap), music_tensor, torch.zeros(2, pad_size + overlap)], | |
1, | |
) | |
# Start running the session for the model | |
ort_session = ort.InferenceSession( | |
ONNX_MODEL_PATH, providers=EXECUTION_PROVIDER_LIST | |
) | |
# process one chunk at a time (batch_size=1) | |
demixed_chunks = [] | |
i = 0 | |
while i < number_of_samples + pad_size: | |
# Computation | |
chunk = mix_padded[:, i : i + chunk_size] | |
x = self.models[0].stft(chunk.unsqueeze(0).to(COMPUTATION_DEVICE)) | |
with torch.no_grad(): | |
x = torch.tensor(ort_session.run(None, {"input": x.cpu().numpy()})[0]) | |
x = self.models[0].stft.inverse(x).squeeze(0) | |
x = x[..., overlap:-overlap] | |
demixed_chunks.append(x) | |
i += gen_size | |
vocals_output = torch.cat(demixed_chunks, -1)[..., :-pad_size].cpu() | |
return vocals_output | |