# Third Party Imports import torch import onnxruntime as ort # Local Imports from vocal_isolation.models.mdx_net import Conv_TDF_net_trimm # Global Variables from vocal_isolation.constants import EXECUTION_PROVIDER_LIST, COMPUTATION_DEVICE from vocal_isolation.constants import ONNX_MODEL_PATH, PRETRAINED_MODELS_DIRECTORY class KimVocal: def __init__(self): self.models = [ Conv_TDF_net_trimm( ONNX_MODEL_PATH, use_onnx=True, target_name='vocals', L=11, l=3, g=48, bn=8, bias=False, dim_f=11, dim_t=8 ) ] def demix_both(self, music_tensor, sample_rate): """ Isolating vocals AND instrumental using an ONNX model. Assuming the audio is loaded correctly at 41000hz samplerate. Args: music_tensor (torch.Tensor): Input tensor. model (torch.nn): Model used for inferring. Returns: torch.Tensor: Output tensor after passing through the network. """ number_of_samples = music_tensor.shape[1] vocals_tensor = None # * Extracting vocals overlap = self.models[0].overlap chunk_size = self.models[0].chunk_size gen_size = chunk_size - 2 * overlap pad_size = gen_size - number_of_samples % gen_size # Along the column dimensions (used for features), we pad the left and right side of the mix to keep the integrity of the whole tensor # overlap is added to ensure there's overlap between chunks. mix_padded = torch.cat([torch.zeros(2, overlap), music_tensor, torch.zeros(2, pad_size+overlap)], 1) ort_session = ort.InferenceSession(f'{PRETRAINED_MODELS_DIRECTORY}/{self.models[0].target_name}.onnx', providers=EXECUTION_PROVIDER_LIST) # process one chunk at a time (batch_size=1) demixed_chunks = [] i = 0 while i < number_of_samples + pad_size: chunk = mix_padded[:, i : i + chunk_size] x = self.models[0].stft(chunk.unsqueeze(0).to(COMPUTATION_DEVICE)) with torch.no_grad(): x = torch.tensor(ort_session.run(None, {'input': x.cpu().numpy()})[0]) x = self.models[0].stft.inverse(x).squeeze(0) x = x[...,overlap:-overlap] demixed_chunks.append(x) i += gen_size vocals_tensor = torch.cat(demixed_chunks, -1)[...,:-pad_size].cpu() # Subtract vocals output from the input mix for the remaining models music_minus_vocals_tensor = music_tensor - vocals_tensor # Returning two tensors. return music_minus_vocals_tensor, vocals_tensor def demix_vocals(self, music_tensor, sample_rate): """ Isolating vocals using an ONNX model. Assuming the audio is loaded correctly at 41000hz samplerate. Args: music_tensor (torch.Tensor): Input tensor. model (torch.nn): Model used for inferring. Returns: torch.Tensor: Output tensor after passing through the network. """ number_of_samples = music_tensor.shape[1] overlap = self.models[0].overlap # Calculate chunk_size and gen_size based on the sample rate chunk_size = self.models[0].chunk_size gen_size = chunk_size - 2 * overlap pad_size = gen_size - number_of_samples % gen_size mix_padded = torch.cat( [torch.zeros(2, overlap), music_tensor, torch.zeros(2, pad_size + overlap)], 1, ) # Start running the session for the model ort_session = ort.InferenceSession( ONNX_MODEL_PATH, providers=EXECUTION_PROVIDER_LIST ) # process one chunk at a time (batch_size=1) demixed_chunks = [] i = 0 while i < number_of_samples + pad_size: # Computation chunk = mix_padded[:, i : i + chunk_size] x = self.models[0].stft(chunk.unsqueeze(0).to(COMPUTATION_DEVICE)) with torch.no_grad(): x = torch.tensor(ort_session.run(None, {"input": x.cpu().numpy()})[0]) x = self.models[0].stft.inverse(x).squeeze(0) x = x[..., overlap:-overlap] demixed_chunks.append(x) i += gen_size vocals_output = torch.cat(demixed_chunks, -1)[..., :-pad_size].cpu() return vocals_output