Spaces:

Smotto
/

Sing-For-Me

Running

Sing-For-Me / vocal_isolation /models /kimvocal.py

Jarod Castillo

init

bb70eb3 over 1 year ago

4.44 kB

	# Third Party Imports
	import torch
	import onnxruntime as ort

	# Local Imports
	from vocal_isolation.models.mdx_net import Conv_TDF_net_trimm

	# Global Variables
	from vocal_isolation.constants import EXECUTION_PROVIDER_LIST, COMPUTATION_DEVICE
	from vocal_isolation.constants import ONNX_MODEL_PATH, PRETRAINED_MODELS_DIRECTORY


	class KimVocal:
	def __init__(self):
	self.models = [
	Conv_TDF_net_trimm(
	ONNX_MODEL_PATH, use_onnx=True, target_name='vocals',
	L=11, l=3, g=48, bn=8, bias=False,
	dim_f=11, dim_t=8
	)
	]

	def demix_both(self, music_tensor, sample_rate):
	"""
	Isolating vocals AND instrumental using an ONNX model.
	Assuming the audio is loaded correctly at 41000hz samplerate.

	Args:
	music_tensor (torch.Tensor): Input tensor.
	model (torch.nn): Model used for inferring.

	Returns:
	torch.Tensor: Output tensor after passing through the network.
	"""
	number_of_samples = music_tensor.shape[1]
	vocals_tensor = None

	# * Extracting vocals
	overlap = self.models[0].overlap
	chunk_size = self.models[0].chunk_size
	gen_size = chunk_size - 2 * overlap
	pad_size = gen_size - number_of_samples % gen_size
	# Along the column dimensions (used for features), we pad the left and right side of the mix to keep the integrity of the whole tensor
	# overlap is added to ensure there's overlap between chunks.
	mix_padded = torch.cat([torch.zeros(2, overlap), music_tensor, torch.zeros(2, pad_size+overlap)], 1)

	ort_session = ort.InferenceSession(f'{PRETRAINED_MODELS_DIRECTORY}/{self.models[0].target_name}.onnx', providers=EXECUTION_PROVIDER_LIST)

	# process one chunk at a time (batch_size=1)
	demixed_chunks = []
	i = 0
	while i < number_of_samples + pad_size:
	chunk = mix_padded[:, i : i + chunk_size]
	x = self.models[0].stft(chunk.unsqueeze(0).to(COMPUTATION_DEVICE))
	with torch.no_grad():
	x = torch.tensor(ort_session.run(None, {'input': x.cpu().numpy()})[0])
	x = self.models[0].stft.inverse(x).squeeze(0)
	x = x[...,overlap:-overlap]
	demixed_chunks.append(x)
	i += gen_size

	vocals_tensor = torch.cat(demixed_chunks, -1)[...,:-pad_size].cpu()

	# Subtract vocals output from the input mix for the remaining models
	music_minus_vocals_tensor = music_tensor - vocals_tensor

	# Returning two tensors.
	return music_minus_vocals_tensor, vocals_tensor

	def demix_vocals(self, music_tensor, sample_rate):
	"""
	Isolating vocals using an ONNX model.
	Assuming the audio is loaded correctly at 41000hz samplerate.

	Args:
	music_tensor (torch.Tensor): Input tensor.
	model (torch.nn): Model used for inferring.

	Returns:
	torch.Tensor: Output tensor after passing through the network.
	"""
	number_of_samples = music_tensor.shape[1]
	overlap = self.models[0].overlap

	# Calculate chunk_size and gen_size based on the sample rate
	chunk_size = self.models[0].chunk_size
	gen_size = chunk_size - 2 * overlap
	pad_size = gen_size - number_of_samples % gen_size
	mix_padded = torch.cat(
	[torch.zeros(2, overlap), music_tensor, torch.zeros(2, pad_size + overlap)],
	1,
	)

	# Start running the session for the model
	ort_session = ort.InferenceSession(
	ONNX_MODEL_PATH, providers=EXECUTION_PROVIDER_LIST
	)

	# process one chunk at a time (batch_size=1)
	demixed_chunks = []
	i = 0
	while i < number_of_samples + pad_size:
	# Computation
	chunk = mix_padded[:, i : i + chunk_size]
	x = self.models[0].stft(chunk.unsqueeze(0).to(COMPUTATION_DEVICE))
	with torch.no_grad():
	x = torch.tensor(ort_session.run(None, {"input": x.cpu().numpy()})[0])
	x = self.models[0].stft.inverse(x).squeeze(0)
	x = x[..., overlap:-overlap]
	demixed_chunks.append(x)
	i += gen_size

	vocals_output = torch.cat(demixed_chunks, -1)[..., :-pad_size].cpu()

	return vocals_output