Spaces:

naotokui
/

latentgranular

Running on T4

App Files Files Community

latentgranular / app.py

naotokui

rename

6e53c55 5 months ago

raw

history blame contribute delete

5.15 kB

	#%%
	import gradio as gr
	import librosa, torch
	import numpy as np
	from music2latent import EncoderDecoder
	from scipy.spatial.distance import cdist

	class LatentGranularSynthesis:
	def __init__(self):
	self.encdec = EncoderDecoder()
	if torch.cuda.is_available():
	print('Using GPU')

	self.unit = 2
	self.stride = 2
	self.temperature = 0.01
	self.threshold = 1.0
	self.files = None
	self.pitch_aug = [-5, -2, 2, 5]
	self.vol_aug = [0.3, 0.7]

	def set_temperature(self, temperature, threshold):
	self.temperature = temperature * 0.01
	self.threshold = threshold

	def set_unit(self, unit, stride):
	self.unit = unit
	self.stride = stride
	if self.files is not None:
	self.build_dataset(self.files)

	def build_dataset(self, files, aug_checkbox: bool):
	self.files = files
	self.codedb = torch.tensor([])
	n_files = 0
	for path in files:
	try:
	y, sr = librosa.load(path, sr=44100)

	# Normalize audio
	y = librosa.util.normalize(y)

	if aug_checkbox:
	# Apply volume augmentation
	for vol in self.vol_aug:
	y_vol = y * vol
	y = np.hstack((y, y_vol))

	# Apply pitch augmentation
	for pitch in self.pitch_aug:
	y_pitch = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch)
	y = np.hstack((y, y_pitch))

	# Encode audio
	latent = self.encdec.encode(y, max_waveform_length=44100*1).cpu()
	self.codedb = torch.cat((self.codedb, latent), dim=-1)
	n_files += 1
	except Exception as e:
	print(e)

	self.db = torch.tensor([])
	for i in range(0, self.codedb.shape[-1], 1):
	code = self.codedb[:,:,i:i+self.unit]
	if code.shape[-1] != self.unit:
	continue
	self.db = torch.cat((self.db, code), dim=0)

	return {"message": f"Done! {n_files} files processed."}

	def morph_audio(self, target_file):
	# load target audio
	y, sr = librosa.load(target_file, sr=44100, mono=True)
	target_codes = self.encdec.encode(y)
	reconstructed = torch.zeros_like(target_codes).to(target_codes.device)

	# to make it stereo
	reconstructed = torch.vstack([reconstructed, reconstructed])

	# find closest code in db
	for i in range(0, target_codes.shape[-1], self.stride):
	target_code = target_codes[:,:,i:i+self.unit]
	if target_code.shape[-1] != self.unit:
	continue

	distances = cdist(self.db.reshape(self.db.shape[0], -1).cpu().numpy(),
	target_code.reshape(1, -1).cpu().numpy(), 'cosine').squeeze()

	# Apply temperature scaling to logits
	logits = -distances / (self.temperature + 1e-8)
	probabilities = np.exp(logits) / (np.sum(np.exp(logits)) + 1e-8)
	probabilities = np.nan_to_num(probabilities)

	for j in range(2): # to fill stereo buffer
	code_closest = self.db[np.random.choice(self.db.shape[0], p=probabilities/np.sum(probabilities))]
	if min(distances) > self.threshold:
	code_closest = target_code
	if i+self.unit < reconstructed.shape[-1]:
	reconstructed[j,:,i:i+self.unit] = code_closest
	else:
	reconstructed[j,:,i:] = code_closest[:,:reconstructed.shape[-1]-i]

	# decode
	y2 = self.encdec.decode(reconstructed)
	sr = 44100
	return sr, (y2.cpu().numpy().squeeze().transpose() * 31000).astype(np.int16)
	# return sr, (y2.cpu().numpy().squeeze().transpose() * 32767).astype(np.int16)


	synth = LatentGranularSynthesis()

	def build_dataset(files, aug_checkbox):
	return synth.build_dataset(files, aug_checkbox)

	def morph_audio(target_file):
	return synth.morph_audio(target_file)

	def temperature(temperature, threshold):
	return synth.set_temperature(temperature, threshold)

	def unit(unit, stride):
	return synth.set_unit(unit, stride)


	with gr.Blocks() as demo:
	with gr.Row():
	with gr.Column():
	# gr.Label("Upload your audio files to train a model")
	db_file = gr.File(file_count="multiple", label="Source Sounds")
	aug_checkbox = gr.Checkbox(label="Apply Augmentation")
	b1 = gr.Button("Process source sounds")
	text = gr.Textbox(label="Result")

	with gr.Column():
	# gr.Label("Upload a target audio file to morph")
	target_file = gr.File(label="Target sound")
	b2 = gr.Button("Morph Audio")
	audioplayer = gr.Audio(label="Output")

	b1.click(build_dataset, inputs=[db_file, aug_checkbox], outputs=text)
	b2.click(morph_audio, inputs=target_file, outputs=audioplayer)

	demo.launch()

	#%%