#%% import gradio as gr import librosa, torch import numpy as np from music2latent import EncoderDecoder from scipy.spatial.distance import cdist class LatentGranularSynthesis: def __init__(self): self.encdec = EncoderDecoder() if torch.cuda.is_available(): print('Using GPU') self.unit = 2 self.stride = 2 self.temperature = 0.01 self.threshold = 1.0 self.files = None self.pitch_aug = [-5, -2, 2, 5] self.vol_aug = [0.3, 0.7] def set_temperature(self, temperature, threshold): self.temperature = temperature * 0.01 self.threshold = threshold def set_unit(self, unit, stride): self.unit = unit self.stride = stride if self.files is not None: self.build_dataset(self.files) def build_dataset(self, files, aug_checkbox: bool): self.files = files self.codedb = torch.tensor([]) n_files = 0 for path in files: try: y, sr = librosa.load(path, sr=44100) # Normalize audio y = librosa.util.normalize(y) if aug_checkbox: # Apply volume augmentation for vol in self.vol_aug: y_vol = y * vol y = np.hstack((y, y_vol)) # Apply pitch augmentation for pitch in self.pitch_aug: y_pitch = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch) y = np.hstack((y, y_pitch)) # Encode audio latent = self.encdec.encode(y, max_waveform_length=44100*1).cpu() self.codedb = torch.cat((self.codedb, latent), dim=-1) n_files += 1 except Exception as e: print(e) self.db = torch.tensor([]) for i in range(0, self.codedb.shape[-1], 1): code = self.codedb[:,:,i:i+self.unit] if code.shape[-1] != self.unit: continue self.db = torch.cat((self.db, code), dim=0) return {"message": f"Done! {n_files} files processed."} def morph_audio(self, target_file): # load target audio y, sr = librosa.load(target_file, sr=44100, mono=True) target_codes = self.encdec.encode(y) reconstructed = torch.zeros_like(target_codes).to(target_codes.device) # to make it stereo reconstructed = torch.vstack([reconstructed, reconstructed]) # find closest code in db for i in range(0, target_codes.shape[-1], self.stride): target_code = target_codes[:,:,i:i+self.unit] if target_code.shape[-1] != self.unit: continue distances = cdist(self.db.reshape(self.db.shape[0], -1).cpu().numpy(), target_code.reshape(1, -1).cpu().numpy(), 'cosine').squeeze() # Apply temperature scaling to logits logits = -distances / (self.temperature + 1e-8) probabilities = np.exp(logits) / (np.sum(np.exp(logits)) + 1e-8) probabilities = np.nan_to_num(probabilities) for j in range(2): # to fill stereo buffer code_closest = self.db[np.random.choice(self.db.shape[0], p=probabilities/np.sum(probabilities))] if min(distances) > self.threshold: code_closest = target_code if i+self.unit < reconstructed.shape[-1]: reconstructed[j,:,i:i+self.unit] = code_closest else: reconstructed[j,:,i:] = code_closest[:,:reconstructed.shape[-1]-i] # decode y2 = self.encdec.decode(reconstructed) sr = 44100 return sr, (y2.cpu().numpy().squeeze().transpose() * 31000).astype(np.int16) # return sr, (y2.cpu().numpy().squeeze().transpose() * 32767).astype(np.int16) synth = LatentGranularSynthesis() def build_dataset(files, aug_checkbox): return synth.build_dataset(files, aug_checkbox) def morph_audio(target_file): return synth.morph_audio(target_file) def temperature(temperature, threshold): return synth.set_temperature(temperature, threshold) def unit(unit, stride): return synth.set_unit(unit, stride) with gr.Blocks() as demo: with gr.Row(): with gr.Column(): # gr.Label("Upload your audio files to train a model") db_file = gr.File(file_count="multiple", label="Source Sounds") aug_checkbox = gr.Checkbox(label="Apply Augmentation") b1 = gr.Button("Process source sounds") text = gr.Textbox(label="Result") with gr.Column(): # gr.Label("Upload a target audio file to morph") target_file = gr.File(label="Target sound") b2 = gr.Button("Morph Audio") audioplayer = gr.Audio(label="Output") b1.click(build_dataset, inputs=[db_file, aug_checkbox], outputs=text) b2.click(morph_audio, inputs=target_file, outputs=audioplayer) demo.launch() #%%