latentgranular / app.py
naotokui's picture
rename
6e53c55
#%%
import gradio as gr
import librosa, torch
import numpy as np
from music2latent import EncoderDecoder
from scipy.spatial.distance import cdist
class LatentGranularSynthesis:
def __init__(self):
self.encdec = EncoderDecoder()
if torch.cuda.is_available():
print('Using GPU')
self.unit = 2
self.stride = 2
self.temperature = 0.01
self.threshold = 1.0
self.files = None
self.pitch_aug = [-5, -2, 2, 5]
self.vol_aug = [0.3, 0.7]
def set_temperature(self, temperature, threshold):
self.temperature = temperature * 0.01
self.threshold = threshold
def set_unit(self, unit, stride):
self.unit = unit
self.stride = stride
if self.files is not None:
self.build_dataset(self.files)
def build_dataset(self, files, aug_checkbox: bool):
self.files = files
self.codedb = torch.tensor([])
n_files = 0
for path in files:
try:
y, sr = librosa.load(path, sr=44100)
# Normalize audio
y = librosa.util.normalize(y)
if aug_checkbox:
# Apply volume augmentation
for vol in self.vol_aug:
y_vol = y * vol
y = np.hstack((y, y_vol))
# Apply pitch augmentation
for pitch in self.pitch_aug:
y_pitch = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch)
y = np.hstack((y, y_pitch))
# Encode audio
latent = self.encdec.encode(y, max_waveform_length=44100*1).cpu()
self.codedb = torch.cat((self.codedb, latent), dim=-1)
n_files += 1
except Exception as e:
print(e)
self.db = torch.tensor([])
for i in range(0, self.codedb.shape[-1], 1):
code = self.codedb[:,:,i:i+self.unit]
if code.shape[-1] != self.unit:
continue
self.db = torch.cat((self.db, code), dim=0)
return {"message": f"Done! {n_files} files processed."}
def morph_audio(self, target_file):
# load target audio
y, sr = librosa.load(target_file, sr=44100, mono=True)
target_codes = self.encdec.encode(y)
reconstructed = torch.zeros_like(target_codes).to(target_codes.device)
# to make it stereo
reconstructed = torch.vstack([reconstructed, reconstructed])
# find closest code in db
for i in range(0, target_codes.shape[-1], self.stride):
target_code = target_codes[:,:,i:i+self.unit]
if target_code.shape[-1] != self.unit:
continue
distances = cdist(self.db.reshape(self.db.shape[0], -1).cpu().numpy(),
target_code.reshape(1, -1).cpu().numpy(), 'cosine').squeeze()
# Apply temperature scaling to logits
logits = -distances / (self.temperature + 1e-8)
probabilities = np.exp(logits) / (np.sum(np.exp(logits)) + 1e-8)
probabilities = np.nan_to_num(probabilities)
for j in range(2): # to fill stereo buffer
code_closest = self.db[np.random.choice(self.db.shape[0], p=probabilities/np.sum(probabilities))]
if min(distances) > self.threshold:
code_closest = target_code
if i+self.unit < reconstructed.shape[-1]:
reconstructed[j,:,i:i+self.unit] = code_closest
else:
reconstructed[j,:,i:] = code_closest[:,:reconstructed.shape[-1]-i]
# decode
y2 = self.encdec.decode(reconstructed)
sr = 44100
return sr, (y2.cpu().numpy().squeeze().transpose() * 31000).astype(np.int16)
# return sr, (y2.cpu().numpy().squeeze().transpose() * 32767).astype(np.int16)
synth = LatentGranularSynthesis()
def build_dataset(files, aug_checkbox):
return synth.build_dataset(files, aug_checkbox)
def morph_audio(target_file):
return synth.morph_audio(target_file)
def temperature(temperature, threshold):
return synth.set_temperature(temperature, threshold)
def unit(unit, stride):
return synth.set_unit(unit, stride)
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
# gr.Label("Upload your audio files to train a model")
db_file = gr.File(file_count="multiple", label="Source Sounds")
aug_checkbox = gr.Checkbox(label="Apply Augmentation")
b1 = gr.Button("Process source sounds")
text = gr.Textbox(label="Result")
with gr.Column():
# gr.Label("Upload a target audio file to morph")
target_file = gr.File(label="Target sound")
b2 = gr.Button("Morph Audio")
audioplayer = gr.Audio(label="Output")
b1.click(build_dataset, inputs=[db_file, aug_checkbox], outputs=text)
b2.click(morph_audio, inputs=target_file, outputs=audioplayer)
demo.launch()
#%%