Spaces:
Running
on
T4
Running
on
T4
#%% | |
import gradio as gr | |
import librosa, torch | |
import numpy as np | |
from music2latent import EncoderDecoder | |
from scipy.spatial.distance import cdist | |
class LatentGranularSynthesis: | |
def __init__(self): | |
self.encdec = EncoderDecoder() | |
if torch.cuda.is_available(): | |
print('Using GPU') | |
self.unit = 2 | |
self.stride = 2 | |
self.temperature = 0.01 | |
self.threshold = 1.0 | |
self.files = None | |
self.pitch_aug = [-5, -2, 2, 5] | |
self.vol_aug = [0.3, 0.7] | |
def set_temperature(self, temperature, threshold): | |
self.temperature = temperature * 0.01 | |
self.threshold = threshold | |
def set_unit(self, unit, stride): | |
self.unit = unit | |
self.stride = stride | |
if self.files is not None: | |
self.build_dataset(self.files) | |
def build_dataset(self, files, aug_checkbox: bool): | |
self.files = files | |
self.codedb = torch.tensor([]) | |
n_files = 0 | |
for path in files: | |
try: | |
y, sr = librosa.load(path, sr=44100) | |
# Normalize audio | |
y = librosa.util.normalize(y) | |
if aug_checkbox: | |
# Apply volume augmentation | |
for vol in self.vol_aug: | |
y_vol = y * vol | |
y = np.hstack((y, y_vol)) | |
# Apply pitch augmentation | |
for pitch in self.pitch_aug: | |
y_pitch = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch) | |
y = np.hstack((y, y_pitch)) | |
# Encode audio | |
latent = self.encdec.encode(y, max_waveform_length=44100*1).cpu() | |
self.codedb = torch.cat((self.codedb, latent), dim=-1) | |
n_files += 1 | |
except Exception as e: | |
print(e) | |
self.db = torch.tensor([]) | |
for i in range(0, self.codedb.shape[-1], 1): | |
code = self.codedb[:,:,i:i+self.unit] | |
if code.shape[-1] != self.unit: | |
continue | |
self.db = torch.cat((self.db, code), dim=0) | |
return {"message": f"Done! {n_files} files processed."} | |
def morph_audio(self, target_file): | |
# load target audio | |
y, sr = librosa.load(target_file, sr=44100, mono=True) | |
target_codes = self.encdec.encode(y) | |
reconstructed = torch.zeros_like(target_codes).to(target_codes.device) | |
# to make it stereo | |
reconstructed = torch.vstack([reconstructed, reconstructed]) | |
# find closest code in db | |
for i in range(0, target_codes.shape[-1], self.stride): | |
target_code = target_codes[:,:,i:i+self.unit] | |
if target_code.shape[-1] != self.unit: | |
continue | |
distances = cdist(self.db.reshape(self.db.shape[0], -1).cpu().numpy(), | |
target_code.reshape(1, -1).cpu().numpy(), 'cosine').squeeze() | |
# Apply temperature scaling to logits | |
logits = -distances / (self.temperature + 1e-8) | |
probabilities = np.exp(logits) / (np.sum(np.exp(logits)) + 1e-8) | |
probabilities = np.nan_to_num(probabilities) | |
for j in range(2): # to fill stereo buffer | |
code_closest = self.db[np.random.choice(self.db.shape[0], p=probabilities/np.sum(probabilities))] | |
if min(distances) > self.threshold: | |
code_closest = target_code | |
if i+self.unit < reconstructed.shape[-1]: | |
reconstructed[j,:,i:i+self.unit] = code_closest | |
else: | |
reconstructed[j,:,i:] = code_closest[:,:reconstructed.shape[-1]-i] | |
# decode | |
y2 = self.encdec.decode(reconstructed) | |
sr = 44100 | |
return sr, (y2.cpu().numpy().squeeze().transpose() * 31000).astype(np.int16) | |
# return sr, (y2.cpu().numpy().squeeze().transpose() * 32767).astype(np.int16) | |
synth = LatentGranularSynthesis() | |
def build_dataset(files, aug_checkbox): | |
return synth.build_dataset(files, aug_checkbox) | |
def morph_audio(target_file): | |
return synth.morph_audio(target_file) | |
def temperature(temperature, threshold): | |
return synth.set_temperature(temperature, threshold) | |
def unit(unit, stride): | |
return synth.set_unit(unit, stride) | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
with gr.Column(): | |
# gr.Label("Upload your audio files to train a model") | |
db_file = gr.File(file_count="multiple", label="Source Sounds") | |
aug_checkbox = gr.Checkbox(label="Apply Augmentation") | |
b1 = gr.Button("Process source sounds") | |
text = gr.Textbox(label="Result") | |
with gr.Column(): | |
# gr.Label("Upload a target audio file to morph") | |
target_file = gr.File(label="Target sound") | |
b2 = gr.Button("Morph Audio") | |
audioplayer = gr.Audio(label="Output") | |
b1.click(build_dataset, inputs=[db_file, aug_checkbox], outputs=text) | |
b2.click(morph_audio, inputs=target_file, outputs=audioplayer) | |
demo.launch() | |
#%% |