#%%
import gradio as gr
import librosa, torch
import numpy as np
from music2latent import EncoderDecoder
from scipy.spatial.distance import cdist

class LatentGranularSynthesis:
    def __init__(self):
        self.encdec = EncoderDecoder()
        if torch.cuda.is_available():
            print('Using GPU')

        self.unit = 2
        self.stride = 2
        self.temperature = 0.01
        self.threshold = 1.0
        self.files = None
        self.pitch_aug = [-5, -2, 2, 5]
        self.vol_aug = [0.3, 0.7]
    
    def set_temperature(self, temperature, threshold):
        self.temperature = temperature * 0.01
        self.threshold = threshold

    def set_unit(self, unit, stride):
        self.unit = unit
        self.stride = stride
        if self.files is not None:
            self.build_dataset(self.files)

    def build_dataset(self, files, aug_checkbox: bool):
        self.files = files
        self.codedb = torch.tensor([])
        n_files = 0
        for path in files:
            try:
                y, sr = librosa.load(path, sr=44100)

                # Normalize audio
                y = librosa.util.normalize(y)

                if aug_checkbox:
                    # Apply volume augmentation
                    for vol in self.vol_aug:
                        y_vol = y * vol
                        y = np.hstack((y, y_vol))

                    # Apply pitch augmentation
                    for pitch in self.pitch_aug:
                        y_pitch = librosa.effects.pitch_shift(y, sr=sr, n_steps=pitch)
                        y = np.hstack((y, y_pitch))

                # Encode audio
                latent = self.encdec.encode(y, max_waveform_length=44100*1).cpu()    
                self.codedb = torch.cat((self.codedb, latent), dim=-1)
                n_files += 1
            except Exception as e:
                print(e)

        self.db = torch.tensor([])
        for i in range(0, self.codedb.shape[-1], 1):
            code =  self.codedb[:,:,i:i+self.unit]
            if code.shape[-1] != self.unit:
                continue
            self.db = torch.cat((self.db, code), dim=0)
        
        return {"message": f"Done! {n_files} files processed."}
 
    def morph_audio(self, target_file):
        # load target audio
        y, sr = librosa.load(target_file, sr=44100, mono=True)
        target_codes = self.encdec.encode(y)    
        reconstructed = torch.zeros_like(target_codes).to(target_codes.device)

        # to make it stereo
        reconstructed = torch.vstack([reconstructed, reconstructed]) 

        # find closest code in db
        for i in range(0, target_codes.shape[-1], self.stride):
            target_code = target_codes[:,:,i:i+self.unit]
            if target_code.shape[-1] != self.unit:
                continue

            distances = cdist(self.db.reshape(self.db.shape[0], -1).cpu().numpy(), 
                                target_code.reshape(1, -1).cpu().numpy(), 'cosine').squeeze()

            # Apply temperature scaling to logits
            logits = -distances / (self.temperature + 1e-8)
            probabilities = np.exp(logits) / (np.sum(np.exp(logits)) + 1e-8)
            probabilities = np.nan_to_num(probabilities)

            for j in range(2): # to fill stereo buffer
                code_closest = self.db[np.random.choice(self.db.shape[0], p=probabilities/np.sum(probabilities))]
                if min(distances) > self.threshold:
                    code_closest = target_code   
                if i+self.unit < reconstructed.shape[-1]:   
                    reconstructed[j,:,i:i+self.unit] = code_closest
                else:
                    reconstructed[j,:,i:] = code_closest[:,:reconstructed.shape[-1]-i]

        # decode    
        y2 = self.encdec.decode(reconstructed)
        sr = 44100
        return sr, (y2.cpu().numpy().squeeze().transpose() * 31000).astype(np.int16)
        # return sr, (y2.cpu().numpy().squeeze().transpose() * 32767).astype(np.int16)


synth = LatentGranularSynthesis()

def build_dataset(files, aug_checkbox):
    return synth.build_dataset(files, aug_checkbox)

def morph_audio(target_file):
    return synth.morph_audio(target_file)

def temperature(temperature, threshold):
    return synth.set_temperature(temperature, threshold)

def unit(unit, stride):
    return synth.set_unit(unit, stride)


with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            # gr.Label("Upload your audio files to train a model")
            db_file = gr.File(file_count="multiple", label="Source Sounds")
            aug_checkbox = gr.Checkbox(label="Apply Augmentation")
            b1 = gr.Button("Process source sounds")
            text = gr.Textbox(label="Result")

        with gr.Column():
            # gr.Label("Upload a target audio file to morph")
            target_file = gr.File(label="Target sound")
            b2 = gr.Button("Morph Audio")
            audioplayer = gr.Audio(label="Output")

    b1.click(build_dataset, inputs=[db_file, aug_checkbox], outputs=text)
    b2.click(morph_audio, inputs=target_file, outputs=audioplayer)

demo.launch()

#%%