Spaces:

lambdalabs
/

generative-music-visualizer

Build error

File size: 5,929 Bytes

import librosa
import numpy as np
import moviepy.editor as mpy
import random
import torch
from tqdm import tqdm
import dnnlib
import legacy

target_sr = 22050

def visualize(audio_file,
              network,
              truncation,
              tempo_sensitivity,
              jitter,
              frame_length,
              duration,
              ):
    print(audio_file)

    if audio_file:
        print('\nReading audio \n')
        audio, sr = librosa.load(audio_file, duration=duration)
    else:
        raise ValueError("you must enter an audio file name in the --song argument")

    # print(sr)
    # print(audio.dtype)
    # print(audio.shape)
    # if audio.shape[0] < duration * sr:
    #     duration = None
    # else:
    #     frames = duration * sr
    #     audio = audio[:frames]
    #
    # print(audio.dtype)
    # print(audio.shape)
    # if audio.dtype == np.int16:
    #     print(f'min: {np.min(audio)}, max: {np.max(audio)}')
    #     audio = audio.astype(np.float32, order='C') / 2**15
    # elif audio.dtype == np.int32:
    #     print(f'min: {np.min(audio)}, max: {np.max(audio)}')
    #     audio = audio.astype(np.float32, order='C') / 2**31
    # audio = audio.T
    # audio = librosa.to_mono(audio)
    # audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr, res_type="kaiser_best")
    # print(audio.dtype)
    # print(audio.shape)



    # TODO:
    batch_size = 1
    resolution = 512
    outfile="output.mp4"

    tempo_sensitivity = tempo_sensitivity * frame_length / 512

    # Load pre-trained model
    device = torch.device('cuda')
    with dnnlib.util.open_url(network) as f:
        G = legacy.load_network_pkl(f)['G_ema'].to(device) # type: ignore
        G.eval()

    with torch.no_grad():
        z = torch.randn([1, G.z_dim]).cuda()    # latent codes
        c = None                                # class labels (not used in this example)
        img = G(z, c)                           # NCHW, float32, dynamic range [-1, +1], no truncation

    #set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    #create spectrogram
    spec = librosa.feature.melspectrogram(y=audio, sr=target_sr, n_mels=512,fmax=8000, hop_length=frame_length)

    #get mean power at each time point
    specm=np.mean(spec,axis=0)

    #compute power gradient across time points
    gradm=np.gradient(specm)

    #set max to 1
    gradm=gradm/np.max(gradm)

    #set negative gradient time points to zero
    gradm = gradm.clip(min=0)

    #normalize mean power between 0-1
    specm=(specm-np.min(specm))/np.ptp(specm)

    #initialize first noise vector
    nv1 = torch.randn([G.z_dim]).cuda()

    #initialize list of class and noise vectors
    noise_vectors=[nv1]

    #initialize previous vectors (will be used to track the previous frame)
    nvlast=nv1

    #initialize the direction of noise vector unit updates
    update_dir=np.zeros(512)
    print(len(nv1))
    for ni,n in enumerate(nv1):
        if n<0:
            update_dir[ni] = 1
        else:
            update_dir[ni] = -1

    #initialize noise unit update
    update_last=np.zeros(512)

    #get new jitters
    def new_jitters(jitter):
        jitters=np.zeros(512)
        for j in range(512):
            if random.uniform(0,1)<0.5:
                jitters[j]=1
            else:
                jitters[j]=1-jitter
        return jitters


    #get new update directions
    def new_update_dir(nv2,update_dir):
        for ni,n in enumerate(nv2):
            if n >= 2*truncation - tempo_sensitivity:
                update_dir[ni] = -1

            elif n < -2*truncation + tempo_sensitivity:
                update_dir[ni] = 1
        return update_dir

    print('\nGenerating input vectors \n')
    for i in tqdm(range(len(gradm))):

        #update jitter vector every 100 frames by setting ~half of noise vector units to lower sensitivity
        if i%200==0:
            jitters=new_jitters(jitter)

        #get last noise vector
        nv1=nvlast

        #set noise vector update based on direction, sensitivity, jitter, and combination of overall power and gradient of power
        update = np.array([tempo_sensitivity for k in range(512)]) * (gradm[i]+specm[i]) * update_dir * jitters

        #smooth the update with the previous update (to avoid overly sharp frame transitions)
        update=(update+update_last*3)/4

        #set last update
        update_last=update

        #update noise vector
        nv2=nv1.cpu()+update

        #append to noise vectors
        noise_vectors.append(nv2)

        #set last noise vector
        nvlast=nv2

        #update the direction of noise units
        update_dir=new_update_dir(nv2,update_dir)

    noise_vectors = torch.stack([nv.cuda() for nv in noise_vectors])


    print('\n\nGenerating frames \n')
    frames = []
    for i in tqdm(range(noise_vectors.shape[0] // batch_size)):

        noise_vector=noise_vectors[i*batch_size:(i+1)*batch_size]

        c = None  # class labels (not used in this example)
        with torch.no_grad():
            img = np.array(G(noise_vector, c, truncation_psi=truncation, noise_mode='const').cpu())            # NCHW, float32, dynamic range [-1, +1], no truncation
            img = np.transpose(img, (0,2,3,1)) #CHW -> HWC
            img = np.clip((img * 127.5 + 128), 0, 255).astype(np.uint8)

        # add to frames
        for im in img:
            frames.append(im)


    #Save video
    aud = mpy.AudioFileClip(audio_file)

    if duration < aud.duration:
        aud.duration = duration

    fps = target_sr / frame_length
    clip = mpy.ImageSequenceClip(frames, fps=fps)
    clip = clip.set_audio(aud)
    clip.write_videofile(outfile, audio_codec='aac', ffmpeg_params=[
        # "-vf", "scale=-1:2160:flags=lanczos",
        "-bf", "2",
        "-g", f"{fps/2}",
        "-crf", "18",
        "-movflags", "faststart"
    ])

    return outfile