Spaces:
Build error
Build error
| import librosa | |
| import numpy as np | |
| import moviepy.editor as mpy | |
| import random | |
| import torch | |
| from tqdm import tqdm | |
| import dnnlib | |
| import legacy | |
| target_sr = 22050 | |
| def visualize(audio_file, | |
| network, | |
| truncation, | |
| tempo_sensitivity, | |
| jitter, | |
| frame_length, | |
| duration, | |
| ): | |
| print(audio_file) | |
| if audio_file: | |
| print('\nReading audio \n') | |
| audio, sr = librosa.load(audio_file, duration=duration) | |
| else: | |
| raise ValueError("you must enter an audio file name in the --song argument") | |
| # print(sr) | |
| # print(audio.dtype) | |
| # print(audio.shape) | |
| # if audio.shape[0] < duration * sr: | |
| # duration = None | |
| # else: | |
| # frames = duration * sr | |
| # audio = audio[:frames] | |
| # | |
| # print(audio.dtype) | |
| # print(audio.shape) | |
| # if audio.dtype == np.int16: | |
| # print(f'min: {np.min(audio)}, max: {np.max(audio)}') | |
| # audio = audio.astype(np.float32, order='C') / 2**15 | |
| # elif audio.dtype == np.int32: | |
| # print(f'min: {np.min(audio)}, max: {np.max(audio)}') | |
| # audio = audio.astype(np.float32, order='C') / 2**31 | |
| # audio = audio.T | |
| # audio = librosa.to_mono(audio) | |
| # audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr, res_type="kaiser_best") | |
| # print(audio.dtype) | |
| # print(audio.shape) | |
| # TODO: | |
| batch_size = 1 | |
| resolution = 512 | |
| outfile="output.mp4" | |
| tempo_sensitivity = tempo_sensitivity * frame_length / 512 | |
| # Load pre-trained model | |
| device = torch.device('cuda') | |
| with dnnlib.util.open_url(network) as f: | |
| G = legacy.load_network_pkl(f)['G_ema'].to(device) # type: ignore | |
| G.eval() | |
| with torch.no_grad(): | |
| z = torch.randn([1, G.z_dim]).cuda() # latent codes | |
| c = None # class labels (not used in this example) | |
| img = G(z, c) # NCHW, float32, dynamic range [-1, +1], no truncation | |
| #set device | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| #create spectrogram | |
| spec = librosa.feature.melspectrogram(y=audio, sr=target_sr, n_mels=512,fmax=8000, hop_length=frame_length) | |
| #get mean power at each time point | |
| specm=np.mean(spec,axis=0) | |
| #compute power gradient across time points | |
| gradm=np.gradient(specm) | |
| #set max to 1 | |
| gradm=gradm/np.max(gradm) | |
| #set negative gradient time points to zero | |
| gradm = gradm.clip(min=0) | |
| #normalize mean power between 0-1 | |
| specm=(specm-np.min(specm))/np.ptp(specm) | |
| #initialize first noise vector | |
| nv1 = torch.randn([G.z_dim]).cuda() | |
| #initialize list of class and noise vectors | |
| noise_vectors=[nv1] | |
| #initialize previous vectors (will be used to track the previous frame) | |
| nvlast=nv1 | |
| #initialize the direction of noise vector unit updates | |
| update_dir=np.zeros(512) | |
| print(len(nv1)) | |
| for ni,n in enumerate(nv1): | |
| if n<0: | |
| update_dir[ni] = 1 | |
| else: | |
| update_dir[ni] = -1 | |
| #initialize noise unit update | |
| update_last=np.zeros(512) | |
| #get new jitters | |
| def new_jitters(jitter): | |
| jitters=np.zeros(512) | |
| for j in range(512): | |
| if random.uniform(0,1)<0.5: | |
| jitters[j]=1 | |
| else: | |
| jitters[j]=1-jitter | |
| return jitters | |
| #get new update directions | |
| def new_update_dir(nv2,update_dir): | |
| for ni,n in enumerate(nv2): | |
| if n >= 2*truncation - tempo_sensitivity: | |
| update_dir[ni] = -1 | |
| elif n < -2*truncation + tempo_sensitivity: | |
| update_dir[ni] = 1 | |
| return update_dir | |
| print('\nGenerating input vectors \n') | |
| for i in tqdm(range(len(gradm))): | |
| #update jitter vector every 100 frames by setting ~half of noise vector units to lower sensitivity | |
| if i%200==0: | |
| jitters=new_jitters(jitter) | |
| #get last noise vector | |
| nv1=nvlast | |
| #set noise vector update based on direction, sensitivity, jitter, and combination of overall power and gradient of power | |
| update = np.array([tempo_sensitivity for k in range(512)]) * (gradm[i]+specm[i]) * update_dir * jitters | |
| #smooth the update with the previous update (to avoid overly sharp frame transitions) | |
| update=(update+update_last*3)/4 | |
| #set last update | |
| update_last=update | |
| #update noise vector | |
| nv2=nv1.cpu()+update | |
| #append to noise vectors | |
| noise_vectors.append(nv2) | |
| #set last noise vector | |
| nvlast=nv2 | |
| #update the direction of noise units | |
| update_dir=new_update_dir(nv2,update_dir) | |
| noise_vectors = torch.stack([nv.cuda() for nv in noise_vectors]) | |
| print('\n\nGenerating frames \n') | |
| frames = [] | |
| for i in tqdm(range(noise_vectors.shape[0] // batch_size)): | |
| noise_vector=noise_vectors[i*batch_size:(i+1)*batch_size] | |
| c = None # class labels (not used in this example) | |
| with torch.no_grad(): | |
| img = np.array(G(noise_vector, c, truncation_psi=truncation, noise_mode='const').cpu()) # NCHW, float32, dynamic range [-1, +1], no truncation | |
| img = np.transpose(img, (0,2,3,1)) #CHW -> HWC | |
| img = np.clip((img * 127.5 + 128), 0, 255).astype(np.uint8) | |
| # add to frames | |
| for im in img: | |
| frames.append(im) | |
| #Save video | |
| aud = mpy.AudioFileClip(audio_file) | |
| if duration < aud.duration: | |
| aud.duration = duration | |
| fps = target_sr / frame_length | |
| clip = mpy.ImageSequenceClip(frames, fps=fps) | |
| clip = clip.set_audio(aud) | |
| clip.write_videofile(outfile, audio_codec='aac', ffmpeg_params=[ | |
| # "-vf", "scale=-1:2160:flags=lanczos", | |
| "-bf", "2", | |
| "-g", f"{fps/2}", | |
| "-crf", "18", | |
| "-movflags", "faststart" | |
| ]) | |
| return outfile |