File size: 5,929 Bytes
ae0b9cb
 
 
 
 
 
5e8bcf0
 
ae0b9cb
0a72300
ae0b9cb
0a72300
 
 
 
 
 
 
 
c5b2d3d
ae0b9cb
 
 
c5b2d3d
ae0b9cb
 
 
c5b2d3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a72300
ae0b9cb
0a72300
 
 
268ca90
ae0b9cb
 
 
 
 
5e8bcf0
 
ae0b9cb
 
 
 
 
 
 
 
 
 
 
0a72300
ae0b9cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5b2d3d
ae0b9cb
49aae60
ae0b9cb
 
0a72300
ae0b9cb
 
0a72300
 
 
 
 
 
 
ae0b9cb
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import librosa
import numpy as np
import moviepy.editor as mpy
import random
import torch
from tqdm import tqdm
import dnnlib
import legacy

target_sr = 22050

def visualize(audio_file,
              network,
              truncation,
              tempo_sensitivity,
              jitter,
              frame_length,
              duration,
              ):
    print(audio_file)

    if audio_file:
        print('\nReading audio \n')
        audio, sr = librosa.load(audio_file, duration=duration)
    else:
        raise ValueError("you must enter an audio file name in the --song argument")

    # print(sr)
    # print(audio.dtype)
    # print(audio.shape)
    # if audio.shape[0] < duration * sr:
    #     duration = None
    # else:
    #     frames = duration * sr
    #     audio = audio[:frames]
    #
    # print(audio.dtype)
    # print(audio.shape)
    # if audio.dtype == np.int16:
    #     print(f'min: {np.min(audio)}, max: {np.max(audio)}')
    #     audio = audio.astype(np.float32, order='C') / 2**15
    # elif audio.dtype == np.int32:
    #     print(f'min: {np.min(audio)}, max: {np.max(audio)}')
    #     audio = audio.astype(np.float32, order='C') / 2**31
    # audio = audio.T
    # audio = librosa.to_mono(audio)
    # audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr, res_type="kaiser_best")
    # print(audio.dtype)
    # print(audio.shape)



    # TODO:
    batch_size = 1
    resolution = 512
    outfile="output.mp4"

    tempo_sensitivity = tempo_sensitivity * frame_length / 512

    # Load pre-trained model
    device = torch.device('cuda')
    with dnnlib.util.open_url(network) as f:
        G = legacy.load_network_pkl(f)['G_ema'].to(device) # type: ignore
        G.eval()

    with torch.no_grad():
        z = torch.randn([1, G.z_dim]).cuda()    # latent codes
        c = None                                # class labels (not used in this example)
        img = G(z, c)                           # NCHW, float32, dynamic range [-1, +1], no truncation

    #set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    #create spectrogram
    spec = librosa.feature.melspectrogram(y=audio, sr=target_sr, n_mels=512,fmax=8000, hop_length=frame_length)

    #get mean power at each time point
    specm=np.mean(spec,axis=0)

    #compute power gradient across time points
    gradm=np.gradient(specm)

    #set max to 1
    gradm=gradm/np.max(gradm)

    #set negative gradient time points to zero
    gradm = gradm.clip(min=0)

    #normalize mean power between 0-1
    specm=(specm-np.min(specm))/np.ptp(specm)

    #initialize first noise vector
    nv1 = torch.randn([G.z_dim]).cuda()

    #initialize list of class and noise vectors
    noise_vectors=[nv1]

    #initialize previous vectors (will be used to track the previous frame)
    nvlast=nv1

    #initialize the direction of noise vector unit updates
    update_dir=np.zeros(512)
    print(len(nv1))
    for ni,n in enumerate(nv1):
        if n<0:
            update_dir[ni] = 1
        else:
            update_dir[ni] = -1

    #initialize noise unit update
    update_last=np.zeros(512)

    #get new jitters
    def new_jitters(jitter):
        jitters=np.zeros(512)
        for j in range(512):
            if random.uniform(0,1)<0.5:
                jitters[j]=1
            else:
                jitters[j]=1-jitter
        return jitters


    #get new update directions
    def new_update_dir(nv2,update_dir):
        for ni,n in enumerate(nv2):
            if n >= 2*truncation - tempo_sensitivity:
                update_dir[ni] = -1

            elif n < -2*truncation + tempo_sensitivity:
                update_dir[ni] = 1
        return update_dir

    print('\nGenerating input vectors \n')
    for i in tqdm(range(len(gradm))):

        #update jitter vector every 100 frames by setting ~half of noise vector units to lower sensitivity
        if i%200==0:
            jitters=new_jitters(jitter)

        #get last noise vector
        nv1=nvlast

        #set noise vector update based on direction, sensitivity, jitter, and combination of overall power and gradient of power
        update = np.array([tempo_sensitivity for k in range(512)]) * (gradm[i]+specm[i]) * update_dir * jitters

        #smooth the update with the previous update (to avoid overly sharp frame transitions)
        update=(update+update_last*3)/4

        #set last update
        update_last=update

        #update noise vector
        nv2=nv1.cpu()+update

        #append to noise vectors
        noise_vectors.append(nv2)

        #set last noise vector
        nvlast=nv2

        #update the direction of noise units
        update_dir=new_update_dir(nv2,update_dir)

    noise_vectors = torch.stack([nv.cuda() for nv in noise_vectors])


    print('\n\nGenerating frames \n')
    frames = []
    for i in tqdm(range(noise_vectors.shape[0] // batch_size)):

        noise_vector=noise_vectors[i*batch_size:(i+1)*batch_size]

        c = None  # class labels (not used in this example)
        with torch.no_grad():
            img = np.array(G(noise_vector, c, truncation_psi=truncation, noise_mode='const').cpu())            # NCHW, float32, dynamic range [-1, +1], no truncation
            img = np.transpose(img, (0,2,3,1)) #CHW -> HWC
            img = np.clip((img * 127.5 + 128), 0, 255).astype(np.uint8)

        # add to frames
        for im in img:
            frames.append(im)


    #Save video
    aud = mpy.AudioFileClip(audio_file)

    if duration < aud.duration:
        aud.duration = duration

    fps = target_sr / frame_length
    clip = mpy.ImageSequenceClip(frames, fps=fps)
    clip = clip.set_audio(aud)
    clip.write_videofile(outfile, audio_codec='aac', ffmpeg_params=[
        # "-vf", "scale=-1:2160:flags=lanczos",
        "-bf", "2",
        "-g", f"{fps/2}",
        "-crf", "18",
        "-movflags", "faststart"
    ])

    return outfile