Jeremy Hummel commited on
Commit
0a72300
·
1 Parent(s): cda6f07

Adds options for generation

Browse files
Files changed (2) hide show
  1. app.py +10 -3
  2. visualize.py +54 -17
app.py CHANGED
@@ -32,12 +32,19 @@ network_choices = [
32
  'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan2/versions/1/files/stylegan2-metfacesu-1024x1024.pkl'
33
  ]
34
 
 
35
  demo = gr.Interface(
36
  fn=visualize,
37
- inputs=[gr.File(label="Audio file"),
 
 
38
  gr.Dropdown(choices=network_choices, value=network_choices[0], label="Network"),
39
- gr.Slider(minimum=0.0, value=1.0, maximum=2.0, step=0.1, label="Truncation"),
40
- gr.Slider(minimum=1, value=16, maximum=64, step=48, label="Batch Size")],
 
 
 
 
41
  outputs=gr.Video()
42
  )
43
  demo.launch()
 
32
  'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan2/versions/1/files/stylegan2-metfacesu-1024x1024.pkl'
33
  ]
34
 
35
+
36
  demo = gr.Interface(
37
  fn=visualize,
38
+ inputs=[
39
+ gr.Audio(label="Audio File"),
40
+ # gr.File(),
41
  gr.Dropdown(choices=network_choices, value=network_choices[0], label="Network"),
42
+ gr.Slider(minimum=0.0, value=1.0, maximum=2.0, label="Truncation"),
43
+ gr.Slider(minimum=0.0, value=0.25, maximum=2.0, label="Tempo Sensitivity"),
44
+ gr.Slider(minimum=0.0, value=0.5, maximum=2.0, label="Jitter"),
45
+ gr.Slider(minimum=64, value=512, maximum=1024, step=64, label="Frame Length (samples)"),
46
+ gr.Slider(minimum=1, value=300, maximum=600, step=1, label="Max Duration (seconds)"),
47
+ ],
48
  outputs=gr.Video()
49
  )
50
  demo.launch()
visualize.py CHANGED
@@ -3,32 +3,63 @@ import numpy as np
3
  import moviepy.editor as mpy
4
  import random
5
  import torch
 
6
  from tqdm import tqdm
7
  import stylegan3
8
 
 
9
 
10
- def visualize(audio_file, network, truncation, batch_size, *args, **kwargs):
 
 
 
 
 
 
 
11
  # print(audio_file, truncation, network)
12
  # print(args)
13
  # print(kwargs)
14
 
15
  if audio_file:
16
  print('\nReading audio \n')
17
- y, sr = librosa.load(audio_file.name)
 
18
  else:
19
  raise ValueError("you must enter an audio file name in the --song argument")
20
 
21
- resolution = 512
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- duration = None
24
 
25
- frame_length = 512
 
 
26
 
27
- tempo_sensitivity = 0.25
28
  tempo_sensitivity = tempo_sensitivity * frame_length / 512
29
 
30
- jitter = 0.5
31
-
32
  outfile = "output.mp4"
33
 
34
  # Load pre-trained model
@@ -46,7 +77,7 @@ def visualize(audio_file, network, truncation, batch_size, *args, **kwargs):
46
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
47
 
48
  #create spectrogram
49
- spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=512,fmax=8000, hop_length=frame_length)
50
 
51
  #get mean power at each time point
52
  specm=np.mean(spec,axis=0)
@@ -143,9 +174,6 @@ def visualize(audio_file, network, truncation, batch_size, *args, **kwargs):
143
  frames = []
144
  for i in tqdm(range(noise_vectors.shape[0] // batch_size)):
145
 
146
- #print progress
147
- pass
148
-
149
  noise_vector=noise_vectors[i*batch_size:(i+1)*batch_size]
150
 
151
  c = None # class labels (not used in this example)
@@ -160,15 +188,24 @@ def visualize(audio_file, network, truncation, batch_size, *args, **kwargs):
160
 
161
 
162
  #Save video
163
- aud = mpy.AudioFileClip(audio_file.name, fps = 44100)
 
 
 
 
164
 
165
- if duration:
166
  aud.duration = duration
167
 
168
- fps = 22050/frame_length
169
  clip = mpy.ImageSequenceClip(frames, fps=fps)
170
  clip = clip.set_audio(aud)
171
- clip.write_videofile(outfile, audio_codec='aac', ffmpeg_params=["-vf", "scale=-1:2160:flags=lanczos", "-bf", "2", "-g", f"{fps/2}", "-crf", "18", "-movflags", "faststart"])
172
-
 
 
 
 
 
173
 
174
  return outfile
 
3
  import moviepy.editor as mpy
4
  import random
5
  import torch
6
+ from moviepy.audio.AudioClip import AudioArrayClip
7
  from tqdm import tqdm
8
  import stylegan3
9
 
10
+ target_sr = 22050
11
 
12
+ def visualize(audio_file,
13
+ network,
14
+ truncation,
15
+ tempo_sensitivity,
16
+ jitter,
17
+ frame_length,
18
+ duration,
19
+ ):
20
  # print(audio_file, truncation, network)
21
  # print(args)
22
  # print(kwargs)
23
 
24
  if audio_file:
25
  print('\nReading audio \n')
26
+ # audio, sr = librosa.load(audio_file.name)
27
+ sr, audio = audio_file
28
  else:
29
  raise ValueError("you must enter an audio file name in the --song argument")
30
 
31
+ print(sr)
32
+ print(audio.dtype)
33
+ print(audio.shape)
34
+ if audio.shape[0] < duration * sr:
35
+ duration = None
36
+ else:
37
+ frames = duration * sr
38
+ audio = audio[:frames]
39
+
40
+ print(audio.dtype)
41
+ print(audio.shape)
42
+ if audio.dtype == np.int16:
43
+ audio = audio.astype(np.float32, order='C') / 32768.0
44
+ audio = audio.T
45
+ audio = librosa.to_mono(audio)
46
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr, res_type="kaiser_best")
47
+ print(audio.dtype)
48
+ print(audio.shape)
49
+
50
+ if audio.shape[0] / target_sr < duration:
51
+ duration = None
52
+ else:
53
+ frames = duration * sr
54
+ audio = audio[:frames]
55
 
 
56
 
57
+ # TODO:
58
+ batch_size = 1
59
+ resolution = 512
60
 
 
61
  tempo_sensitivity = tempo_sensitivity * frame_length / 512
62
 
 
 
63
  outfile = "output.mp4"
64
 
65
  # Load pre-trained model
 
77
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
78
 
79
  #create spectrogram
80
+ spec = librosa.feature.melspectrogram(y=audio, sr=target_sr, n_mels=512,fmax=8000, hop_length=frame_length)
81
 
82
  #get mean power at each time point
83
  specm=np.mean(spec,axis=0)
 
174
  frames = []
175
  for i in tqdm(range(noise_vectors.shape[0] // batch_size)):
176
 
 
 
 
177
  noise_vector=noise_vectors[i*batch_size:(i+1)*batch_size]
178
 
179
  c = None # class labels (not used in this example)
 
188
 
189
 
190
  #Save video
191
+ sr, audio = audio_file
192
+ if audio.dtype == np.int16:
193
+ audio = audio.astype(np.float32, order='C') / 32768.0
194
+ with AudioArrayClip(audio, sr) as aud: # from a numeric array
195
+ pass # Close is implicitly performed by context manager.
196
 
197
+ if duration is not None:
198
  aud.duration = duration
199
 
200
+ fps = target_sr / frame_length
201
  clip = mpy.ImageSequenceClip(frames, fps=fps)
202
  clip = clip.set_audio(aud)
203
+ clip.write_videofile(outfile, audio_codec='aac', ffmpeg_params=[
204
+ # "-vf", "scale=-1:2160:flags=lanczos",
205
+ "-bf", "2",
206
+ "-g", f"{fps/2}",
207
+ "-crf", "18",
208
+ "-movflags", "faststart"
209
+ ])
210
 
211
  return outfile