Jeremy Hummel commited on
Commit
c5b2d3d
·
1 Parent(s): b2fe805

Fixes audio, adds description, examples

Browse files
Files changed (2) hide show
  1. app.py +46 -2
  2. visualize.py +26 -33
app.py CHANGED
@@ -32,12 +32,55 @@ network_choices = [
32
  'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan2/versions/1/files/stylegan2-metfacesu-1024x1024.pkl'
33
  ]
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  demo = gr.Interface(
37
  fn=visualize,
 
 
 
38
  inputs=[
39
- gr.Audio(label="Audio File"),
40
- # gr.File(),
41
  gr.Dropdown(choices=network_choices, value=network_choices[0], label="Network"),
42
  gr.Slider(minimum=0.0, value=1.0, maximum=2.0, label="Truncation"),
43
  gr.Slider(minimum=0.0, value=0.25, maximum=2.0, label="Tempo Sensitivity"),
@@ -45,6 +88,7 @@ demo = gr.Interface(
45
  gr.Slider(minimum=64, value=512, maximum=1024, step=64, label="Frame Length (samples)"),
46
  gr.Slider(minimum=1, value=300, maximum=600, step=1, label="Max Duration (seconds)"),
47
  ],
 
48
  outputs=gr.Video()
49
  )
50
  demo.launch()
 
32
  'https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan2/versions/1/files/stylegan2-metfacesu-1024x1024.pkl'
33
  ]
34
 
35
+ description = \
36
+ """
37
+ Generate visualizations on an input audio file using [StyleGAN3](https://nvlabs.github.io/stylegan3/) (Karras, Tero, et al. "Alias-free generative adversarial networks." Advances in Neural Information Processing Systems 34 (2021): 852-863.).
38
+ Inspired by [Deep Music Visualizer](https://github.com/msieg/deep-music-visualizer), which used BigGAN (Brock et al., 2018)
39
+ Developed by Jeremy Hummel at [Lambda](https://lambdalabs.com/)
40
+ """
41
+
42
+ article = \
43
+ """
44
+ ## How does this work?
45
+ The audio is transformed to a spectral representation by using Short-time Fourier transform (STFT). [librosa]()
46
+ Starting with an initial noise vector, we perform a random walk, adjusting the length of each step with the power gradient.
47
+ This pushes the noise vector to move around more when the sound changes.
48
+
49
+ ## Parameter info:
50
+ *Network*: various pre-trained models from NVIDIA, "afhqv2" is animals, "ffhq" is faces, "metfaces" is artwork.
51
+
52
+ *Truncation*: controls how far the noise vector can be from the origin. `0.7` will generate more realistic, but less diverse samples,
53
+ while `1.2` will can yield more interesting but less realistic images.
54
+
55
+ *Tempo Sensitivity*: controls the how the size of each step scales with the audio features
56
+
57
+ *Jitter*: prevents the same exact noise vectors from cycling repetitively, if set to `0`, the images will repeat during
58
+ repetitive parts of the audio
59
+
60
+ *Frame Length*: controls the number of audio frames per video frame in the output.
61
+ If you want a higher frame rate for visualizing very rapid music, lower the frame length.
62
+ If you want a lower frame rate (which will complete the job faster), raise the frame length
63
+
64
+ *Max Duration*: controls the max length of the visualization, in seconds. Use a shorter value here to get output
65
+ more quickly, especially for testing different combinations of parameters.
66
+
67
+ Media sources:
68
+ [Maple Leaf Rag - Scott Joplin (1916, public domain)](https://commons.wikimedia.org/wiki/File:Maple_leaf_rag_-_played_by_Scott_Joplin_1916_V2.ogg)
69
+ [Moonlight Sonata Opus 27. no 2. - movement 3 - Ludwig van Beethoven, played by Muriel Nguyen Xuan (2008, CC BY-SA 3.0)](https://commons.wikimedia.org/wiki/File:Muriel-Nguyen-Xuan-Beethovens-Moonlight-Sonata-mvt-3.oga)
70
+ """
71
+
72
+ examples = [
73
+ ["examples/Maple_leaf_rag_-_played_by_Scott_Joplin_1916_V2.ogg", network_choices[0], 1.0, 0.25, 0.5, 512, 45],
74
+ ["examples/Muriel-Nguyen-Xuan-Beethovens-Moonlight-Sonata-mvt-3.ogx", network_choices[4], 1.2, 0.3, 0.5, 384, 22],
75
+ ]
76
 
77
  demo = gr.Interface(
78
  fn=visualize,
79
+ title="Generative Music Visualizer",
80
+ description=description,
81
+ article=article,
82
  inputs=[
83
+ gr.Audio(label="Audio File", type="filepath"),
 
84
  gr.Dropdown(choices=network_choices, value=network_choices[0], label="Network"),
85
  gr.Slider(minimum=0.0, value=1.0, maximum=2.0, label="Truncation"),
86
  gr.Slider(minimum=0.0, value=0.25, maximum=2.0, label="Tempo Sensitivity"),
 
88
  gr.Slider(minimum=64, value=512, maximum=1024, step=64, label="Frame Length (samples)"),
89
  gr.Slider(minimum=1, value=300, maximum=600, step=1, label="Max Duration (seconds)"),
90
  ],
91
+ examples=examples,
92
  outputs=gr.Video()
93
  )
94
  demo.launch()
visualize.py CHANGED
@@ -3,7 +3,6 @@ import numpy as np
3
  import moviepy.editor as mpy
4
  import random
5
  import torch
6
- from moviepy.audio.AudioClip import AudioArrayClip
7
  from tqdm import tqdm
8
  import dnnlib
9
  import legacy
@@ -18,37 +17,37 @@ def visualize(audio_file,
18
  frame_length,
19
  duration,
20
  ):
21
- # print(audio_file, truncation, network)
22
- # print(args)
23
- # print(kwargs)
24
 
25
  if audio_file:
26
  print('\nReading audio \n')
27
- # audio, sr = librosa.load(audio_file.name)
28
- sr, audio = audio_file
29
  else:
30
  raise ValueError("you must enter an audio file name in the --song argument")
31
 
32
- print(sr)
33
- print(audio.dtype)
34
- print(audio.shape)
35
- if audio.shape[0] < duration * sr:
36
- duration = None
37
- else:
38
- frames = duration * sr
39
- audio = audio[:frames]
40
-
41
- print(audio.dtype)
42
- print(audio.shape)
43
- if audio.dtype == np.int16:
44
- audio = audio.astype(np.float32, order='C') / 2**15
45
- elif audio.dtype == np.int32:
46
- audio = audio.astype(np.float32, order='C') / 2**31
47
- audio = audio.T
48
- audio = librosa.to_mono(audio)
49
- audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr, res_type="kaiser_best")
50
- print(audio.dtype)
51
- print(audio.shape)
 
 
 
52
 
53
 
54
  # TODO:
@@ -185,13 +184,7 @@ def visualize(audio_file,
185
 
186
 
187
  #Save video
188
- sr, audio = audio_file
189
- if audio.dtype == np.int16:
190
- audio = audio.astype(np.float32, order='C') / 2**15
191
- elif audio.dtype == np.int32:
192
- audio = audio.astype(np.float32, order='C') / 2**31
193
- with AudioArrayClip(audio, sr) as aud: # from a numeric array
194
- pass # Close is implicitly performed by context manager.
195
 
196
  if duration is not None:
197
  aud.duration = duration
 
3
  import moviepy.editor as mpy
4
  import random
5
  import torch
 
6
  from tqdm import tqdm
7
  import dnnlib
8
  import legacy
 
17
  frame_length,
18
  duration,
19
  ):
20
+ print(audio_file)
 
 
21
 
22
  if audio_file:
23
  print('\nReading audio \n')
24
+ audio, sr = librosa.load(audio_file, duration=duration)
 
25
  else:
26
  raise ValueError("you must enter an audio file name in the --song argument")
27
 
28
+ # print(sr)
29
+ # print(audio.dtype)
30
+ # print(audio.shape)
31
+ # if audio.shape[0] < duration * sr:
32
+ # duration = None
33
+ # else:
34
+ # frames = duration * sr
35
+ # audio = audio[:frames]
36
+ #
37
+ # print(audio.dtype)
38
+ # print(audio.shape)
39
+ # if audio.dtype == np.int16:
40
+ # print(f'min: {np.min(audio)}, max: {np.max(audio)}')
41
+ # audio = audio.astype(np.float32, order='C') / 2**15
42
+ # elif audio.dtype == np.int32:
43
+ # print(f'min: {np.min(audio)}, max: {np.max(audio)}')
44
+ # audio = audio.astype(np.float32, order='C') / 2**31
45
+ # audio = audio.T
46
+ # audio = librosa.to_mono(audio)
47
+ # audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr, res_type="kaiser_best")
48
+ # print(audio.dtype)
49
+ # print(audio.shape)
50
+
51
 
52
 
53
  # TODO:
 
184
 
185
 
186
  #Save video
187
+ aud = mpy.AudioFileClip(audio_file)
 
 
 
 
 
 
188
 
189
  if duration is not None:
190
  aud.duration = duration