In [2]:
from audio_diffusion_pytorch import AudioDiffusionModel
import torch
from IPython.display import Audio

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
model = AudioDiffusionModel(in_channels=1, 
                            patch_size=1,
                            multipliers=[1, 2, 4, 4, 4, 4, 4],
                            factors=[2, 2, 2, 2, 2, 2],
                            num_blocks=[2, 2, 2, 2, 2, 2],
                            attentions=[0, 0, 0, 0, 0, 0]
                           )
model = model.to(device)

In [7]:
fs = 22050
t = 2 ** 18 / 22050
samples = torch.arange(t * fs) / fs

for i in range(300, 8000):
    f = i
    # Create 2 sine waves (one at f=step, other is octave up) 
    # There is aliasing at higher freq, but since it is sinusoids, that doesn't matter too much
    signal1 = torch.sin(2 * torch.pi * f * samples)
    signal2 = torch.sin(2 * torch.pi * (f*2) * samples)
    stacked_signal = torch.stack((signal1, signal2)).unsqueeze(1)
    stacked_signal = stacked_signal.to(device)
    loss = model(stacked_signal)
    loss.backward() 
    if i % 10 == 0:
        print("Step", i)

Step 300
Step 310
Step 320


In [8]:
# Sample 2 sources given start noise
noise = torch.randn(2, 1, 2 ** 18)
noise = noise.to(device)
sampled = model.sample(
    noise=noise,
    num_steps=10 # Suggested range: 2-50
) # [2, 1, 2 ** 18]

In [9]:
z = sampled[1]
Audio(z.cpu(), rate=22050)