Spaces:

descript
/

vampnet

Runtime error

App Files Files Community

Hugo Flores Garcia commited on Jun 16, 2023

Commit

09b9691

1 Parent(s): 3f6f517

settling down on the new sampling routine

Browse files

Files changed (8) hide show

conf/generated/breaks-steps/c2f.yml +15 -0
conf/generated/breaks-steps/coarse.yml +8 -0
conf/generated/breaks-steps/interface.yml +7 -0
demo-new.py +0 -518
demo.py +28 -70
scripts/exp/train.py +2 -25
vampnet/interface.py +7 -6
vampnet/modules/transformer.py +24 -24

conf/generated/breaks-steps/c2f.yml ADDED Viewed

	@@ -0,0 +1,15 @@

+$include:
+- conf/lora/lora.yml
+AudioDataset.duration: 3.0
+AudioDataset.loudness_cutoff: -40.0
+VampNet.embedding_dim: 1280
+VampNet.n_codebooks: 14
+VampNet.n_conditioning_codebooks: 4
+VampNet.n_heads: 20
+VampNet.n_layers: 16
+fine_tune: true
+fine_tune_checkpoint: ./models/spotdl/c2f.pth
+save_path: ./runs/breaks-steps/c2f
+train/AudioLoader.sources: &id001
+- /media/CHONK/hugo/breaks-steps
+val/AudioLoader.sources: *id001

conf/generated/breaks-steps/coarse.yml ADDED Viewed

	@@ -0,0 +1,8 @@

+$include:
+- conf/lora/lora.yml
+fine_tune: true
+fine_tune_checkpoint: ./models/spotdl/coarse.pth
+save_path: ./runs/breaks-steps/coarse
+train/AudioLoader.sources: &id001
+- /media/CHONK/hugo/breaks-steps
+val/AudioLoader.sources: *id001

conf/generated/breaks-steps/interface.yml ADDED Viewed

	@@ -0,0 +1,7 @@

+AudioLoader.sources:
+- - /media/CHONK/hugo/breaks-steps
+Interface.coarse2fine_ckpt: ./models/spotdl/c2f.pth
+Interface.coarse2fine_lora_ckpt: ./runs/breaks-steps/c2f/latest/lora.pth
+Interface.coarse_ckpt: ./models/spotdl/coarse.pth
+Interface.coarse_lora_ckpt: ./runs/breaks-steps/coarse/latest/lora.pth
+Interface.codec_ckpt: ./models/spotdl/codec.pth

demo-new.py DELETED Viewed

@@ -1,518 +0,0 @@
-from pathlib import Path
-from typing import Tuple
-import yaml
-import tempfile
-import uuid
-from dataclasses import dataclass, asdict
-import numpy as np
-import audiotools as at
-import argbind
-import gradio as gr
-from vampnet.interface import Interface
-from vampnet import mask as pmask
-import logging
-logger = logging.getLogger()
-logger.setLevel(logging.CRITICAL)
-Interface = argbind.bind(Interface)
-AudioLoader = argbind.bind(at.data.datasets.AudioLoader)
-conf = argbind.parse_args()
-with argbind.scope(conf):
-    interface = Interface()
-    loader = AudioLoader()
-    print(f"interface device is {interface.device}")
-dataset = at.data.datasets.AudioDataset(
-    loader,
-    sample_rate=interface.codec.sample_rate,
-    duration=interface.coarse.chunk_size_s,
-    n_examples=5000,
-    without_replacement=True,
-)
-checkpoints = {
-    "spotdl": {
-        "coarse": "./models/spotdl/coarse.pth",
-        "c2f": "./models/spotdl/c2f.pth",
-        "codec": "./models/spotdl/codec.pth",
-        "full_ckpt": True
-    },
-    "berta": {
-        "coarse": "./models/finetuned/berta-goldman-speech/coarse.pth",
-        "c2f": "./models/finetuned/berta-goldman-speech/c2f.pth",
-        "codec": "./model/spotdl/codec.pth",
-        "full_ckpt": True
-    },
-    "xeno-canto-2": {
-        "coarse": "./models/finetuned/xeno-canto-2/coarse.pth",
-        "c2f": "./models/finetuned/xeno-canto-2/c2f.pth",
-        "codec": "./models/spotdl/codec.pth",
-        "full_ckpt": True
-    },
-    "panchos": {
-        "coarse": "./models/finetuned/panchos/coarse.pth",
-        "c2f": "./models/finetuned/panchos/c2f.pth",
-        "codec": "./models/spotdl/codec.pth",
-        "full_ckpt": False
-    },
-    "tv-choir": {
-        "coarse": "./models/finetuned/tv-choir/coarse.pth",
-        "c2f": "./models/finetuned/tv-choir/c2f.pth",
-        "codec": "./models/spotdl/codec.pth",
-        "full_ckpt": False
-    },
-    "titi": {
-        "coarse": "./models/finetuned/titi/coarse.pth",
-        "c2f": "./models/finetuned/titi/c2f.pth",
-        "codec": "./models/spotdl/codec.pth",
-        "full_ckpt": False
-    },
-    "titi-clean": {
-        "coarse": "./models/finetuned/titi-clean/coarse.pth",
-        "c2f": "./models/finetuned/titi-clean/c2f.pth",
-        "codec": "./models/spotdl/codec.pth",
-        "full_ckpt": False
-    }
-}
-interface.checkpoint_key = "spotdl"
-OUT_DIR = Path("gradio-outputs")
-OUT_DIR.mkdir(exist_ok=True, parents=True)
-def load_audio(file):
-    print(file)
-    filepath = file.name
-    sig = at.AudioSignal.salient_excerpt(
-        filepath,
-        duration=interface.coarse.chunk_size_s
-    )
-    sig = interface.preprocess(sig)
-    out_dir = OUT_DIR / "tmp" / str(uuid.uuid4())
-    out_dir.mkdir(parents=True, exist_ok=True)
-    sig.write(out_dir / "input.wav")
-    return sig.path_to_file
-def load_random_audio():
-    index = np.random.randint(0, len(dataset))
-    sig = dataset[index]["signal"]
-    sig = interface.preprocess(sig)
-    out_dir = OUT_DIR / "tmp" / str(uuid.uuid4())
-    out_dir.mkdir(parents=True, exist_ok=True)
-    sig.write(out_dir / "input.wav")
-    return sig.path_to_file
-def _vamp(data, return_mask=False):
-    # if our checkpoint key is different, we need to load a new checkpoint
-    if data[checkpoint_key] != interface.checkpoint_key:
-        print(f"loading checkpoint {data[checkpoint_key]}")
-        interface.lora_load(
-            checkpoints[data[checkpoint_key]]["coarse"],
-            checkpoints[data[checkpoint_key]]["c2f"],
-            checkpoints[data[checkpoint_key]]["full_ckpt"],
-        )
-        interface.checkpoint_key = data[checkpoint_key]
-    out_dir = OUT_DIR / str(uuid.uuid4())
-    out_dir.mkdir()
-    sig = at.AudioSignal(data[input_audio])
-    #pitch shift input
-    sig = sig.shift_pitch(data[input_pitch_shift])
-    # TODO: random pitch shift of segments in the signal to prompt! window size should be a parameter, pitch shift width should be a parameter
-    z = interface.encode(sig)
-    ncc = data[n_conditioning_codebooks]
-    # build the mask
-    mask = pmask.linear_random(z, data[rand_mask_intensity])
-    mask = pmask.mask_and(
-        mask, pmask.inpaint(
-            z,
-            interface.s2t(data[prefix_s]),
-            interface.s2t(data[suffix_s])
-        )
-    )
-    mask = pmask.mask_and(
-        mask, pmask.periodic_mask(
-            z,
-            data[periodic_p],
-            data[periodic_w],
-            random_roll=True
-        )
-    )
-    if data[onset_mask_width] > 0:
-        mask = pmask.mask_or(
-            mask, pmask.onset_mask(sig, z, interface, width=data[onset_mask_width])
-        )
-    # these should be the last two mask ops
-    mask = pmask.dropout(mask, data[dropout])
-    mask = pmask.codebook_unmask(mask, ncc)
-    print(f"created mask with: linear random {data[rand_mask_intensity]}, inpaint {data[prefix_s]}:{data[suffix_s]}, periodic {data[periodic_p]}:{data[periodic_w]}, dropout {data[dropout]}, codebook unmask {ncc}, onset mask {data[onset_mask_width]}, num steps {data[num_steps]}, init temp {data[init_temp]}, final temp {data[final_temp]}, use coarse2fine {data[use_coarse2fine]}")
-    # save the mask as a txt file
-    np.savetxt(out_dir / "mask.txt", mask[:,0,:].long().cpu().numpy())
-    # if data[topk] is not None:
-    #     top_k = data[topk] if data[topk] > 0 else None
-    # else:
-    #     top_k = None
-    zv, mask_z = interface.coarse_vamp(
-        z,
-        mask=mask,
-        sampling_steps=data[num_steps],
-        temperature=(data[init_temp]*10, data[final_temp]*10),
-        return_mask=True,
-        # sample=data[sampling_strategy],
-        typical_filtering=data[typical_filtering],
-        typical_mass=data[typical_mass],
-        typical_min_tokens=data[typical_min_tokens],
-        # top_k=top_k,
-        gen_fn=interface.coarse.generate,
-    )
-    if use_coarse2fine:
-        zv = interface.coarse_to_fine(zv)
-    sig = interface.to_signal(zv).cpu()
-    print("done")
-    sig.write(out_dir / "output.wav")
-    if return_mask:
-        mask = interface.to_signal(mask_z).cpu()
-        mask.write(out_dir / "mask.wav")
-        return sig.path_to_file, mask.path_to_file
-    else:
-        return sig.path_to_file
-def vamp(data):
-    return _vamp(data, return_mask=True)
-def api_vamp(data):
-    return _vamp(data, return_mask=False)
-def save_vamp(data):
-    out_dir = OUT_DIR / "saved" / str(uuid.uuid4())
-    out_dir.mkdir(parents=True, exist_ok=True)
-    sig_in = at.AudioSignal(data[input_audio])
-    sig_out = at.AudioSignal(data[output_audio])
-    sig_in.write(out_dir / "input.wav")
-    sig_out.write(out_dir / "output.wav")
-    _data = {
-        "init_temp": data[init_temp],
-        "final_temp": data[final_temp],
-        "prefix_s": data[prefix_s],
-        "suffix_s": data[suffix_s],
-        "rand_mask_intensity": data[rand_mask_intensity],
-        "num_steps": data[num_steps],
-        "notes": data[notes_text],
-        "periodic_period": data[periodic_p],
-        "periodic_width": data[periodic_w],
-        "n_conditioning_codebooks": data[n_conditioning_codebooks],
-        "use_coarse2fine": data[use_coarse2fine],
-        "stretch_factor": data[stretch_factor],
-    }
-    # save with yaml
-    with open(out_dir / "data.yaml", "w") as f:
-        yaml.dump(_data, f)
-    import zipfile
-    zip_path = out_dir.with_suffix(".zip")
-    with zipfile.ZipFile(zip_path, "w") as zf:
-        for file in out_dir.iterdir():
-            zf.write(file, file.name)
-    return f"saved! your save code is {out_dir.stem}", zip_path
-with gr.Blocks() as demo:
-    with gr.Row():
-        with gr.Column():
-            use_coarse2fine = gr.Checkbox(
-                label="use coarse2fine",
-                value=True
-            )
-            manual_audio_upload = gr.File(
-                label=f"upload some audio (will be randomly trimmed to max of {interface.coarse.chunk_size_s:.2f}s)",
-                file_types=["audio"]
-            )
-            load_random_audio_button = gr.Button("or load random audio")
-            input_audio = gr.Audio(
-                label="input audio",
-                interactive=False,
-                type="filepath",
-            )
-            audio_mask = gr.Audio(
-                label="audio mask (listen to this to hear the mask hints)",
-                interactive=False,
-                type="filepath",
-            )
-            # connect widgets
-            load_random_audio_button.click(
-                fn=load_random_audio,
-                inputs=[],
-                outputs=[ input_audio]
-            )
-            manual_audio_upload.change(
-                fn=load_audio,
-                inputs=[manual_audio_upload],
-                outputs=[ input_audio]
-            )
-        # mask settings
-        with gr.Column():
-            input_pitch_shift = gr.Slider(
-                label="input pitch shift (semitones)",
-                minimum=-36,
-                maximum=36,
-                step=1,
-                value=0,
-            )
-            rand_mask_intensity = gr.Slider(
-                label="random mask intensity. (If this is less than 1, scatters prompts throughout the audio, should be between 0.9 and 1.0)",
-                minimum=0.0,
-                maximum=1.0,
-                value=1.0
-            )
-            periodic_p = gr.Slider(
-                label="periodic prompt  (0.0 means no hint, 2 - lots of hints, 8 - a couple of hints, 16 - occasional hint, 32 - very occasional hint, etc)",
-                minimum=0,
-                maximum=128,
-                step=1,
-                value=3,
-            )
-            periodic_w = gr.Slider(
-                label="periodic prompt width (steps, 1 step ~= 10milliseconds)",
-                minimum=1,
-                maximum=20,
-                step=1,
-                value=1,
-            )
-            onset_mask_width = gr.Slider(
-                label="onset mask width (steps, 1 step ~= 10milliseconds)",
-                minimum=0,
-                maximum=20,
-                step=1,
-                value=5,
-            )
-            with gr.Accordion("extras ", open=False):
-                n_conditioning_codebooks = gr.Number(
-                    label="number of conditioning codebooks. probably 0",
-                    value=0,
-                    precision=0,
-                )
-                stretch_factor = gr.Slider(
-                    label="time stretch factor",
-                    minimum=0,
-                    maximum=64,
-                    step=1,
-                    value=1,
-                )
-            with gr.Accordion("prefix/suffix hints", open=False):
-                prefix_s = gr.Slider(
-                    label="prefix hint length (seconds)",
-                    minimum=0.0,
-                    maximum=10.0,
-                    value=0.0
-                )
-                suffix_s = gr.Slider(
-                    label="suffix hint length (seconds)",
-                    minimum=0.0,
-                    maximum=10.0,
-                    value=0.0
-                )
-            with gr.Accordion("temperature settings", open=False):
-                init_temp = gr.Slider(
-                    label="initial temperature (should probably stay between 0.6 and 1)",
-                    minimum=0.0,
-                    maximum=1.5,
-                    value=0.8
-                )
-                final_temp = gr.Slider(
-                    label="final temperature (should probably stay between 0.7 and 2)",
-                    minimum=0.0,
-                    maximum=2.0,
-                    value=0.8
-                )
-            with gr.Accordion("sampling settings", open=False):
-                sampling_strategy = gr.Radio(
-                    label="sampling strategy",
-                    choices=["gumbel", "multinomial"],
-                    value="gumbel"
-                )
-                typical_filtering = gr.Checkbox(
-                    label="typical filtering (cannot be used with topk)",
-                    value=False
-                )
-                typical_mass = gr.Slider(
-                    label="typical mass (should probably stay between 0.1 and 0.5)",
-                    minimum=0.01,
-                    maximum=0.99,
-                    value=0.2
-                )
-                typical_min_tokens = gr.Slider(
-                    label="typical min tokens (should probably stay between 1 and 256)",
-                    minimum=1,
-                    maximum=256,
-                    step=1,
-                    value=1
-                )
-            num_steps = gr.Slider(
-                label="number of steps (should normally be between 12 and 36)",
-                minimum=1,
-                maximum=128,
-                step=1,
-                value=36
-            )
-            dropout = gr.Slider(
-                label="mask dropout",
-                minimum=0.0,
-                maximum=1.0,
-                step=0.01,
-                value=0.0
-            )
-        # mask settings
-        with gr.Column():
-            checkpoint_key = gr.Radio(
-                label="checkpoint",
-                choices=list(checkpoints.keys()),
-                value="spotdl"
-            )
-            vamp_button = gr.Button("vamp!!!")
-            output_audio = gr.Audio(
-                label="output audio",
-                interactive=False,
-                type="filepath"
-            )
-        # with gr.Column():
-        #     with gr.Accordion(label="beat unmask (how much time around the beat should be hinted?)"):
-        #         use_beats = gr.Checkbox(
-        #             label="use beat hints (helps the output stick to the beat structure of the input)",
-        #             value=False
-        #         )
-        #         snap_to_beats = gr.Checkbox(
-        #             label="trim to beat markers (uncheck if the output audio is too short.)",
-        #             value=True
-        #         )
-        #         beat_unmask_dur = gr.Slider(
-        #             label="duration",
-        #             minimum=0.0,
-        #             maximum=3.0,
-        #             value=0.07
-        #         )
-            notes_text = gr.Textbox(
-                label="type any notes about the generated audio here",
-                value="",
-                interactive=True
-            )
-            save_button = gr.Button("save vamp")
-            download_file = gr.File(
-                label="vamp to download will appear here",
-                interactive=False
-            )
-            use_as_input_button = gr.Button("use output as input")
-            thank_you = gr.Markdown("")
-    _inputs = {
-            input_audio,
-            num_steps,
-            init_temp, final_temp,
-            prefix_s, suffix_s,
-            rand_mask_intensity,
-            periodic_p, periodic_w,
-            n_conditioning_codebooks,
-            dropout,
-            use_coarse2fine,
-            stretch_factor,
-            onset_mask_width,
-            input_pitch_shift,
-            sampling_strategy,
-            typical_filtering,
-            typical_mass,
-            typical_min_tokens,
-            # topk,
-            checkpoint_key
-        }
-    # connect widgets
-    vamp_button.click(
-        fn=vamp,
-        inputs=_inputs,
-        outputs=[output_audio, audio_mask],
-    )
-    api_vamp_button = gr.Button("api vamp")
-    api_vamp_button.click(
-        fn=api_vamp,
-        inputs=_inputs,
-        outputs=[output_audio],
-        api_name="vamp"
-    )
-    use_as_input_button.click(
-        fn=lambda x: x,
-        inputs=[output_audio],
-        outputs=[input_audio]
-    )
-    save_button.click(
-        fn=save_vamp,
-        inputs=_inputs | {notes_text, output_audio},
-        outputs=[thank_you, download_file]
-    )
-demo.launch(share=True, enable_queue=False, debug=True, server_name="0.0.0.0")

demo.py CHANGED Viewed

@@ -68,7 +68,19 @@ checkpoints = {
         "c2f": "./models/finetuned/titi/c2f.pth",
         "codec": "./models/spotdl/codec.pth",
         "full_ckpt": False
-    }
 }
 interface.checkpoint_key = "spotdl"
@@ -112,10 +124,8 @@ def _vamp(data, return_mask=False):
             checkpoints[data[checkpoint_key]]["coarse"],
             checkpoints[data[checkpoint_key]]["c2f"],
             checkpoints[data[checkpoint_key]]["full_ckpt"],
-            reset=(data[checkpoint_key] == "spotdl")
         )
         interface.checkpoint_key = data[checkpoint_key]
     out_dir = OUT_DIR / str(uuid.uuid4())
     out_dir.mkdir()
@@ -154,30 +164,24 @@ def _vamp(data, return_mask=False):
     mask = pmask.dropout(mask, data[dropout])
     mask = pmask.codebook_unmask(mask, ncc)
-    print(f"created mask with: linear random {data[rand_mask_intensity]}, inpaint {data[prefix_s]}:{data[suffix_s]}, periodic {data[periodic_p]}:{data[periodic_w]}, dropout {data[dropout]}, codebook unmask {ncc}, onset mask {data[onset_mask_width]}, num steps {data[num_steps]}, init temp {data[init_temp]}, final temp {data[final_temp]}, use coarse2fine {data[use_coarse2fine]}")
     # save the mask as a txt file
     np.savetxt(out_dir / "mask.txt", mask[:,0,:].long().cpu().numpy())
-    if data[topk] is not None:
-        top_k = data[topk] if data[topk] > 0 else None
-    else:
-        top_k = None
     zv, mask_z = interface.coarse_vamp(
         z,
         mask=mask,
         sampling_steps=data[num_steps],
-        temperature=(data[init_temp], data[final_temp]),
         return_mask=True,
-        sample=data[sampling_strategy],
         typical_filtering=data[typical_filtering],
         typical_mass=data[typical_mass],
         typical_min_tokens=data[typical_min_tokens],
-        top_k=top_k,
     )
     if use_coarse2fine:
-        zv = interface.coarse_to_fine(zv)
     sig = interface.to_signal(zv).cpu()
     print("done")
@@ -210,8 +214,7 @@ def save_vamp(data):
     sig_out.write(out_dir / "output.wav")
     _data = {
-        "init_temp": data[init_temp],
-        "final_temp": data[final_temp],
         "prefix_s": data[prefix_s],
         "suffix_s": data[suffix_s],
         "rand_mask_intensity": data[rand_mask_intensity],
@@ -349,53 +352,32 @@ with gr.Blocks() as demo:
                     value=0.0
                 )
-            with gr.Accordion("temperature settings", open=False):
-                init_temp = gr.Slider(
-                    label="initial temperature (should probably stay between 0.6 and 1)",
-                    minimum=0.0,
-                    maximum=1.5,
-                    value=0.8
-                )
-                final_temp = gr.Slider(
-                    label="final temperature (should probably stay between 0.7 and 2)",
-                    minimum=0.0,
-                    maximum=2.0,
-                    value=1.0
-                )
             with gr.Accordion("sampling settings", open=False):
-                sampling_strategy = gr.Radio(
-                    label="sampling strategy",
-                    choices=["gumbel", "multinomial"],
-                    value="gumbel"
-                )
                 typical_filtering = gr.Checkbox(
-                    label="typical filtering (cannot be used with topk)",
                     value=True
                 )
                 typical_mass = gr.Slider(
                     label="typical mass (should probably stay between 0.1 and 0.5)",
                     minimum=0.01,
                     maximum=0.99,
-                    value=0.2
                 )
                 typical_min_tokens = gr.Slider(
                     label="typical min tokens (should probably stay between 1 and 256)",
                     minimum=1,
                     maximum=256,
                     step=1,
-                    value=1
-                )
-                topk = gr.Slider(
-                    label="topk (cannot be used with typical filtering). 0 = None",
-                    minimum=0,
-                    maximum=256,
-                    step=1,
-                    value=0
                 )
             num_steps = gr.Slider(
                 label="number of steps (should normally be between 12 and 36)",
                 minimum=1,
@@ -427,28 +409,6 @@ with gr.Blocks() as demo:
                 type="filepath"
             )
-        # with gr.Column():
-        #     with gr.Accordion(label="beat unmask (how much time around the beat should be hinted?)"):
-        #         use_beats = gr.Checkbox(
-        #             label="use beat hints (helps the output stick to the beat structure of the input)",
-        #             value=False
-        #         )
-        #         snap_to_beats = gr.Checkbox(
-        #             label="trim to beat markers (uncheck if the output audio is too short.)",
-        #             value=True
-        #         )
-        #         beat_unmask_dur = gr.Slider(
-        #             label="duration",
-        #             minimum=0.0,
-        #             maximum=3.0,
-        #             value=0.07
-        #         )
             notes_text = gr.Textbox(
                 label="type any notes about the generated audio here",
                 value="",
@@ -467,7 +427,7 @@ with gr.Blocks() as demo:
     _inputs = {
             input_audio,
             num_steps,
-            init_temp, final_temp,
             prefix_s, suffix_s,
             rand_mask_intensity,
             periodic_p, periodic_w,
@@ -477,11 +437,9 @@ with gr.Blocks() as demo:
             stretch_factor,
             onset_mask_width,
             input_pitch_shift,
-            sampling_strategy,
             typical_filtering,
             typical_mass,
             typical_min_tokens,
-            topk,
             checkpoint_key
         }

         "c2f": "./models/finetuned/titi/c2f.pth",
         "codec": "./models/spotdl/codec.pth",
         "full_ckpt": False
+    },
+    "titi-clean": {
+        "coarse": "./models/finetuned/titi-clean/coarse.pth",
+        "c2f": "./models/finetuned/titi-clean/c2f.pth",
+        "codec": "./models/spotdl/codec.pth",
+        "full_ckpt": False
+    },
+    "breaks-steps": {
+        "coarse": "./models/finetuned/breaks-steps/coarse.pth",
+        "c2f": None, #"./models/finetuned/breaks-steps/c2f.pth",
+        "codec": "./models/spotdl/codec.pth",
+        "full_ckpt": False
+    },
 }
 interface.checkpoint_key = "spotdl"
             checkpoints[data[checkpoint_key]]["coarse"],
             checkpoints[data[checkpoint_key]]["c2f"],
             checkpoints[data[checkpoint_key]]["full_ckpt"],
         )
         interface.checkpoint_key = data[checkpoint_key]
     out_dir = OUT_DIR / str(uuid.uuid4())
     out_dir.mkdir()
     mask = pmask.dropout(mask, data[dropout])
     mask = pmask.codebook_unmask(mask, ncc)
+    print(f"created mask with: linear random {data[rand_mask_intensity]}, inpaint {data[prefix_s]}:{data[suffix_s]}, periodic {data[periodic_p]}:{data[periodic_w]}, dropout {data[dropout]}, codebook unmask {ncc}, onset mask {data[onset_mask_width]}, num steps {data[num_steps]}, init temp {data[temp]},  use coarse2fine {data[use_coarse2fine]}")
     # save the mask as a txt file
     np.savetxt(out_dir / "mask.txt", mask[:,0,:].long().cpu().numpy())
     zv, mask_z = interface.coarse_vamp(
         z,
         mask=mask,
         sampling_steps=data[num_steps],
+        temperature=data[temp]*10,
         return_mask=True,
         typical_filtering=data[typical_filtering],
         typical_mass=data[typical_mass],
         typical_min_tokens=data[typical_min_tokens],
+        gen_fn=interface.coarse.generate,
     )
     if use_coarse2fine:
+        zv = interface.coarse_to_fine(zv, temperature=data[temp])
     sig = interface.to_signal(zv).cpu()
     print("done")
     sig_out.write(out_dir / "output.wav")
     _data = {
+        "temp": data[temp],
         "prefix_s": data[prefix_s],
         "suffix_s": data[suffix_s],
         "rand_mask_intensity": data[rand_mask_intensity],
                     value=0.0
                 )
+            temp = gr.Slider(
+                label="temperature",
+                minimum=0.0,
+                maximum=1.5,
+                value=0.8
+            )
             with gr.Accordion("sampling settings", open=False):
                 typical_filtering = gr.Checkbox(
+                    label="typical filtering ",
                     value=True
                 )
                 typical_mass = gr.Slider(
                     label="typical mass (should probably stay between 0.1 and 0.5)",
                     minimum=0.01,
                     maximum=0.99,
+                    value=0.15
                 )
                 typical_min_tokens = gr.Slider(
                     label="typical min tokens (should probably stay between 1 and 256)",
                     minimum=1,
                     maximum=256,
                     step=1,
+                    value=64
                 )
             num_steps = gr.Slider(
                 label="number of steps (should normally be between 12 and 36)",
                 minimum=1,
                 type="filepath"
             )
             notes_text = gr.Textbox(
                 label="type any notes about the generated audio here",
                 value="",
     _inputs = {
             input_audio,
             num_steps,
+            temp,
             prefix_s, suffix_s,
             rand_mask_intensity,
             periodic_p, periodic_w,
             stretch_factor,
             onset_mask_width,
             input_pitch_shift,
             typical_filtering,
             typical_mass,
             typical_min_tokens,
             checkpoint_key
         }

scripts/exp/train.py CHANGED Viewed

@@ -491,7 +491,7 @@ def train(
             num_samples = z.shape[0]
             for i in range(num_samples):
-                sampled = accel.unwrap(model).sample(
                     codec=codec,
                     time_steps=z.shape[-1],
                     start_tokens=z[i : i + 1],
@@ -503,29 +503,6 @@ def train(
                     plot_fn=None,
                 )
-                # sample in 1 step (only for coarse2fine models)
-                if accel.unwrap(model).n_conditioning_codebooks > 0:
-                    sampled_argmax = accel.unwrap(model).sample(
-                        codec=codec,
-                        time_steps=z.shape[-1],
-                        start_tokens=z[i : i + 1],
-                        sample="argmax",
-                        sampling_steps=1,
-                    )
-                    sampled_argmax.cpu().write_audio_to_tb(
-                        f"sampled_1step-argmax/{i}",
-                        self.writer,
-                        step=self.state.epoch,
-                        plot_fn=None,
-                    )
-                    conditioning = z[i:i+1, : accel.unwrap(model).n_conditioning_codebooks, :]
-                    conditioning = accel.unwrap(model).to_signal(conditioning, codec)
-                    conditioning.cpu().write_audio_to_tb(
-                        f"conditioning/{i}",
-                        self.writer,
-                        step=self.state.epoch,
-                        plot_fn=None,
-                    )
         def save_imputation(self, z: torch.Tensor):
             n_prefix = int(z.shape[-1] * 0.25)
@@ -543,7 +520,7 @@ def train(
             imputed = []
             for i in range(len(z)):
                 imputed.append(
-                    vn.sample(
                         codec=codec,
                         time_steps=z.shape[-1],
                         start_tokens=z[i][None, ...],

             num_samples = z.shape[0]
             for i in range(num_samples):
+                sampled = accel.unwrap(model).generate(
                     codec=codec,
                     time_steps=z.shape[-1],
                     start_tokens=z[i : i + 1],
                     plot_fn=None,
                 )
         def save_imputation(self, z: torch.Tensor):
             n_prefix = int(z.shape[-1] * 0.25)
             imputed = []
             for i in range(len(z)):
                 imputed.append(
+                    vn.generate(
                         codec=codec,
                         time_steps=z.shape[-1],
                         start_tokens=z[i][None, ...],

vampnet/interface.py CHANGED Viewed

@@ -292,7 +292,7 @@ class Interface(torch.nn.Module):
         fine_z = []
         for i in range(n_chunks):
             chunk = coarse_z[:, :, i * chunk_len : (i + 1) * chunk_len]
-            chunk = self.c2f.sample(
                 codec=self.codec,
                 time_steps=chunk_len,
                 start_tokens=chunk,
@@ -343,6 +343,7 @@ if __name__ == "__main__":
     logger = logging.getLogger()
     logger.setLevel(logging.INFO)
     torch.set_printoptions(threshold=10000)
     interface = Interface(
         coarse_ckpt="./models/spotdl/coarse.pth",
@@ -372,20 +373,20 @@ if __name__ == "__main__":
         z,
         mask=mask,
         sampling_steps=36,
-        temperature=6.0,
         return_mask=True,
-        # gen_fn=interface.coarse.generate
     )
-    use_coarse2fine = False
     if use_coarse2fine:
-        zv = interface.coarse_to_fine(zv)
     mask = interface.to_signal(mask_z).cpu()
     sig = interface.to_signal(zv).cpu()
     print("done")
-    sig.write("output.wav")
     mask.write("mask.wav")

         fine_z = []
         for i in range(n_chunks):
             chunk = coarse_z[:, :, i * chunk_len : (i + 1) * chunk_len]
+            chunk = self.c2f.generate(
                 codec=self.codec,
                 time_steps=chunk_len,
                 start_tokens=chunk,
     logger = logging.getLogger()
     logger.setLevel(logging.INFO)
     torch.set_printoptions(threshold=10000)
+    at.util.seed(42)
     interface = Interface(
         coarse_ckpt="./models/spotdl/coarse.pth",
         z,
         mask=mask,
         sampling_steps=36,
+        temperature=8.0,
         return_mask=True,
+        gen_fn=interface.coarse.generate
     )
+    use_coarse2fine = True
     if use_coarse2fine:
+        zv = interface.coarse_to_fine(zv, temperature=0.8)
     mask = interface.to_signal(mask_z).cpu()
     sig = interface.to_signal(zv).cpu()
     print("done")
+    sig.write("output3.wav")
     mask.write("mask.wav")

vampnet/modules/transformer.py CHANGED Viewed

@@ -741,7 +741,7 @@ class VampNet(at.ml.BaseModel):
         sampling_steps: int = 36,
         start_tokens: Optional[torch.Tensor] = None,
         mask: Optional[torch.Tensor] = None,
-        temperature: Union[float, Tuple[float, float]] = 0.8,
         typical_filtering=False,
         typical_mass=0.2,
         typical_min_tokens=1,
@@ -848,26 +848,27 @@ class VampNet(at.ml.BaseModel):
             probs = torch.softmax(logits, dim=-1)
             logging.info(f"computed probs with shape: {probs.shape}")
-            # flatten z_masked and mask, so we can deal with the sampling logic
-            # we'll unflatten them at the end of the loop for the next forward pass
-            z_masked = codebook_flatten(z_masked)
             # sample from logits with multinomial sampling
             b = probs.shape[0]
             probs = rearrange(probs, "b seq prob -> (b seq) prob")
             sampled_z =  torch.multinomial(probs, 1).squeeze(-1)
             sampled_z = rearrange(sampled_z, "(b seq)-> b seq", b=b)
             probs = rearrange(probs, "(b seq) prob -> b seq prob", b=b)
             logging.info(f"sampled z with shape: {sampled_z.shape}")
-            # update the mask
             mask = (z_masked == self.mask_token).int()
             logging.info(f"updated mask with shape: {mask.shape}")
             # add z back into sampled z where the mask was false
             sampled_z = torch.where(
                 mask.bool(), sampled_z, z_masked
@@ -902,17 +903,9 @@ class VampNet(at.ml.BaseModel):
             # get our new mask
-            # print(tmpt * (1-_gamma(r)))
             mask = mask_by_random_topk(
                 num_to_mask, selected_probs, tmpt * (1-r)
-            )
-            # print(f"most confident tokens: ")
-            # print(torch.take_along_dim(
-            #     sampled_z, selected_probs.argsort(descending=False), dim=-1)
-            # )
-            # print(sampled_z[~mask.bool()])
             # update the mask
             z_masked = torch.where(
@@ -920,22 +913,29 @@ class VampNet(at.ml.BaseModel):
             )
             logging.info(f"updated z_masked with shape: {z_masked.shape}")
-            z_masked = codebook_unflatten(z_masked, self.n_codebooks)
-            mask = codebook_unflatten(mask, self.n_codebooks)
             logging.info(f"unflattened z_masked with shape: {z_masked.shape}")
-            logging.info(f"updated z_masked with shape: {z_masked.shape}")
         logging.info(f"finished sampling")
-        z = codebook_unflatten(sampled_z, self.n_codebooks)
         if return_signal:
-            return self.to_signal(z, codec)
         else:
-            return z
 def mask_by_random_topk(num_to_mask: int, probs: torch.Tensor, temperature: float = 1.0):

         sampling_steps: int = 36,
         start_tokens: Optional[torch.Tensor] = None,
         mask: Optional[torch.Tensor] = None,
+        temperature: Union[float, Tuple[float, float]] = 8.0,
         typical_filtering=False,
         typical_mass=0.2,
         typical_min_tokens=1,
             probs = torch.softmax(logits, dim=-1)
             logging.info(f"computed probs with shape: {probs.shape}")
             # sample from logits with multinomial sampling
             b = probs.shape[0]
             probs = rearrange(probs, "b seq prob -> (b seq) prob")
             sampled_z =  torch.multinomial(probs, 1).squeeze(-1)
             sampled_z = rearrange(sampled_z, "(b seq)-> b seq", b=b)
             probs = rearrange(probs, "(b seq) prob -> b seq prob", b=b)
             logging.info(f"sampled z with shape: {sampled_z.shape}")
+            # flatten z_masked and mask, so we can deal with the sampling logic
+            # we'll unflatten them at the end of the loop for the next forward pass
+            # remove conditioning codebooks, we'll add them back at the end
+            z_masked = codebook_flatten(z_masked[:, self.n_conditioning_codebooks:, :])
             mask = (z_masked == self.mask_token).int()
+            # update the mask, remove conditioning codebooks from the mask
             logging.info(f"updated mask with shape: {mask.shape}")
             # add z back into sampled z where the mask was false
             sampled_z = torch.where(
                 mask.bool(), sampled_z, z_masked
             # get our new mask
             mask = mask_by_random_topk(
                 num_to_mask, selected_probs, tmpt * (1-r)
+            )
             # update the mask
             z_masked = torch.where(
             )
             logging.info(f"updated z_masked with shape: {z_masked.shape}")
+            z_masked = codebook_unflatten(z_masked, n_infer_codebooks)
+            mask = codebook_unflatten(mask, n_infer_codebooks)
             logging.info(f"unflattened z_masked with shape: {z_masked.shape}")
+            # add conditioning codebooks back to z_masked
+            z_masked = torch.cat(
+                (z[:, :self.n_conditioning_codebooks, :], z_masked), dim=1
+            )
+            logging.info(f"added conditioning codebooks back to z_masked with shape: {z_masked.shape}")
+        # add conditioning codebooks back to sampled_z
+        sampled_z = codebook_unflatten(sampled_z, n_infer_codebooks)
+        sampled_z = torch.cat(
+            (z[:, :self.n_conditioning_codebooks, :], sampled_z), dim=1
+        )
         logging.info(f"finished sampling")
         if return_signal:
+            return self.to_signal(sampled_z, codec)
         else:
+            return sampled_z
 def mask_by_random_topk(num_to_mask: int, probs: torch.Tensor, temperature: float = 1.0):