Spaces:

jeonchangbin49
/

De-limiter

Running

App Files Files Community

jeonchangbin49 commited on Aug 1, 2023

Commit

a00b67a

1 Parent(s): da27cbe

first commit

Browse files

Files changed (41) hide show

.gitattributes +0 -34
LICENSE +21 -0
README.md +2 -13
add.py +293 -0
configs/delimit_6_s.yaml +92 -0
dataloader/__init__.py +8 -0
dataloader/dataset.py +579 -0
dataloader/delimit_dataset.py +573 -0
dataloader/singleset.py +95 -0
eval_delimit/calc_flops.py +44 -0
eval_delimit/score_calc_delimit.py +145 -0
eval_delimit/score_diff_dyn_complexity.py +87 -0
eval_delimit/score_fad.py +75 -0
eval_delimit/score_features.py +233 -0
eval_delimit/score_peaq.py +77 -0
eval_delimit/score_peaq_aggregate.py +88 -0
inference.py +165 -0
main_ddp.py +49 -0
models/__init__.py +1 -0
models/base_models.py +239 -0
models/load_models.py +87 -0
prepro/delimit_save_delimiter_stems.py +93 -0
prepro/delimit_save_musdb_loudnorm.py +118 -0
prepro/delimit_train_ozone_prepro.py +293 -0
prepro/delimit_valid_L_prepro.py +41 -0
prepro/delimit_valid_custom_limiter_prepro.py +59 -0
prepro/delimit_valid_prepro.py +41 -0
requirements.txt +13 -0
separate_func/__init__.py +1 -0
separate_func/conv_tasnet_separate.py +89 -0
solver_ddp.py +643 -0
test_ddp.py +245 -0
train_ddp.py +56 -0
utils/__init__.py +19 -0
utils/logging.py +79 -0
utils/loudness_utils.py +71 -0
utils/lr_scheduler.py +80 -0
utils/read_wave_utils.py +109 -0
utils/train_utils.py +27 -0
weight/all.json +957 -0
weight/all.pth +3 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text
























1	*.pth filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 jeonchangbin49
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,13 +1,2 @@
----
-title: De Limiter
-emoji: 🏃
-colorFrom: pink
-colorTo: indigo
-sdk: gradio
-sdk_version: 3.39.0
-app_file: app.py
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ # De-limiter
2	+ An official demo of "Music De-limiter Networks via Sample-wise Gain Inversion", which will be presented in WASPAA 2023.

add.py ADDED Viewed

	@@ -0,0 +1,293 @@

+import os
+import json
+import argparse
+import numpy as np
+import matplotlib.pyplot as plt
+import torch
+import tqdm
+import librosa
+import librosa.display
+import soundfile as sf
+import pyloudnorm as pyln
+from dotmap import DotMap
+import gradio as gr
+from models import load_model_with_args
+from separate_func import (
+    conv_tasnet_separate,
+)
+from utils import db2linear
+tqdm.monitor_interval = 0
+def separate_track_with_model(
+    args, model, device, track_audio, track_name, meter, augmented_gain
+):
+    with torch.no_grad():
+        if (
+            args.model_loss_params.architecture == "conv_tasnet_mask_on_output"
+            or args.model_loss_params.architecture == "conv_tasnet"
+        ):
+            estimates = conv_tasnet_separate(
+                args,
+                model,
+                device,
+                track_audio,
+                track_name,
+                meter=meter,
+                augmented_gain=augmented_gain,
+            )
+        return estimates
+def main(input, mix_coefficient):
+    parser = argparse.ArgumentParser(description="model test.py")
+    parser.add_argument("--target", type=str, default="all")
+    parser.add_argument("--weight_directory", type=str, default="weight")
+    parser.add_argument("--output_directory", type=str, default="output")
+    parser.add_argument("--use_gpu", type=bool, default=True)
+    parser.add_argument("--save_name_as_target", type=bool, default=False)
+    parser.add_argument(
+        "--loudnorm_input_lufs",
+        type=float,
+        default=None,
+        help="If you want to use loudnorm for input",
+    )
+    parser.add_argument(
+        "--save_output_loudnorm",
+        type=float,
+        default=-14.0,
+        help="Save loudness normalized outputs or not. If you want to save, input target loudness",
+    )
+    parser.add_argument(
+        "--save_mixed_output",
+        type=float,
+        default=None,
+        help="Save original+delimited-estimation mixed output with a ratio of default 0.5 (orginal) and 1 - 0.5 (estimation)",
+    )
+    parser.add_argument(
+        "--save_16k_mono",
+        type=bool,
+        default=False,
+        help="Save 16k mono wav files for FAD evaluation.",
+    )
+    parser.add_argument(
+        "--save_histogram",
+        type=bool,
+        default=False,
+        help="Save histogram of the output. Only valid when the task is 'delimit'",
+    )
+    parser.add_argument(
+        "--use_singletrackset",
+        type=bool,
+        default=False,
+        help="Use SingleTrackSet if input data is too long.",
+    )
+    args, _ = parser.parse_known_args()
+    with open(f"{args.weight_directory}/{args.target}.json", "r") as f:
+        args_dict = json.load(f)
+        args_dict = DotMap(args_dict)
+    for key, value in args_dict["args"].items():
+        if key in list(vars(args).keys()):
+            pass
+        else:
+            setattr(args, key, value)
+    args.test_output_dir = f"{args.output_directory}"
+    os.makedirs(args.test_output_dir, exist_ok=True)
+    device = torch.device(
+        "cuda" if torch.cuda.is_available() and args.use_gpu else "cpu"
+    )
+    ###################### Define Models ######################
+    our_model = load_model_with_args(args)
+    our_model = our_model.to(device)
+    target_model_path = f"{args.weight_directory}/{args.target}.pth"
+    checkpoint = torch.load(target_model_path, map_location=device)
+    our_model.load_state_dict(checkpoint)
+    our_model.eval()
+    meter = pyln.Meter(44100)
+    sr, track_audio = input
+    track_audio = track_audio.T
+    track_name = "gradio_demo"
+    orig_audio = track_audio.copy()
+    if sr != 44100:
+        raise ValueError("Sample rate should be 44100")
+    augmented_gain = None
+    if args.loudnorm_input_lufs:  # If you want to use loud-normalized input
+        track_lufs = meter.integrated_loudness(track_audio.T)
+        augmented_gain = args.loudnorm_input_lufs - track_lufs
+        track_audio = track_audio * db2linear(augmented_gain, eps=0.0)
+    track_audio = (
+        torch.as_tensor(track_audio, dtype=torch.float32).unsqueeze(0).to(device)
+    )
+    estimates = separate_track_with_model(
+        args, our_model, device, track_audio, track_name, meter, augmented_gain
+    )
+    if args.save_mixed_output:
+        track_lufs = meter.integrated_loudness(orig_audio.T)
+        augmented_gain = args.save_output_loudnorm - track_lufs
+        orig_audio = orig_audio * db2linear(augmented_gain, eps=0.0)
+        mixed_output = orig_audio * args.save_mixed_output + estimates * (
+            1 - args.save_mixed_output
+        )
+        sf.write(
+            f"{args.test_output_dir}/{track_name}/{track_name}_mixed.wav",
+            mixed_output.T,
+            args.data_params.sample_rate,
+        )
+    return (
+        (sr, estimates.T),
+        (sr, orig_audio.T),
+        (sr, orig_audio.T * mix_coefficient + estimates.T * (1 - mix_coefficient)),
+    )
+def parallel_mix(input, output, mix_coefficient):
+    sr = 44100
+    return sr, input[1] * mix_coefficient + output[1] * (1 - mix_coefficient)
+def int16_to_float32(wav):
+    wav = np.frombuffer(wav, dtype=np.int16)
+    X = wav / 32768
+    return X
+def waveform_plot(input, output, prl_mix_ouptut, figsize_x=20, figsize_y=9):
+    sr = 44100
+    fig, ax = plt.subplots(
+        nrows=3, sharex=True, sharey=True, figsize=(figsize_x, figsize_y)
+    )
+    librosa.display.waveshow(int16_to_float32(input[1]).T, sr=sr, ax=ax[0])
+    ax[0].set(title="Loudness Normalized Input")
+    ax[0].label_outer()
+    librosa.display.waveshow(int16_to_float32(output[1]).T, sr=sr, ax=ax[1])
+    ax[1].set(title="De-limiter Output")
+    ax[1].label_outer()
+    librosa.display.waveshow(int16_to_float32(prl_mix_ouptut[1]).T, sr=sr, ax=ax[2])
+    ax[2].set(title="Parallel Mix of the Input and its De-limiter Output")
+    ax[2].label_outer()
+    return fig
+with gr.Blocks() as demo:
+    gr.HTML(
+        """
+            <div style="text-align: center; max-width: 700px; margin: 0 auto;">
+              <div
+                style="
+                  display: inline-flex;
+                  align-items: center;
+                  gap: 0.8rem;
+                  font-size: 1.75rem;
+                "
+              >
+                <h1 style="font-weight: 900; margin-bottom: 7px;">
+                  Music De-limiter
+                </h1>
+              </div>
+              <p style="margin-bottom: 10px; font-size: 94%">
+                A demo for "Music De-limiter via Sample-wise Gain Inversion" to appear in WASPAA 2023.
+                You can first upload a music (.wav or .mp3) file and then press "De-limit" button to apply the De-limiter. Since we use a CPU instead of a GPU, it may require a few minute.
+                Then, you can apply a Parallel Mix technique, which is a simple linear mixing technique of "loudness normalized input" and the "de-limiter output".
+                You can modify the mixing coefficient by yourself.
+                If the coefficient is 0.3 then the output will be the "loudness_normalized_input * 0.3 + de-limiter_output * 0.7"
+            </div>
+        """
+    )
+    with gr.Row().style(mobile_collapse=False, equal_height=True):
+        with gr.Column():
+            with gr.Box():
+                input_audio = gr.Audio(source="upload", label="De-limiter Input")
+                btn = gr.Button("De-limit")
+        with gr.Column():
+            with gr.Box():
+                loud_norm_input = gr.Audio(label="Loudness Normalized Input (-14LUFS)")
+            with gr.Box():
+                output_audio = gr.Audio(label="De-limiter Output")
+            with gr.Box():
+                output_audio_parallel = gr.Audio(
+                    label="Parallel Mix of the Input and its De-limiter Output"
+                )
+                slider = gr.Slider(
+                    minimum=0,
+                    maximum=1,
+                    step=0.1,
+                    value=0.5,
+                    label="Parallel Mix Coefficient",
+                )
+            btn.click(
+                main,
+                inputs=[input_audio, slider],
+                outputs=[output_audio, loud_norm_input, output_audio_parallel],
+            )
+            slider.release(
+                parallel_mix,
+                inputs=[input_audio, output_audio, slider],
+                outputs=output_audio_parallel,
+            )
+    with gr.Row().style(mobile_collapse=False, equal_height=True):
+        with gr.Column():
+            with gr.Box():
+                plot = gr.Plot(label="Plots")
+                btn2 = gr.Button("Show Plots")
+                slider_plot_x = gr.Slider(
+                    minimum=1,
+                    maximum=100,
+                    step=1,
+                    value=20,
+                    label="Plot X-axis size",
+                )
+                slider_plot_y = gr.Slider(
+                    minimum=1,
+                    maximum=30,
+                    step=1,
+                    value=9,
+                    label="Plot Y-axis size",
+                )
+                btn2.click(
+                    waveform_plot,
+                    inputs=[
+                        loud_norm_input,
+                        output_audio,
+                        output_audio_parallel,
+                        slider_plot_x,
+                        slider_plot_y,
+                    ],
+                    outputs=plot,
+                )
+                slider.release(
+                    waveform_plot,
+                    inputs=[
+                        loud_norm_input,
+                        output_audio,
+                        output_audio_parallel,
+                        slider_plot_x,
+                        slider_plot_y,
+                    ],
+                    outputs=plot,
+                )
+if __name__ == "__main__":
+    demo.launch(debug=True)

configs/delimit_6_s.yaml ADDED Viewed

	@@ -0,0 +1,92 @@

+# For De-limit task, Conv-TasNet.
+# si_sdr loss
+#
+# ozone_train_fixed is about 6.36 hours
+# 300,000 segments is about 333.33 hours
+# ratio should be about 0.019
+wandb_params:
+  use_wandb: true
+  entity: null # your wandb id
+  project: delimit # your wandb project
+  rerun_id: null # use when you rerun wandb.
+  sweep: false
+sys_params:
+  nb_workers: 4
+  seed: 777
+  n_nodes: 1
+  port: null
+  rank: 0
+task_params:
+  target: all # choices=["all"]
+  train: true
+  dataset: delimit # choices=["musdb", "delimit"]
+dir_params:
+  root: /path/to/musdb18hq
+  output_directory: /path/to/results
+  exp_name: convtasnet_6_s # you MUST specify this
+  resume: null # "path of checkpoint folder"
+  continual_train: false # when we want to use a pre-trained model but not want to use lr_scheduler history.
+  delimit_valid_root: null
+  delimit_valid_L_root: null
+  ozone_root: /path/to/musdb-XL-train # you have to specify data_params.use_fixed
+hyperparams:
+  batch_size: 8 # with 1 gpus (we used 2080ti 11GB)
+  epochs: 200
+  optimizer: adamw
+  weight_decay: 0.01
+  lr: 0.00003
+  lr_decay_gamma: 0.5
+  lr_decay_patience: 15
+  patience: 50
+  lr_scheduler: step_lr
+  gradient_clip: 5.0
+  ema: false
+data_params:
+  nfft: 4096
+  nhop: 1024
+  nb_channels: 2
+  sample_rate: 44100
+  seq_dur: 4.0
+  singleset_num_frames: null
+  samples_per_track: 128 # "Number of samples per track to use for training."
+  limitaug_method: ozone
+  limitaug_mode: null
+  limitaug_custom_target_lufs: null
+  limitaug_custom_target_lufs_std: null
+  target_loudnorm_lufs: -14.0
+  random_mix: true
+  target_limitaug_mode: null
+  target_limitaug_custom_target_lufs: null
+  target_limitaug_custom_target_lufs_std: null
+  custom_limiter_attack_range: null
+  custom_limiter_release_range: null
+  use_fixed: 0.019 # range 0.0 ~ 1.0 => 1.0 will use fixed Ozoned_mixture training examples only.
+model_loss_params:
+  architecture: conv_tasnet_mask_on_output # Sample-wise Gain Inversion (SGI)
+  train_loss_func: [si_sdr]
+  train_loss_scales: [1.]
+  valid_loss_func: [si_sdr]
+  valid_loss_scales: [1.]
+conv_tasnet_params:
+  encoder_activation: relu
+  n_filters: 512
+  kernel_size: 128 # about 3ms in 44100Hz
+  stride: 64
+  n_blocks: 5
+  n_repeats: 2
+  bn_chan: 128
+  hid_chan: 512
+  skip_chan: 128
+  # conv_kernel_size:
+  # norm_type:
+  mask_act: relu
+  # causal:
+  decoder_activation: sigmoid

dataloader/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from .dataset import aug_from_str, MusdbTrainDataset, MusdbValidDataset
+from .singleset import SingleTrackSet
+from .delimit_dataset import (
+    DelimitTrainDataset,
+    DelimitValidDataset,
+    OzoneTrainDataset,
+    OzoneValidDataset,
+)

dataloader/dataset.py ADDED Viewed

	@@ -0,0 +1,579 @@

+# Dataloader based on https://github.com/jeonchangbin49/LimitAug
+import os
+from glob import glob
+import random
+from typing import Optional, Callable
+import numpy as np
+import torch
+import librosa
+from torch.utils.data import Dataset
+import pyloudnorm as pyln
+from pedalboard import Pedalboard, Limiter, Gain, Compressor, Clipping
+from utils import load_wav_arbitrary_position_stereo, db2linear
+# based on https://github.com/sigsep/open-unmix-pytorch
+def aug_from_str(list_of_function_names: list):
+    if list_of_function_names:
+        return Compose([globals()["_augment_" + aug] for aug in list_of_function_names])
+    else:
+        return lambda audio: audio
+class Compose(object):
+    """Composes several augmentation transforms.
+    Args:
+        augmentations: list of augmentations to compose.
+    """
+    def __init__(self, transforms):
+        self.transforms = transforms
+    def __call__(self, audio: torch.Tensor) -> torch.Tensor:
+        for t in self.transforms:
+            audio = t(audio)
+        return audio
+# numpy based augmentation
+# based on https://github.com/sigsep/open-unmix-pytorch
+def _augment_gain(audio, low=0.25, high=1.25):
+    """Applies a random gain between `low` and `high`"""
+    g = low + random.random() * (high - low)
+    return audio * g
+def _augment_channelswap(audio):
+    """Swap channels of stereo signals with a probability of p=0.5"""
+    if audio.shape[0] == 2 and random.random() < 0.5:
+        return np.flip(audio, axis=0)  # axis=0 must be given
+    else:
+        return audio
+# Linear gain increasing implementation for Method (1)
+def apply_linear_gain_increase(mixture, target, board, meter, samplerate, target_lufs):
+    mixture, target = mixture.T, target.T
+    loudness = meter.integrated_loudness(mixture)
+    if np.isinf(loudness):
+        augmented_gain = 0.0
+        board[0].gain_db = augmented_gain
+    else:
+        augmented_gain = target_lufs - loudness
+        board[0].gain_db = augmented_gain
+    mixture = board(mixture.T, samplerate)
+    target = board(target.T, samplerate)
+    return mixture, target
+# LimitAug implementation for Method (2) and
+# implementation of LimitAug then Loudness normalization for Method (4)
+def apply_limitaug(
+    audio,
+    board,
+    meter,
+    samplerate,
+    target_lufs,
+    target_loudnorm_lufs=None,
+    loudness=None,
+):
+    audio = audio.T
+    if loudness is None:
+        loudness = meter.integrated_loudness(audio)
+    if np.isinf(loudness):
+        augmented_gain = 0.0
+        board[0].gain_db = augmented_gain
+    else:
+        augmented_gain = target_lufs - loudness
+        board[0].gain_db = augmented_gain
+    audio = board(audio.T, samplerate)
+    if target_loudnorm_lufs:
+        after_loudness = meter.integrated_loudness(audio.T)
+        if np.isinf(after_loudness):
+            pass
+        else:
+            target_gain = target_loudnorm_lufs - after_loudness
+            audio = audio * db2linear(target_gain)
+    return audio, loudness
+"""
+This dataloader implementation is based on https://github.com/sigsep/open-unmix-pytorch
+"""
+class MusdbTrainDataset(Dataset):
+    def __init__(
+        self,
+        target: str = "vocals",
+        root: str = None,
+        seq_duration: Optional[float] = 6.0,
+        samples_per_track: int = 64,
+        source_augmentations: Optional[Callable] = lambda audio: audio,
+        sample_rate: int = 44100,
+        seed: int = 42,
+        limitaug_method: str = "limitaug_then_loudnorm",
+        limitaug_mode: str = "normal_L",
+        limitaug_custom_target_lufs: float = None,
+        limitaug_custom_target_lufs_std: float = None,
+        target_loudnorm_lufs: float = -14.0,
+        custom_limiter_attack_range: list = [2.0, 2.0],
+        custom_limiter_release_range: list = [200.0, 200.0],
+        *args,
+        **kwargs,
+    ) -> None:
+        """
+        Parameters
+        ----------
+        limitaug_method : str
+        choose from ["linear_gain_increase", "limitaug", "limitaug_then_loudnorm", "only_loudnorm"]
+        limitaug_mode : str
+        choose from ["uniform", "normal", "normal_L", "normal_XL", "normal_short_term", "normal_L_short_term", "normal_XL_short_term", "custom"]
+        limitaug_custom_target_lufs : float
+        valid only when
+        limitaug_mode == "custom"
+        limitaug_custom_target_lufs_std : float
+        also valid only when
+        limitaug_mode == "custom
+        target_loudnorm_lufs : float
+        valid only when
+        limitaug_method == 'limitaug_then_loudnorm' or 'only_loudnorm'
+        default is -14.
+        To the best of my knowledge, Spotify and Youtube music is using -14 as a reference loudness normalization level.
+        No special reason for the choice of -14 as target_loudnorm_lufs.
+        target : str
+            target name of the source to be separated, defaults to ``vocals``.
+        root : str
+            root path of MUSDB
+        seq_duration : float
+            training is performed in chunks of ``seq_duration`` (in seconds,
+            defaults to ``None`` which loads the full audio track
+        samples_per_track : int
+            sets the number of samples, yielded from each track per epoch.
+            Defaults to 64
+        source_augmentations : list[callables]
+            provide list of augmentation function that take a multi-channel
+            audio file of shape (src, samples) as input and output. Defaults to
+            no-augmentations (input = output)
+        seed : int
+            control randomness of dataset iterations
+        args, kwargs : additional keyword arguments
+            used to add further control for the musdb dataset
+            initialization function.
+        """
+        self.seed = seed
+        random.seed(seed)
+        self.seq_duration = seq_duration
+        self.target = target
+        self.samples_per_track = samples_per_track
+        self.source_augmentations = source_augmentations
+        self.sample_rate = sample_rate
+        self.root = root
+        self.sources = ["vocals", "bass", "drums", "other"]
+        self.train_list = glob(f"{self.root}/train/*")
+        self.valid_list = [
+            "ANiMAL - Rockshow",
+            "Actions - One Minute Smile",
+            "Alexander Ross - Goodbye Bolero",
+            "Clara Berry And Wooldog - Waltz For My Victims",
+            "Fergessen - Nos Palpitants",
+            "James May - On The Line",
+            "Johnny Lokke - Promises & Lies",
+            "Leaf - Summerghost",
+            "Meaxic - Take A Step",
+            "Patrick Talbot - A Reason To Leave",
+            "Skelpolu - Human Mistakes",
+            "Traffic Experiment - Sirens",
+            "Triviul - Angelsaint",
+            "Young Griffo - Pennies",
+        ]
+        self.train_list = [
+            x for x in self.train_list if os.path.basename(x) not in self.valid_list
+        ]
+        # limitaug related
+        self.limitaug_method = limitaug_method
+        self.limitaug_mode = limitaug_mode
+        self.limitaug_custom_target_lufs = limitaug_custom_target_lufs
+        self.limitaug_custom_target_lufs_std = limitaug_custom_target_lufs_std
+        self.target_loudnorm_lufs = target_loudnorm_lufs
+        self.meter = pyln.Meter(self.sample_rate)
+        # Method (1) in our paper's Results section and Table 5
+        if self.limitaug_method == "linear_gain_increase":
+            print("using linear gain increasing!")
+            self.board = Pedalboard([Gain(gain_db=0.0)])
+        # Method (2) in our paper's Results section and Table 5
+        elif self.limitaug_method == "limitaug":
+            print("using limitaug!")
+            self.board = Pedalboard(
+                [Gain(gain_db=0.0), Limiter(threshold_db=0.0, release_ms=100.0)]
+            )
+        # Method (3) in our paper's Results section and Table 5
+        elif self.limitaug_method == "only_loudnorm":
+            print("using only loudness normalized inputs")
+        # Method (4) in our paper's Results section and Table 5
+        elif self.limitaug_method == "limitaug_then_loudnorm":
+            print("using limitaug then loudness normalize!")
+            self.board = Pedalboard(
+                [Gain(gain_db=0.0), Limiter(threshold_db=0.0, release_ms=100.0)]
+            )
+        elif self.limitaug_method == "custom_limiter_limitaug":
+            print("using Custom limiter limitaug!")
+            self.custom_limiter_attack_range = custom_limiter_attack_range
+            self.custom_limiter_release_range = custom_limiter_release_range
+            self.board = Pedalboard(
+                [
+                    Gain(gain_db=0.0),
+                    Compressor(
+                        threshold_db=-10.0, ratio=4.0, attack_ms=2.0, release_ms=200.0
+                    ),  # attack_ms and release_ms will be changed later.
+                    Compressor(
+                        threshold_db=0.0,
+                        ratio=1000.0,
+                        attack_ms=0.001,
+                        release_ms=100.0,
+                    ),
+                    Gain(gain_db=3.75),
+                    Clipping(threshold_db=0.0),
+                ]
+            )  # This implementation is the same as JUCE Limiter.
+            # However, we want the first compressor to have a variable attack and release time.
+            # Therefore, we use the Custom Limiter instead of the JUCE Limiter.
+        self.limitaug_mode_statistics = {
+            "normal": [
+                -15.954,
+                1.264,
+            ],  # -15.954 is mean LUFS of musdb-hq and 1.264 is standard deviation
+            "normal_L": [
+                -10.887,
+                1.191,
+            ],  # -10.887 is mean LUFS of musdb-L and 1.191 is standard deviation
+            "normal_XL": [
+                -8.608,
+                1.165,
+            ],  # -8.608 is mean LUFS of musdb-L and 1.165 is standard deviation
+            "normal_short_term": [
+                -17.317,
+                5.036,
+            ],  # In our experiments, short-term statistics were not helpful.
+            "normal_L_short_term": [-12.303, 5.233],
+            "normal_XL_short_term": [-9.988, 5.518],
+            "custom": [limitaug_custom_target_lufs, limitaug_custom_target_lufs_std],
+        }
+    def sample_target_lufs(self):
+        if (
+            self.limitaug_mode == "uniform"
+        ):  # if limitaug_mode is uniform, then choose target_lufs from uniform distribution
+            target_lufs = random.uniform(-20, -5)
+        else:  # else, choose target_lufs from gaussian distribution
+            target_lufs = random.gauss(
+                self.limitaug_mode_statistics[self.limitaug_mode][0],
+                self.limitaug_mode_statistics[self.limitaug_mode][1],
+            )
+        return target_lufs
+    def get_limitaug_results(self, mixture, target):
+        # Apply linear gain increasing (Method (1))
+        if self.limitaug_method == "linear_gain_increase":
+            target_lufs = self.sample_target_lufs()
+            mixture, target = apply_linear_gain_increase(
+                mixture,
+                target,
+                self.board,
+                self.meter,
+                self.sample_rate,
+                target_lufs=target_lufs,
+            )
+        # Apply LimitAug (Method (2))
+        elif self.limitaug_method == "limitaug":
+            self.board[1].release_ms = random.uniform(30.0, 200.0)
+            mixture_orig = mixture.copy()
+            target_lufs = self.sample_target_lufs()
+            mixture, _ = apply_limitaug(
+                mixture,
+                self.board,
+                self.meter,
+                self.sample_rate,
+                target_lufs=target_lufs,
+            )
+            print("mixture shape:", mixture.shape)
+            print("target shape:", target.shape)
+            target *= mixture / (mixture_orig + 1e-8)
+        # Apply only loudness normalization (Method(3))
+        elif self.limitaug_method == "only_loudnorm":
+            mixture_loudness = self.meter.integrated_loudness(mixture.T)
+            if np.isinf(
+                mixture_loudness
+            ):  # if the source is silence, then mixture_loudness is -inf.
+                pass
+            else:
+                augmented_gain = (
+                    self.target_loudnorm_lufs - mixture_loudness
+                )  # default target_loudnorm_lufs is -14.
+                mixture = mixture * db2linear(augmented_gain)
+                target = target * db2linear(augmented_gain)
+        # Apply LimitAug then loudness normalization (Method (4))
+        elif self.limitaug_method == "limitaug_then_loudnorm":
+            self.board[1].release_ms = random.uniform(30.0, 200.0)
+            mixture_orig = mixture.copy()
+            target_lufs = self.sample_target_lufs()
+            mixture, _ = apply_limitaug(
+                mixture,
+                self.board,
+                self.meter,
+                self.sample_rate,
+                target_lufs=target_lufs,
+                target_loudnorm_lufs=self.target_loudnorm_lufs,
+            )
+            target *= mixture / (mixture_orig + 1e-8)
+        # Apply LimitAug using Custom Limiter
+        elif self.limitaug_method == "custom_limiter_limitaug":
+            # Change attack time of First compressor of the Limiter
+            self.board[1].attack_ms = random.uniform(
+                self.custom_limiter_attack_range[0], self.custom_limiter_attack_range[1]
+            )
+            # Change release time of First compressor of the Limiter
+            self.board[1].release_ms = random.uniform(
+                self.custom_limiter_release_range[0],
+                self.custom_limiter_release_range[1],
+            )
+            # Change release time of Second compressor of the Limiter
+            self.board[2].release_ms = random.uniform(30.0, 200.0)
+            mixture_orig = mixture.copy()
+            target_lufs = self.sample_target_lufs()
+            mixture, _ = apply_limitaug(
+                mixture,
+                self.board,
+                self.meter,
+                self.sample_rate,
+                target_lufs=target_lufs,
+                target_loudnorm_lufs=self.target_loudnorm_lufs,
+            )
+            target *= mixture / (mixture_orig + 1e-8)
+        return mixture, target
+    def __getitem__(self, index):
+        audio_sources = []
+        target_ind = None
+        for k, source in enumerate(self.sources):
+            # memorize index of target source
+            if source == self.target:  # if source is 'vocals'
+                target_ind = k
+                track_path = self.train_list[
+                    index // self.samples_per_track
+                ]  # we want to use # training samples per each track.
+                audio_path = f"{track_path}/{source}.wav"
+                audio = load_wav_arbitrary_position_stereo(
+                    audio_path, self.sample_rate, self.seq_duration
+                )
+            else:
+                track_path = random.choice(self.train_list)
+                audio_path = f"{track_path}/{source}.wav"
+                audio = load_wav_arbitrary_position_stereo(
+                    audio_path, self.sample_rate, self.seq_duration
+                )
+            audio = self.source_augmentations(audio)
+            audio_sources.append(audio)
+        stems = np.stack(audio_sources, axis=0)
+        # # apply linear mix over source index=0
+        x = stems.sum(0)
+        # get the target stem
+        y = stems[target_ind]
+        # Apply the limitaug,
+        x, y = self.get_limitaug_results(x, y)
+        x = torch.as_tensor(x, dtype=torch.float32)
+        y = torch.as_tensor(y, dtype=torch.float32)
+        return x, y
+    def __len__(self):
+        return len(self.train_list) * self.samples_per_track
+class MusdbValidDataset(Dataset):
+    def __init__(
+        self,
+        target: str = "vocals",
+        root: str = None,
+        *args,
+        **kwargs,
+    ) -> None:
+        """MUSDB18 torch.data.Dataset that samples from the MUSDB tracks
+        using track and excerpts with replacement.
+        Parameters
+        ----------
+        target : str
+            target name of the source to be separated, defaults to ``vocals``.
+        root : str
+            root path of MUSDB18HQ dataset, defaults to ``None``.
+        args, kwargs : additional keyword arguments
+            used to add further control for the musdb dataset
+            initialization function.
+        """
+        self.target = target
+        self.sample_rate = 44100.0  # musdb is fixed sample rate
+        self.root = root
+        self.sources = ["vocals", "bass", "drums", "other"]
+        self.train_list = glob(f"{self.root}/train/*")
+        self.valid_list = [
+            "ANiMAL - Rockshow",
+            "Actions - One Minute Smile",
+            "Alexander Ross - Goodbye Bolero",
+            "Clara Berry And Wooldog - Waltz For My Victims",
+            "Fergessen - Nos Palpitants",
+            "James May - On The Line",
+            "Johnny Lokke - Promises & Lies",
+            "Leaf - Summerghost",
+            "Meaxic - Take A Step",
+            "Patrick Talbot - A Reason To Leave",
+            "Skelpolu - Human Mistakes",
+            "Traffic Experiment - Sirens",
+            "Triviul - Angelsaint",
+            "Young Griffo - Pennies",
+        ]
+        self.valid_list = [
+            x for x in self.train_list if os.path.basename(x) in self.valid_list
+        ]
+    def __getitem__(self, index):
+        audio_sources = []
+        target_ind = None
+        for k, source in enumerate(self.sources):
+            # memorize index of target source
+            if source == self.target:  # if source is 'vocals'
+                target_ind = k
+                track_path = self.valid_list[index]
+                song_name = os.path.basename(track_path)
+                audio_path = f"{track_path}/{source}.wav"
+                # audio = utils.load_wav_stereo(audio_path, self.sample_rate)
+                audio = librosa.load(audio_path, mono=False, sr=self.sample_rate)[0]
+            else:
+                track_path = self.valid_list[index]
+                song_name = os.path.basename(track_path)
+                audio_path = f"{track_path}/{source}.wav"
+                # audio = utils.load_wav_stereo(audio_path, self.sample_rate)
+                audio = librosa.load(audio_path, mono=False, sr=self.sample_rate)[0]
+            audio = torch.as_tensor(audio, dtype=torch.float32)
+            audio_sources.append(audio)
+        stems = torch.stack(audio_sources, dim=0)
+        # # apply linear mix over source index=0
+        x = stems.sum(0)
+        # get the target stem
+        y = stems[target_ind]
+        return x, y, song_name
+    def __len__(self):
+        return len(self.valid_list)
+# If you want to check the LUFS values of training examples, run this.
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(
+        description="Make musdb-L and musdb-XL dataset from its ratio data"
+    )
+    parser.add_argument(
+        "--musdb_root",
+        type=str,
+        default="/path/to/musdb",
+        help="root path of musdb-hq dataset",
+    )
+    parser.add_argument(
+        "--limitaug_method",
+        type=str,
+        default="limitaug",
+        choices=[
+            "linear_gain_increase",
+            "limitaug",
+            "limitaug_then_loudnorm",
+            "only_loudnorm",
+            None,
+        ],
+        help="choose limitaug method",
+    )
+    parser.add_argument(
+        "--limitaug_mode",
+        type=str,
+        default="normal_L",
+        choices=[
+            "uniform",
+            "normal",
+            "normal_L",
+            "normal_XL",
+            "normal_short_term",
+            "normal_L_short_term",
+            "normal_XL_short_term",
+            "custom",
+        ],
+        help="if you use LimitAug, what lufs distribution to target",
+    )
+    parser.add_argument(
+        "--limitaug_custom_target_lufs",
+        type=float,
+        default=None,
+        help="if limitaug_mode is custom, set custom target lufs for LimitAug",
+    )
+    args, _ = parser.parse_known_args()
+    source_augmentations_ = aug_from_str(["gain", "channelswap"])
+    train_dataset = MusdbTrainDataset(
+        target="vocals",
+        root=args.musdb_root,
+        seq_duration=6.0,
+        source_augmentations=source_augmentations_,
+        limitaug_method=args.limitaug_method,
+        limitaug_mode=args.limitaug_mode,
+        limitaug_custom_target_lufs=args.limitaug_custom_target_lufs,
+    )
+    dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=1,
+        shuffle=True,
+        num_workers=4,
+        pin_memory=True,
+        drop_last=False,
+    )
+    meter = pyln.Meter(44100)
+    for i in range(5):
+        for x, y in dataloader:
+            loudness = meter.integrated_loudness(x[0].numpy().T)
+            print(f"mixture loudness : {loudness} LUFS")

dataloader/delimit_dataset.py ADDED Viewed

	@@ -0,0 +1,573 @@

+import os
+import random
+from typing import Optional, Callable
+import json
+import glob
+import csv
+import numpy as np
+import torch
+import librosa
+import pyloudnorm as pyln
+from pedalboard import Pedalboard, Limiter, Gain, Compressor, Clipping
+from .dataset import (
+    MusdbTrainDataset,
+    MusdbValidDataset,
+    apply_limitaug,
+)
+from utils import (
+    load_wav_arbitrary_position_stereo,
+    load_wav_specific_position_stereo,
+    db2linear,
+)
+class DelimitTrainDataset(MusdbTrainDataset):
+    def __init__(
+        self,
+        target: str = "all",
+        root: str = None,
+        seq_duration: Optional[float] = 6.0,
+        samples_per_track: int = 64,
+        source_augmentations: Optional[Callable] = lambda audio: audio,
+        sample_rate: int = 44100,
+        seed: int = 42,
+        limitaug_method: str = "limitaug",
+        limitaug_mode: str = "normal_L",
+        limitaug_custom_target_lufs: float = None,
+        limitaug_custom_target_lufs_std: float = None,
+        target_loudnorm_lufs: float = -14.0,
+        target_limitaug_mode: str = None,
+        target_limitaug_custom_target_lufs: float = None,
+        target_limitaug_custom_target_lufs_std: float = None,
+        custom_limiter_attack_range: list = [2.0, 2.0],
+        custom_limiter_release_range: list = [200.0, 200.0],
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            target=target,
+            root=root,
+            seq_duration=seq_duration,
+            samples_per_track=samples_per_track,
+            source_augmentations=source_augmentations,
+            sample_rate=sample_rate,
+            seed=seed,
+            limitaug_method=limitaug_method,
+            limitaug_mode=limitaug_mode,
+            limitaug_custom_target_lufs=limitaug_custom_target_lufs,
+            limitaug_custom_target_lufs_std=limitaug_custom_target_lufs_std,
+            target_loudnorm_lufs=target_loudnorm_lufs,
+            custom_limiter_attack_range=custom_limiter_attack_range,
+            custom_limiter_release_range=custom_limiter_release_range,
+            *args,
+            **kwargs,
+        )
+        self.target_limitaug_mode = target_limitaug_mode
+        self.target_limitaug_custom_target_lufs = (target_limitaug_custom_target_lufs,)
+        self.target_limitaug_custom_target_lufs_std = (
+            target_limitaug_custom_target_lufs_std,
+        )
+        self.limitaug_mode_statistics["target_custom"] = [
+            target_limitaug_custom_target_lufs,
+            target_limitaug_custom_target_lufs_std,
+        ]
+        """
+        Parameters
+        ----------
+        limitaug_method : str
+        choose from ["linear_gain_increase", "limitaug", "limitaug_then_loudnorm", "only_loudnorm"]
+        limitaug_mode : str
+        choose from ["uniform", "normal", "normal_L", "normal_XL", "normal_short_term", "normal_L_short_term", "normal_XL_short_term", "custom"]
+        limitaug_custom_target_lufs : float
+        valid only when
+        limitaug_mode == "custom"
+        target_loudnorm_lufs : float
+        valid only when
+        limitaug_method == 'limitaug_then_loudnorm' or 'only_loudnorm'
+        default is -14.
+        To the best of my knowledge, Spotify and Youtube music is using -14 as a reference loudness normalization level.
+        No special reason for the choice of -14 as target_loudnorm_lufs.
+        target : str
+            target name of the source to be separated, defaults to ``vocals``.
+        root : str
+            root path of MUSDB
+        seq_duration : float
+            training is performed in chunks of ``seq_duration`` (in seconds,
+            defaults to ``None`` which loads the full audio track
+        samples_per_track : int
+            sets the number of samples, yielded from each track per epoch.
+            Defaults to 64
+        source_augmentations : list[callables]
+            provide list of augmentation function that take a multi-channel
+            audio file of shape (src, samples) as input and output. Defaults to
+            no-augmentations (input = output)
+        seed : int
+            control randomness of dataset iterations
+        args, kwargs : additional keyword arguments
+            used to add further control for the musdb dataset
+            initialization function.
+        """
+    # Get a limitaug result without target (individual stem source)
+    def get_limitaug_mixture(self, mixture):
+        if self.limitaug_method == "limitaug":
+            self.board[1].release_ms = random.uniform(30.0, 200.0)
+            target_lufs = self.sample_target_lufs()
+            mixture_limited, mixture_lufs = apply_limitaug(
+                mixture,
+                self.board,
+                self.meter,
+                self.sample_rate,
+                target_lufs=target_lufs,
+            )
+        elif self.limitaug_method == "limitaug_then_loudnorm":
+            self.board[1].release_ms = random.uniform(30.0, 200.0)
+            target_lufs = self.sample_target_lufs()
+            mixture_limited, mixture_lufs = (
+                apply_limitaug(
+                    mixture,
+                    self.board,
+                    self.meter,
+                    self.sample_rate,
+                    target_lufs=target_lufs,
+                    target_loudnorm_lufs=self.target_loudnorm_lufs,
+                ),
+            )
+        # Apply LimitAug using Custom Limiter
+        elif self.limitaug_method == "custom_limiter_limitaug":
+            # Change attack time of First compressor of the Limiter
+            self.board[1].attack_ms = random.uniform(
+                self.custom_limiter_attack_range[0], self.custom_limiter_attack_range[1]
+            )
+            # Change release time of First compressor of the Limiter
+            self.board[1].release_ms = random.uniform(
+                self.custom_limiter_release_range[0],
+                self.custom_limiter_release_range[1],
+            )
+            # Change release time of Second compressor of the Limiter
+            self.board[2].release_ms = random.uniform(30.0, 200.0)
+            target_lufs = self.sample_target_lufs()
+            mixture_limited, mixture_lufs = apply_limitaug(
+                mixture,
+                self.board,
+                self.meter,
+                self.sample_rate,
+                target_lufs=target_lufs,
+                target_loudnorm_lufs=self.target_loudnorm_lufs,
+            )
+        # When we want to force NN to output an appropriately compressed target output
+        if self.target_limitaug_mode:
+            mixture_target_lufs = random.gauss(
+                self.limitaug_mode_statistics[self.target_limitaug_mode][0],
+                self.limitaug_mode_statistics[self.target_limitaug_mode][1],
+            )
+            mixture, target_lufs = apply_limitaug(
+                mixture,
+                self.board,
+                self.meter,
+                self.sample_rate,
+                target_lufs=mixture_target_lufs,
+                loudness=mixture_lufs,
+            )
+        if np.isinf(mixture_lufs):
+            mixture_loudnorm = mixture
+        else:
+            augmented_gain = self.target_loudnorm_lufs - mixture_lufs
+            mixture_loudnorm = mixture * db2linear(augmented_gain, eps=0.0)
+        return mixture_limited, mixture_loudnorm
+    def __getitem__(self, index):
+        audio_sources = []
+        for k, source in enumerate(self.sources):
+            # memorize index of target source
+            if source == self.target:  # if source is 'vocals'
+                track_path = self.train_list[
+                    index // self.samples_per_track
+                ]  # we want to use # training samples per each track.
+                audio_path = f"{track_path}/{source}.wav"
+                audio = load_wav_arbitrary_position_stereo(
+                    audio_path, self.sample_rate, self.seq_duration
+                )
+            else:
+                track_path = random.choice(self.train_list)
+                audio_path = f"{track_path}/{source}.wav"
+                audio = load_wav_arbitrary_position_stereo(
+                    audio_path, self.sample_rate, self.seq_duration
+                )
+            audio = self.source_augmentations(audio)
+            audio_sources.append(audio)
+        stems = np.stack(audio_sources, axis=0)
+        # apply linear mix over source index=0
+        # and here, linear mixture is a target unlike in MusdbTrainDataset
+        mixture = stems.sum(0)
+        mixture_limited, mixture_loudnorm = self.get_limitaug_mixture(mixture)
+        # We will give mixture_limited as an input and mixture_loudnorm as a target to the model.
+        mixture_limited = np.clip(mixture_limited, -1.0, 1.0)
+        mixture_limited = torch.as_tensor(mixture_limited, dtype=torch.float32)
+        mixture_loudnorm = torch.as_tensor(mixture_loudnorm, dtype=torch.float32)
+        return mixture_limited, mixture_loudnorm
+class OzoneTrainDataset(DelimitTrainDataset):
+    def __init__(
+        self,
+        target: str = "all",
+        root: str = None,
+        ozone_root: str = None,
+        use_fixed: float = 0.1,  # ratio of fixed samples
+        seq_duration: Optional[float] = 6.0,
+        samples_per_track: int = 64,
+        source_augmentations: Optional[Callable] = lambda audio: audio,
+        sample_rate: int = 44100,
+        seed: int = 42,
+        limitaug_method: str = "limitaug",
+        limitaug_mode: str = "normal_L",
+        limitaug_custom_target_lufs: float = None,
+        limitaug_custom_target_lufs_std: float = None,
+        target_loudnorm_lufs: float = -14.0,
+        target_limitaug_mode: str = None,
+        target_limitaug_custom_target_lufs: float = None,
+        target_limitaug_custom_target_lufs_std: float = None,
+        custom_limiter_attack_range: list = [2.0, 2.0],
+        custom_limiter_release_range: list = [200.0, 200.0],
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            target,
+            root,
+            seq_duration,
+            samples_per_track,
+            source_augmentations,
+            sample_rate,
+            seed,
+            limitaug_method,
+            limitaug_mode,
+            limitaug_custom_target_lufs,
+            limitaug_custom_target_lufs_std,
+            target_loudnorm_lufs,
+            target_limitaug_mode,
+            target_limitaug_custom_target_lufs,
+            target_limitaug_custom_target_lufs_std,
+            custom_limiter_attack_range,
+            custom_limiter_release_range,
+            *args,
+            **kwargs,
+        )
+        self.ozone_root = ozone_root
+        self.use_fixed = use_fixed
+        self.list_train_fixed = glob.glob(f"{self.ozone_root}/ozone_train_fixed/*.wav")
+        self.list_train_random = glob.glob(
+            f"{self.ozone_root}/ozone_train_random/*.wav"
+        )
+        self.dict_train_random = {}
+        # Load information of pre-generated random training examples
+        list_csv_files = glob.glob(f"{self.ozone_root}/ozone_train_random_*.csv")
+        list_csv_files.sort()
+        for csv_file in list_csv_files:
+            with open(csv_file, "r") as f:
+                reader = csv.reader(f)
+                next(reader)
+                for row in reader:
+                    self.dict_train_random[row[0]] = {
+                        "max_threshold": float(row[1]),
+                        "max_character": float(row[2]),
+                        "vocals": {
+                            "name": row[3],
+                            "start_sec": float(row[4]),
+                            "gain": float(row[5]),
+                            "channelswap": bool(row[6]),
+                        },
+                        "bass": {
+                            "name": row[7],
+                            "start_sec": float(row[8]),
+                            "gain": float(row[9]),
+                            "channelswap": bool(row[10]),
+                        },
+                        "drums": {
+                            "name": row[11],
+                            "start_sec": float(row[12]),
+                            "gain": float(row[13]),
+                            "channelswap": bool(row[14]),
+                        },
+                        "other": {
+                            "name": row[15],
+                            "start_sec": float(row[16]),
+                            "gain": float(row[17]),
+                            "channelswap": bool(row[18]),
+                        },
+                    }
+    def __getitem__(self, idx):
+        use_fixed_prob = random.random()
+        if use_fixed_prob <= self.use_fixed:
+            # Fixed examples
+            audio_path = random.choice(self.list_train_fixed)
+            song_name = os.path.basename(audio_path).replace(".wav", "")
+            mixture_limited, start_pos_sec = load_wav_arbitrary_position_stereo(
+                audio_path, self.sample_rate, self.seq_duration, return_pos=True
+            )
+            audio_sources = []
+            track_path = f"{self.root}/train/{song_name}"
+            for source in self.sources:
+                audio_path = f"{track_path}/{source}.wav"
+                audio = load_wav_specific_position_stereo(
+                    audio_path,
+                    self.sample_rate,
+                    self.seq_duration,
+                    start_position=start_pos_sec,
+                )
+                audio_sources.append(audio)
+        else:
+            # Random examples
+            # Load mixture_limited (pre-generated)
+            audio_path = random.choice(self.list_train_random)
+            seg_name = os.path.basename(audio_path).replace(".wav", "")
+            mixture_limited, sr = librosa.load(
+                audio_path, sr=self.sample_rate, mono=False
+            )
+            # Load mixture_unlimited (from the original musdb18, using metadata)
+            audio_sources = []
+            for source in self.sources:
+                dict_seg_info = self.dict_train_random[seg_name]
+                dict_seg_source_info = dict_seg_info[source]
+                audio_path = (
+                    f"{self.root}/train/{dict_seg_source_info['name']}/{source}.wav"
+                )
+                audio = load_wav_specific_position_stereo(
+                    audio_path,
+                    self.sample_rate,
+                    self.seq_duration,
+                    start_position=dict_seg_source_info["start_sec"],
+                )
+                # apply augmentations
+                audio = audio * dict_seg_source_info["gain"]
+                if dict_seg_source_info["channelswap"]:
+                    audio = np.flip(audio, axis=0)
+                audio_sources.append(audio)
+        stems = np.stack(audio_sources, axis=0)
+        mixture = stems.sum(axis=0)
+        mixture_lufs = self.meter.integrated_loudness(mixture.T)
+        if np.isinf(mixture_lufs):
+            mixture_loudnorm = mixture
+        else:
+            augmented_gain = self.target_loudnorm_lufs - mixture_lufs
+            mixture_loudnorm = mixture * db2linear(augmented_gain, eps=0.0)
+        return mixture_limited, mixture_loudnorm
+class DelimitValidDataset(MusdbValidDataset):
+    def __init__(
+        self,
+        target: str = "vocals",
+        root: str = None,
+        delimit_valid_root: str = None,
+        valid_target_lufs: float = -8.05,  # From the Table 1 of the "Towards robust music source separation on loud commercial music" paper, the average loudness of commerical music.
+        target_loudnorm_lufs: float = -14.0,
+        delimit_valid_L_root: str = None,  # This will be used when using the target as compressed (normal_L) mixture.
+        use_custom_limiter: bool = False,
+        custom_limiter_attack_range: list = [0.1, 10.0],
+        custom_limiter_release_range: list = [30.0, 200.0],
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__(target=target, root=root, *args, **kwargs)
+        self.delimit_valid_root = delimit_valid_root
+        if self.delimit_valid_root:
+            with open(f"{self.delimit_valid_root}/valid_loudness.json", "r") as f:
+                self.dict_valid_loudness = json.load(f)
+        self.delimit_valid_L_root = delimit_valid_L_root
+        if self.delimit_valid_L_root:
+            with open(f"{self.delimit_valid_L_root}/valid_loudness.json", "r") as f:
+                self.dict_valid_L_loudness = json.load(f)
+        self.valid_target_lufs = valid_target_lufs
+        self.target_loudnorm_lufs = target_loudnorm_lufs
+        self.meter = pyln.Meter(self.sample_rate)
+        self.use_custom_limiter = use_custom_limiter
+        if self.use_custom_limiter:
+            print("using Custom limiter limitaug for validation!!")
+            self.custom_limiter_attack_range = custom_limiter_attack_range
+            self.custom_limiter_release_range = custom_limiter_release_range
+            self.board = Pedalboard(
+                [
+                    Gain(gain_db=0.0),
+                    Compressor(
+                        threshold_db=-10.0, ratio=4.0, attack_ms=2.0, release_ms=200.0
+                    ),  # attack_ms and release_ms will be changed later.
+                    Compressor(
+                        threshold_db=0.0,
+                        ratio=1000.0,
+                        attack_ms=0.001,
+                        release_ms=100.0,
+                    ),
+                    Gain(gain_db=3.75),
+                    Clipping(threshold_db=0.0),
+                ]
+            )  # This implementation is the same as JUCE Limiter.
+            # However, we want the first compressor to have a variable attack and release time.
+            # Therefore, we use the Custom Limiter instead of the JUCE Limiter.
+        else:
+            self.board = Pedalboard(
+                [Gain(gain_db=0.0), Limiter(threshold_db=0.0, release_ms=100.0)]
+            )  # Currently, we are using a limiter with a release time of 100ms.
+    def __getitem__(self, index):
+        audio_sources = []
+        target_ind = None
+        for k, source in enumerate(self.sources):
+            # memorize index of target source
+            if source == self.target:  # if source is 'vocals'
+                target_ind = k
+                track_path = self.valid_list[index]
+                song_name = os.path.basename(track_path)
+                audio_path = f"{track_path}/{source}.wav"
+                # audio = utils.load_wav_stereo(audio_path, self.sample_rate)
+                audio = librosa.load(audio_path, mono=False, sr=self.sample_rate)[0]
+            else:
+                track_path = self.valid_list[index]
+                song_name = os.path.basename(track_path)
+                audio_path = f"{track_path}/{source}.wav"
+                # audio = utils.load_wav_stereo(audio_path, self.sample_rate)
+                audio = librosa.load(audio_path, mono=False, sr=self.sample_rate)[0]
+            audio = torch.as_tensor(audio, dtype=torch.float32)
+            audio_sources.append(audio)
+        stems = np.stack(audio_sources, axis=0)
+        # apply linear mix over source index=0
+        # and here, linear mixture is a target unlike in MusdbTrainDataset
+        mixture = stems.sum(0)
+        if (
+            self.delimit_valid_root
+        ):  # If there exists a pre-processed delimit valid dataset
+            audio_path = f"{self.delimit_valid_root}/valid/{song_name}.wav"
+            mixture_limited = librosa.load(audio_path, mono=False, sr=self.sample_rate)[
+                0
+            ]
+            mixture_lufs = self.dict_valid_loudness[song_name]
+        else:
+            if self.use_custom_limiter:
+                custom_limiter_attack = random.uniform(
+                    self.custom_limiter_attack_range[0],
+                    self.custom_limiter_attack_range[1],
+                )
+                self.board[1].attack_ms = custom_limiter_attack
+                custom_limiter_release = random.uniform(
+                    self.custom_limiter_release_range[0],
+                    self.custom_limiter_release_range[1],
+                )
+                self.board[1].release_ms = custom_limiter_release
+                mixture_limited, mixture_lufs = apply_limitaug(
+                    mixture,
+                    self.board,
+                    self.meter,
+                    self.sample_rate,
+                    target_lufs=self.valid_target_lufs,
+                )
+            else:
+                mixture_limited, mixture_lufs = apply_limitaug(
+                    mixture,
+                    self.board,
+                    self.meter,
+                    self.sample_rate,
+                    target_lufs=self.valid_target_lufs,
+                    # target_loudnorm_lufs=self.target_loudnorm_lufs,
+                )  # mixture_limited is a limiter applied mixture
+                # We will give mixture_limited as an input and mixture_loudnorm as a target to the model.
+        if self.delimit_valid_L_root:
+            audio_L_path = f"{self.delimit_valid_L_root}/valid/{song_name}.wav"
+            mixture_loudnorm = librosa.load(
+                audio_L_path, mono=False, sr=self.sample_rate
+            )[0]
+            mixture_lufs = self.dict_valid_L_loudness[song_name]
+            mixture = mixture_loudnorm
+        augmented_gain = self.target_loudnorm_lufs - mixture_lufs
+        mixture_loudnorm = mixture * db2linear(augmented_gain)
+        if self.use_custom_limiter:
+            return (
+                mixture_limited,
+                mixture_loudnorm,
+                song_name,
+                mixture_lufs,
+                custom_limiter_attack,
+                custom_limiter_release,
+            )
+        else:
+            return mixture_limited, mixture_loudnorm, song_name, mixture_lufs
+class OzoneValidDataset(MusdbValidDataset):
+    def __init__(
+        self,
+        target: str = "all",
+        root: str = None,
+        ozone_root: str = None,
+        target_loudnorm_lufs: float = -14.0,
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__(target=target, root=root, *args, **kwargs)
+        self.ozone_root = ozone_root
+        self.target_loudnorm_lufs = target_loudnorm_lufs
+        with open(f"{self.ozone_root}/valid_loudness.json", "r") as f:
+            self.dict_valid_loudness = json.load(f)
+    def __getitem__(self, index):
+        audio_sources = []
+        track_path = self.valid_list[index]
+        song_name = os.path.basename(track_path)
+        for k, source in enumerate(self.sources):
+            audio_path = f"{track_path}/{source}.wav"
+            audio = librosa.load(audio_path, mono=False, sr=self.sample_rate)[0]
+            audio_sources.append(audio)
+        stems = np.stack(audio_sources, axis=0)
+        mixture = stems.sum(0)
+        audio_path = f"{self.ozone_root}/ozone_train_fixed/{song_name}.wav"
+        mixture_limited = librosa.load(audio_path, mono=False, sr=self.sample_rate)[0]
+        mixture_lufs = self.dict_valid_loudness[song_name]
+        augmented_gain = self.target_loudnorm_lufs - mixture_lufs
+        mixture_loudnorm = mixture * db2linear(augmented_gain)
+        return mixture_limited, mixture_loudnorm, song_name, mixture_lufs

dataloader/singleset.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import math
+import torch
+from torch.utils.data import Dataset
+import torch.nn.functional as F
+# Modified version from woosungchoi's original implementation
+class SingleTrackSet(Dataset):
+    def __init__(self, track, hop_length, num_frame=128, target_name="vocals"):
+        assert len(track.shape) == 2
+        assert track.shape[0] == 2  # check stereo audio
+        self.hop_length = hop_length
+        self.window_length = hop_length * (num_frame - 1)  # 130048
+        self.trim_length = self.get_trim_length(self.hop_length)  # 5120
+        self.true_samples = self.window_length - 2 * self.trim_length  # 119808
+        self.lengths = [track.shape[1]]  # track lengths (in sample level)
+        self.source_names = [
+            "vocals",
+            "drums",
+            "bass",
+            "other",
+        ]  # == self.musdb_train.targets_names[:-2]
+        self.target_names = [target_name]
+        self.num_tracks = 1
+        import math
+        num_chunks = [
+            math.ceil(length / self.true_samples) for length in self.lengths
+        ]  # example : 44.1khz 180sec audio, => [67]
+        self.acc_chunk_final_ids = [
+            sum(num_chunks[: i + 1]) for i in range(self.num_tracks)
+        ]  # [67]
+        self.cache_mode = True
+        self.cache = {}
+        self.cache[0] = {}
+        self.cache[0]["linear_mixture"] = track
+    def __len__(self):
+        return self.acc_chunk_final_ids[-1] * len(self.target_names)  # 67
+    def __getitem__(self, idx):
+        target_offset = idx % len(self.target_names)  # 0
+        idx = idx // len(self.target_names)  # idx
+        target_name = self.target_names[target_offset]  # 'vocals'
+        mixture_idx, start_pos = self.idx_to_track_offset(
+            idx
+        )  # idx * self.true_samples
+        length = self.true_samples
+        left_padding_num = right_padding_num = self.trim_length  # 5120
+        if mixture_idx is None:
+            raise StopIteration
+        mixture_length = self.lengths[mixture_idx]
+        if start_pos + length > mixture_length:  # last
+            right_padding_num += self.true_samples - (mixture_length - start_pos)
+            length = None
+        mixture = self.get_audio(mixture_idx, "linear_mixture", start_pos, length)
+        mixture = F.pad(mixture, (left_padding_num, right_padding_num), "constant", 0)
+        return mixture
+    def idx_to_track_offset(self, idx):
+        for i, last_chunk in enumerate(self.acc_chunk_final_ids):
+            if idx < last_chunk:
+                if i != 0:
+                    offset = (idx - self.acc_chunk_final_ids[i - 1]) * self.true_samples
+                else:
+                    offset = idx * self.true_samples
+                return i, offset
+        return None, None
+    def get_audio(self, idx, target_name, pos=0, length=None):
+        track = self.cache[idx][target_name]
+        return track[:, pos : pos + length] if length is not None else track[:, pos:]
+    def get_trim_length(self, hop_length, min_trim=5000):
+        trim_per_hop = math.ceil(min_trim / hop_length)
+        trim_length = trim_per_hop * hop_length
+        assert trim_per_hop > 1
+        return trim_length

eval_delimit/calc_flops.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+import argparse
+import random
+import torch
+from deepspeed.profiling.flops_profiler import get_model_profile
+from utils import get_config
+from models import load_model_with_args
+# def main():
+parser = argparse.ArgumentParser(description="FLOPs calculation")
+parser.add_argument(
+    "-c", "--config", default="delimit_6_s", type=str, help="Name of the setting file."
+)
+config_args = parser.parse_args()
+args = get_config(config_args.config)
+print(args)
+with torch.cuda.device(0):
+    model = load_model_with_args(args)
+    batch_size = 1
+    flops, macs, params = get_model_profile(
+        model=model,  # model
+        input_shape=(batch_size, 2, 44100 * 60),  # input shape to the model. If specified, the model takes a tensor with this shape as the only positional argument.
+        args=[],  # list of positional arguments to the model.
+        kwargs={},  # dictionary of keyword arguments to the model.
+        print_profile=True,  # prints the model graph with the measured profile attached to each module
+        detailed=True,  # print the detailed profile
+        module_depth=-1,  # depth into the nested modules, with -1 being the inner most modules
+        top_modules=1,  # the number of top modules to print aggregated profile
+        warm_up=1,  # the number of warm-ups before measuring the time of each module
+        as_string=True,  # print raw numbers (e.g. 1000) or as human-readable strings (e.g. 1k)
+        output_file=None,  # path to the output file. If None, the profiler prints to stdout.
+        ignore_modules=None,
+    )  # the list of modules to ignore in the profiling
+    print(args.dir_params.exp_name)
+    print('flops: ', flops)
+    print('macs: ', macs)
+    print('params: ', params)

eval_delimit/score_calc_delimit.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# Calculate SI-SDR, Multi-resolution spectrogram mse score of the pre-inferenced sources
+import os
+import argparse
+import csv
+import json
+import glob
+import tqdm
+import numpy as np
+import librosa
+import pyloudnorm as pyln
+from asteroid.metrics import get_metrics
+from utils import str2bool
+def multi_resolution_spectrogram_mse(
+    gt, est, n_fft=[2048, 1024, 512], n_hop=[512, 256, 128]
+):
+    assert gt.shape == est.shape
+    assert len(n_fft) == len(n_hop)
+    score = 0.0
+    for i in range(len(n_fft)):
+        gt_spec = librosa.magphase(
+            librosa.stft(gt, n_fft=n_fft[i], hop_length=n_hop[i])
+        )[0]
+        est_spec = librosa.magphase(
+            librosa.stft(est, n_fft=n_fft[i], hop_length=n_hop[i])
+        )[0]
+        score = score + np.mean((gt_spec - est_spec) ** 2)
+    return score
+parser = argparse.ArgumentParser(description="model test.py")
+parser.add_argument(
+    "--target",
+    type=str,
+    default="all",
+    help="target source. all, vocals, drums, bass, other, 0.5_mixed",
+)
+parser.add_argument(
+    "--root", type=str, default="/path/to/musdb18hq_loudnorm"
+)
+parser.add_argument("--exp_name", type=str, default="convtasnet_6_s")
+parser.add_argument(
+    "--output_directory",
+    type=str,
+    default="/path/to/results",
+)
+parser.add_argument("--loudnorm_lufs", type=float, default=-14.0)
+parser.add_argument(
+    "--calc_mse",
+    type=str2bool,
+    default=True,
+    help="calculate multi-resolution spectrogram mse",
+)
+parser.add_argument(
+    "--calc_results",
+    type=str2bool,
+    default=True,
+    help="Set this True when you want to calculate the results of the test set. Set this False when calculating musdb-hq vs musdb-XL. (top row in Table 1.)",
+)
+args, _ = parser.parse_known_args()
+args.sample_rate = 44100
+meter = pyln.Meter(args.sample_rate)
+if args.calc_results:
+    args.test_output_dir = f"{args.output_directory}/test/{args.exp_name}"
+else:
+    args.test_output_dir = f"{args.output_directory}/{args.exp_name}"
+if args.target == "all" or args.target == "0.5_mixed":
+    test_tracks = glob.glob(f"{args.root}/*/mixture.wav")
+else:
+    test_tracks = glob.glob(f"{args.root}/*/{args.target}.wav")
+i = 0
+dict_song_score = {}
+list_si_sdr = []
+list_multi_mse = []
+for track in tqdm.tqdm(test_tracks):
+    if args.target == "all":  # for standard de-limiter estimation
+        audio_name = os.path.basename(os.path.dirname(track))
+        gt_source = librosa.load(track, sr=args.sample_rate, mono=False)[0]
+        est_delimiter = librosa.load(
+            f"{args.test_output_dir}/{audio_name}/all.wav",
+            sr=args.sample_rate,
+            mono=False,
+        )[0]
+    else:  # for source-separated de-limiter estimation
+        audio_name = os.path.basename(os.path.dirname(track))
+        gt_source = librosa.load(track, sr=args.sample_rate, mono=False)[0]
+        est_delimiter = librosa.load(
+            f"{args.test_output_dir}/{audio_name}/{args.target}.wav",
+            sr=args.sample_rate,
+            mono=False,
+        )[0]
+    metrics_dict = get_metrics(
+        gt_source + est_delimiter,
+        gt_source,
+        est_delimiter,
+        sample_rate=args.sample_rate,
+        metrics_list=["si_sdr"],
+    )
+    if args.calc_mse:
+        multi_resolution_spectrogram_mse_score = multi_resolution_spectrogram_mse(
+            gt_source, est_delimiter
+        )
+    else:
+        multi_resolution_spectrogram_mse_score = None
+    dict_song_score[audio_name] = {
+        "si_sdr": metrics_dict["si_sdr"],
+        "multi_mse": multi_resolution_spectrogram_mse_score,
+    }
+    list_si_sdr.append(metrics_dict["si_sdr"])
+    list_multi_mse.append(multi_resolution_spectrogram_mse_score)
+    i += 1
+print(f"{args.exp_name} on {args.target}")
+print(f"SI-SDR score: {sum(list_si_sdr) / len(list_si_sdr)}")
+if args.calc_mse:
+    print(f"multi-mse score: {sum(list_multi_mse) / len(list_multi_mse)}")
+if args.target != "all":
+    # save dict_song_score to json file
+    with open(f"{args.test_output_dir}/score_{args.target}.json", "w") as f:
+        json.dump(dict_song_score, f, indent=4)
+else:
+    # save dict_song_score to json file
+    with open(f"{args.test_output_dir}/score.json", "w") as f:
+        json.dump(dict_song_score, f, indent=4)

eval_delimit/score_diff_dyn_complexity.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import os
+import argparse
+import csv
+import json
+import glob
+import tqdm
+import numpy as np
+import librosa
+import musdb
+import pyloudnorm as pyln
+from utils import str2bool, db2linear
+parser = argparse.ArgumentParser(description="model test.py")
+parser.add_argument(
+    "--target",
+    type=str,
+    default="all",
+    help="target source. all, vocals, bass, drums, other.",
+)
+parser.add_argument(
+    "--root",
+    type=str,
+    default="/path/to/musdb18hq_loudnorm",
+)
+parser.add_argument(
+    "--output_directory",
+    type=str,
+    default="/path/to/results",
+)
+parser.add_argument("--exp_name", type=str, default="convtasnet_6_s")
+parser.add_argument(
+    "--calc_results",
+    type=str2bool,
+    default=True,
+    help="Set this True when you want to calculate the results of the test set. Set this False when calculating musdb-hq vs musdb-XL. (top row in Table 1.)",
+)
+args, _ = parser.parse_known_args()
+args.sample_rate = 44100
+meter = pyln.Meter(args.sample_rate)
+if args.calc_results:
+    args.test_output_dir = f"{args.output_directory}/test/{args.exp_name}"
+else:
+    args.test_output_dir = f"{args.output_directory}/{args.exp_name}"
+est_track_list = glob.glob(f"{args.test_output_dir}/*/{args.target}.wav")
+f = open(
+    f"{args.test_output_dir}/score_feature_{args.target}.json",
+    encoding="UTF-8",
+)
+dict_song_score_est = json.loads(f.read())
+if args.target == "all":
+    ref_track_list = glob.glob(f"{args.root}/*/mixture.wav")
+    f = open(f"{args.root}/score_feature.json", encoding="UTF-8")
+    dict_song_score_ref = json.loads(f.read())
+else:
+    ref_track_list = glob.glob(f"{args.root}/*/{args.target}.wav")
+    f = open(f"{args.root}/score_feature_{args.target}.json", encoding="UTF-8")
+    dict_song_score_ref = json.loads(f.read())
+i = 0
+dict_song_score = {}
+list_diff_dynamic_complexity = []
+for track in tqdm.tqdm(ref_track_list):
+    audio_name = os.path.basename(os.path.dirname(track))
+    ref_dyn_complexity = dict_song_score_ref[audio_name]["dynamic_complexity_score"]
+    est_dyn_complexity = dict_song_score_est[audio_name]["dynamic_complexity_score"]
+    list_diff_dynamic_complexity.append(est_dyn_complexity - ref_dyn_complexity)
+    i += 1
+print(
+    f"Dynamic complexity difference {args.exp_name} vs {os.path.basename(args.root)} on {args.target}"
+)
+print("mean: ", np.mean(list_diff_dynamic_complexity))
+print("median: ", np.median(list_diff_dynamic_complexity))
+print("std: ", np.std(list_diff_dynamic_complexity))

eval_delimit/score_fad.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# We are going to use FAD based on https://github.com/gudgud96/frechet-audio-distance
+import os
+import subprocess
+import glob
+import argparse
+from frechet_audio_distance import FrechetAudioDistance
+from utils import str2bool
+parser = argparse.ArgumentParser(description="model test.py")
+parser.add_argument(
+    "--target",
+    type=str,
+    default="all",
+    help="target source. all, vocals, drums, bass, other",
+)
+parser.add_argument(
+    "--root",
+    type=str,
+    default="/path/to/musdb18hq_loudnorm",
+)
+parser.add_argument(
+    "--output_directory",
+    type=str,
+    default="/path/to/results",
+)
+parser.add_argument("--exp_name", type=str, default="delimit_6_s")
+parser.add_argument(
+    "--calc_results",
+    type=str2bool,
+    default=True,
+    help="Set this True when you want to calculate the results of the test set. Set this False when calculating musdb-hq vs musdb-XL. (top row in Table 1.)",
+)
+args, _ = parser.parse_known_args()
+os.makedirs(f"{args.root}/musdb_hq_loudnorm_16k_mono_link", exist_ok=True)
+song_list = glob.glob(f"{args.root}/musdb_hq_loudnorm_16k_mono/*/mixture.wav")
+for song in song_list:
+    song_name = os.path.basename(os.path.dirname(song))
+    subprocess.run(
+        f'ln --symbolic "{song}" "{args.root}/musdb_hq_loudnorm_16k_mono_link/{song_name}.wav"',
+        shell=True,
+    )
+if args.calc_results:
+    args.test_output_dir = f"{args.output_directory}/test/{args.exp_name}"
+else:
+    args.test_output_dir = f"{args.output_directory}/{args.exp_name}"
+os.makedirs(f"{args.test_output_dir}_16k_mono_link", exist_ok=True)
+song_list = glob.glob(f"{args.test_output_dir}_16k_mono/*/{args.target}.wav")
+for song in song_list:
+    song_name = os.path.basename(os.path.dirname(song))
+    subprocess.run(
+        f'ln --symbolic "{song}" "{args.test_output_dir}_16k_mono_link/{song_name}.wav"',
+        shell=True,
+    )
+frechet = FrechetAudioDistance()
+fad_score = frechet.score(
+    f"{args.root}/musdb_hq_loudnorm_16k_mono_link",
+    f"{args.test_output_dir}_16k_mono_link",
+)
+print(f"{args.exp_name}")
+print(f"FAD score: {fad_score}")

eval_delimit/score_features.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import os
+import argparse
+import csv
+import json
+import glob
+from typing import Any, Optional, Union, Collection
+import tqdm
+import numpy as np
+import librosa
+from librosa.core.spectrum import _spectrogram
+import musdb
+import essentia
+import essentia.standard
+import pyloudnorm as pyln
+from utils import str2bool, db2linear
+def spectral_crest(
+    *,
+    y: Optional[np.ndarray] = None,
+    S: Optional[np.ndarray] = None,
+    n_fft: int = 2048,
+    hop_length: int = 512,
+    win_length: Optional[int] = None,
+    window: str = "hann",
+    center: bool = True,
+    pad_mode: str = "constant",
+    amin: float = 1e-10,
+    power: float = 2.0,
+) -> np.ndarray:
+    """Compute spectral crest
+    Spectral crest (or tonality coefficient) is a measure of
+    the ratio of the maximum of the spectrum to the arithmetic mean of the spectrum
+    A higher spectral crest => more tonality,
+    A lower spectral crest => more noisy.
+    Parameters
+    ----------
+    y : np.ndarray [shape=(..., n)] or None
+        audio time series. Multi-channel is supported.
+    S : np.ndarray [shape=(..., d, t)] or None
+        (optional) pre-computed spectrogram magnitude
+    n_fft : int > 0 [scalar]
+        FFT window size
+    hop_length : int > 0 [scalar]
+        hop length for STFT. See `librosa.stft` for details.
+    win_length : int <= n_fft [scalar]
+        Each frame of audio is windowed by `window()`.
+        The window will be of length `win_length` and then padded
+        with zeros to match ``n_fft``.
+        If unspecified, defaults to ``win_length = n_fft``.
+    window : string, tuple, number, function, or np.ndarray [shape=(n_fft,)]
+        - a window specification (string, tuple, or number);
+          see `scipy.signal.get_window`
+        - a window function, such as `scipy.signal.windows.hann`
+        - a vector or array of length ``n_fft``
+        .. see also:: `librosa.filters.get_window`
+    center : boolean
+        - If `True`, the signal ``y`` is padded so that frame
+          ``t`` is centered at ``y[t * hop_length]``.
+        - If `False`, then frame `t` begins at ``y[t * hop_length]``
+    pad_mode : string
+        If ``center=True``, the padding mode to use at the edges of the signal.
+        By default, STFT uses zero padding.
+    amin : float > 0 [scalar]
+        minimum threshold for ``S`` (=added noise floor for numerical stability)
+    power : float > 0 [scalar]
+        Exponent for the magnitude spectrogram.
+        e.g., 1 for energy, 2 for power, etc.
+        Power spectrogram is usually used for computing spectral flatness.
+    Returns
+    -------
+    crest : np.ndarray [shape=(..., 1, t)]
+        spectral crest for each frame.
+    """
+    S, n_fft = _spectrogram(
+        y=y,
+        S=S,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        power=1.0,
+        win_length=win_length,
+        window=window,
+        center=center,
+        pad_mode=pad_mode,
+    )
+    S_thresh = np.maximum(amin, S**power)
+    # gmean = np.exp(np.mean(np.log(S_thresh), axis=-2, keepdims=True))
+    gmax = np.max(S_thresh, axis=-2, keepdims=True)
+    amean = np.mean(S_thresh, axis=-2, keepdims=True)
+    crest: np.ndarray = gmax / amean
+    return crest
+parser = argparse.ArgumentParser(description="model test.py")
+parser.add_argument(
+    "--target",
+    type=str,
+    default="all",
+    help="target source. all, vocals, drums, bass, other",
+)
+parser.add_argument(
+    "--root", type=str, default="/path/to/musdb18hq_loudnorm"
+)
+parser.add_argument("--exp_name", type=str, default="delimit_6_s")
+parser.add_argument(
+    "--output_directory",
+    type=str,
+    default="/path/to/results",
+)
+parser.add_argument(
+    "--calc_results",
+    type=str2bool,
+    default=True,
+    help="calculate results or musdb-hq or musdb-XL test dataset",
+)
+args, _ = parser.parse_known_args()
+args.sample_rate = 44100
+args.test_output_dir = f"{args.output_directory}/test/{args.exp_name}"
+if args.calc_results:
+    track_list = glob.glob(
+        f"{args.output_directory}/test/{args.exp_name}/*/{args.target}.wav"
+    )
+else:
+    if args.target == "all":
+        track_list = glob.glob(f"{args.root}/*/mixture.wav")
+    else:
+        track_list = glob.glob(f"{args.root}/*/{args.target}.wav")
+i = 0
+dynamic_complexity = essentia.standard.DynamicComplexity()
+loudness_range = essentia.standard.LoudnessEBUR128()
+spectral_centroid = essentia.standard.SpectralCentroidTime()
+crest = essentia.standard.Crest()
+dynamic_spread = essentia.standard.DistributionShape()
+central_moments = essentia.standard.CentralMoments()
+dict_song_score = {}
+list_rms = []
+list_crest_factor = []
+list_dc_score = []
+list_lra_score = []
+list_sc_hertz = []
+list_sf_score = []
+list_spectral_crest_score = []
+for track in tqdm.tqdm(track_list):
+    audio_name = os.path.basename(os.path.dirname(track))
+    gt_source_librosa = librosa.load(f"{track}", sr=args.sample_rate, mono=False)[
+        0
+    ]  # (nb_channels, nb_samples)
+    gt_source_librosa_mono = librosa.to_mono(gt_source_librosa)  # (nb_samples)
+    gt_source_essentia = essentia.standard.AudioLoader(filename=f"{track}")()[
+        0
+    ]  # (nb_samples, nb_channels)
+    gt_source_essentia_cat = np.concatenate(
+        [gt_source_essentia[:, 0], gt_source_essentia[:, 1]]
+    )  # (nb_samples * nb_channels)
+    gt_source_essentia_mono = np.mean(gt_source_essentia, axis=1)  # (nb_samples)
+    rms = np.sqrt(np.mean(gt_source_essentia_cat**2))
+    crest_factor = np.max(np.abs(gt_source_essentia_cat)) / rms
+    dc_score, _ = dynamic_complexity(gt_source_essentia_mono)
+    _, _, _, lra_score = loudness_range(gt_source_essentia)
+    sc_hertz = spectral_centroid(gt_source_essentia_mono)
+    sf_score = np.mean(librosa.feature.spectral_flatness(gt_source_librosa_mono))
+    spectral_crest_score = np.mean(spectral_crest(y=gt_source_librosa_mono))
+    dict_song_score[audio_name] = {
+        "rms": float(rms),
+        "crest_factor": float(crest_factor),
+        "dynamic_complexity_score": float(dc_score),
+        "lra_score": float(lra_score),
+        "spectral_centroid_hertz": float(sc_hertz),
+        "spectral_flatness_score": float(sf_score),
+        "spectral_crest_score": float(spectral_crest_score),
+    }
+    list_rms.append(rms)
+    list_crest_factor.append(crest_factor)
+    list_dc_score.append(dc_score)
+    list_lra_score.append(lra_score)
+    list_sc_hertz.append(sc_hertz)
+    list_sf_score.append(sf_score)
+    list_spectral_crest_score.append(spectral_crest_score)
+    i += 1
+if args.calc_results:
+    print(f"{args.exp_name} on {args.target}")
+else:
+    print(f"{os.path.basename(args.root)} on {args.target}")
+print(f"rms: {np.mean(list_rms)}")
+print(f"crest_factor: {np.mean(list_crest_factor)}")
+print(f"dynamic_complexity_score: {np.mean(list_dc_score)}")
+print(f"lra_score: {np.mean(list_lra_score)}")
+print(f"sc_hertz: {np.mean(list_sc_hertz)}")
+print(f"sf_score: {np.mean(list_sf_score)}")
+print(f"spectral_crest_score: {np.mean(list_spectral_crest_score)}")
+# save dict_song_score to json file
+if args.target == "all":
+    file_name = "score_features"
+else:
+    file_name = f"score_feature_{args.target}"
+if args.calc_results:
+    with open(
+        f"{args.output_directory}/test/{args.exp_name}/{file_name}.json", "w"
+    ) as f:
+        json.dump(dict_song_score, f, indent=4)
+else:
+    with open(f"{args.root}/{file_name}.json", "w") as f:
+        json.dump(dict_song_score, f, indent=4)

eval_delimit/score_peaq.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# We are going to use PEAQ based on https://github.com/HSU-ANT/gstpeaq
+"""
+python3 score_peaq.py --exp_name=delimit_6_s | tee /path/to/results/delimit_6_s/score_peaq.txt
+"""
+import os
+import subprocess
+import glob
+import argparse
+def str2bool(v):
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("Boolean value expected.")
+parser = argparse.ArgumentParser(description="model test.py")
+parser.add_argument(
+    "--target",
+    type=str,
+    default="all",
+    help="target source. all, vocals, drums, bass, other",
+)
+parser.add_argument(
+    "--root",
+    type=str,
+    default="/path/to/musdb_XL_loudnorm",
+)
+parser.add_argument(
+    "--output_directory",
+    type=str,
+    default="/path/to/results/",
+)
+parser.add_argument("--exp_name", type=str, default="delimit_6_s")
+parser.add_argument(
+    "--calc_results",
+    type=str2bool,
+    default=True,
+    help="Set this True when you want to calculate the results of the test set. Set this False when calculating musdb-hq vs musdb-XL. (top row in Table 1.)",
+)
+args, _ = parser.parse_known_args()
+if args.calc_results:
+    args.test_output_dir = f"{args.output_directory}/test/{args.exp_name}"
+else:
+    args.test_output_dir = f"{args.output_directory}/{args.exp_name}"
+if args.target == "all":
+    song_list = sorted(glob.glob(f"{args.root}/*/mixture.wav"))
+    for song in song_list:
+        song_name = os.path.basename(os.path.dirname(song))
+        est_path = f"{args.test_output_dir}/{song_name}/{args.target}.wav"
+        subprocess.run(
+            f'peaq --gst-plugin-load=/usr/local/lib/gstreamer-1.0/libgstpeaq.so "{song}" "{est_path}"',
+            shell=True,
+        )
+else:
+    song_list = sorted(glob.glob(f"{args.root}/*/{args.target}.wav"))
+    for song in song_list:
+        song_name = os.path.basename(os.path.dirname(song))
+        est_path = f"{args.test_output_dir}/{song_name}/{args.target}.wav"
+        subprocess.run(
+            f'peaq --gst-plugin-load=/usr/local/lib/gstreamer-1.0/libgstpeaq.so "{song}" "{est_path}"',
+            shell=True,
+        )

eval_delimit/score_peaq_aggregate.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# PEAQ aggregate score
+"""
+/path/to/results/delimit_6_s/score_peaq.txt
+"""
+import os
+import glob
+import argparse
+import json
+def str2bool(v):
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("Boolean value expected.")
+parser = argparse.ArgumentParser(description="model test.py")
+parser.add_argument(
+    "--target",
+    type=str,
+    default="all",
+    help="target source. all, vocals, drums, bass, other",
+)
+parser.add_argument(
+    "--root",
+    type=str,
+    default="/path/to/musdb18hq_loudnorm",
+)
+parser.add_argument(
+    "--output_directory",
+    type=str,
+    default="/path/to/results",
+)
+parser.add_argument("--exp_name", type=str, default="delimit_6_s")
+parser.add_argument(
+    "--calc_results",
+    type=str2bool,
+    default=True,
+    help="Set this True when you want to calculate the results of the test set. Set this False when calculating musdb-hq vs musdb-XL. (top row in Table 1.)",
+)
+args, _ = parser.parse_known_args()
+if args.calc_results:
+    args.test_output_dir = f"{args.output_directory}/test/{args.exp_name}"
+else:
+    args.test_output_dir = f"{args.output_directory}/{args.exp_name}"
+if args.target == "all":
+    score_path = f"{args.test_output_dir}/score_peaq.txt"
+else:
+    score_path = f"{args.test_output_dir}/score_peaq_{args.target}.txt"
+# write the code to load score_peaq.txt
+with open(score_path, "r") as f:
+    score_txt = f.readlines()
+song_list = glob.glob(f"{args.root}/*")
+dict_song_peaq = {}
+list_peaq = []
+for idx, song in enumerate(song_list):
+    song_name = os.path.basename(song)
+    peaq = float(score_txt[idx * 2].replace("Objective Difference Grade: ", ""))
+    dict_song_peaq[song_name] = peaq
+    list_peaq.append(peaq)
+print(f"{args.exp_name} on {args.target}")
+print(f"PEAQ score: {sum(list_peaq) / len(list_peaq)}")
+if args.target == "all":
+    # save dict_song_peaq to json file
+    with open(f"{args.test_output_dir}/score_peaq.json", "w") as f:
+        json.dump(dict_song_peaq, f, indent=4)
+else:
+    # save dict_song_peaq to json file
+    with open(
+        f"{args.test_output_dir}/score_peaq_{args.target}.json",
+        "w",
+    ) as f:
+        json.dump(dict_song_peaq, f, indent=4)

inference.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import os
+import json
+import argparse
+import glob
+import torch
+import tqdm
+import librosa
+import soundfile as sf
+import pyloudnorm as pyln
+from dotmap import DotMap
+from models import load_model_with_args
+from separate_func import (
+    conv_tasnet_separate,
+)
+from utils import str2bool, db2linear
+tqdm.monitor_interval = 0
+def separate_track_with_model(
+    args, model, device, track_audio, track_name, meter, augmented_gain
+):
+    with torch.no_grad():
+        if (
+            args.model_loss_params.architecture == "conv_tasnet_mask_on_output"
+            or args.model_loss_params.architecture == "conv_tasnet"
+        ):
+            estimates = conv_tasnet_separate(
+                args,
+                model,
+                device,
+                track_audio,
+                track_name,
+                meter=meter,
+                augmented_gain=augmented_gain,
+            )
+        return estimates
+def main():
+    parser = argparse.ArgumentParser(description="model test.py")
+    parser.add_argument("--target", type=str, default="all")
+    parser.add_argument("--data_root", type=str, default="./input_data")
+    parser.add_argument("--weight_directory", type=str, default="./weight")
+    parser.add_argument("--output_directory", type=str, default="./output")
+    parser.add_argument("--use_gpu", type=str2bool, default=True)
+    parser.add_argument("--save_name_as_target", type=str2bool, default=False)
+    parser.add_argument(
+        "--loudnorm_input_lufs",
+        type=float,
+        default=None,
+        help="If you want to use loudnorm for input",
+    )
+    parser.add_argument(
+        "--save_output_loudnorm",
+        type=float,
+        default=-14.0,
+        help="Save loudness normalized outputs or not. If you want to save, input target loudness",
+    )
+    parser.add_argument(
+        "--save_mixed_output",
+        type=float,
+        default=None,
+        help="Save original+delimited-estimation mixed output with a ratio of default 0.5 (orginal) and 1 - 0.5 (estimation)",
+    )
+    parser.add_argument(
+        "--save_16k_mono",
+        type=str2bool,
+        default=False,
+        help="Save 16k mono wav files for FAD evaluation.",
+    )
+    parser.add_argument(
+        "--save_histogram",
+        type=str2bool,
+        default=False,
+        help="Save histogram of the output. Only valid when the task is 'delimit'",
+    )
+    parser.add_argument(
+        "--use_singletrackset",
+        type=str2bool,
+        default=False,
+        help="Use SingleTrackSet if input data is too long.",
+    )
+    args, _ = parser.parse_known_args()
+    with open(f"{args.weight_directory}/{args.target}.json", "r") as f:
+        args_dict = json.load(f)
+        args_dict = DotMap(args_dict)
+    for key, value in args_dict["args"].items():
+        if key in list(vars(args).keys()):
+            pass
+        else:
+            setattr(args, key, value)
+    args.test_output_dir = f"{args.output_directory}"
+    os.makedirs(args.test_output_dir, exist_ok=True)
+    device = torch.device(
+        "cuda" if torch.cuda.is_available() and args.use_gpu else "cpu"
+    )
+    ###################### Define Models ######################
+    our_model = load_model_with_args(args)
+    our_model = our_model.to(device)
+    target_model_path = f"{args.weight_directory}/{args.target}.pth"
+    checkpoint = torch.load(target_model_path, map_location=device)
+    our_model.load_state_dict(checkpoint)
+    our_model.eval()
+    meter = pyln.Meter(44100)
+    test_tracks = glob.glob(f"{args.data_root}/*.wav") + glob.glob(
+        f"{args.data_root}/*.mp3"
+    )
+    for track in tqdm.tqdm(test_tracks):
+        track_name = os.path.basename(track).replace(".wav", "").replace(".mp3", "")
+        track_audio, sr = librosa.load(track, sr=None, mono=False)  # sr should be 44100
+        orig_audio = track_audio.copy()
+        if sr != 44100:
+            raise ValueError("Sample rate should be 44100")
+        augmented_gain = None
+        print("Now De-limiting : ", track_name)
+        if args.loudnorm_input_lufs:  # If you want to use loud-normalized input
+            track_lufs = meter.integrated_loudness(track_audio.T)
+            augmented_gain = args.loudnorm_input_lufs - track_lufs
+            track_audio = track_audio * db2linear(augmented_gain, eps=0.0)
+        track_audio = (
+            torch.as_tensor(track_audio, dtype=torch.float32).unsqueeze(0).to(device)
+        )
+        estimates = separate_track_with_model(
+            args, our_model, device, track_audio, track_name, meter, augmented_gain
+        )
+        if args.save_mixed_output:
+            track_lufs = meter.integrated_loudness(orig_audio.T)
+            augmented_gain = args.save_output_loudnorm - track_lufs
+            orig_audio = orig_audio * db2linear(augmented_gain, eps=0.0)
+            mixed_output = orig_audio * args.save_mixed_output + estimates * (
+                1 - args.save_mixed_output
+            )
+            sf.write(
+                f"{args.test_output_dir}/{track_name}/{track_name}_mixed.wav",
+                mixed_output.T,
+                args.data_params.sample_rate,
+            )
+if __name__ == "__main__":
+    main()

main_ddp.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import os
+import argparse
+import random
+import torch
+from train_ddp import train
+from utils import get_config
+def main():
+    parser = argparse.ArgumentParser(description="Trainer")
+    # Put every argumnet in './configs/yymmdd_architecture_number.yaml' and load it.
+    parser.add_argument(
+        "-c",
+        "--config",
+        default="delimit_6_s",
+        type=str,
+        help="Name of the setting file.",
+    )
+    config_args = parser.parse_args()
+    args = get_config(config_args.config)
+    args.img_check = (
+        f"{args.dir_params.output_directory}/img_check/{args.dir_params.exp_name}"
+    )
+    args.output = (
+        f"{args.dir_params.output_directory}/checkpoint/{args.dir_params.exp_name}"
+    )
+    # Set which devices to use
+    os.environ["MASTER_ADDR"] = "127.0.0.1"
+    os.environ["MASTER_PORT"] = str(random.randint(0, 1800))
+    os.makedirs(args.img_check, exist_ok=True)
+    os.makedirs(args.output, exist_ok=True)
+    torch.manual_seed(args.sys_params.seed)
+    random.seed(args.sys_params.seed)
+    print(args)
+    train(args)
+if __name__ == "__main__":
+    main()

models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .load_models import load_model_with_args

models/base_models.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import torch
+import torch.nn as nn
+from asteroid.models.base_models import (
+    BaseEncoderMaskerDecoder,
+    _unsqueeze_to_3d,
+    _shape_reconstructed,
+)
+from asteroid.utils.torch_utils import pad_x_to_y, jitable_shape
+from einops import rearrange
+class BaseEncoderMaskerDecoderWithConfigs(BaseEncoderMaskerDecoder):
+    def __init__(self, encoder, masker, decoder, encoder_activation=None, **kwargs):
+        super().__init__(encoder, masker, decoder, encoder_activation)
+        self.use_encoder = kwargs.get("use_encoder", True)
+        self.apply_mask = kwargs.get("apply_mask", True)
+        self.use_decoder = kwargs.get("use_decoder", True)
+    def forward(self, wav):
+        """
+        Enc/Mask/Dec model forward with some additional options.
+        Some of the models we use, like TFC-TDF-UNet, have no masker.
+        In UMX or X-UMX, they already use masking in their model implementation.
+        Since we do not want to manipulate the model codes, we use this wrapper.
+        Args:
+            wav (torch.Tensor): waveform tensor. 1D, 2D or 3D tensor, time last.
+        Returns:
+            torch.Tensor, of shape (batch, n_src, time) or (n_src, time).
+        """
+        # Remember shape to shape reconstruction, cast to Tensor for torchscript
+        shape = jitable_shape(wav)
+        # Reshape to (batch, n_mix, time)
+        wav = _unsqueeze_to_3d(wav)
+        # Real forward
+        if self.use_encoder:
+            tf_rep = self.forward_encoder(wav)
+        else:
+            tf_rep = wav
+        est_masks = self.forward_masker(tf_rep)
+        if self.apply_mask:
+            masked_tf_rep = self.apply_masks(tf_rep, est_masks)
+        else:  # model already used masking
+            masked_tf_rep = est_masks
+        if self.use_decoder:
+            decoded = self.forward_decoder(masked_tf_rep)
+            reconstructed = pad_x_to_y(decoded, wav)
+            return masked_tf_rep, _shape_reconstructed(reconstructed, shape)
+        else:  # In UMX or X-UMX, decoder is not used
+            decoded = masked_tf_rep
+            return decoded
+class BaseEncoderMaskerDecoder_mixture_consistency(BaseEncoderMaskerDecoder):
+    def __init__(self, encoder, masker, decoder, encoder_activation=None):
+        super().__init__(encoder, masker, decoder, encoder_activation)
+    def forward(self, wav):
+        """Enc/Mask/Dec model forward with mixture consistent output
+        References:
+        [1] : Wisdom, Scott, et al. "Differentiable consistency constraints for improved deep speech enhancement." ICASSP 2019.
+        [2] : Wisdom, Scott, et al. "Unsupervised sound separation using mixture invariant training." NeurIPS 2020.
+        Args:
+            wav (torch.Tensor): waveform tensor. 1D, 2D or 3D tensor, time last.
+        Returns:
+            torch.Tensor, of shape (batch, n_src, time) or (n_src, time).
+        """
+        # Remember shape to shape reconstruction, cast to Tensor for torchscript
+        shape = jitable_shape(wav)
+        # Reshape to (batch, n_mix, time)
+        wav = _unsqueeze_to_3d(wav)
+        # Real forward
+        tf_rep = self.forward_encoder(wav)
+        est_masks = self.forward_masker(tf_rep)
+        masked_tf_rep = self.apply_masks(tf_rep, est_masks)
+        decoded = self.forward_decoder(masked_tf_rep)
+        reconstructed = _shape_reconstructed(pad_x_to_y(decoded, wav), shape)
+        reconstructed = reconstructed + 1 / reconstructed.shape[1] * (
+            wav - reconstructed.sum(dim=1, keepdim=True)
+        )
+        return reconstructed
+class BaseEncoderMaskerDecoderWithConfigsMaskOnOutput(BaseEncoderMaskerDecoder):
+    def __init__(self, encoder, masker, decoder, encoder_activation=None, **kwargs):
+        super().__init__(encoder, masker, decoder, encoder_activation)
+        self.use_encoder = kwargs.get("use_encoder", True)
+        self.apply_mask = kwargs.get("apply_mask", True)
+        self.use_decoder = kwargs.get("use_decoder", True)
+        self.nb_channels = kwargs.get("nb_channels", 2)
+        self.decoder_activation = kwargs.get("decoder_activation", "sigmoid")
+        if self.decoder_activation == "sigmoid":
+            self.act_after_dec = nn.Sigmoid()
+        elif self.decoder_activation == "relu":
+            self.act_after_dec = nn.ReLU()
+        elif self.decoder_activation == "relu6":
+            self.act_after_dec = nn.ReLU6()
+        elif self.decoder_activation == "tanh":
+            self.act_after_dec = nn.Tanh()
+        elif self.decoder_activation == "none":
+            self.act_after_dec = nn.Identity()
+        else:
+            self.act_after_dec = nn.Sigmoid()
+    def forward(self, wav):
+        """
+        For the De-limit task, we will apply the mask on the output of the decoder.
+        We want decoder to learn the sample-wise ratio of the sources.
+        Args:
+            wav (torch.Tensor): waveform tensor. 1D, 2D or 3D tensor, time last.
+        Returns:
+            torch.Tensor, of shape (batch, n_src, time) or (n_src, time).
+        """
+        # Remember shape to shape reconstruction, cast to Tensor for torchscript
+        shape = jitable_shape(wav)
+        # Reshape to (batch, n_mix, time)
+        wav = _unsqueeze_to_3d(wav)  # (batch, n_channels, time)
+        # Real forward
+        if self.use_encoder:
+            tf_rep = self.forward_encoder(wav)  # (batch, n_channels, freq, time)
+        else:
+            tf_rep = wav
+        if self.nb_channels == 2:
+            tf_rep = rearrange(
+                tf_rep, "b c f t -> b (c f) t"
+            )  # c == 2 when stereo input.
+        est_masks = self.forward_masker(tf_rep)  # (batch, 1, freq, time)
+        # we are going to apply the mask on the output of the decoder
+        if self.use_decoder:
+            if self.nb_channels == 2:
+                est_masks = rearrange(est_masks, "b 1 f t -> b f t")
+            est_masks_decoded = self.forward_decoder(est_masks)
+            est_masks_decoded = pad_x_to_y(est_masks_decoded, wav)  # (batch, 1, time)
+            est_masks_decoded = self.act_after_dec(
+                est_masks_decoded
+            )  # (batch, 1, time)
+            decoded = wav * est_masks_decoded  # (batch, n_channels, time)
+            return (
+                est_masks_decoded,
+                decoded,
+            )
+        else:
+            decoded = est_masks
+            return (decoded,)
+class BaseEncoderMaskerDecoderWithConfigsMultiChannelAsteroid(BaseEncoderMaskerDecoder):
+    def __init__(self, encoder, masker, decoder, encoder_activation=None, **kwargs):
+        super().__init__(encoder, masker, decoder, encoder_activation)
+        self.use_encoder = kwargs.get("use_encoder", True)
+        self.apply_mask = kwargs.get("apply_mask", True)
+        self.use_decoder = kwargs.get("use_decoder", True)
+        self.nb_channels = kwargs.get("nb_channels", 2)
+        self.decoder_activation = kwargs.get("decoder_activation", "none")
+        if self.decoder_activation == "sigmoid":
+            self.act_after_dec = nn.Sigmoid()
+        elif self.decoder_activation == "relu":
+            self.act_after_dec = nn.ReLU()
+        elif self.decoder_activation == "relu6":
+            self.act_after_dec = nn.ReLU6()
+        elif self.decoder_activation == "tanh":
+            self.act_after_dec = nn.Tanh()
+        elif self.decoder_activation == "none":
+            self.act_after_dec = nn.Identity()
+        else:
+            self.act_after_dec = nn.Sigmoid()
+    def forward(self, wav):
+        """
+        Enc/Mask/Dec model forward with some additional options.
+        For MultiChannel usage of asteroid-based models. (e.g. ConvTasNet)
+        Args:
+            wav (torch.Tensor): waveform tensor. 1D, 2D or 3D tensor, time last.
+        Returns:
+            torch.Tensor, of shape (batch, n_src, time) or (n_src, time).
+        """
+        # Remember shape to shape reconstruction, cast to Tensor for torchscript
+        shape = jitable_shape(wav)
+        # Reshape to (batch, n_mix, time)
+        wav = _unsqueeze_to_3d(wav)
+        # Real forward
+        if self.use_encoder:
+            tf_rep = self.forward_encoder(wav)
+        else:
+            tf_rep = wav
+        if self.nb_channels == 2:
+            tf_rep = rearrange(
+                tf_rep, "b c f t -> b (c f) t"
+            )  # c == 2 when stereo input.
+        est_masks = self.forward_masker(tf_rep)
+        if self.nb_channels == 2:
+            tf_rep = rearrange(tf_rep, "b (c f) t -> b c f t", c=self.nb_channels)
+        if self.apply_mask:
+            # Since original asteroid implementation of masking includes unnecessary unsqueeze operation, we will do it manually.
+            masked_tf_rep = est_masks * tf_rep
+        else:
+            masked_tf_rep = est_masks
+        if self.use_decoder:
+            decoded = self.forward_decoder(masked_tf_rep)
+            reconstructed = pad_x_to_y(decoded, wav)
+            reconstructed = self.act_after_dec(reconstructed)
+            return masked_tf_rep, _shape_reconstructed(reconstructed, shape)
+        else:
+            decoded = masked_tf_rep
+            return decoded

models/load_models.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import numpy as np
+import torch
+import torch.nn as nn
+from asteroid_filterbanks import make_enc_dec
+from asteroid.masknn import TDConvNet
+import utils
+from .base_models import (
+    BaseEncoderMaskerDecoderWithConfigs,
+    BaseEncoderMaskerDecoderWithConfigsMaskOnOutput,
+    BaseEncoderMaskerDecoderWithConfigsMultiChannelAsteroid,
+)
+def load_model_with_args(args):
+    if args.model_loss_params.architecture == "conv_tasnet_mask_on_output":
+        encoder, decoder = make_enc_dec(
+            "free",
+            n_filters=args.conv_tasnet_params.n_filters,
+            kernel_size=args.conv_tasnet_params.kernel_size,
+            stride=args.conv_tasnet_params.stride,
+            sample_rate=args.sample_rate,
+        )
+        masker = TDConvNet(
+            in_chan=encoder.n_feats_out * args.data_params.nb_channels,  # stereo
+            n_src=1,  # for de-limit task.
+            out_chan=encoder.n_feats_out,
+            n_blocks=args.conv_tasnet_params.n_blocks,
+            n_repeats=args.conv_tasnet_params.n_repeats,
+            bn_chan=args.conv_tasnet_params.bn_chan,
+            hid_chan=args.conv_tasnet_params.hid_chan,
+            skip_chan=args.conv_tasnet_params.skip_chan,
+            # conv_kernel_size=args.conv_tasnet_params.conv_kernel_size,
+            norm_type=args.conv_tasnet_params.norm_type if args.conv_tasnet_params.norm_type else 'gLN',
+            mask_act=args.conv_tasnet_params.mask_act,
+            # causal=args.conv_tasnet_params.causal,
+        )
+        model = BaseEncoderMaskerDecoderWithConfigsMaskOnOutput(
+            encoder,
+            masker,
+            decoder,
+            encoder_activation=args.conv_tasnet_params.encoder_activation,
+            use_encoder=True,
+            apply_mask=True,
+            use_decoder=True,
+            decoder_activation=args.conv_tasnet_params.decoder_activation,
+        )
+        model.use_encoder_to_target = False
+    elif args.model_loss_params.architecture == "conv_tasnet":
+        encoder, decoder = make_enc_dec(
+            "free",
+            n_filters=args.conv_tasnet_params.n_filters,
+            kernel_size=args.conv_tasnet_params.kernel_size,
+            stride=args.conv_tasnet_params.stride,
+            sample_rate=args.sample_rate,
+        )
+        masker = TDConvNet(
+            in_chan=encoder.n_feats_out * args.data_params.nb_channels,  # stereo
+            n_src=args.conv_tasnet_params.n_src,  # for de-limit task with the standard conv-tasnet setting.
+            out_chan=encoder.n_feats_out,
+            n_blocks=args.conv_tasnet_params.n_blocks,
+            n_repeats=args.conv_tasnet_params.n_repeats,
+            bn_chan=args.conv_tasnet_params.bn_chan,
+            hid_chan=args.conv_tasnet_params.hid_chan,
+            skip_chan=args.conv_tasnet_params.skip_chan,
+            # conv_kernel_size=args.conv_tasnet_params.conv_kernel_size,
+            norm_type=args.conv_tasnet_params.norm_type if args.conv_tasnet_params.norm_type else 'gLN',
+            mask_act=args.conv_tasnet_params.mask_act,
+            # causal=args.conv_tasnet_params.causal,
+        )
+        model = BaseEncoderMaskerDecoderWithConfigsMultiChannelAsteroid(
+            encoder,
+            masker,
+            decoder,
+            encoder_activation=args.conv_tasnet_params.encoder_activation,
+            use_encoder=True,
+            apply_mask=False if args.conv_tasnet_params.synthesis else True,
+            use_decoder=True,
+            decoder_activation=args.conv_tasnet_params.decoder_activation,
+        )
+        model.use_encoder_to_target = False
+    return model

prepro/delimit_save_delimiter_stems.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# Save loudness normalized (-14 LUFS) musdb-XL audio files for delimiter evaluation
+import os
+import argparse
+import tqdm
+import musdb
+import soundfile as sf
+import librosa
+import pyloudnorm as pyln
+from utils import db2linear, str2bool
+tqdm.monitor_interval = 0
+def main():
+    parser = argparse.ArgumentParser(description="model test.py")
+    parser.add_argument(
+        "--target",
+        type=str,
+        default="vocals",
+        help="target source. all, vocals, drums, bass, other",
+    )
+    parser.add_argument("--data_root", type=str, default="/path/to/musdb_XL")
+    parser.add_argument(
+        "--data_root_hq",
+        type=str,
+        default="/data1/Music/musdb18hq",
+        help="this is used when saving loud-norm stem of musdb-XL")
+    parser.add_argument(
+        "--output_directory",
+        type=str,
+        default="/path/to/results",
+    )
+    parser.add_argument("--exp_name", type=str, default="delimit_6_s")
+    parser.add_argument(
+        "--save_16k_mono",
+        type=str2bool,
+        default=False,
+        help="Save 16k mono wav files for FAD evaluation.",
+    )
+    args, _ = parser.parse_known_args()
+    os.makedirs(args.output_directory, exist_ok=True)
+    meter = pyln.Meter(44100)
+    args.test_output_dir = f"{args.output_directory}/test/{args.exp_name}"
+    test_tracks = musdb.DB(root=args.data_root, subsets="test", is_wav=True)
+    if args.target != "mixture": # In this file, args.target should not be "mixture"
+        hq_tracks = musdb.DB(root=args.data_root_hq, subsets='test', is_wav=True)
+    for idx, track in tqdm.tqdm(enumerate(test_tracks)):
+        track_name = track.name
+        if (
+            os.path.basename(args.data_root) == "musdb18hq"
+            and track_name == "PR - Oh No"
+        ):  # We have to consider this exception because 'PR - Oh No' mixture.wav is left-panned. We will use the linear mixture instead.
+            # Please refer https://github.com/jeonchangbin49/musdb-XL/blob/main/make_L_and_XL.py
+            track_audio = (
+                track.targets["vocals"].audio
+                + track.targets["drums"].audio
+                + track.targets["bass"].audio
+                + track.targets["other"].audio
+            )
+        else:
+            track_audio = track.audio
+        delimiter_track = librosa.load(f"{args.test_output_dir}/{track_name}/all.wav", sr=44100, mono=False)[0].T
+        print(track_name)
+        if args.target != "mixture":
+            hq_track = hq_tracks[idx]
+            hq_audio = hq_track.audio
+            hq_stem = hq_track.targets[args.target].audio
+            hq_samplewise_gain = track_audio / (hq_audio + 1e-8)
+            XL_stem = hq_samplewise_gain * hq_stem
+            XL_samplewise_gain = delimiter_track / (track_audio + 1e-8)
+            delimiter_stem = XL_samplewise_gain * XL_stem
+        sf.write(
+            f"{args.test_output_dir}/{track_name}/{args.target}.wav", delimiter_stem, 44100
+        )
+if __name__ == "__main__":
+    main()

prepro/delimit_save_musdb_loudnorm.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# Save loudness normalized (-14 LUFS) musdb-XL audio files for evaluations of de-limiter
+import os
+import argparse
+import tqdm
+import musdb
+import soundfile as sf
+import librosa
+import pyloudnorm as pyln
+from utils import db2linear, str2bool
+tqdm.monitor_interval = 0
+def main():
+    parser = argparse.ArgumentParser(description="model test.py")
+    parser.add_argument(
+        "--target",
+        type=str,
+        default="mixture",
+        help="target source. all, vocals, drums, bass, other",
+    )
+    parser.add_argument("--data_root", type=str, default="/path/to/musdb_XL")
+    parser.add_argument(
+        "--data_root_hq",
+        type=str,
+        default="/path/to/musdb18hq",
+        help="this is used when saving loud-norm stem of musdb-XL")
+    parser.add_argument(
+        "--output_directory",
+        type=str,
+        default="/path/to/musdb_XL_loudnorm",
+    )
+    parser.add_argument(
+        "--loudnorm_input_lufs",
+        type=float,
+        default=-14.0,
+        help="If you want to use loudnorm, input target lufs",
+    )
+    parser.add_argument(
+        "--save_16k_mono",
+        type=str2bool,
+        default=True,
+        help="Save 16k mono wav files for FAD evaluation.",
+    )
+    args, _ = parser.parse_known_args()
+    os.makedirs(args.output_directory, exist_ok=True)
+    meter = pyln.Meter(44100)
+    test_tracks = musdb.DB(root=args.data_root, subsets="test", is_wav=True)
+    if args.target != "mixture":
+        hq_tracks = musdb.DB(root=args.data_root_hq, subsets='test', is_wav=True)
+    for idx, track in tqdm.tqdm(enumerate(test_tracks)):
+        track_name = track.name
+        if (
+            os.path.basename(args.data_root) == "musdb18hq"
+            and track_name == "PR - Oh No"
+        ):  # We have to consider this exception because 'PR - Oh No' mixture.wav is left-panned. We will use the linear mixture instead.
+            # Please refer https://github.com/jeonchangbin49/musdb-XL/blob/main/make_L_and_XL.py
+            track_audio = (
+                track.targets["vocals"].audio
+                + track.targets["drums"].audio
+                + track.targets["bass"].audio
+                + track.targets["other"].audio
+            )
+        else:
+            track_audio = track.audio
+        print(track_name)
+        augmented_gain = None
+        track_lufs = meter.integrated_loudness(track_audio)
+        augmented_gain = args.loudnorm_input_lufs - track_lufs
+        if os.path.basename(args.data_root) == "musdb18hq":
+            if args.target != "mixture":
+                track_audio = track.targets[args.target].audio
+            track_audio = track_audio * db2linear(augmented_gain, eps=0.0)
+        elif os.path.basename(args.data_root) == "musdb_XL":
+            track_audio = track_audio * db2linear(augmented_gain, eps=0.0)
+            if args.target != "mixture":
+                hq_track = hq_tracks[idx]
+                hq_audio = hq_track.audio
+                hq_stem = hq_track.targets[args.target].audio
+                samplewise_gain = track_audio / (hq_audio + 1e-8)
+                track_audio = samplewise_gain * hq_stem
+        os.makedirs(f"{args.output_directory}/{track_name}", exist_ok=True)
+        sf.write(
+            f"{args.output_directory}/{track_name}/{args.target}.wav", track_audio, 44100
+        )
+        if args.save_16k_mono:
+            track_audio_16k_mono = librosa.to_mono(track_audio.T)
+            track_audio_16k_mono = librosa.resample(
+                track_audio_16k_mono,
+                orig_sr=44100,
+                target_sr=16000,
+            )
+            os.makedirs(f"{args.output_directory}_16k_mono/{track_name}", exist_ok=True)
+            sf.write(
+                f"{args.output_directory}_16k_mono/{track_name}/{args.target}.wav",
+                track_audio_16k_mono,
+                samplerate=16000,
+            )
+if __name__ == "__main__":
+    main()

prepro/delimit_train_ozone_prepro.py ADDED Viewed

	@@ -0,0 +1,293 @@

+import os
+import json
+import csv
+import glob
+import argparse
+import random
+import math
+import librosa
+import soundfile as sf
+import pedalboard
+import numpy as np
+import pyloudnorm as pyln
+from scipy.stats import gamma
+import torchaudio
+def str2bool(v):
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("Boolean value expected.")
+def _augment_gain_ozone(audio, low=0.25, high=1.25):
+    """Applies a random gain between `low` and `high`"""
+    g = low + random.random() * (high - low)
+    return audio * g, g
+def _augment_channelswap_ozone(audio):
+    """Swap channels of stereo signals with a probability of p=0.5"""
+    if audio.shape[0] == 2 and random.random() < 0.5:
+        return np.flip(audio, axis=0), True  # axis=0 must be given
+    else:
+        return audio, False
+# load wav file from arbitrary positions of 16bit stereo wav file
+def load_wav_arbitrary_position_stereo(
+    filename, sample_rate, seq_duration, return_pos=False
+):
+    # stereo
+    # seq_duration[second]
+    length = torchaudio.info(filename).num_frames
+    random_start = random.randint(
+        0, int(length - math.ceil(seq_duration * sample_rate) - 1)
+    )
+    random_start_sec = librosa.samples_to_time(random_start, sr=sample_rate)
+    X, sr = librosa.load(
+        filename, sr=None, mono=False, offset=random_start_sec, duration=seq_duration
+    )
+    if return_pos:
+        return X, random_start_sec
+    else:
+        return X
+# def main():
+parser = argparse.ArgumentParser(description="Preprocess audio files for training")
+parser.add_argument(
+    "--root",
+    type=str,
+    default="/path/to/musdb18hq",
+    help="Root directory",
+)
+parser.add_argument(
+    "--output",
+    type=str,
+    default="/path/to/musdb-XL-train",
+    help="Where to save output files",
+)
+parser.add_argument(
+    "--n_samples", type=int, default=300000, help="Number of samples to save"
+)
+parser.add_argument("--seq_duration", type=float, default=4.0, help="Sequence duration")
+parser.add_argument(
+    "--save_fixed", type=str2bool, default=False, help="Save fixed mixture audio"
+)
+parser.add_argument(
+    "--target_lufs_mean", type=float, default=-8.0, help="Target LUFS mean"
+)
+parser.add_argument(
+    "--target_lufs_std", type=float, default=-1.0, help="Target LUFS std"
+)
+parser.add_argument("--sample_rate", type=int, default=44100, help="Sample rate")
+parser.add_argument("--seed", type=int, default=46, help="Random seed")
+args = parser.parse_args()
+random.seed(args.seed)
+valid_list = [
+    "ANiMAL - Rockshow",
+    "Actions - One Minute Smile",
+    "Alexander Ross - Goodbye Bolero",
+    "Clara Berry And Wooldog - Waltz For My Victims",
+    "Fergessen - Nos Palpitants",
+    "James May - On The Line",
+    "Johnny Lokke - Promises & Lies",
+    "Leaf - Summerghost",
+    "Meaxic - Take A Step",
+    "Patrick Talbot - A Reason To Leave",
+    "Skelpolu - Human Mistakes",
+    "Traffic Experiment - Sirens",
+    "Triviul - Angelsaint",
+    "Young Griffo - Pennies",
+]
+meter = pyln.Meter(args.sample_rate)
+sources = ["vocals", "bass", "drums", "other"]
+song_list = glob.glob(f"{args.root}/train/*")
+vst = pedalboard.load_plugin(
+    "/Library/Audio/Plug-Ins/Components/iZOzone9ElementsAUHook.component"
+)
+if args.save_fixed:
+    vst_params = []
+    os.makedirs(f"{args.output}/ozone_train_fixed", exist_ok=True)
+    for song in song_list:
+        print(f"Processing {song}...")
+        song_name = os.path.basename(song)
+        audio_sources = []
+        for source in sources:
+            audio_path = f"{song}/{source}.wav"
+            audio, sr = librosa.load(audio_path, sr=args.sample_rate, mono=False)
+            audio_sources.append(audio)
+        stems = np.stack(audio_sources, axis=0)
+        mixture = stems.sum(0)
+        lufs = meter.integrated_loudness(mixture.T)
+        target_lufs = random.gauss(args.target_lufs_mean, args.target_lufs_std)
+        adjusted_loudness = target_lufs - lufs
+        vst.reset()
+        vst.eq_bypass = True
+        vst.img_bypass = True
+        vst.max_mode = 1.0  # Set IRC2 mode
+        vst.max_threshold = min(-adjusted_loudness, 0.0)
+        vst.max_character = min(gamma.rvs(2), 10.0)
+        print(
+            f"Applying Ozone 9 Elements IRC2 with threshold {vst.max_threshold} and character {vst.max_character}..."
+        )
+        limited_mixture = vst(mixture, args.sample_rate)
+        sf.write(
+            f"{args.output}/ozone_train_fixed/{song_name}.wav",
+            limited_mixture.T,
+            args.sample_rate,
+        )
+        vst_params.append([song_name, vst.max_threshold, vst.max_character])
+        # Save the song name and vst parameters (vst.max_threshold and vst.max_character) to a csv file
+        with open(f"{args.output}/ozone_train_fixed.csv", "w") as f:
+            writer = csv.writer(f)
+            writer.writerow(["song_name", "max_threshold", "max_character"])
+            for idx, list_vst_param in enumerate(vst_params):
+                writer.writerow(list_vst_param)
+else:
+    if os.path.exists(f"{args.output}/ozone_train_random_0.csv"):
+        vst_params = []
+        list_csv_files = glob.glob(f"{args.output}/ozone_train_random_*.csv")
+        list_csv_files.sort()
+        for csv_file in list_csv_files:
+            with open(csv_file, "r") as f:
+                reader = csv.reader(f)
+                next(reader)
+                vst_params.extend([row for row in reader])
+    else:
+        vst_params = []
+    song_list = [x for x in song_list if os.path.basename(x) not in valid_list]
+    os.makedirs(f"{args.output}/ozone_train_random", exist_ok=True)
+    for n in range(len(vst_params), args.n_samples):
+        print(f"Processing {n} / {args.n_samples}...")
+        seg_name = f"ozone_seg_{n}"
+        lufs_not_inf = True
+        while lufs_not_inf:
+            audio_sources = []
+            source_song_names = {}
+            source_start_secs = {}
+            source_gains = {}
+            source_channelswaps = {}
+            for source in sources:
+                track_path = random.choice(song_list)
+                song_name = os.path.basename(track_path)
+                audio_path = f"{track_path}/{source}.wav"
+                audio, start_sec = load_wav_arbitrary_position_stereo(
+                    audio_path, args.sample_rate, args.seq_duration, return_pos=True
+                )
+                audio, gain = _augment_gain_ozone(audio)
+                audio, channelswap = _augment_channelswap_ozone(audio)
+                audio_sources.append(audio)
+                source_song_names[source] = song_name
+                source_start_secs[source] = start_sec
+                source_gains[source] = gain
+                source_channelswaps[source] = channelswap
+            stems = np.stack(audio_sources, axis=0)
+            mixture = stems.sum(0)
+            lufs = meter.integrated_loudness(mixture.T)
+            # if lufs is inf, then the mixture is silent, so we need to generate a new mixture
+            lufs_not_inf = np.isinf(lufs)
+        target_lufs = random.gauss(args.target_lufs_mean, args.target_lufs_std)
+        adjusted_loudness = target_lufs - lufs
+        vst.reset()
+        vst.eq_bypass = True
+        vst.img_bypass = True
+        vst.max_mode = 1.0  # Set IRC2 mode
+        vst.max_threshold = min(max(-20, -adjusted_loudness), 0.0)
+        vst.max_character = min(gamma.rvs(2), 10.0)
+        print(
+            f"Applying Ozone 9 Elements IRC2 with threshold {vst.max_threshold} and character {vst.max_character}..."
+        )
+        limited_mixture = vst(mixture, args.sample_rate)
+        sf.write(
+            f"{args.output}/ozone_train_random_0/{seg_name}.wav",
+            limited_mixture.T,
+            args.sample_rate,
+        )
+        vst_params.append(
+            [
+                seg_name,
+                vst.max_threshold,
+                vst.max_character,
+                source_song_names["vocals"],
+                source_start_secs["vocals"],
+                source_gains["vocals"],
+                source_channelswaps["vocals"],
+                source_song_names["bass"],
+                source_start_secs["bass"],
+                source_gains["bass"],
+                source_channelswaps["bass"],
+                source_song_names["drums"],
+                source_start_secs["drums"],
+                source_gains["drums"],
+                source_channelswaps["drums"],
+                source_song_names["other"],
+                source_start_secs["other"],
+                source_gains["other"],
+                source_channelswaps["other"],
+            ]
+        )
+        if (n + 1) % 20000 == 0 or n == args.n_samples - 1:
+            # We will separate the csv file into multiple files to avoid memory error
+            # Save the song name and vst parameters (vst.max_threshold and vst.max_character) to a csv file
+            number = int(n // 20000)
+            with open(f"{args.output}/ozone_train_random_{number}.csv", "w") as f:
+                writer = csv.writer(f)
+                writer.writerow(
+                    [
+                        "song_name",
+                        "max_threshold",
+                        "max_character",
+                        "vocals_name",
+                        "vocals_start_sec",
+                        "vocals_gain",
+                        "vocals_channelswap",
+                        "bass_name",
+                        "bass_start_sec",
+                        "bass_gain",
+                        "bass_channelswap",
+                        "drums_name",
+                        "drums_start_sec",
+                        "drums_gain",
+                        "drums_channelswap",
+                        "other_name",
+                        "other_start_sec",
+                        "other_gain",
+                        "other_channelswap",
+                    ]
+                )
+                for idx, list_vst_param in enumerate(
+                    vst_params[number * 20000 : (number + 1) * 20000]
+                ):
+                    writer.writerow(list_vst_param)

prepro/delimit_valid_L_prepro.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import os
+import json
+from torch.utils.data import DataLoader
+import soundfile as sf
+import tqdm
+from dataloader import DelimitValidDataset
+def main():
+    # Parameters
+    data_path = "/path/to/musdb18hq"
+    save_path = "/path/to/musdb18hq_limited_L"
+    batch_size = 1
+    num_workers = 1
+    sr = 44100
+    # Dataset
+    dataset = DelimitValidDataset(root=data_path, valid_target_lufs=-14.39)
+    data_loader = DataLoader(
+        dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False
+    )
+    dict_valid_loudness = {}
+    # Preprocessing
+    for limited_audio, orig_audio, audio_name, loudness in tqdm.tqdm(data_loader):
+        audio_name = audio_name[0]
+        limited_audio = limited_audio[0].numpy()
+        loudness = float(loudness[0].numpy())
+        dict_valid_loudness[audio_name] = loudness
+        # Save audio
+        os.makedirs(os.path.join(save_path, "valid"), exist_ok=True)
+        audio_path = os.path.join(save_path, "valid", audio_name)
+        sf.write(f"{audio_path}.wav", limited_audio.T, sr)
+        # write json write code
+    with open(os.path.join(save_path, "valid_loudness.json"), "w") as f:
+        json.dump(dict_valid_loudness, f, indent=4)
+if __name__ == "__main__":
+    main()

prepro/delimit_valid_custom_limiter_prepro.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import os
+import json
+from torch.utils.data import DataLoader
+import soundfile as sf
+import tqdm
+from dataloader import DelimitValidDataset
+def main():
+    # Parameters
+    data_path = "/path/to/musdb18hq"
+    save_path = (
+        "/path/to/musdb18hq_custom_limiter_fixed_attack"
+    )
+    batch_size = 1
+    num_workers = 1
+    sr = 44100
+    # Dataset
+    dataset = DelimitValidDataset(
+        root=data_path, use_custom_limiter=True, custom_limiter_attack_range=[2.0, 2.0]
+    )
+    data_loader = DataLoader(
+        dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False
+    )
+    dict_valid_loudness = {}
+    dict_limiter_params = {}
+    # Preprocessing
+    for (
+        limited_audio,
+        orig_audio,
+        audio_name,
+        loudness,
+        custom_attack,
+        custom_release,
+    ) in tqdm.tqdm(data_loader):
+        audio_name = audio_name[0]
+        limited_audio = limited_audio[0].numpy()
+        loudness = float(loudness[0].numpy())
+        dict_valid_loudness[audio_name] = loudness
+        dict_limiter_params[audio_name] = {
+            "attack_ms": float(custom_attack[0].numpy()),
+            "release_ms": float(custom_release[0].numpy()),
+        }
+        # Save audio
+        os.makedirs(os.path.join(save_path, "valid"), exist_ok=True)
+        audio_path = os.path.join(save_path, "valid", audio_name)
+        sf.write(f"{audio_path}.wav", limited_audio.T, sr)
+        # write json write code
+    with open(os.path.join(save_path, "valid_loudness.json"), "w") as f:
+        json.dump(dict_valid_loudness, f, indent=4)
+    with open(os.path.join(save_path, "valid_limiter_params.json"), "w") as f:
+        json.dump(dict_limiter_params, f, indent=4)
+if __name__ == "__main__":
+    main()

prepro/delimit_valid_prepro.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import os
+import json
+from torch.utils.data import DataLoader
+import soundfile as sf
+import tqdm
+from dataloader import DelimitValidDataset
+def main():
+    # Parameters
+    data_path = "/path/to/musdb18hq"
+    save_path = "/path/to/musdb18hq_limited"
+    batch_size = 1
+    num_workers = 1
+    sr = 44100
+    # Dataset
+    dataset = DelimitValidDataset(root=data_path)
+    data_loader = DataLoader(
+        dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False
+    )
+    dict_valid_loudness = {}
+    # Preprocessing
+    for limited_audio, orig_audio, audio_name, loudness in tqdm.tqdm(data_loader):
+        audio_name = audio_name[0]
+        limited_audio = limited_audio[0].numpy()
+        loudness = float(loudness[0].numpy())
+        dict_valid_loudness[audio_name] = loudness
+        # Save audio
+        os.makedirs(os.path.join(save_path, "valid"), exist_ok=True)
+        audio_path = os.path.join(save_path, "valid", audio_name)
+        sf.write(f"{audio_path}.wav", limited_audio.T, sr)
+        # write json write code
+    with open(os.path.join(save_path, "valid_loudness.json"), "w") as f:
+        json.dump(dict_valid_loudness, f, indent=4)
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+git+https://github.com/asteroid-team/asteroid.git@master
+numpy
+librosa
+soundfile
+torch
+torchaudio
+matplotlib
+wandb
+musdb
+dotmap
+ema-pytorch
+pedalboard
+einops

separate_func/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .conv_tasnet_separate import conv_tasnet_separate

separate_func/conv_tasnet_separate.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import os
+import soundfile as sf
+import torch
+import pyloudnorm as pyln
+import librosa
+import matplotlib
+import matplotlib.pyplot as plt
+from dataloader import SingleTrackSet
+from utils import db2linear
+def conv_tasnet_separate(
+    args, our_model, device, track_audio, track_name, meter=None, augmented_gain=None
+):
+    if args.use_singletrackset:
+        db = SingleTrackSet(
+            track_audio.squeeze(dim=0),
+            hop_length=args.data_params.nhop,
+            num_frame=128,
+            target_name=args.target,
+        )
+        separated = []
+        for item in db:
+            item = item.unsqueeze(0).to(device)
+            estimates, *estimates_vars = our_model(item)
+            if args.task_params.dataset == "delimit":
+                estimates = estimates_vars[0]
+            estimates = estimates.cpu().detach()
+            separated.append(
+                estimates[..., db.trim_length : -db.trim_length].cpu().detach().clone()
+            )
+        estimates = torch.cat(separated, dim=-1)
+        estimates = estimates[0, :, : track_audio.shape[-1]].numpy()
+    else:
+        estimates, *estimates_vars = our_model(track_audio)
+        if args.save_histogram and args.task_params.dataset == "delimit":
+            plt.figure(figsize=(10, 10))
+            plt.hist(estimates.cpu().detach().numpy().flatten(), bins=100)
+            os.makedirs(f"{args.test_output_dir}/{track_name}", exist_ok=True)
+            plt.savefig(
+                f"{args.test_output_dir}/{track_name}/{args.target}_histogram.png"
+            )
+        if args.task_params.dataset == "delimit":
+            estimates = estimates_vars[0]
+        estimates = estimates.cpu().detach().numpy()
+        estimates = estimates[0, :, : track_audio.shape[-1]]
+    if args.save_name_as_target:
+        os.makedirs(f"{args.test_output_dir}/{track_name}", exist_ok=True)
+    if args.save_output_loudnorm:
+        print("SAVE Loudness normalized OUTPUT ")
+        loudness = meter.integrated_loudness(estimates.T)
+        estimates = estimates * db2linear(args.save_output_loudnorm - loudness, eps=0.0)
+    elif augmented_gain != None and args.save_output_loudnorm == None:
+        estimates = estimates * db2linear(-augmented_gain, eps=0.0)
+    sf.write(
+        f"{args.test_output_dir}/{track_name}/{args.target}.wav"
+        if args.save_name_as_target
+        else f"{args.test_output_dir}/{track_name}.wav",
+        estimates.T,
+        samplerate=args.data_params.sample_rate,
+    )
+    if args.save_16k_mono:
+        estimates_16k_mono = librosa.to_mono(estimates)
+        estimates_16k_mono = librosa.resample(
+            estimates_16k_mono,
+            orig_sr=args.data_params.sample_rate,
+            target_sr=16000,
+        )
+        os.makedirs(f"{args.test_output_dir}_16k_mono/{track_name}", exist_ok=True)
+        sf.write(
+            f"{args.test_output_dir}_16k_mono/{track_name}/{args.target}.wav"
+            if args.save_name_as_target
+            else f"{args.test_output_dir}_16k_mono/{track_name}.wav",
+            estimates_16k_mono,
+            samplerate=16000,
+        )
+    return estimates

solver_ddp.py ADDED Viewed

	@@ -0,0 +1,643 @@

+import time
+import json
+import torch
+import torch.nn as nn
+import wandb
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import torch.distributed as dist
+from torch.utils.data.distributed import DistributedSampler
+from torch.nn.parallel.distributed import DistributedDataParallel as DDP
+from asteroid.losses import (
+    pairwise_neg_sisdr,
+    PairwiseNegSDR,
+)
+from einops import rearrange, reduce
+from ema_pytorch import EMA
+from models import load_model_with_args
+import utils
+from dataloader import (
+    MusdbTrainDataset,
+    MusdbValidDataset,
+    DelimitTrainDataset,
+    DelimitValidDataset,
+    OzoneTrainDataset,
+    OzoneValidDataset,
+    aug_from_str,
+    SingleTrackSet,
+)
+class Solver(object):
+    def __init__(self):
+        pass
+    def set_gpu(self, args):
+        if args.wandb_params.use_wandb and args.gpu == 0:
+            if args.wandb_params.sweep:
+                wandb.init(
+                    entity=args.wandb_params.entity,
+                    project=args.wandb_params.project,
+                    config=args,
+                    resume=True
+                    if args.dir_params.resume != None and args.gpu == 0
+                    else False,
+                )
+            else:
+                wandb.init(
+                    entity=args.wandb_params.entity,
+                    project=args.wandb_params.project,
+                    name=f"{args.dir_params.exp_name}",
+                    config=args,
+                    resume="must"
+                    if args.dir_params.resume != None
+                    and not args.dir_params.continual_train
+                    else False,
+                    id=args.wandb_params.rerun_id
+                    if args.wandb_params.rerun_id
+                    else None,
+                    settings=wandb.Settings(start_method="fork"),
+                )
+        ###################### Define Models ######################
+        self.model = load_model_with_args(args)
+        trainable_params = []
+        trainable_params = trainable_params + list(self.model.parameters())
+        if args.hyperparams.optimizer == "sgd":
+            print("Use SGD optimizer.")
+            self.optimizer = torch.optim.SGD(
+                params=trainable_params,
+                lr=args.hyperparams.lr,
+                momentum=0.9,
+                weight_decay=args.hyperparams.weight_decay,
+            )
+        elif args.hyperparams.optimizer == "adamw":
+            print("Use AdamW optimizer.")
+            self.optimizer = torch.optim.AdamW(
+                params=trainable_params,
+                lr=args.hyperparams.lr,
+                betas=(0.9, 0.999),
+                amsgrad=False,
+                weight_decay=args.hyperparams.weight_decay,
+            )
+        elif args.hyperparams.optimizer == "radam":
+            print("Use RAdam optimizer.")
+            self.optimizer = torch.optim.RAdam(
+                params=trainable_params,
+                lr=args.hyperparams.lr,
+                betas=(0.9, 0.999),
+                eps=1e-08,
+                weight_decay=args.hyperparams.weight_decay,
+            )
+        elif args.hyperparams.optimizer == "adam":
+            print("Use Adam optimizer.")
+            self.optimizer = torch.optim.Adam(
+                params=trainable_params,
+                lr=args.hyperparams.lr,
+                betas=(0.9, 0.999),
+                weight_decay=args.hyperparams.weight_decay,
+            )
+        else:
+            print("no optimizer loaded")
+            raise NotImplementedError
+        if args.hyperparams.lr_scheduler == "step_lr":
+            if args.model_loss_params.architecture == "umx":
+                self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+                    self.optimizer,
+                    mode="min",
+                    factor=args.hyperparams.lr_decay_gamma,
+                    patience=args.hyperparams.lr_decay_patience,
+                    cooldown=10,
+                    verbose=True,
+                )
+            else:
+                self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+                    self.optimizer,
+                    mode="min",
+                    factor=args.hyperparams.lr_decay_gamma,
+                    patience=args.hyperparams.lr_decay_patience,
+                    cooldown=0,
+                    min_lr=5e-5,
+                    verbose=True,
+                )
+        elif args.hyperparams.lr_scheduler == "cos_warmup":
+            self.scheduler = utils.CosineAnnealingWarmUpRestarts(
+                self.optimizer,
+                T_0=40,
+                T_mult=1,
+                eta_max=args.hyperparams.lr,
+                T_up=10,
+                gamma=0.5,
+            )
+        torch.cuda.set_device(args.gpu)
+        self.model = self.model.to(f"cuda:{args.gpu}")
+        ############################################################
+        # Define Losses
+        self.criterion = {}
+        self.criterion["l1"] = nn.L1Loss().to(args.gpu)
+        self.criterion["mse"] = nn.MSELoss().to(args.gpu)
+        self.criterion["si_sdr"] = pairwise_neg_sisdr.to(args.gpu)
+        self.criterion["snr"] = PairwiseNegSDR("snr").to(args.gpu)
+        self.criterion["bcewithlogits"] = nn.BCEWithLogitsLoss().to(args.gpu)
+        self.criterion["bce"] = nn.BCELoss().to(args.gpu)
+        self.criterion["kl"] = nn.KLDivLoss(log_target=True).to(args.gpu)
+        print("Loss functions we use in this training:")
+        print(args.model_loss_params.train_loss_func)
+        # Early stopping utils
+        self.es = utils.EarlyStopping(patience=args.hyperparams.patience)
+        self.stop = False
+        if args.wandb_params.use_wandb and args.gpu == 0:
+            wandb.watch(self.model, log="all")
+        self.start_epoch = 1
+        self.train_losses = []
+        self.valid_losses = []
+        self.train_times = []
+        self.best_epoch = 0
+        if args.dir_params.resume and not args.hyperparams.ema:
+            self.resume(args)
+        # Distribute models to machine
+        self.model = DDP(
+            self.model,
+            device_ids=[args.gpu],
+            output_device=args.gpu,
+            find_unused_parameters=True,
+        )
+        if args.hyperparams.ema:
+            self.model_ema = EMA(
+                self.model,
+                beta=0.999,
+                update_after_step=100,
+                update_every=10,
+            )
+        if args.resume and args.hyperparams.ema:
+            self.resume(args)
+        ###################### Define data pipeline ######################
+        args.hyperparams.batch_size = int(
+            args.hyperparams.batch_size / args.ngpus_per_node
+        )
+        self.mp_context = torch.multiprocessing.get_context("fork")
+        if args.task_params.dataset == "musdb":
+            self.train_dataset = MusdbTrainDataset(
+                target=args.task_params.target,
+                root=args.dir_params.root,
+                seq_duration=args.data_params.seq_dur,
+                samples_per_track=args.data_params.samples_per_track,
+                source_augmentations=aug_from_str(
+                    ["gain", "channelswap"],
+                ),
+                sample_rate=args.data_params.sample_rate,
+                seed=args.sys_params.seed,
+                limitaug_method=args.data_params.limitaug_method,
+                limitaug_mode=args.data_params.limitaug_mode,
+                limitaug_custom_target_lufs=args.data_params.limitaug_custom_target_lufs,
+                limitaug_custom_target_lufs_std=args.data_params.limitaug_custom_target_lufs_std,
+                target_loudnorm_lufs=args.data_params.target_loudnorm_lufs,
+                custom_limiter_attack_range=args.data_params.custom_limiter_attack_range,
+                custom_limiter_release_range=args.data_params.custom_limiter_release_range,
+            )
+            self.valid_dataset = MusdbValidDataset(
+                target=args.task_params.target, root=args.dir_params.root
+            )
+        elif args.task_params.dataset == "delimit":
+            if args.data_params.limitaug_method == "ozone":
+                self.train_dataset = OzoneTrainDataset(
+                    target=args.task_params.target,
+                    root=args.dir_params.root,
+                    ozone_root=args.dir_params.ozone_root,
+                    use_fixed=args.data_params.use_fixed,
+                    seq_duration=args.data_params.seq_dur,
+                    samples_per_track=args.data_params.samples_per_track,
+                    source_augmentations=aug_from_str(
+                        ["gain", "channelswap"],
+                    ),
+                    sample_rate=args.data_params.sample_rate,
+                    seed=args.sys_params.seed,
+                    limitaug_method=args.data_params.limitaug_method,
+                    limitaug_mode=args.data_params.limitaug_mode,
+                    limitaug_custom_target_lufs=args.data_params.limitaug_custom_target_lufs,
+                    limitaug_custom_target_lufs_std=args.data_params.limitaug_custom_target_lufs_std,
+                    target_loudnorm_lufs=args.data_params.target_loudnorm_lufs,
+                    target_limitaug_mode=args.data_params.target_limitaug_mode,
+                    target_limitaug_custom_target_lufs=args.data_params.target_limitaug_custom_target_lufs,
+                    target_limitaug_custom_target_lufs_std=args.data_params.target_limitaug_custom_target_lufs_std,
+                    custom_limiter_attack_range=args.data_params.custom_limiter_attack_range,
+                    custom_limiter_release_range=args.data_params.custom_limiter_release_range,
+                )
+                self.valid_dataset = OzoneValidDataset(
+                    target=args.task_params.target,
+                    root=args.dir_params.root,
+                    ozone_root=args.dir_params.ozone_root,
+                    target_loudnorm_lufs=args.data_params.target_loudnorm_lufs,
+                )
+            else:
+                self.train_dataset = DelimitTrainDataset(
+                    target=args.task_params.target,
+                    root=args.dir_params.root,
+                    seq_duration=args.data_params.seq_dur,
+                    samples_per_track=args.data_params.samples_per_track,
+                    source_augmentations=aug_from_str(
+                        ["gain", "channelswap"],
+                    ),
+                    sample_rate=args.data_params.sample_rate,
+                    seed=args.sys_params.seed,
+                    limitaug_method=args.data_params.limitaug_method,
+                    limitaug_mode=args.data_params.limitaug_mode,
+                    limitaug_custom_target_lufs=args.data_params.limitaug_custom_target_lufs,
+                    limitaug_custom_target_lufs_std=args.data_params.limitaug_custom_target_lufs_std,
+                    target_loudnorm_lufs=args.data_params.target_loudnorm_lufs,
+                    target_limitaug_mode=args.data_params.target_limitaug_mode,
+                    target_limitaug_custom_target_lufs=args.data_params.target_limitaug_custom_target_lufs,
+                    target_limitaug_custom_target_lufs_std=args.data_params.target_limitaug_custom_target_lufs_std,
+                    custom_limiter_attack_range=args.data_params.custom_limiter_attack_range,
+                    custom_limiter_release_range=args.data_params.custom_limiter_release_range,
+                )
+                self.valid_dataset = DelimitValidDataset(
+                    target=args.task_params.target,
+                    root=args.dir_params.root,
+                    delimit_valid_root=args.dir_params.delimit_valid_root,
+                    valid_target_lufs=args.data_params.valid_target_lufs,
+                    target_loudnorm_lufs=args.data_params.target_loudnorm_lufs,
+                    delimit_valid_L_root=args.dir_params.delimit_valid_L_root,
+                )
+        self.train_sampler = DistributedSampler(
+            self.train_dataset, shuffle=True, rank=args.gpu
+        )
+        self.train_loader = torch.utils.data.DataLoader(
+            self.train_dataset,
+            batch_size=args.hyperparams.batch_size,
+            shuffle=False,
+            num_workers=args.sys_params.nb_workers,
+            multiprocessing_context=self.mp_context,
+            pin_memory=True,
+            sampler=self.train_sampler,
+            drop_last=False,
+        )
+        self.valid_sampler = DistributedSampler(
+            self.valid_dataset, shuffle=False, rank=args.gpu
+        )
+        self.valid_loader = torch.utils.data.DataLoader(
+            self.valid_dataset,
+            batch_size=1,
+            shuffle=False,
+            num_workers=args.sys_params.nb_workers,
+            multiprocessing_context=self.mp_context,
+            pin_memory=False,
+            sampler=self.valid_sampler,
+            drop_last=False,
+        )
+    def train(self, args, epoch):
+        self.end = time.time()
+        self.model.train()
+        # get current learning rate
+        for param_group in self.optimizer.param_groups:
+            current_lr = param_group["lr"]
+        if (
+            args.sys_params.rank % args.ngpus_per_node == 0
+        ):  # when the last rank process is finished
+            print(f"Epoch {epoch}, Learning rate: {current_lr}")
+        losses = utils.AverageMeter()
+        loss_logger = {}
+        loss_logger["train/train loss"] = 0
+        # with torch.autograd.detect_anomaly():  # use this if you want to detect anomaly behavior while training.
+        for i, values in enumerate(self.train_loader):
+            mixture, clean, *train_vars = values
+            mixture = mixture.cuda(args.gpu, non_blocking=True)
+            clean = clean.cuda(args.gpu, non_blocking=True)
+            target = clean  # target_shape = [batch_size, n_srcs, nb_channels (if stereo: 2), wave_length]
+            loss_input = {}
+            estimates, *estimates_vars = self.model(mixture)
+            # estimates = self.model(mixture)
+            # loss = []
+            dict_loss = {}
+            if args.task_params.dataset == "delimit":
+                estimates = estimates_vars[0]
+            for train_loss_idx, single_train_loss_func in enumerate(
+                args.model_loss_params.train_loss_func
+            ):
+                if self.model.module.use_encoder_to_target:
+                    target_spec = self.model.module.encoder(
+                        rearrange(target, "b s c t -> (b s) c t")
+                    )
+                    target_spec = rearrange(
+                        target_spec,
+                        "(b s) c f t -> b s c f t",
+                        s=args.task_params.bleeding_nsrcs,
+                    )
+                loss_else = self.criterion[single_train_loss_func](
+                    estimates,
+                    target_spec
+                    if self.model.module.use_encoder_to_target
+                    else target,
+                )
+                dict_loss[single_train_loss_func] = (
+                    loss_else.mean()
+                    * args.model_loss_params.train_loss_scales[train_loss_idx]
+                )
+            loss = sum([value for key, value in dict_loss.items()])
+            ############################################################
+            #################### 5. Back propagation ####################
+            loss.backward()
+            if args.hyperparams.gradient_clip:
+                nn.utils.clip_grad_norm_(
+                    self.model.parameters(), max_norm=args.hyperparams.gradient_clip
+                )
+            losses.update(loss.item(), clean.size(0))
+            loss_logger["train/train loss"] = losses.avg
+            for key, value in dict_loss.items():
+                loss_logger[f"train/{key}"] = value.item()
+            self.optimizer.step()
+            self.model.zero_grad(
+                set_to_none=True
+            )  # set_to_none=True is for memory saving
+            if args.hyperparams.ema:
+                self.model_ema.update()
+            ############################################################
+            # ###################### 6. Plot ######################
+            if i % 30 == 0:
+                # loss print for multiple loss function
+                multiple_score = torch.Tensor(
+                    [value for key, value in loss_logger.items()]
+                ).to(args.gpu)
+                gathered_score_list = [
+                    torch.ones_like(multiple_score)
+                    for _ in range(dist.get_world_size())
+                ]
+                dist.all_gather(gathered_score_list, multiple_score)
+                gathered_score = torch.mean(
+                    torch.stack(gathered_score_list, dim=0), dim=0
+                )
+                if args.gpu == 0:
+                    print(f"Epoch {epoch},  step {i} / {len(self.train_loader)}")
+                    temp_loss_logger = {}
+                    for index, (key, value) in enumerate(loss_logger.items()):
+                        temp_key = key.replace("train/", "iter-wise/")
+                        temp_loss_logger[temp_key] = round(
+                            gathered_score[index].item(), 6
+                        )
+                        print(f"{key} : {round(gathered_score[index].item(), 6)}")
+        single_score = torch.Tensor([losses.avg]).to(args.gpu)
+        gathered_score_list = [
+            torch.ones_like(single_score) for _ in range(dist.get_world_size())
+        ]
+        dist.all_gather(gathered_score_list, single_score)
+        gathered_score = torch.mean(torch.cat(gathered_score_list)).item()
+        if args.gpu == 0:
+            self.train_losses.append(gathered_score)
+            if args.wandb_params.use_wandb:
+                loss_logger["train/train loss"] = single_score
+                loss_logger["train/epoch"] = epoch
+                wandb.log(loss_logger)
+            ############################################################
+    def multi_validate(self, args, epoch):
+        if args.gpu == 0:
+            print(f"Epoch {epoch} Validation session!")
+        losses = utils.AverageMeter()
+        loss_logger = {}
+        self.model.eval()
+        with torch.no_grad():
+            for i, values in enumerate(self.valid_loader, start=1):
+                mixture, clean, song_name, *valid_vars = values
+                mixture = mixture.cuda(args.gpu, non_blocking=True)
+                clean = clean.cuda(args.gpu, non_blocking=True)
+                target = clean
+                dict_loss = {}
+                if not args.data_params.singleset_num_frames:
+                    if args.hyperparams.ema:
+                        estimates, *estimates_vars = self.model_ema(mixture)
+                    else:
+                        estimates, *estimates_vars = self.model(mixture)
+                    if args.task_params.dataset == "delimit":
+                        estimates = estimates_vars[0]
+                    estimates = estimates[..., : clean.size(-1)]
+                else:  # use SingleTrackSet
+                    db = SingleTrackSet(
+                        mixture[0],
+                        hop_length=args.data_params.nhop,
+                        num_frame=args.data_params.singleset_num_frames,
+                        target_name=args.task_params.target,
+                    )
+                    separated = []
+                    for item in db:
+                        if args.hyperparams.ema:
+                            estimates, *estimates_vars = self.model_ema(
+                                item.unsqueeze(0).to(args.gpu)
+                            )
+                        else:
+                            estimates, *estimates_vars = self.model(
+                                item.unsqueeze(0).to(args.gpu)
+                            )
+                        if args.task_params.dataset == "delimit":
+                            estimates = estimates_vars[0]
+                        separated.append(
+                            estimates_vars[0][
+                                ..., db.trim_length : -db.trim_length
+                            ].clone()
+                        )
+                    estimates = torch.cat(separated, dim=-1)
+                    estimates = estimates[..., : target.shape[-1]]
+                for valid_loss_idx, single_valid_loss_func in enumerate(
+                    args.model_loss_params.valid_loss_func
+                ):
+                    loss_else = self.criterion[single_valid_loss_func](
+                        estimates,
+                        target,
+                    )
+                    dict_loss[single_valid_loss_func] = (
+                        loss_else.mean()
+                        * args.model_loss_params.valid_loss_scales[valid_loss_idx]
+                    )
+                loss = sum([value for key, value in dict_loss.items()])
+                losses.update(loss.item(), clean.size(0))
+            list_sum_count = torch.Tensor([losses.sum, losses.count]).to(args.gpu)
+            list_gathered_sum_count = [
+                torch.ones_like(list_sum_count) for _ in range(dist.get_world_size())
+            ]
+            dist.all_gather(list_gathered_sum_count, list_sum_count)
+            gathered_score = reduce(
+                torch.stack(list_gathered_sum_count), "s c -> c", "sum"
+            )  # s: sum of losses.sum, c: sum of losses.count
+            gathered_score = (gathered_score[0] / gathered_score[1]).item()
+            loss_logger["valid/valid loss"] = gathered_score
+            for key, value in dict_loss.items():
+                loss_logger[f"valid/{key}"] = value.item()
+            if args.hyperparams.lr_scheduler == "step_lr":
+                self.scheduler.step(gathered_score)
+            elif args.hyperparams.lr_scheduler == "cos_warmup":
+                self.scheduler.step(epoch)
+            else:
+                self.scheduler.step(gathered_score)
+            if args.wandb_params.use_wandb and args.gpu == 0:
+                loss_logger["valid/epoch"] = epoch
+                wandb.log(loss_logger)
+            if args.gpu == 0:
+                self.valid_losses.append(gathered_score)
+                self.stop = self.es.step(gathered_score)
+                print(f"Epoch {epoch}, validation loss : {round(gathered_score, 6)}")
+                plt.plot(self.train_losses, label="train loss")
+                plt.plot(self.valid_losses, label="valid loss")
+                plt.legend(loc="upper right")
+                plt.savefig(f"{args.output}/loss_graph_{args.task_params.target}.png")
+                plt.close()
+                save_states = {
+                    "epoch": epoch,
+                    "state_dict": self.model.module.state_dict()
+                    if not args.hyperparams.ema
+                    else self.model_ema.state_dict(),
+                    "best_loss": self.es.best,
+                    "optimizer": self.optimizer.state_dict(),
+                    "scheduler": self.scheduler.state_dict(),
+                }
+                utils.save_checkpoint(
+                    save_states,
+                    state_dict_only=gathered_score == self.es.best,
+                    path=args.output,
+                    target=args.task_params.target,
+                )
+                self.train_times.append(time.time() - self.end)
+                if gathered_score == self.es.best:
+                    self.best_epoch = epoch
+                # save params
+                params = {
+                    "epochs_trained": epoch,
+                    "args": args.toDict(),
+                    "best_loss": self.es.best,
+                    "best_epoch": self.best_epoch,
+                    "train_loss_history": self.train_losses,
+                    "valid_loss_history": self.valid_losses,
+                    "train_time_history": self.train_times,
+                    "num_bad_epochs": self.es.num_bad_epochs,
+                }
+                with open(
+                    f"{args.output}/{args.task_params.target}.json", "w"
+                ) as outfile:
+                    outfile.write(json.dumps(params, indent=4, sort_keys=True))
+                self.train_times.append(time.time() - self.end)
+                print(
+                    f"Epoch {epoch} train completed. Took {round(self.train_times[-1], 3)} seconds"
+                )
+    def resume(self, args):
+        print(f"Resume checkpoint from: {args.dir_params.resume}:")
+        loc = f"cuda:{args.gpu}"
+        checkpoint_path = f"{args.dir_params.resume}/{args.task_params.target}"
+        with open(f"{checkpoint_path}.json", "r") as stream:
+            results = json.load(stream)
+        checkpoint = torch.load(f"{checkpoint_path}.chkpnt", map_location=loc)
+        if args.hyperparams.ema:
+            self.model_ema.load_state_dict(checkpoint["state_dict"])
+        else:
+            self.model.load_state_dict(checkpoint["state_dict"])
+        self.optimizer.load_state_dict(checkpoint["optimizer"])
+        if (
+            args.dir_params.continual_train
+        ):  # we want to use a pre-trained model but not want to use lr_scheduler history.
+            for param_group in self.optimizer.param_groups:
+                param_group["lr"] = args.hyperparams.lr
+        else:
+            self.scheduler.load_state_dict(checkpoint["scheduler"])
+            self.es.best = results["best_loss"]
+            self.es.num_bad_epochs = results["num_bad_epochs"]
+        self.start_epoch = results["epochs_trained"]
+        self.train_losses = results["train_loss_history"]
+        self.valid_losses = results["valid_loss_history"]
+        self.train_times = results["train_time_history"]
+        self.best_epoch = results["best_epoch"]
+        if args.sys_params.rank % args.ngpus_per_node == 0:
+            print(
+                f"=> loaded checkpoint {checkpoint_path} (epoch {results['epochs_trained']})"
+            )
+    def cal_loss(self, args, loss_input):
+        loss_dict = {}
+        for key, value in loss_input.items():
+            loss_dict[key] = self.criterion[key](*value)
+        return loss_dict
+    def cal_multiple_losses(self, args, dict_loss_name_input):
+        loss_dict = {}
+        for loss_name, loss_input in dict_loss_name_input.items():
+            loss_dict[loss_name] = self.cal_loss(args, loss_input)
+        return loss_dict

test_ddp.py ADDED Viewed

	@@ -0,0 +1,245 @@

+# To be honest... this is not ddp.
+import os
+import json
+import argparse
+import glob
+import torch
+import tqdm
+import musdb
+import librosa
+import soundfile as sf
+import pyloudnorm as pyln
+from dotmap import DotMap
+from models import load_model_with_args
+from separate_func import (
+    conv_tasnet_separate,
+)
+from utils import str2bool, db2linear
+tqdm.monitor_interval = 0
+def separate_track_with_model(
+    args, model, device, track_audio, track_name, meter, augmented_gain
+):
+    with torch.no_grad():
+        if (
+            args.model_loss_params.architecture == "conv_tasnet_mask_on_output"
+            or args.model_loss_params.architecture == "conv_tasnet"
+        ):
+            estimates = conv_tasnet_separate(
+                args,
+                model,
+                device,
+                track_audio,
+                track_name,
+                meter=meter,
+                augmented_gain=augmented_gain,
+            )
+        return estimates
+def main():
+    parser = argparse.ArgumentParser(description="model test.py")
+    parser.add_argument("--target", type=str, default="all")
+    parser.add_argument("--data_root", type=str, default="/path/to/musdb_XL")
+    parser.add_argument(
+        "--use_musdb",
+        type=str2bool,
+        default=True,
+        help="Use musdb test data or just want to inference other samples?",
+    )
+    parser.add_argument("--exp_name", type=str, default="delimit_6_s')
+    parser.add_argument("--manual_output_name", type=str, default=None)
+    parser.add_argument(
+        "--output_directory", type=str, default="/path/to/results"
+    )
+    parser.add_argument("--use_gpu", type=str2bool, default=True)
+    parser.add_arugment("--save_name_as_target", type=str2bool, default=True)
+    parser.add_argument(
+        "--loudnorm_input_lufs",
+        type=float,
+        default=None,
+        help="If you want to use loudnorm, input target lufs",
+    )
+    parser.add_argument(
+        "--use_singletrackset",
+        type=str2bool,
+        default=False,
+        help="Use SingleTrackSet for X-UMX",
+    )
+    parser.add_argument(
+        "--best_model",
+        type=str2bool,
+        default=True,
+        help="Use best model or lastly saved model",
+    )
+    parser.add_argument(
+        "--save_output_loudnorm",
+        type=float,
+        default=None,
+        help="Save loudness normalized outputs or not. If you want to save, input target loudness",
+    )
+    parser.add_argument(
+        "--save_mixed_output",
+        type=float,
+        default=None,
+        help="Save original+delimited-estimation mixed output with a ratio of default 0.5 (orginal) and 1 - 0.5 (estimation)",
+    )
+    parser.add_argument(
+        "--save_16k_mono",
+        type=str2bool,
+        default=False,
+        help="Save 16k mono wav files for FAD evaluation.",
+    )
+    parser.add_argument(
+        "--save_histogram",
+        type=str2bool,
+        default=False,
+        help="Save histogram of the output. Only valid when the task is 'delimit'",
+    )
+    args, _ = parser.parse_known_args()
+    args.output_dir = f"{args.output_directory}/checkpoint/{args.exp_name}"
+    with open(f"{args.output_dir}/{args.target}.json", "r") as f:
+        args_dict = json.load(f)
+        args_dict = DotMap(args_dict)
+    for key, value in args_dict["args"].items():
+        if key in list(vars(args).keys()):
+            pass
+        else:
+            setattr(args, key, value)
+    args.test_output_dir = f"{args.output_directory}/test/{args.exp_name}"
+    if args.manual_output_name != None:
+        args.test_output_dir = f"{args.output_directory}/test/{args.manual_output_name}"
+    os.makedirs(args.test_output_dir, exist_ok=True)
+    device = torch.device(
+        "cuda" if torch.cuda.is_available() and args.use_gpu else "cpu"
+    )
+    ###################### Define Models ######################
+    our_model = load_model_with_args(args)
+    our_model = our_model.to(device)
+    print(our_model)
+    pytorch_total_params = sum(
+        p.numel() for p in our_model.parameters() if p.requires_grad
+    )
+    print("Total number of parameters", pytorch_total_params)
+    # Future work => Torchinfo would be better for this purpose.
+    if args.best_model:
+        target_model_path = f"{args.output_dir}/{args.target}.pth"
+        checkpoint = torch.load(target_model_path, map_location=device)
+        our_model.load_state_dict(checkpoint)
+    else:  # when using lastly saved model
+        target_model_path = f"{args.output_dir}/{args.target}.chkpnt"
+        checkpoint = torch.load(target_model_path, map_location=device)
+        our_model.load_state_dict(checkpoint["state_dict"])
+    our_model.eval()
+    meter = pyln.Meter(44100)
+    if args.use_musdb:
+        test_tracks = musdb.DB(root=args.data_root, subsets="test", is_wav=True)
+        for track in tqdm.tqdm(test_tracks):
+            track_name = track.name
+            track_audio = track.audio
+            orig_audio = track_audio.copy()
+            augmented_gain = None
+            print("Now De-limiting : ", track_name)
+            if args.loudnorm_input_lufs:  # If you want to use loud-normalized input
+                track_lufs = meter.integrated_loudness(track_audio)
+                augmented_gain = args.loudnorm_input_lufs - track_lufs
+                track_audio = track_audio * db2linear(augmented_gain, eps=0.0)
+            track_audio = (
+                torch.as_tensor(track_audio.T, dtype=torch.float32)
+                .unsqueeze(0)
+                .to(device)
+            )
+            estimates = separate_track_with_model(
+                args, our_model, device, track_audio, track_name, meter, augmented_gain
+            )
+            if args.save_mixed_output:
+                orig_audio = orig_audio.T
+                track_lufs = meter.integrated_loudness(orig_audio.T)
+                augmented_gain = args.save_output_loudnorm - track_lufs
+                orig_audio = orig_audio * db2linear(augmented_gain, eps=0.0)
+                mixed_output = orig_audio * args.save_mixed_output + estimates * (
+                    1 - args.save_mixed_output
+                )
+                sf.write(
+                    f"{args.test_output_dir}/{track_name}/{str(args.save_mixed_output)}_mixed.wav",
+                    mixed_output.T,
+                    args.data_params.sample_rate,
+                )
+    else:
+        test_tracks = glob.glob(f"{args.data_root}/*.wav") + glob.glob(
+            f"{args.data_root}/*.mp3"
+        )
+        for track in tqdm.tqdm(test_tracks):
+            track_name = os.path.basename(track).replace(".wav", "").replace(".mp3", "")
+            track_audio, sr = librosa.load(
+                track, sr=None, mono=False
+            )  # sr should be 44100
+            orig_audio = track_audio.copy()
+            if sr != 44100:
+                raise ValueError("Sample rate should be 44100")
+            augmented_gain = None
+            print("Now De-limiting : ", track_name)
+            if args.loudnorm_input_lufs:  # If you want to use loud-normalized input
+                track_lufs = meter.integrated_loudness(track_audio.T)
+                augmented_gain = args.loudnorm_input_lufs - track_lufs
+                track_audio = track_audio * db2linear(augmented_gain, eps=0.0)
+            track_audio = (
+                torch.as_tensor(track_audio, dtype=torch.float32)
+                .unsqueeze(0)
+                .to(device)
+            )
+            estimates = separate_track_with_model(
+                args, our_model, device, track_audio, track_name, meter, augmented_gain
+            )
+            if args.save_mixed_output:
+                track_lufs = meter.integrated_loudness(orig_audio.T)
+                augmented_gain = args.save_output_loudnorm - track_lufs
+                orig_audio = orig_audio * db2linear(augmented_gain, eps=0.0)
+                mixed_output = orig_audio * args.save_mixed_output + estimates * (
+                    1 - args.save_mixed_output
+                )
+                sf.write(
+                    f"{args.test_output_dir}/{track_name}/{track_name}_mixed.wav",
+                    mixed_output.T,
+                    args.data_params.sample_rate,
+                )
+if __name__ == "__main__":
+    main()

train_ddp.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import sys
+import time
+import torch
+import torch.multiprocessing as mp
+import torch.distributed as dist
+import wandb
+from solver_ddp import Solver
+def train(args):
+    print("hello")
+    solver = Solver()
+    ngpus_per_node = int(torch.cuda.device_count() / args.sys_params.n_nodes)
+    print(f"use {ngpus_per_node} gpu machine")
+    args.sys_params.world_size = ngpus_per_node * args.sys_params.n_nodes
+    mp.spawn(worker, nprocs=ngpus_per_node, args=(solver, ngpus_per_node, args))
+def worker(gpu, solver, ngpus_per_node, args):
+    args.sys_params.rank = args.sys_params.rank * ngpus_per_node + gpu
+    dist.init_process_group(
+        backend="nccl",
+        world_size=args.sys_params.world_size,
+        init_method="env://",
+        rank=args.sys_params.rank,
+    )
+    args.gpu = gpu
+    args.ngpus_per_node = ngpus_per_node
+    solver.set_gpu(args)
+    start_epoch = solver.start_epoch
+    if args.dir_params.resume:
+        start_epoch = start_epoch + 1
+    for epoch in range(start_epoch, args.hyperparams.epochs + 1):
+        solver.train_sampler.set_epoch(epoch)
+        solver.train(args, epoch)
+        time.sleep(1)
+        solver.multi_validate(args, epoch)
+        if solver.stop == True:
+            print("Apply Early Stopping")
+            if args.wandb_params.use_wandb:
+                wandb.finish()
+            sys.exit()
+    if args.wandb_params.use_wandb:
+        wandb.finish()

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from .read_wave_utils import (
+    load_wav_arbitrary_position_mono,
+    load_wav_specific_position_mono,
+    load_wav_arbitrary_position_stereo,
+    load_wav_specific_position_stereo,
+)
+from .loudness_utils import (
+    linear2db,
+    db2linear,
+    normalize_mag_spec,
+    denormalize_mag_spec,
+    loudness_match_and_norm,
+    loudness_normal_match_and_norm,
+    loudness_normal_match_and_norm_output_louder_first,
+    loudnorm,
+)
+from .logging import save_img_and_npy, save_checkpoint, AverageMeter, EarlyStopping
+from .lr_scheduler import CosineAnnealingWarmUpRestarts
+from .train_utils import worker_init_fn, str2bool, get_config

utils/logging.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import os
+import torch
+import numpy as np
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+def save_img_and_npy(path, matrix):
+    plt.imsave(path + ".png", matrix, origin="lower")
+def save_checkpoint(state, state_dict_only, path, target):
+    torch.save(state, os.path.join(path, target + ".chkpnt"))
+    if state_dict_only:
+        # save just the weights
+        torch.save(state["state_dict"], os.path.join(path, target + ".pth"))
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+class EarlyStopping(object):
+    def __init__(self, mode="min", min_delta=0, patience=10):
+        self.mode = mode
+        self.min_delta = min_delta
+        self.patience = patience
+        self.best = None
+        self.num_bad_epochs = 0
+        self.is_better = None
+        self._init_is_better(mode, min_delta)
+        if patience == 0:
+            self.is_better = lambda a, b: True
+    def step(self, metrics):
+        if self.best is None:
+            self.best = metrics
+            return False
+        if np.isnan(metrics):
+            return True
+        if self.is_better(metrics, self.best):
+            self.num_bad_epochs = 0
+            self.best = metrics
+        else:
+            self.num_bad_epochs += 1
+        if self.num_bad_epochs >= self.patience:
+            return True
+        return False
+    def _init_is_better(self, mode, min_delta):
+        if mode not in {"min", "max"}:
+            raise ValueError("mode " + mode + " is unknown!")
+        if mode == "min":
+            self.is_better = lambda a, best: a < best - min_delta
+        if mode == "max":
+            self.is_better = lambda a, best: a > best + min_delta

utils/loudness_utils.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import random
+import numpy as np
+import torch
+def linear2db(x, eps=1e-5, scale=20):
+    return scale * np.log10(x + eps)
+def db2linear(x, eps=1e-5, scale=20):
+    return 10 ** (x / scale) - eps
+def normalize_mag_spec(S, min_level_db=-100.0):
+    return torch.clamp((S - min_level_db) / -min_level_db, min=0.0, max=1.0)
+def denormalize_mag_spec(S, min_level_db=-100.0):
+    return torch.clamp(S, min=0.0, max=1.0) * -min_level_db + min_level_db
+def loudness_match_and_norm(audio1, audio2, meter):
+    lufs_1 = meter.integrated_loudness(audio1)
+    lufs_2 = meter.integrated_loudness(audio2)
+    if np.isinf(lufs_1) or np.isinf(lufs_2):
+        return audio1, audio2
+    else:
+        audio2 = audio2 * db2linear(lufs_1 - lufs_2)
+        return audio1, audio2
+def loudness_normal_match_and_norm(audio1, audio2, meter):
+    lufs_1 = meter.integrated_loudness(audio1)
+    lufs_2 = meter.integrated_loudness(audio2)
+    if np.isinf(lufs_1) or np.isinf(lufs_2):
+        return audio1, audio2
+    else:
+        target_lufs = random.normalvariate(lufs_1, 6.0)
+        audio2 = audio2 * db2linear(target_lufs - lufs_2)
+        return audio1, audio2
+def loudness_normal_match_and_norm_output_louder_first(audio1, audio2, meter):
+    lufs_1 = meter.integrated_loudness(audio1)
+    lufs_2 = meter.integrated_loudness(audio2)
+    if np.isinf(lufs_1) or np.isinf(lufs_2):
+        return audio1, audio2
+    else:
+        target_lufs = random.normalvariate(
+            lufs_1 - 2.0, 2.0
+        )  # we want audio1 to be louder than audio2 about target_lufs_diff
+        audio2 = audio2 * db2linear(target_lufs - lufs_2)
+        return audio1, audio2
+def loudnorm(audio, target_lufs, meter, eps=1e-5):
+    lufs = meter.integrated_loudness(audio)
+    if np.isinf(lufs):
+        return audio, 0.0
+    else:
+        adjusted_gain = target_lufs - lufs
+        audio = audio * db2linear(adjusted_gain, eps)
+        return audio, adjusted_gain

utils/lr_scheduler.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import math
+from torch.optim.lr_scheduler import _LRScheduler
+class CosineAnnealingWarmUpRestarts(_LRScheduler):
+    def __init__(
+        self, optimizer, T_0, T_mult=1, eta_max=0.1, T_up=0, gamma=1.0, last_epoch=-1
+    ):
+        if T_0 <= 0 or not isinstance(T_0, int):
+            raise ValueError("Expected positive integer T_0, but got {}".format(T_0))
+        if T_mult < 1 or not isinstance(T_mult, int):
+            raise ValueError("Expected integer T_mult >= 1, but got {}".format(T_mult))
+        if T_up < 0 or not isinstance(T_up, int):
+            raise ValueError("Expected positive integer T_up, but got {}".format(T_up))
+        self.T_0 = T_0
+        self.T_mult = T_mult
+        self.base_eta_max = eta_max
+        self.eta_max = eta_max
+        self.T_up = T_up
+        self.T_i = T_0
+        self.gamma = gamma
+        self.cycle = 0
+        self.T_cur = last_epoch
+        super(CosineAnnealingWarmUpRestarts, self).__init__(optimizer, last_epoch)
+    def get_lr(self):
+        if self.T_cur == -1:
+            return self.base_lrs
+        elif self.T_cur < self.T_up:
+            return [
+                (self.eta_max - base_lr) * self.T_cur / self.T_up + base_lr
+                for base_lr in self.base_lrs
+            ]
+        else:
+            return [
+                base_lr
+                + (self.eta_max - base_lr)
+                * (
+                    1
+                    + math.cos(
+                        math.pi * (self.T_cur - self.T_up) / (self.T_i - self.T_up)
+                    )
+                )
+                / 2
+                for base_lr in self.base_lrs
+            ]
+    def step(self, epoch=None):
+        if epoch is None:
+            epoch = self.last_epoch + 1
+            self.T_cur = self.T_cur + 1
+            if self.T_cur >= self.T_i:
+                self.cycle += 1
+                self.T_cur = self.T_cur - self.T_i
+                self.T_i = (self.T_i - self.T_up) * self.T_mult + self.T_up
+        else:
+            if epoch >= self.T_0:
+                if self.T_mult == 1:
+                    self.T_cur = epoch % self.T_0
+                    self.cycle = epoch // self.T_0
+                else:
+                    n = int(
+                        math.log(
+                            (epoch / self.T_0 * (self.T_mult - 1) + 1), self.T_mult
+                        )
+                    )
+                    self.cycle = n
+                    self.T_cur = epoch - self.T_0 * (self.T_mult**n - 1) / (
+                        self.T_mult - 1
+                    )
+                    self.T_i = self.T_0 * self.T_mult ** (n)
+            else:
+                self.T_i = self.T_0
+                self.T_cur = epoch
+        self.eta_max = self.base_eta_max * (self.gamma**self.cycle)
+        self.last_epoch = math.floor(epoch)
+        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
+            param_group["lr"] = lr

utils/read_wave_utils.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import random
+import math
+import numpy as np
+import librosa
+import torchaudio
+def load_wav_arbitrary_position_mono(filename, sample_rate, seq_duration):
+    # mono
+    # seq_duration[second]
+    length = torchaudio.info(filename).num_frames
+    read_length = librosa.time_to_samples(seq_duration, sr=sample_rate)
+    if length > read_length:
+        random_start = random.randint(0, int(length - read_length - 1)) / sample_rate
+        X, sr = librosa.load(
+            filename, sr=None, offset=random_start, duration=seq_duration
+        )
+    else:
+        random_start = 0
+        total_pad_length = read_length - length
+        X, sr = librosa.load(filename, sr=None, offset=0, duration=seq_duration)
+        pad_left = random.randint(0, total_pad_length)
+        X = np.pad(X, (pad_left, total_pad_length - pad_left))
+    return X
+def load_wav_specific_position_mono(
+    filename, sample_rate, seq_duration, start_position
+):
+    # mono
+    # seq_duration[second]
+    # start_position[second]
+    length = torchaudio.info(filename).num_frames
+    read_length = librosa.time_to_samples(seq_duration, sr=sample_rate)
+    start_pos_sec = max(
+        start_position, 0
+    )  # if start_position is minus, then start from 0.
+    start_pos_sample = librosa.time_to_samples(start_pos_sec, sr=sample_rate)
+    if (
+        length <= start_pos_sample
+    ):  # if start position exceeds audio length, then start from 0.
+        start_pos_sec = 0
+        start_pos_sample = 0
+    X, sr = librosa.load(filename, sr=None, offset=start_pos_sec, duration=seq_duration)
+    if length < start_pos_sample + read_length:
+        X = np.pad(X, (0, (start_pos_sample + read_length) - length))
+    return X
+# load wav file from arbitrary positions of 16bit stereo wav file
+def load_wav_arbitrary_position_stereo(
+    filename, sample_rate, seq_duration, return_pos=False
+):
+    # stereo
+    # seq_duration[second]
+    length = torchaudio.info(filename).num_frames
+    read_length = librosa.time_to_samples(seq_duration, sr=sample_rate)
+    random_start_sample = random.randint(
+        0, int(length - math.ceil(seq_duration * sample_rate) - 1)
+    )
+    random_start_sec = librosa.samples_to_time(random_start_sample, sr=sample_rate)
+    X, sr = librosa.load(
+        filename, sr=None, mono=False, offset=random_start_sec, duration=seq_duration
+    )
+    if length < random_start_sample + read_length:
+        X = np.pad(X, ((0, 0), (0, (random_start_sample + read_length) - length)))
+    if return_pos:
+        return X, random_start_sec
+    else:
+        return X
+def load_wav_specific_position_stereo(
+    filename, sample_rate, seq_duration, start_position
+):
+    # stereo
+    # seq_duration[second]
+    # start_position[second]
+    length = torchaudio.info(filename).num_frames
+    read_length = librosa.time_to_samples(seq_duration, sr=sample_rate)
+    start_pos_sec = max(
+        start_position, 0
+    )  # if start_position is minus, then start from 0.
+    start_pos_sample = librosa.time_to_samples(start_pos_sec, sr=sample_rate)
+    if (
+        length <= start_pos_sample
+    ):  # if start position exceeds audio length, then start from 0.
+        start_pos_sec = 0
+        start_pos_sample = 0
+    X, sr = librosa.load(
+        filename, sr=None, mono=False, offset=start_pos_sec, duration=seq_duration
+    )
+    if length < start_pos_sample + read_length:
+        X = np.pad(X, ((0, 0), (0, (start_pos_sample + read_length) - length)))
+    return X

utils/train_utils.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import argparse
+import yaml
+from dotmap import DotMap
+import numpy as np
+def worker_init_fn(worker_id):
+    np.random.seed(np.random.get_state()[1][0] + worker_id)
+def str2bool(v):
+    if v.lower() in ("yes", "true", "t", "y", "1"):
+        return True
+    elif v.lower() in ("no", "false", "f", "n", "0"):
+        return False
+    else:
+        raise argparse.ArgumentTypeError("Boolean value expected.")
+def get_config(config_name="default"):
+    with open(f"./configs/{config_name}.yaml", "r") as f:
+        config = yaml.load(f, Loader=yaml.FullLoader)
+    config = DotMap(config)
+    return config

weight/all.json ADDED Viewed

	@@ -0,0 +1,957 @@

+{
+    "args": {
+        "classifier_params": {
+            "chosen_source_mean": 0.7,
+            "chosen_source_std": 0.15,
+            "classifier_activation": "softmax",
+            "classifier_n_classes": 4,
+            "classifier_n_srcs": 4,
+            "freeze_when_mixit": true,
+            "melspec_power": 2.0,
+            "model_name": "hrnet_w18_small",
+            "n_mels": 128,
+            "other_source_mean": 0.3,
+            "other_source_std": 0.15,
+            "pretrained_model": false,
+            "use_one_source_prob": 0.2,
+            "use_stereo": true
+        },
+        "conv_tasnet_params": {
+            "bn_chan": 128,
+            "decoder_activation": "sigmoid",
+            "encoder_activation": "relu",
+            "hid_chan": 512,
+            "kernel_size": 128,
+            "mask_act": "relu",
+            "n_blocks": 5,
+            "n_filters": 512,
+            "n_repeats": 2,
+            "skip_chan": 128,
+            "stride": 64
+        },
+        "data_params": {
+            "custom_limiter_attack_range": null,
+            "custom_limiter_release_range": null,
+            "limitaug_custom_target_lufs": null,
+            "limitaug_custom_target_lufs_std": null,
+            "limitaug_method": "ozone",
+            "limitaug_mode": null,
+            "nb_channels": 2,
+            "nfft": 4096,
+            "nhop": 1024,
+            "random_mix": true,
+            "sample_rate": 44100,
+            "samples_per_track": 128,
+            "seq_dur": 4.0,
+            "singleset_num_frames": null,
+            "target_limitaug_custom_target_lufs": null,
+            "target_limitaug_custom_target_lufs_std": null,
+            "target_limitaug_mode": null,
+            "target_loudnorm_lufs": -14.0,
+            "use_fixed": 0.019
+        },
+        "dir_params": {
+            "continual_train": false,
+            "delimit_valid_L_root": null,
+            "delimit_valid_root": null,
+            "exp_name": "convtasnet_35",
+            "output_directory": "/data2/personal/jeon/delimit/results",
+            "ozone_root": "/data5/personal/jeon/delimit/data",
+            "pretrained_classifier": null,
+            "resume": null,
+            "root": "/data1/Music/musdb18hq"
+        },
+        "gpu": 0,
+        "hyperparams": {
+            "batch_size": 8,
+            "ema": false,
+            "epochs": 200,
+            "gradient_clip": 5.0,
+            "lr": 3e-05,
+            "lr_decay_gamma": 0.5,
+            "lr_decay_patience": 15,
+            "lr_scheduler": "step_lr",
+            "optimizer": "adamw",
+            "patience": 50,
+            "weight_decay": 0.01
+        },
+        "img_check": "/data2/personal/jeon/delimit/results/img_check/convtasnet_35",
+        "invest_unet_params": {
+            "bn_factor": 16,
+            "f_down_layers": null,
+            "first_conv_activation": "relu",
+            "input_channels": 4,
+            "internal_channels": 24,
+            "kernel_size_f": 3,
+            "kernel_size_t": 3,
+            "last_activation": "identity",
+            "min_bn_units": 16,
+            "n_blocks": 7,
+            "n_internal_layers": 5,
+            "t_down_layers": null,
+            "tfc_tdf_activation": "relu",
+            "tfc_tdf_bias": true,
+            "tif_init_mode": null
+        },
+        "model_loss_params": {
+            "architecture": "conv_tasnet_mask_on_output",
+            "efficient_mixit_threshold": null,
+            "train_loss_func": [
+                "si_sdr"
+            ],
+            "train_loss_scales": [
+                1.0
+            ],
+            "valid_loss_func": [
+                "si_sdr"
+            ],
+            "valid_loss_scales": [
+                1.0
+            ]
+        },
+        "ngpus_per_node": 1,
+        "output": "/data2/personal/jeon/delimit/results/checkpoint/convtasnet_35",
+        "resume": {},
+        "sample_rate": {},
+        "sys_params": {
+            "n_nodes": 1,
+            "nb_workers": 4,
+            "port": null,
+            "rank": 0,
+            "seed": 777,
+            "world_size": 1
+        },
+        "task_params": {
+            "bleeding_nsrcs": null,
+            "dataset": "delimit",
+            "target": "all",
+            "train": true
+        },
+        "umx_params": {
+            "activation": "relu",
+            "dropout_rate": 0.05,
+            "hidden_size": 512,
+            "instead_tanh_activation": "tanh",
+            "lstm_dropout_rate": 0.4,
+            "nb_layers": 3,
+            "normalization": "bn",
+            "umx_get_statistics": false
+        },
+        "wandb_params": {
+            "entity": "vinyne",
+            "project": "delimit",
+            "rerun_id": null,
+            "sweep": false,
+            "use_wandb": true
+        }
+    },
+    "best_epoch": 183,
+    "best_loss": -14.165373802185059,
+    "epochs_trained": 200,
+    "num_bad_epochs": 17,
+    "train_loss_history": [
+        -11.723381042480469,
+        -11.759103775024414,
+        -11.818404197692871,
+        -11.88597583770752,
+        -11.882278442382812,
+        -11.943178176879883,
+        -11.909675598144531,
+        -11.93053913116455,
+        -11.922198295593262,
+        -12.013456344604492,
+        -12.106053352355957,
+        -11.999975204467773,
+        -12.067265510559082,
+        -12.079473495483398,
+        -12.13272762298584,
+        -12.15418529510498,
+        -12.08314037322998,
+        -12.152527809143066,
+        -12.096565246582031,
+        -12.219636917114258,
+        -12.246475219726562,
+        -12.170637130737305,
+        -12.188806533813477,
+        -12.230484962463379,
+        -12.207123756408691,
+        -12.307502746582031,
+        -12.200200080871582,
+        -12.284586906433105,
+        -12.244038581848145,
+        -12.302275657653809,
+        -12.200104713439941,
+        -12.31570816040039,
+        -12.42324447631836,
+        -12.352653503417969,
+        -12.367401123046875,
+        -12.295838356018066,
+        -12.404874801635742,
+        -12.338440895080566,
+        -12.365501403808594,
+        -12.365768432617188,
+        -12.225799560546875,
+        -12.26883602142334,
+        -12.390016555786133,
+        -12.410661697387695,
+        -12.311858177185059,
+        -12.408061027526855,
+        -12.396013259887695,
+        -12.353321075439453,
+        -12.470121383666992,
+        -12.469389915466309,
+        -12.452675819396973,
+        -12.381932258605957,
+        -12.31003475189209,
+        -12.412126541137695,
+        -12.267746925354004,
+        -12.440984725952148,
+        -12.413816452026367,
+        -12.417757034301758,
+        -12.4945650100708,
+        -12.445524215698242,
+        -12.38110065460205,
+        -12.454893112182617,
+        -12.390727996826172,
+        -12.339771270751953,
+        -12.528243064880371,
+        -12.434144973754883,
+        -12.43438720703125,
+        -12.458473205566406,
+        -12.424423217773438,
+        -12.387894630432129,
+        -12.438997268676758,
+        -12.528799057006836,
+        -12.423232078552246,
+        -12.534538269042969,
+        -12.495400428771973,
+        -12.53675651550293,
+        -12.551910400390625,
+        -12.478575706481934,
+        -12.461804389953613,
+        -12.483702659606934,
+        -12.474960327148438,
+        -12.441666603088379,
+        -12.42241096496582,
+        -12.48852252960205,
+        -12.513558387756348,
+        -12.40845012664795,
+        -12.555559158325195,
+        -12.589385032653809,
+        -12.395785331726074,
+        -12.496671676635742,
+        -12.554829597473145,
+        -12.530548095703125,
+        -12.564457893371582,
+        -12.52737808227539,
+        -12.608246803283691,
+        -12.3996000289917,
+        -12.433905601501465,
+        -12.490935325622559,
+        -12.477506637573242,
+        -12.470728874206543,
+        -12.564470291137695,
+        -12.525967597961426,
+        -12.502660751342773,
+        -12.440997123718262,
+        -12.576118469238281,
+        -12.538352966308594,
+        -12.512738227844238,
+        -12.525115966796875,
+        -12.511483192443848,
+        -12.571795463562012,
+        -12.59391975402832,
+        -12.442131996154785,
+        -12.617898941040039,
+        -12.495210647583008,
+        -12.551814079284668,
+        -12.4913330078125,
+        -12.626816749572754,
+        -12.556028366088867,
+        -12.477901458740234,
+        -12.596776008605957,
+        -12.597326278686523,
+        -12.484386444091797,
+        -12.660898208618164,
+        -12.440162658691406,
+        -12.530372619628906,
+        -12.51207447052002,
+        -12.503606796264648,
+        -12.670214653015137,
+        -12.51667308807373,
+        -12.546160697937012,
+        -12.504158020019531,
+        -12.6427001953125,
+        -12.56100082397461,
+        -12.506058692932129,
+        -12.637288093566895,
+        -12.572591781616211,
+        -12.544734001159668,
+        -12.604019165039062,
+        -12.549866676330566,
+        -12.521714210510254,
+        -12.601127624511719,
+        -12.629931449890137,
+        -12.587185859680176,
+        -12.605366706848145,
+        -12.606413841247559,
+        -12.536269187927246,
+        -12.577346801757812,
+        -12.703147888183594,
+        -12.60477066040039,
+        -12.603355407714844,
+        -12.536528587341309,
+        -12.601842880249023,
+        -12.698568344116211,
+        -12.72192668914795,
+        -12.663148880004883,
+        -12.644909858703613,
+        -12.631479263305664,
+        -12.596253395080566,
+        -12.61674690246582,
+        -12.701379776000977,
+        -12.664311408996582,
+        -12.646204948425293,
+        -12.597058296203613,
+        -12.652384757995605,
+        -12.579480171203613,
+        -12.757433891296387,
+        -12.686827659606934,
+        -12.65634536743164,
+        -12.552176475524902,
+        -12.625761032104492,
+        -12.652499198913574,
+        -12.668974876403809,
+        -12.700301170349121,
+        -12.591926574707031,
+        -12.54333782196045,
+        -12.541864395141602,
+        -12.720565795898438,
+        -12.625009536743164,
+        -12.577120780944824,
+        -12.67569637298584,
+        -12.634958267211914,
+        -12.660367012023926,
+        -12.646204948425293,
+        -12.713308334350586,
+        -12.734916687011719,
+        -12.602835655212402,
+        -12.596168518066406,
+        -12.66109848022461,
+        -12.568808555603027,
+        -12.719843864440918,
+        -12.746356010437012,
+        -12.602999687194824,
+        -12.632689476013184,
+        -12.715725898742676,
+        -12.671126365661621,
+        -12.659911155700684,
+        -12.755860328674316,
+        -12.591080665588379,
+        -12.623464584350586,
+        -12.643362045288086
+    ],
+    "train_time_history": [
+        308.12283968925476,
+        308.12408661842346,
+        305.56318974494934,
+        305.6093053817749,
+        304.1926734447479,
+        304.2103099822998,
+        301.78035831451416,
+        301.7819468975067,
+        317.8168547153473,
+        317.818119764328,
+        314.8585801124573,
+        314.8601076602936,
+        311.61795926094055,
+        311.61953926086426,
+        316.2616910934448,
+        316.2639091014862,
+        312.59282636642456,
+        312.59408020973206,
+        314.6765525341034,
+        314.6778757572174,
+        314.4039900302887,
+        314.40531301498413,
+        313.9343922138214,
+        313.9356322288513,
+        315.1470823287964,
+        315.14854192733765,
+        317.65793561935425,
+        317.65903544425964,
+        316.41589403152466,
+        316.4171371459961,
+        316.253050327301,
+        316.2544617652893,
+        316.2039670944214,
+        316.20542550086975,
+        316.30707120895386,
+        316.30964159965515,
+        315.7812213897705,
+        315.7832131385803,
+        315.77191638946533,
+        315.7732570171356,
+        315.7776229381561,
+        315.77907848358154,
+        315.80343294143677,
+        315.8051166534424,
+        314.40133929252625,
+        314.403112411499,
+        314.32283997535706,
+        314.32424092292786,
+        314.90000677108765,
+        314.90242648124695,
+        313.8207128047943,
+        313.8227391242981,
+        313.86938881874084,
+        313.87079215049744,
+        316.9037547111511,
+        316.9056947231293,
+        317.4321286678314,
+        317.43361139297485,
+        316.41515493392944,
+        316.4182825088501,
+        315.69741559028625,
+        315.699245929718,
+        315.9285054206848,
+        315.930716753006,
+        314.25376319885254,
+        314.25567531585693,
+        312.997665643692,
+        313.0005877017975,
+        315.5962414741516,
+        315.5977747440338,
+        315.49425506591797,
+        315.4961242675781,
+        315.980491399765,
+        315.98283791542053,
+        315.5533638000488,
+        315.55492901802063,
+        313.9896593093872,
+        313.99131321907043,
+        314.3214478492737,
+        314.3232262134552,
+        314.6442220211029,
+        314.64620661735535,
+        315.69726514816284,
+        315.7001700401306,
+        314.78302001953125,
+        314.7847316265106,
+        313.14448523521423,
+        313.1465194225311,
+        311.8232834339142,
+        311.8251144886017,
+        318.88225960731506,
+        318.8843643665314,
+        319.20725083351135,
+        319.20886182785034,
+        317.81429648399353,
+        317.8159878253937,
+        320.23738193511963,
+        320.23904752731323,
+        315.8315763473511,
+        315.83344054222107,
+        317.32581615448,
+        317.3274848461151,
+        316.7596924304962,
+        316.7628848552704,
+        316.3167974948883,
+        316.3188827037811,
+        316.44567823410034,
+        316.44802141189575,
+        313.8653395175934,
+        313.8687484264374,
+        308.43933939933777,
+        308.44151163101196,
+        312.1857454776764,
+        312.18967509269714,
+        307.8407344818115,
+        307.84401679039,
+        307.48447585105896,
+        307.48623728752136,
+        310.300940990448,
+        310.3029022216797,
+        310.32225275039673,
+        310.3257050514221,
+        309.351779460907,
+        309.3539865016937,
+        309.4356527328491,
+        309.4380919933319,
+        312.63360381126404,
+        312.63535809516907,
+        311.7453818321228,
+        311.7476508617401,
+        311.3258364200592,
+        311.327698469162,
+        312.28111600875854,
+        312.2828998565674,
+        311.3383209705353,
+        311.34200048446655,
+        306.9764757156372,
+        306.9787657260895,
+        309.35506653785706,
+        309.3569576740265,
+        310.2506465911865,
+        310.2529339790344,
+        310.65880727767944,
+        310.66108298301697,
+        311.18562865257263,
+        311.1874952316284,
+        309.07765316963196,
+        309.07997822761536,
+        313.3008818626404,
+        313.3029179573059,
+        311.267498254776,
+        311.26989102363586,
+        310.62635374069214,
+        310.6306185722351,
+        308.1883268356323,
+        308.19112515449524,
+        310.65689158439636,
+        310.65896558761597,
+        308.98754620552063,
+        309.03386878967285,
+        309.21512937545776,
+        309.2185757160187,
+        309.93750405311584,
+        309.93965554237366,
+        310.2938587665558,
+        310.29592084884644,
+        308.24257493019104,
+        308.2463102340698,
+        310.6870594024658,
+        310.6905345916748,
+        310.7875945568085,
+        310.78995156288147,
+        310.9882712364197,
+        310.9906806945801,
+        310.95856285095215,
+        310.96066546440125,
+        312.4489221572876,
+        312.45125246047974,
+        312.24022579193115,
+        312.2863116264343,
+        309.68400406837463,
+        309.6862533092499,
+        309.64014887809753,
+        309.64232993125916,
+        309.9094281196594,
+        309.9119017124176,
+        309.40677762031555,
+        309.40893173217773,
+        309.1595506668091,
+        309.1617259979248,
+        308.4178020954132,
+        308.4198989868164,
+        308.5063133239746,
+        308.5085346698761,
+        307.5796904563904,
+        307.5972898006439,
+        309.66309905052185,
+        309.66530561447144,
+        312.70798993110657,
+        312.7102212905884,
+        310.2431013584137,
+        310.2453660964966,
+        312.2640459537506,
+        312.26635122299194,
+        311.27055287361145,
+        311.27321219444275,
+        312.58145689964294,
+        312.58376598358154,
+        313.1553518772125,
+        313.1574249267578,
+        308.4067575931549,
+        308.4089684486389,
+        311.0251498222351,
+        311.0274658203125,
+        308.0227520465851,
+        308.02498388290405,
+        308.0182030200958,
+        308.0204634666443,
+        308.63523149490356,
+        308.63751220703125,
+        308.53969383239746,
+        308.5420751571655,
+        306.51329946517944,
+        306.51555824279785,
+        309.59846591949463,
+        309.60128831863403,
+        305.3712034225464,
+        305.37409830093384,
+        305.43984270095825,
+        305.4421238899231,
+        309.3166663646698,
+        309.3195414543152,
+        308.8618497848511,
+        308.86409974098206,
+        304.8731882572174,
+        304.8755958080292,
+        306.6576888561249,
+        306.663143157959,
+        306.6716537475586,
+        306.6740062236786,
+        309.47339940071106,
+        309.47578954696655,
+        307.73386335372925,
+        307.7363700866699,
+        308.0688214302063,
+        308.07209277153015,
+        311.58968901634216,
+        311.6099576950073,
+        308.70460844039917,
+        308.70710158348083,
+        312.0563473701477,
+        312.05881452560425,
+        310.89456367492676,
+        310.9119510650635,
+        308.73097705841064,
+        308.73414373397827,
+        309.4255359172821,
+        309.42857813835144,
+        311.0751721858978,
+        311.07801842689514,
+        309.5860447883606,
+        309.5896680355072,
+        309.87396597862244,
+        309.8803391456604,
+        310.9183626174927,
+        310.92147397994995,
+        308.4321529865265,
+        308.4359757900238,
+        312.4424922466278,
+        312.44731879234314,
+        312.3443009853363,
+        312.3491401672363,
+        310.3139410018921,
+        310.3165555000305,
+        312.09410762786865,
+        312.09656262397766,
+        311.11144399642944,
+        311.1577796936035,
+        309.1589603424072,
+        309.16152119636536,
+        312.51157093048096,
+        312.51463317871094,
+        314.15198159217834,
+        314.15485286712646,
+        310.00070810317993,
+        310.0033264160156,
+        311.2290298938751,
+        311.23188829421997,
+        313.0510983467102,
+        313.05362153053284,
+        313.48791670799255,
+        313.4910161495209,
+        307.60272216796875,
+        307.6053590774536,
+        303.84622287750244,
+        303.8494029045105,
+        304.8547012805939,
+        304.85784125328064,
+        310.63141536712646,
+        310.63450264930725,
+        304.8634753227234,
+        304.8664004802704,
+        308.1505949497223,
+        308.15428018569946,
+        310.18936228752136,
+        310.1920323371887,
+        309.2550263404846,
+        309.2577428817749,
+        310.08596634864807,
+        310.08910751342773,
+        307.4643654823303,
+        307.4670605659485,
+        308.558221578598,
+        308.5638659000397,
+        309.7440264225006,
+        309.7467608451843,
+        308.2091956138611,
+        308.2125828266144,
+        307.0199763774872,
+        307.02332496643066,
+        306.3482081890106,
+        306.35128688812256,
+        307.3764581680298,
+        307.37923669815063,
+        311.61060428619385,
+        311.6135311126709,
+        306.8187861442566,
+        306.8240280151367,
+        305.19880175590515,
+        305.20313119888306,
+        309.252712726593,
+        309.256165266037,
+        310.80801463127136,
+        310.81236577033997,
+        309.1079206466675,
+        309.11073756217957,
+        310.6556165218353,
+        310.65838623046875,
+        310.94868993759155,
+        310.95155143737793,
+        308.4552607536316,
+        308.4580717086792,
+        308.2857587337494,
+        308.2886221408844,
+        306.4856150150299,
+        306.4887855052948,
+        306.8667871952057,
+        306.86966013908386,
+        306.1964519023895,
+        306.2005341053009,
+        308.2178611755371,
+        308.22126364707947,
+        305.94888377189636,
+        305.9523375034332,
+        307.48926973342896,
+        307.4920620918274,
+        307.60354018211365,
+        307.63674998283386,
+        307.2473645210266,
+        307.2501358985901,
+        308.16573452949524,
+        308.2115182876587,
+        307.30736780166626,
+        307.3109815120697,
+        307.2137475013733,
+        307.2178246974945,
+        308.5944905281067,
+        308.59843826293945,
+        307.2346291542053,
+        307.2382435798645,
+        308.417338848114,
+        308.4208617210388,
+        305.5816307067871,
+        305.5852439403534,
+        307.69459652900696,
+        307.6975119113922,
+        307.20833134651184,
+        307.212299823761,
+        305.9614431858063,
+        305.965185880661,
+        305.31594157218933,
+        305.3195445537567,
+        307.46696519851685,
+        307.47079825401306,
+        306.23966455459595,
+        306.2433180809021,
+        306.1235647201538,
+        306.1273248195648,
+        307.02436780929565,
+        307.02733421325684,
+        306.9687819480896,
+        306.97225856781006,
+        306.23205065727234,
+        306.2356073856354,
+        305.3567383289337,
+        305.36028504371643,
+        305.94446635246277,
+        305.9480822086334,
+        307.2553553581238
+    ],
+    "valid_loss_history": [
+        -12.743322372436523,
+        -12.724347114562988,
+        -12.86701488494873,
+        -12.694435119628906,
+        -12.706733703613281,
+        -13.048251152038574,
+        -12.943618774414062,
+        -13.120084762573242,
+        -13.121935844421387,
+        -13.146740913391113,
+        -13.197364807128906,
+        -13.224929809570312,
+        -13.255891799926758,
+        -13.311783790588379,
+        -13.386489868164062,
+        -13.390006065368652,
+        -13.45509147644043,
+        -13.444679260253906,
+        -13.456311225891113,
+        -13.36051082611084,
+        -13.478644371032715,
+        -13.503388404846191,
+        -13.540580749511719,
+        -13.579903602600098,
+        -13.551591873168945,
+        -13.638075828552246,
+        -13.617512702941895,
+        -13.64240550994873,
+        -13.618767738342285,
+        -13.65319538116455,
+        -13.601574897766113,
+        -13.693778038024902,
+        -13.658882141113281,
+        -13.649510383605957,
+        -13.477263450622559,
+        -13.643564224243164,
+        -13.732584953308105,
+        -13.643271446228027,
+        -13.655325889587402,
+        -13.71172046661377,
+        -13.564180374145508,
+        -13.708178520202637,
+        -13.688010215759277,
+        -13.711198806762695,
+        -13.612863540649414,
+        -13.702019691467285,
+        -13.704530715942383,
+        -13.716957092285156,
+        -13.76714038848877,
+        -13.719636917114258,
+        -13.738469123840332,
+        -13.759002685546875,
+        -13.721348762512207,
+        -13.727803230285645,
+        -13.768327713012695,
+        -13.73253345489502,
+        -13.75208568572998,
+        -13.754429817199707,
+        -13.76417064666748,
+        -13.805985450744629,
+        -13.762914657592773,
+        -13.75927448272705,
+        -13.781553268432617,
+        -13.744827270507812,
+        -13.805213928222656,
+        -13.792055130004883,
+        -13.736992835998535,
+        -13.804685592651367,
+        -13.802186012268066,
+        -13.812178611755371,
+        -13.781081199645996,
+        -13.836441993713379,
+        -13.787053108215332,
+        -13.824462890625,
+        -13.827963829040527,
+        -13.768393516540527,
+        -13.824796676635742,
+        -13.809252738952637,
+        -13.820283889770508,
+        -13.811989784240723,
+        -13.845786094665527,
+        -13.801295280456543,
+        -13.795866966247559,
+        -13.847658157348633,
+        -13.841630935668945,
+        -13.887687683105469,
+        -13.838217735290527,
+        -13.833791732788086,
+        -13.8090181350708,
+        -13.810338973999023,
+        -13.812939643859863,
+        -13.813563346862793,
+        -13.72245979309082,
+        -13.829062461853027,
+        -13.820122718811035,
+        -13.764768600463867,
+        -13.882962226867676,
+        -13.887824058532715,
+        -13.874728202819824,
+        -13.83934211730957,
+        -13.854304313659668,
+        -13.853861808776855,
+        -13.878510475158691,
+        -13.855673789978027,
+        -13.935111999511719,
+        -13.873315811157227,
+        -13.88434886932373,
+        -13.913508415222168,
+        -13.804875373840332,
+        -13.874313354492188,
+        -13.925950050354004,
+        -13.898317337036133,
+        -13.861913681030273,
+        -13.83596134185791,
+        -13.907777786254883,
+        -13.832358360290527,
+        -13.936162948608398,
+        -13.925071716308594,
+        -13.906752586364746,
+        -13.87073040008545,
+        -13.964620590209961,
+        -13.925311088562012,
+        -13.974698066711426,
+        -13.957905769348145,
+        -13.918564796447754,
+        -13.975790023803711,
+        -13.988444328308105,
+        -13.959516525268555,
+        -14.01569652557373,
+        -13.992425918579102,
+        -14.039790153503418,
+        -13.940314292907715,
+        -14.011497497558594,
+        -13.953152656555176,
+        -13.920698165893555,
+        -13.960227966308594,
+        -13.907439231872559,
+        -14.014067649841309,
+        -13.972914695739746,
+        -13.942621231079102,
+        -14.019667625427246,
+        -14.037107467651367,
+        -13.85366153717041,
+        -13.980110168457031,
+        -13.97785472869873,
+        -13.983843803405762,
+        -13.843756675720215,
+        -14.002585411071777,
+        -14.026784896850586,
+        -14.028115272521973,
+        -14.02059268951416,
+        -13.985837936401367,
+        -14.076154708862305,
+        -14.060620307922363,
+        -13.936518669128418,
+        -13.957221031188965,
+        -14.017061233520508,
+        -13.995661735534668,
+        -14.056286811828613,
+        -14.037705421447754,
+        -13.940332412719727,
+        -14.092416763305664,
+        -14.024917602539062,
+        -14.002346992492676,
+        -14.026989936828613,
+        -13.944084167480469,
+        -14.002883911132812,
+        -14.120462417602539,
+        -14.043062210083008,
+        -14.008293151855469,
+        -14.040563583374023,
+        -13.994155883789062,
+        -14.08944034576416,
+        -14.078422546386719,
+        -14.014589309692383,
+        -14.083242416381836,
+        -14.104707717895508,
+        -14.103189468383789,
+        -14.063937187194824,
+        -14.0596284866333,
+        -14.059121131896973,
+        -14.102814674377441,
+        -14.165373802185059,
+        -14.106118202209473,
+        -14.107162475585938,
+        -14.085371017456055,
+        -14.123793601989746,
+        -14.053537368774414,
+        -14.077792167663574,
+        -14.056371688842773,
+        -14.033655166625977,
+        -14.096640586853027,
+        -14.057114601135254,
+        -14.115262985229492,
+        -14.074142456054688,
+        -14.067980766296387,
+        -14.118453025817871,
+        -14.117535591125488,
+        -14.126029968261719,
+        -14.117874145507812
+    ]
+}

weight/all.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34f2ef4e5c32542060621f7ea9f7a06a2acf91be22825a38f9270077a7346679
+size 9424379