Commit
·
3978e51
1
Parent(s):
c3eda24
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +5 -0
- .gitignore +76 -0
- .gradio/certificate.pem +31 -0
- LICENSE +21 -0
- README.md +12 -1
- WebUi2.py +0 -0
- ckpts/inst_gabox.yaml +48 -0
- clean_model.py +157 -0
- configs/KimberleyJensen/config_vocals_mel_band_roformer_kj.yaml +72 -0
- configs/config_apollo.yaml +33 -0
- configs/config_dnr_bandit_bsrnn_multi_mus64.yaml +78 -0
- configs/config_dnr_bandit_v2_mus64.yaml +77 -0
- configs/config_drumsep.yaml +72 -0
- configs/config_htdemucs_6stems.yaml +127 -0
- configs/config_musdb18_bs_mamba2.yaml +58 -0
- configs/config_musdb18_bs_roformer.yaml +137 -0
- configs/config_musdb18_bs_roformer_with_lora.yaml +205 -0
- configs/config_musdb18_demucs3_mmi.yaml +72 -0
- configs/config_musdb18_htdemucs.yaml +119 -0
- configs/config_musdb18_mdx23c.yaml +182 -0
- configs/config_musdb18_mdx23c_stht.yaml +182 -0
- configs/config_musdb18_mel_band_roformer.yaml +76 -0
- configs/config_musdb18_mel_band_roformer_all_stems.yaml +97 -0
- configs/config_musdb18_scnet.yaml +83 -0
- configs/config_musdb18_scnet_large.yaml +83 -0
- configs/config_musdb18_segm_models.yaml +92 -0
- configs/config_musdb18_torchseg.yaml +92 -0
- configs/config_vocals_bandit_bsrnn_multi_mus64.yaml +73 -0
- configs/config_vocals_bs_mamba2.yaml +51 -0
- configs/config_vocals_bs_roformer.yaml +141 -0
- configs/config_vocals_htdemucs.yaml +123 -0
- configs/config_vocals_mdx23c.yaml +96 -0
- configs/config_vocals_mel_band_roformer.yaml +80 -0
- configs/config_vocals_scnet.yaml +79 -0
- configs/config_vocals_scnet_large.yaml +79 -0
- configs/config_vocals_scnet_unofficial.yaml +62 -0
- configs/config_vocals_segm_models.yaml +78 -0
- configs/config_vocals_swin_upernet.yaml +51 -0
- configs/config_vocals_torchseg.yaml +58 -0
- configs/viperx/model_bs_roformer_ep_317_sdr_12.9755.yaml +126 -0
- configs/viperx/model_bs_roformer_ep_937_sdr_10.5309.yaml +138 -0
- configs/viperx/model_mel_band_roformer_ep_3005_sdr_11.4360.yaml +65 -0
- cookies.txt +28 -0
- dataset.py +669 -0
- docs/LoRA.md +114 -0
- docs/augmentations.md +146 -0
- docs/bs_roformer_info.md +145 -0
- docs/changes.md +20 -0
- docs/dataset_types.md +75 -0
- docs/ensemble.md +30 -0
.gitattributes
CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
gui/Poppins[[:space:]]Bold[[:space:]]700.ttf filter=lfs diff=lfs merge=lfs -text
|
37 |
+
gui/Poppins[[:space:]]Regular[[:space:]]400.ttf filter=lfs diff=lfs merge=lfs -text
|
38 |
+
input/APT.[[:space:]][[:space:]]YOR[[:space:]]X[[:space:]]LOID[[:space:]][[:space:]]AMV[[:space:]]4K.mp3 filter=lfs diff=lfs merge=lfs -text
|
39 |
+
old_output/APT.[[:space:]][[:space:]]YOR[[:space:]]X[[:space:]]LOID[[:space:]][[:space:]]AMV[[:space:]]4K.mp3_Instrumental_Inst_GaboxV7_(by[[:space:]]Gabox)_old.wav filter=lfs diff=lfs merge=lfs -text
|
40 |
+
output/APT.[[:space:]][[:space:]]YOR[[:space:]]X[[:space:]]LOID[[:space:]][[:space:]]AMV[[:space:]]4K.mp3_Instrumental_Inst_GaboxV7_(by[[:space:]]Gabox).wav filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__
|
2 |
+
.DS_Store
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# Distribution / packaging
|
7 |
+
.Python
|
8 |
+
build/
|
9 |
+
develop-eggs/
|
10 |
+
dist/
|
11 |
+
downloads/
|
12 |
+
eggs/
|
13 |
+
.eggs/
|
14 |
+
lib/
|
15 |
+
Lib/site-packages/
|
16 |
+
lib64/
|
17 |
+
parts/
|
18 |
+
sdist/
|
19 |
+
var/
|
20 |
+
wheels/
|
21 |
+
share/python-wheels/
|
22 |
+
share/man/man1/
|
23 |
+
*.egg-info/
|
24 |
+
.installed.cfg
|
25 |
+
*.egg
|
26 |
+
MANIFEST
|
27 |
+
|
28 |
+
# PyInstaller
|
29 |
+
# Usually these files are written by a python script from a template
|
30 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
31 |
+
*.manifest
|
32 |
+
*.spec
|
33 |
+
|
34 |
+
# Installer logs
|
35 |
+
pip-log.txt
|
36 |
+
pip-delete-this-directory.txt
|
37 |
+
|
38 |
+
# Unit test / coverage reports
|
39 |
+
htmlcov/
|
40 |
+
.tox/
|
41 |
+
.nox/
|
42 |
+
.coverage
|
43 |
+
.coverage.*
|
44 |
+
.cache
|
45 |
+
nosetests.xml
|
46 |
+
coverage.xml
|
47 |
+
*.cover
|
48 |
+
*.py,cover
|
49 |
+
.hypothesis/
|
50 |
+
.pytest_cache/
|
51 |
+
cover/
|
52 |
+
|
53 |
+
# Jupyter Notebook
|
54 |
+
.ipynb_checkpoints
|
55 |
+
share/jupyter
|
56 |
+
etc/jupyter
|
57 |
+
|
58 |
+
# IPython
|
59 |
+
profile_default/
|
60 |
+
ipython_config.py
|
61 |
+
|
62 |
+
# Environments
|
63 |
+
.env
|
64 |
+
.venv
|
65 |
+
env/
|
66 |
+
venv/
|
67 |
+
ENV/
|
68 |
+
env.bak/
|
69 |
+
venv.bak/
|
70 |
+
pyvenv.cfg
|
71 |
+
Scripts/
|
72 |
+
|
73 |
+
*.code-workspace
|
74 |
+
|
75 |
+
results/
|
76 |
+
wandb/
|
.gradio/certificate.pem
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN CERTIFICATE-----
|
2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
31 |
+
-----END CERTIFICATE-----
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2024 Roman Solovyev (ZFTurbo)
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
CHANGED
@@ -1,3 +1,14 @@
|
|
1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
# Gecekondu Dubbing Production Space
|
3 |
-
Bu Space, ses ayrıştırma ve dublaj işlemleri için profesyonel bir arayüz sunar.
|
|
|
1 |
|
2 |
+
---
|
3 |
+
title: Gecekondu Dubbing Production Studio
|
4 |
+
emoji: 🎙️
|
5 |
+
colorFrom: red
|
6 |
+
colorTo: yellow # gold yerine yellow kullanıldı
|
7 |
+
sdk: gradio
|
8 |
+
sdk_version: "4.44.1"
|
9 |
+
app_file: app.py
|
10 |
+
pinned: false
|
11 |
+
---
|
12 |
+
|
13 |
# Gecekondu Dubbing Production Space
|
14 |
+
Bu Space, ses ayrıştırma ve dublaj işlemleri için profesyonel bir arayüz sunar. Gecekondu ekibi tarafından geliştirilmiştir.
|
WebUi2.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
ckpts/inst_gabox.yaml
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 352800
|
3 |
+
dim_f: 1024
|
4 |
+
dim_t: 1101
|
5 |
+
hop_length: 441
|
6 |
+
n_fft: 2048
|
7 |
+
num_channels: 2
|
8 |
+
sample_rate: 44100
|
9 |
+
min_mean_abs: 0.0
|
10 |
+
model:
|
11 |
+
dim: 384
|
12 |
+
depth: 6
|
13 |
+
stereo: true
|
14 |
+
num_stems: 1
|
15 |
+
time_transformer_depth: 1
|
16 |
+
freq_transformer_depth: 1
|
17 |
+
num_bands: 60
|
18 |
+
dim_head: 64
|
19 |
+
heads: 8
|
20 |
+
attn_dropout: 0
|
21 |
+
ff_dropout: 0
|
22 |
+
flash_attn: true
|
23 |
+
dim_freqs_in: 1025
|
24 |
+
sample_rate: 44100
|
25 |
+
stft_n_fft: 2048
|
26 |
+
stft_hop_length: 441
|
27 |
+
stft_win_length: 2048
|
28 |
+
stft_normalized: false
|
29 |
+
mask_estimator_depth: 2
|
30 |
+
multi_stft_resolution_loss_weight: 1.0
|
31 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
32 |
+
- 4096
|
33 |
+
- 2048
|
34 |
+
- 1024
|
35 |
+
- 512
|
36 |
+
- 256
|
37 |
+
multi_stft_hop_size: 147
|
38 |
+
multi_stft_normalized: false
|
39 |
+
training:
|
40 |
+
instruments:
|
41 |
+
- Instrumental
|
42 |
+
- Vocals
|
43 |
+
target_instrument: Instrumental
|
44 |
+
use_amp: true
|
45 |
+
inference:
|
46 |
+
batch_size: 2
|
47 |
+
dim_t: 1101
|
48 |
+
num_overlap: 2
|
clean_model.py
ADDED
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import glob
|
3 |
+
import subprocess
|
4 |
+
import time
|
5 |
+
import gc
|
6 |
+
import shutil
|
7 |
+
import sys
|
8 |
+
from datetime import datetime
|
9 |
+
import torch
|
10 |
+
import yaml
|
11 |
+
import gradio as gr
|
12 |
+
import threading
|
13 |
+
import random
|
14 |
+
import librosa
|
15 |
+
import soundfile as sf
|
16 |
+
import numpy as np
|
17 |
+
import requests
|
18 |
+
import json
|
19 |
+
import locale
|
20 |
+
import re
|
21 |
+
import psutil
|
22 |
+
import concurrent.futures
|
23 |
+
from tqdm import tqdm
|
24 |
+
from google.oauth2.credentials import Credentials
|
25 |
+
import tempfile
|
26 |
+
from urllib.parse import urlparse, quote
|
27 |
+
import gdown
|
28 |
+
|
29 |
+
import warnings
|
30 |
+
warnings.filterwarnings("ignore")
|
31 |
+
|
32 |
+
# BASE_DIR'i dinamik olarak güncel dizine ayarla
|
33 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__)) # processing.py'nin bulunduğu dizin
|
34 |
+
INFERENCE_PATH = os.path.join(BASE_DIR, "inference.py") # inference.py'nin tam yolu
|
35 |
+
OUTPUT_DIR = os.path.join(BASE_DIR, "output") # Çıkış dizini BASE_DIR/output olarak güncellendi
|
36 |
+
AUTO_ENSEMBLE_OUTPUT = os.path.join(BASE_DIR, "ensemble_output") # Ensemble çıkış dizini
|
37 |
+
|
38 |
+
def clean_model_name(model):
|
39 |
+
"""
|
40 |
+
Clean and standardize model names for filename
|
41 |
+
"""
|
42 |
+
model_name_mapping = {
|
43 |
+
'VOCALS-InstVocHQ': 'InstVocHQ',
|
44 |
+
'VOCALS-MelBand-Roformer (by KimberleyJSN)': 'KimberleyJSN',
|
45 |
+
'VOCALS-BS-Roformer_1297 (by viperx)': 'VOCALS_BS_Roformer1297',
|
46 |
+
'VOCALS-BS-Roformer_1296 (by viperx)': 'VOCALS-BS-Roformer_1296',
|
47 |
+
'VOCALS-BS-RoformerLargev1 (by unwa)': 'UnwaLargeV1',
|
48 |
+
'VOCALS-Mel-Roformer big beta 4 (by unwa)': 'UnwaBigBeta4',
|
49 |
+
'VOCALS-Melband-Roformer BigBeta5e (by unwa)': 'UnwaBigBeta5e',
|
50 |
+
'INST-Mel-Roformer v1 (by unwa)': 'UnwaInstV1',
|
51 |
+
'INST-Mel-Roformer v2 (by unwa)': 'UnwaInstV2',
|
52 |
+
'INST-VOC-Mel-Roformer a.k.a. duality (by unwa)': 'UnwaDualityV1',
|
53 |
+
'INST-VOC-Mel-Roformer a.k.a. duality v2 (by unwa)': 'UnwaDualityV2',
|
54 |
+
'KARAOKE-MelBand-Roformer (by aufr33 & viperx)': 'KaraokeMelBandRoformer',
|
55 |
+
'VOCALS-VitLarge23 (by ZFTurbo)': 'VitLarge23',
|
56 |
+
'VOCALS-MelBand-Roformer (by Becruily)': 'BecruilyVocals',
|
57 |
+
'INST-MelBand-Roformer (by Becruily)': 'BecruilyInst',
|
58 |
+
'VOCALS-MelBand-Roformer Kim FT (by Unwa)': 'KimFT',
|
59 |
+
'INST-MelBand-Roformer Kim FT (by Unwa)': 'KimFTInst',
|
60 |
+
'OTHER-BS-Roformer_1053 (by viperx)': 'OtherViperx1053',
|
61 |
+
'CROWD-REMOVAL-MelBand-Roformer (by aufr33)': 'CrowdRemovalRoformer',
|
62 |
+
'CINEMATIC-BandIt_Plus (by kwatcharasupat)': 'CinematicBandItPlus',
|
63 |
+
'DRUMSEP-MDX23C_DrumSep_6stem (by aufr33 & jarredou)': 'DrumSepMDX23C',
|
64 |
+
'4STEMS-SCNet_MUSDB18 (by starrytong)': 'FourStemsSCNet',
|
65 |
+
'DE-REVERB-MDX23C (by aufr33 & jarredou)': 'DeReverbMDX23C',
|
66 |
+
'DENOISE-MelBand-Roformer-1 (by aufr33)': 'DenoiseMelBand1',
|
67 |
+
'DENOISE-MelBand-Roformer-2 (by aufr33)': 'DenoiseMelBand2',
|
68 |
+
'INST-MelBand-Roformer (by Becruily)': 'BecruilyInst',
|
69 |
+
'4STEMS-SCNet_XL_MUSDB18 (by ZFTurbo)': 'FourStemsSCNetXL',
|
70 |
+
'4STEMS-SCNet_Large (by starrytong)': 'FourStemsSCNetLarge',
|
71 |
+
'4STEMS-BS-Roformer_MUSDB18 (by ZFTurbo)': 'FourStemsBSRoformer',
|
72 |
+
'DE-REVERB-MelBand-Roformer aggr./v2/19.1729 (by anvuew)': 'DeReverbMelBandAggr',
|
73 |
+
'DE-REVERB-Echo-MelBand-Roformer (by Sucial)': 'DeReverbEchoMelBand',
|
74 |
+
'bleed_suppressor_v1 (by unwa)': 'BleedSuppressorV1',
|
75 |
+
'inst_v1e (by unwa)': 'InstV1E',
|
76 |
+
'inst_gabox (by Gabox)': 'InstGabox',
|
77 |
+
'inst_gaboxBV1 (by Gabox)': 'InstGaboxBV1',
|
78 |
+
'inst_gaboxBV2 (by Gabox)': 'InstGaboxBV2',
|
79 |
+
'inst_gaboxBFV1 (by Gabox)': 'InstGaboxBFV1',
|
80 |
+
'inst_gaboxFV2 (by Gabox)': 'InstGaboxFV2',
|
81 |
+
'inst_gaboxFV1 (by Gabox)': 'InstGaboxFV1',
|
82 |
+
'dereverb_mel_band_roformer_less_aggressive_anvuew': 'DereverbMelBandRoformerLessAggressive',
|
83 |
+
'dereverb_mel_band_roformer_anvuew': 'DereverbMelBandRoformer',
|
84 |
+
'VOCALS-Male Female-BS-RoFormer Male Female Beta 7_2889 (by aufr33)': 'MaleFemale-BS-RoFormer-(by aufr33)',
|
85 |
+
'VOCALS-MelBand-Roformer (by Becruily)': 'Vocals-MelBand-Roformer-(by Becruily)',
|
86 |
+
'VOCALS-MelBand-Roformer Kim FT 2 (by Unwa)': 'Vocals-MelBand-Roformer-KİM-FT-2(by Unwa)',
|
87 |
+
'voc_gaboxMelRoformer (by Gabox)': 'voc_gaboxMelRoformer',
|
88 |
+
'voc_gaboxBSroformer (by Gabox)': 'voc_gaboxBSroformer',
|
89 |
+
'voc_gaboxMelRoformerFV1 (by Gabox)': 'voc_gaboxMelRoformerFV1',
|
90 |
+
'voc_gaboxMelRoformerFV2 (by Gabox)': 'voc_gaboxMelRoformerFV2',
|
91 |
+
'SYH99999/MelBandRoformerSYHFTB1(by Amane)': 'MelBandRoformerSYHFTB1',
|
92 |
+
'inst_V5 (by Gabox)': 'INSTV5-(by Gabox)',
|
93 |
+
'inst_Fv4Noise (by Gabox)': 'Inst_Fv4Noise-(by Gabox)',
|
94 |
+
'Intrumental_Gabox (by Gabox)': 'Intrumental_Gabox-(by Gabox)',
|
95 |
+
'inst_GaboxFv3 (by Gabox)': 'INST_GaboxFv3-(by Gabox)',
|
96 |
+
'SYH99999/MelBandRoformerSYHFTB1_Model1 (by Amane)': 'MelBandRoformerSYHFTB1_model1',
|
97 |
+
'SYH99999/MelBandRoformerSYHFTB1_Model2 (by Amane)': 'MelBandRoformerSYHFTB1_model2',
|
98 |
+
'SYH99999/MelBandRoformerSYHFTB1_Model3 (by Amane)': 'MelBandRoformerSYHFTB1_model3',
|
99 |
+
'VOCALS-MelBand-Roformer Kim FT 2 Blendless (by unwa)': 'VOCALS-MelBand-Roformer-Kim-FT-2-Blendless-(by unwa)',
|
100 |
+
'inst_gaboxFV6 (by Gabox)': 'inst_gaboxFV6-(by Gabox)',
|
101 |
+
'denoisedebleed (by Gabox)': 'denoisedebleed-(by Gabox)',
|
102 |
+
'INSTV5N (by Gabox)': 'INSTV5N_(by Gabox)',
|
103 |
+
'Voc_Fv3 (by Gabox)': 'Voc_Fv3_(by Gabox)',
|
104 |
+
'MelBandRoformer4StemFTLarge (SYH99999)': 'MelBandRoformer4StemFTLarge_(SYH99999)',
|
105 |
+
'dereverb_mel_band_roformer_mono (by anvuew)': 'dereverb_mel_band_roformer_mono_(by anvuew)',
|
106 |
+
'INSTV6N (by Gabox)': 'INSTV6N_(by Gabox)',
|
107 |
+
'KaraokeGabox': 'KaraokeGabox',
|
108 |
+
'FullnessVocalModel (by Amane)': 'FullnessVocalModel',
|
109 |
+
'Inst_GaboxV7 (by Gabox)': 'Inst_GaboxV7_(by Gabox)',
|
110 |
+
}
|
111 |
+
|
112 |
+
if model in model_name_mapping:
|
113 |
+
return model_name_mapping[model]
|
114 |
+
|
115 |
+
cleaned = re.sub(r'\s*\(.*?\)', '', model) # Remove parenthetical info
|
116 |
+
cleaned = cleaned.replace('-', '_')
|
117 |
+
cleaned = ''.join(char for char in cleaned if char.isalnum() or char == '_')
|
118 |
+
|
119 |
+
return cleaned
|
120 |
+
|
121 |
+
def shorten_filename(filename, max_length=30):
|
122 |
+
"""
|
123 |
+
Shortens a filename to a specified maximum length
|
124 |
+
"""
|
125 |
+
base, ext = os.path.splitext(filename)
|
126 |
+
if len(base) <= max_length:
|
127 |
+
return filename
|
128 |
+
shortened = base[:15] + "..." + base[-10:] + ext
|
129 |
+
return shortened
|
130 |
+
|
131 |
+
def clean_filename(filename):
|
132 |
+
"""
|
133 |
+
Temizlenmiş dosya adını döndürür
|
134 |
+
"""
|
135 |
+
cleanup_patterns = [
|
136 |
+
r'_\d{8}_\d{6}_\d{6}$', # _20231215_123456_123456
|
137 |
+
r'_\d{14}$', # _20231215123456
|
138 |
+
r'_\d{10}$', # _1702658400
|
139 |
+
r'_\d+$' # Herhangi bir sayı
|
140 |
+
]
|
141 |
+
|
142 |
+
base, ext = os.path.splitext(filename)
|
143 |
+
for pattern in cleanup_patterns:
|
144 |
+
base = re.sub(pattern, '', base)
|
145 |
+
|
146 |
+
file_types = ['vocals', 'instrumental', 'drum', 'bass', 'other', 'effects', 'speech', 'music', 'dry', 'male', 'female']
|
147 |
+
for type_keyword in file_types:
|
148 |
+
base = base.replace(f'_{type_keyword}', '')
|
149 |
+
|
150 |
+
detected_type = None
|
151 |
+
for type_keyword in file_types:
|
152 |
+
if type_keyword in base.lower():
|
153 |
+
detected_type = type_keyword
|
154 |
+
break
|
155 |
+
|
156 |
+
clean_base = base.strip('_- ')
|
157 |
+
return clean_base, detected_type, ext
|
configs/KimberleyJensen/config_vocals_mel_band_roformer_kj.yaml
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 352800
|
3 |
+
dim_f: 1024
|
4 |
+
dim_t: 256
|
5 |
+
hop_length: 441
|
6 |
+
n_fft: 2048
|
7 |
+
num_channels: 2
|
8 |
+
sample_rate: 44100
|
9 |
+
min_mean_abs: 0.000
|
10 |
+
|
11 |
+
model:
|
12 |
+
dim: 384
|
13 |
+
depth: 6
|
14 |
+
stereo: true
|
15 |
+
num_stems: 1
|
16 |
+
time_transformer_depth: 1
|
17 |
+
freq_transformer_depth: 1
|
18 |
+
num_bands: 60
|
19 |
+
dim_head: 64
|
20 |
+
heads: 8
|
21 |
+
attn_dropout: 0
|
22 |
+
ff_dropout: 0
|
23 |
+
flash_attn: True
|
24 |
+
dim_freqs_in: 1025
|
25 |
+
sample_rate: 44100 # needed for mel filter bank from librosa
|
26 |
+
stft_n_fft: 2048
|
27 |
+
stft_hop_length: 441
|
28 |
+
stft_win_length: 2048
|
29 |
+
stft_normalized: False
|
30 |
+
mask_estimator_depth: 2
|
31 |
+
multi_stft_resolution_loss_weight: 1.0
|
32 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
33 |
+
- 4096
|
34 |
+
- 2048
|
35 |
+
- 1024
|
36 |
+
- 512
|
37 |
+
- 256
|
38 |
+
multi_stft_hop_size: 147
|
39 |
+
multi_stft_normalized: False
|
40 |
+
|
41 |
+
training:
|
42 |
+
batch_size: 4
|
43 |
+
gradient_accumulation_steps: 1
|
44 |
+
grad_clip: 0
|
45 |
+
instruments:
|
46 |
+
- vocals
|
47 |
+
- other
|
48 |
+
lr: 1.0e-05
|
49 |
+
patience: 2
|
50 |
+
reduce_factor: 0.95
|
51 |
+
target_instrument: vocals
|
52 |
+
num_epochs: 1000
|
53 |
+
num_steps: 1000
|
54 |
+
augmentation: false # enable augmentations by audiomentations and pedalboard
|
55 |
+
augmentation_type: null
|
56 |
+
use_mp3_compress: false # Deprecated
|
57 |
+
augmentation_mix: false # Mix several stems of the same type with some probability
|
58 |
+
augmentation_loudness: false # randomly change loudness of each stem
|
59 |
+
augmentation_loudness_type: 1 # Type 1 or 2
|
60 |
+
augmentation_loudness_min: 0
|
61 |
+
augmentation_loudness_max: 0
|
62 |
+
q: 0.95
|
63 |
+
coarse_loss_clip: false
|
64 |
+
ema_momentum: 0.999
|
65 |
+
optimizer: adam
|
66 |
+
other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
|
67 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
68 |
+
|
69 |
+
inference:
|
70 |
+
batch_size: 4
|
71 |
+
dim_t: 256
|
72 |
+
num_overlap: 2
|
configs/config_apollo.yaml
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 132300
|
3 |
+
num_channels: 2
|
4 |
+
sample_rate: 44100
|
5 |
+
min_mean_abs: 0.0
|
6 |
+
|
7 |
+
model:
|
8 |
+
sr: 44100
|
9 |
+
win: 20
|
10 |
+
feature_dim: 256
|
11 |
+
layer: 6
|
12 |
+
|
13 |
+
training:
|
14 |
+
instruments: ['restored', 'addition']
|
15 |
+
target_instrument: 'restored'
|
16 |
+
batch_size: 2
|
17 |
+
num_steps: 1000
|
18 |
+
num_epochs: 1000
|
19 |
+
optimizer: 'prodigy'
|
20 |
+
lr: 1.0
|
21 |
+
patience: 2
|
22 |
+
reduce_factor: 0.95
|
23 |
+
coarse_loss_clip: true
|
24 |
+
grad_clip: 0
|
25 |
+
q: 0.95
|
26 |
+
use_amp: true
|
27 |
+
|
28 |
+
augmentations:
|
29 |
+
enable: false # enable or disable all augmentations (to fast disable if needed)
|
30 |
+
|
31 |
+
inference:
|
32 |
+
batch_size: 4
|
33 |
+
num_overlap: 4
|
configs/config_dnr_bandit_bsrnn_multi_mus64.yaml
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: "MultiMaskMultiSourceBandSplitRNN"
|
2 |
+
audio:
|
3 |
+
chunk_size: 264600
|
4 |
+
num_channels: 2
|
5 |
+
sample_rate: 44100
|
6 |
+
min_mean_abs: 0.001
|
7 |
+
|
8 |
+
model:
|
9 |
+
in_channel: 1
|
10 |
+
stems: ['speech', 'music', 'effects']
|
11 |
+
band_specs: "musical"
|
12 |
+
n_bands: 64
|
13 |
+
fs: 44100
|
14 |
+
require_no_overlap: false
|
15 |
+
require_no_gap: true
|
16 |
+
normalize_channel_independently: false
|
17 |
+
treat_channel_as_feature: true
|
18 |
+
n_sqm_modules: 8
|
19 |
+
emb_dim: 128
|
20 |
+
rnn_dim: 256
|
21 |
+
bidirectional: true
|
22 |
+
rnn_type: "GRU"
|
23 |
+
mlp_dim: 512
|
24 |
+
hidden_activation: "Tanh"
|
25 |
+
hidden_activation_kwargs: null
|
26 |
+
complex_mask: true
|
27 |
+
n_fft: 2048
|
28 |
+
win_length: 2048
|
29 |
+
hop_length: 512
|
30 |
+
window_fn: "hann_window"
|
31 |
+
wkwargs: null
|
32 |
+
power: null
|
33 |
+
center: true
|
34 |
+
normalized: true
|
35 |
+
pad_mode: "constant"
|
36 |
+
onesided: true
|
37 |
+
|
38 |
+
training:
|
39 |
+
batch_size: 4
|
40 |
+
gradient_accumulation_steps: 4
|
41 |
+
grad_clip: 0
|
42 |
+
instruments:
|
43 |
+
- speech
|
44 |
+
- music
|
45 |
+
- effects
|
46 |
+
lr: 9.0e-05
|
47 |
+
patience: 2
|
48 |
+
reduce_factor: 0.95
|
49 |
+
target_instrument: null
|
50 |
+
num_epochs: 1000
|
51 |
+
num_steps: 1000
|
52 |
+
q: 0.95
|
53 |
+
coarse_loss_clip: true
|
54 |
+
ema_momentum: 0.999
|
55 |
+
optimizer: adam
|
56 |
+
other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
|
57 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
58 |
+
|
59 |
+
augmentations:
|
60 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
61 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
62 |
+
loudness_min: 0.5
|
63 |
+
loudness_max: 1.5
|
64 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
65 |
+
mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
66 |
+
- 0.2
|
67 |
+
- 0.02
|
68 |
+
mixup_loudness_min: 0.5
|
69 |
+
mixup_loudness_max: 1.5
|
70 |
+
all:
|
71 |
+
channel_shuffle: 0.5 # Set 0 or lower to disable
|
72 |
+
random_inverse: 0.1 # inverse track (better lower probability)
|
73 |
+
random_polarity: 0.5 # polarity change (multiply waveform to -1)
|
74 |
+
|
75 |
+
inference:
|
76 |
+
batch_size: 1
|
77 |
+
dim_t: 256
|
78 |
+
num_overlap: 4
|
configs/config_dnr_bandit_v2_mus64.yaml
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
cls: Bandit
|
2 |
+
|
3 |
+
audio:
|
4 |
+
chunk_size: 384000
|
5 |
+
num_channels: 2
|
6 |
+
sample_rate: 48000
|
7 |
+
min_mean_abs: 0.000
|
8 |
+
|
9 |
+
kwargs:
|
10 |
+
in_channels: 1
|
11 |
+
stems: ['speech', 'music', 'sfx']
|
12 |
+
band_type: musical
|
13 |
+
n_bands: 64
|
14 |
+
normalize_channel_independently: false
|
15 |
+
treat_channel_as_feature: true
|
16 |
+
n_sqm_modules: 8
|
17 |
+
emb_dim: 128
|
18 |
+
rnn_dim: 256
|
19 |
+
bidirectional: true
|
20 |
+
rnn_type: "GRU"
|
21 |
+
mlp_dim: 512
|
22 |
+
hidden_activation: "Tanh"
|
23 |
+
hidden_activation_kwargs: null
|
24 |
+
complex_mask: true
|
25 |
+
use_freq_weights: true
|
26 |
+
n_fft: 2048
|
27 |
+
win_length: 2048
|
28 |
+
hop_length: 512
|
29 |
+
window_fn: "hann_window"
|
30 |
+
wkwargs: null
|
31 |
+
power: null
|
32 |
+
center: true
|
33 |
+
normalized: true
|
34 |
+
pad_mode: "reflect"
|
35 |
+
onesided: true
|
36 |
+
|
37 |
+
training:
|
38 |
+
batch_size: 4
|
39 |
+
gradient_accumulation_steps: 4
|
40 |
+
grad_clip: 0
|
41 |
+
instruments:
|
42 |
+
- speech
|
43 |
+
- music
|
44 |
+
- sfx
|
45 |
+
lr: 9.0e-05
|
46 |
+
patience: 2
|
47 |
+
reduce_factor: 0.95
|
48 |
+
target_instrument: null
|
49 |
+
num_epochs: 1000
|
50 |
+
num_steps: 1000
|
51 |
+
q: 0.95
|
52 |
+
coarse_loss_clip: true
|
53 |
+
ema_momentum: 0.999
|
54 |
+
optimizer: adam
|
55 |
+
other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
|
56 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
57 |
+
|
58 |
+
augmentations:
|
59 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
60 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
61 |
+
loudness_min: 0.5
|
62 |
+
loudness_max: 1.5
|
63 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
64 |
+
mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
65 |
+
- 0.2
|
66 |
+
- 0.02
|
67 |
+
mixup_loudness_min: 0.5
|
68 |
+
mixup_loudness_max: 1.5
|
69 |
+
all:
|
70 |
+
channel_shuffle: 0.5 # Set 0 or lower to disable
|
71 |
+
random_inverse: 0.1 # inverse track (better lower probability)
|
72 |
+
random_polarity: 0.5 # polarity change (multiply waveform to -1)
|
73 |
+
|
74 |
+
inference:
|
75 |
+
batch_size: 8
|
76 |
+
dim_t: 256
|
77 |
+
num_overlap: 4
|
configs/config_drumsep.yaml
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 1764000 # samplerate * segment
|
3 |
+
min_mean_abs: 0.000
|
4 |
+
hop_length: 1024
|
5 |
+
|
6 |
+
training:
|
7 |
+
batch_size: 8
|
8 |
+
gradient_accumulation_steps: 1
|
9 |
+
grad_clip: 0
|
10 |
+
segment: 40
|
11 |
+
shift: 1
|
12 |
+
samplerate: 44100
|
13 |
+
channels: 2
|
14 |
+
normalize: true
|
15 |
+
instruments: ['kick', 'snare', 'cymbals', 'toms']
|
16 |
+
target_instrument: null
|
17 |
+
num_epochs: 1000
|
18 |
+
num_steps: 1000
|
19 |
+
optimizer: adam
|
20 |
+
lr: 9.0e-05
|
21 |
+
patience: 2
|
22 |
+
reduce_factor: 0.95
|
23 |
+
q: 0.95
|
24 |
+
coarse_loss_clip: true
|
25 |
+
ema_momentum: 0.999
|
26 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
27 |
+
use_amp: false # enable or disable usage of mixed precision (float16) - usually it must be true
|
28 |
+
|
29 |
+
augmentations:
|
30 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
31 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
32 |
+
loudness_min: 0.5
|
33 |
+
loudness_max: 1.5
|
34 |
+
|
35 |
+
inference:
|
36 |
+
num_overlap: 4
|
37 |
+
batch_size: 8
|
38 |
+
|
39 |
+
model: hdemucs
|
40 |
+
|
41 |
+
hdemucs: # see demucs/hdemucs.py for a detailed description
|
42 |
+
channels: 48
|
43 |
+
channels_time: null
|
44 |
+
growth: 2
|
45 |
+
nfft: 4096
|
46 |
+
wiener_iters: 0
|
47 |
+
end_iters: 0
|
48 |
+
wiener_residual: False
|
49 |
+
cac: True
|
50 |
+
depth: 6
|
51 |
+
rewrite: True
|
52 |
+
hybrid: True
|
53 |
+
hybrid_old: False
|
54 |
+
multi_freqs: []
|
55 |
+
multi_freqs_depth: 3
|
56 |
+
freq_emb: 0.2
|
57 |
+
emb_scale: 10
|
58 |
+
emb_smooth: True
|
59 |
+
kernel_size: 8
|
60 |
+
stride: 4
|
61 |
+
time_stride: 2
|
62 |
+
context: 1
|
63 |
+
context_enc: 0
|
64 |
+
norm_starts: 4
|
65 |
+
norm_groups: 4
|
66 |
+
dconv_mode: 1
|
67 |
+
dconv_depth: 2
|
68 |
+
dconv_comp: 4
|
69 |
+
dconv_attn: 4
|
70 |
+
dconv_lstm: 4
|
71 |
+
dconv_init: 0.001
|
72 |
+
rescale: 0.1
|
configs/config_htdemucs_6stems.yaml
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 485100 # samplerate * segment
|
3 |
+
min_mean_abs: 0.001
|
4 |
+
hop_length: 1024
|
5 |
+
|
6 |
+
training:
|
7 |
+
batch_size: 8
|
8 |
+
gradient_accumulation_steps: 1
|
9 |
+
grad_clip: 0
|
10 |
+
segment: 11
|
11 |
+
shift: 1
|
12 |
+
samplerate: 44100
|
13 |
+
channels: 2
|
14 |
+
normalize: true
|
15 |
+
instruments: ['drums', 'bass', 'other', 'vocals', 'guitar', 'piano']
|
16 |
+
target_instrument: null
|
17 |
+
num_epochs: 1000
|
18 |
+
num_steps: 1000
|
19 |
+
optimizer: adam
|
20 |
+
lr: 9.0e-05
|
21 |
+
patience: 2
|
22 |
+
reduce_factor: 0.95
|
23 |
+
q: 0.95
|
24 |
+
coarse_loss_clip: true
|
25 |
+
ema_momentum: 0.999
|
26 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
27 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
28 |
+
|
29 |
+
augmentations:
|
30 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
31 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
32 |
+
loudness_min: 0.5
|
33 |
+
loudness_max: 1.5
|
34 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
35 |
+
mixup_probs: [0.2, 0.02]
|
36 |
+
mixup_loudness_min: 0.5
|
37 |
+
mixup_loudness_max: 1.5
|
38 |
+
all:
|
39 |
+
channel_shuffle: 0.5 # Set 0 or lower to disable
|
40 |
+
random_inverse: 0.1 # inverse track (better lower probability)
|
41 |
+
random_polarity: 0.5 # polarity change (multiply waveform to -1)
|
42 |
+
|
43 |
+
inference:
|
44 |
+
num_overlap: 4
|
45 |
+
batch_size: 8
|
46 |
+
|
47 |
+
model: htdemucs
|
48 |
+
|
49 |
+
htdemucs: # see demucs/htdemucs.py for a detailed description
|
50 |
+
# Channels
|
51 |
+
channels: 48
|
52 |
+
channels_time:
|
53 |
+
growth: 2
|
54 |
+
# STFT
|
55 |
+
num_subbands: 1
|
56 |
+
nfft: 4096
|
57 |
+
wiener_iters: 0
|
58 |
+
end_iters: 0
|
59 |
+
wiener_residual: false
|
60 |
+
cac: true
|
61 |
+
# Main structure
|
62 |
+
depth: 4
|
63 |
+
rewrite: true
|
64 |
+
# Frequency Branch
|
65 |
+
multi_freqs: []
|
66 |
+
multi_freqs_depth: 3
|
67 |
+
freq_emb: 0.2
|
68 |
+
emb_scale: 10
|
69 |
+
emb_smooth: true
|
70 |
+
# Convolutions
|
71 |
+
kernel_size: 8
|
72 |
+
stride: 4
|
73 |
+
time_stride: 2
|
74 |
+
context: 1
|
75 |
+
context_enc: 0
|
76 |
+
# normalization
|
77 |
+
norm_starts: 4
|
78 |
+
norm_groups: 4
|
79 |
+
# DConv residual branch
|
80 |
+
dconv_mode: 3
|
81 |
+
dconv_depth: 2
|
82 |
+
dconv_comp: 8
|
83 |
+
dconv_init: 1e-3
|
84 |
+
# Before the Transformer
|
85 |
+
bottom_channels: 0
|
86 |
+
# CrossTransformer
|
87 |
+
# ------ Common to all
|
88 |
+
# Regular parameters
|
89 |
+
t_layers: 5
|
90 |
+
t_hidden_scale: 4.0
|
91 |
+
t_heads: 8
|
92 |
+
t_dropout: 0.0
|
93 |
+
t_layer_scale: True
|
94 |
+
t_gelu: True
|
95 |
+
# ------------- Positional Embedding
|
96 |
+
t_emb: sin
|
97 |
+
t_max_positions: 10000 # for the scaled embedding
|
98 |
+
t_max_period: 10000.0
|
99 |
+
t_weight_pos_embed: 1.0
|
100 |
+
t_cape_mean_normalize: True
|
101 |
+
t_cape_augment: True
|
102 |
+
t_cape_glob_loc_scale: [5000.0, 1.0, 1.4]
|
103 |
+
t_sin_random_shift: 0
|
104 |
+
# ------------- norm before a transformer encoder
|
105 |
+
t_norm_in: True
|
106 |
+
t_norm_in_group: False
|
107 |
+
# ------------- norm inside the encoder
|
108 |
+
t_group_norm: False
|
109 |
+
t_norm_first: True
|
110 |
+
t_norm_out: True
|
111 |
+
# ------------- optim
|
112 |
+
t_weight_decay: 0.0
|
113 |
+
t_lr:
|
114 |
+
# ------------- sparsity
|
115 |
+
t_sparse_self_attn: False
|
116 |
+
t_sparse_cross_attn: False
|
117 |
+
t_mask_type: diag
|
118 |
+
t_mask_random_seed: 42
|
119 |
+
t_sparse_attn_window: 400
|
120 |
+
t_global_window: 100
|
121 |
+
t_sparsity: 0.95
|
122 |
+
t_auto_sparsity: False
|
123 |
+
# Cross Encoder First (False)
|
124 |
+
t_cross_first: False
|
125 |
+
# Weight init
|
126 |
+
rescale: 0.1
|
127 |
+
|
configs/config_musdb18_bs_mamba2.yaml
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 132300 # samplerate * segment
|
3 |
+
hop_length: 1024
|
4 |
+
min_mean_abs: 0.0
|
5 |
+
|
6 |
+
training:
|
7 |
+
batch_size: 8
|
8 |
+
gradient_accumulation_steps: 1
|
9 |
+
grad_clip: 0
|
10 |
+
segment: 11
|
11 |
+
shift: 1
|
12 |
+
samplerate: 44100
|
13 |
+
channels: 2
|
14 |
+
normalize: true
|
15 |
+
instruments: ['drums', 'bass', 'other', 'vocals']
|
16 |
+
target_instrument: null
|
17 |
+
num_epochs: 1000
|
18 |
+
num_steps: 1000
|
19 |
+
optimizer: prodigy
|
20 |
+
lr: 1.0
|
21 |
+
patience: 2
|
22 |
+
reduce_factor: 0.95
|
23 |
+
q: 0.95
|
24 |
+
coarse_loss_clip: true
|
25 |
+
ema_momentum: 0.999
|
26 |
+
read_metadata_procs: 8
|
27 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
28 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
29 |
+
|
30 |
+
model:
|
31 |
+
sr: 44100
|
32 |
+
win: 2048
|
33 |
+
stride: 512
|
34 |
+
feature_dim: 128
|
35 |
+
num_repeat_mask: 8
|
36 |
+
num_repeat_map: 4
|
37 |
+
num_output: 4
|
38 |
+
|
39 |
+
augmentations:
|
40 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
41 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
42 |
+
loudness_min: 0.5
|
43 |
+
loudness_max: 1.5
|
44 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
45 |
+
mixup_probs:
|
46 |
+
!!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
47 |
+
- 0.2
|
48 |
+
- 0.02
|
49 |
+
mixup_loudness_min: 0.5
|
50 |
+
mixup_loudness_max: 1.5
|
51 |
+
all:
|
52 |
+
channel_shuffle: 0.5 # Set 0 or lower to disable
|
53 |
+
random_inverse: 0.1 # inverse track (better lower probability)
|
54 |
+
random_polarity: 0.5 # polarity change (multiply waveform to -1)
|
55 |
+
|
56 |
+
inference:
|
57 |
+
num_overlap: 2
|
58 |
+
batch_size: 8
|
configs/config_musdb18_bs_roformer.yaml
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 131584
|
3 |
+
dim_f: 1024
|
4 |
+
dim_t: 256
|
5 |
+
hop_length: 512
|
6 |
+
n_fft: 2048
|
7 |
+
num_channels: 2
|
8 |
+
sample_rate: 44100
|
9 |
+
min_mean_abs: 0.001
|
10 |
+
|
11 |
+
model:
|
12 |
+
dim: 192
|
13 |
+
depth: 6
|
14 |
+
stereo: true
|
15 |
+
num_stems: 1
|
16 |
+
time_transformer_depth: 1
|
17 |
+
freq_transformer_depth: 1
|
18 |
+
linear_transformer_depth: 0
|
19 |
+
freqs_per_bands: !!python/tuple
|
20 |
+
- 2
|
21 |
+
- 2
|
22 |
+
- 2
|
23 |
+
- 2
|
24 |
+
- 2
|
25 |
+
- 2
|
26 |
+
- 2
|
27 |
+
- 2
|
28 |
+
- 2
|
29 |
+
- 2
|
30 |
+
- 2
|
31 |
+
- 2
|
32 |
+
- 2
|
33 |
+
- 2
|
34 |
+
- 2
|
35 |
+
- 2
|
36 |
+
- 2
|
37 |
+
- 2
|
38 |
+
- 2
|
39 |
+
- 2
|
40 |
+
- 2
|
41 |
+
- 2
|
42 |
+
- 2
|
43 |
+
- 2
|
44 |
+
- 4
|
45 |
+
- 4
|
46 |
+
- 4
|
47 |
+
- 4
|
48 |
+
- 4
|
49 |
+
- 4
|
50 |
+
- 4
|
51 |
+
- 4
|
52 |
+
- 4
|
53 |
+
- 4
|
54 |
+
- 4
|
55 |
+
- 4
|
56 |
+
- 12
|
57 |
+
- 12
|
58 |
+
- 12
|
59 |
+
- 12
|
60 |
+
- 12
|
61 |
+
- 12
|
62 |
+
- 12
|
63 |
+
- 12
|
64 |
+
- 24
|
65 |
+
- 24
|
66 |
+
- 24
|
67 |
+
- 24
|
68 |
+
- 24
|
69 |
+
- 24
|
70 |
+
- 24
|
71 |
+
- 24
|
72 |
+
- 48
|
73 |
+
- 48
|
74 |
+
- 48
|
75 |
+
- 48
|
76 |
+
- 48
|
77 |
+
- 48
|
78 |
+
- 48
|
79 |
+
- 48
|
80 |
+
- 128
|
81 |
+
- 129
|
82 |
+
dim_head: 64
|
83 |
+
heads: 8
|
84 |
+
attn_dropout: 0.1
|
85 |
+
ff_dropout: 0.1
|
86 |
+
flash_attn: true
|
87 |
+
dim_freqs_in: 1025
|
88 |
+
stft_n_fft: 2048
|
89 |
+
stft_hop_length: 512
|
90 |
+
stft_win_length: 2048
|
91 |
+
stft_normalized: false
|
92 |
+
mask_estimator_depth: 2
|
93 |
+
multi_stft_resolution_loss_weight: 1.0
|
94 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
95 |
+
- 4096
|
96 |
+
- 2048
|
97 |
+
- 1024
|
98 |
+
- 512
|
99 |
+
- 256
|
100 |
+
multi_stft_hop_size: 147
|
101 |
+
multi_stft_normalized: False
|
102 |
+
mlp_expansion_factor: 4 # Probably too big (requires a lot of memory for weights)
|
103 |
+
use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested)
|
104 |
+
skip_connection: False # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
|
105 |
+
|
106 |
+
training:
|
107 |
+
batch_size: 10
|
108 |
+
gradient_accumulation_steps: 1
|
109 |
+
grad_clip: 0
|
110 |
+
instruments:
|
111 |
+
- vocals
|
112 |
+
- bass
|
113 |
+
- drums
|
114 |
+
- other
|
115 |
+
lr: 5.0e-05
|
116 |
+
patience: 2
|
117 |
+
reduce_factor: 0.95
|
118 |
+
target_instrument: vocals
|
119 |
+
num_epochs: 1000
|
120 |
+
num_steps: 1000
|
121 |
+
q: 0.95
|
122 |
+
coarse_loss_clip: true
|
123 |
+
ema_momentum: 0.999
|
124 |
+
optimizer: adam
|
125 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
126 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
127 |
+
|
128 |
+
augmentations:
|
129 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
130 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
131 |
+
loudness_min: 0.5
|
132 |
+
loudness_max: 1.5
|
133 |
+
|
134 |
+
inference:
|
135 |
+
batch_size: 1
|
136 |
+
dim_t: 256
|
137 |
+
num_overlap: 4
|
configs/config_musdb18_bs_roformer_with_lora.yaml
ADDED
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 485100
|
3 |
+
dim_f: 1024
|
4 |
+
dim_t: 801 # don't work (use in model)
|
5 |
+
hop_length: 441 # don't work (use in model)
|
6 |
+
n_fft: 2048
|
7 |
+
num_channels: 2
|
8 |
+
sample_rate: 44100
|
9 |
+
min_mean_abs: 0.000
|
10 |
+
|
11 |
+
lora:
|
12 |
+
r: 8
|
13 |
+
lora_alpha: 16 # alpha / rank > 1
|
14 |
+
lora_dropout: 0.05
|
15 |
+
merge_weights: False
|
16 |
+
fan_in_fan_out: False
|
17 |
+
enable_lora: [True, False, True] # This for QKV
|
18 |
+
# enable_lora: [True] # For non-Roformers architectures
|
19 |
+
|
20 |
+
model:
|
21 |
+
dim: 384
|
22 |
+
depth: 8
|
23 |
+
stereo: true
|
24 |
+
num_stems: 4
|
25 |
+
time_transformer_depth: 1
|
26 |
+
freq_transformer_depth: 1
|
27 |
+
linear_transformer_depth: 0
|
28 |
+
freqs_per_bands: !!python/tuple
|
29 |
+
- 2
|
30 |
+
- 2
|
31 |
+
- 2
|
32 |
+
- 2
|
33 |
+
- 2
|
34 |
+
- 2
|
35 |
+
- 2
|
36 |
+
- 2
|
37 |
+
- 2
|
38 |
+
- 2
|
39 |
+
- 2
|
40 |
+
- 2
|
41 |
+
- 2
|
42 |
+
- 2
|
43 |
+
- 2
|
44 |
+
- 2
|
45 |
+
- 2
|
46 |
+
- 2
|
47 |
+
- 2
|
48 |
+
- 2
|
49 |
+
- 2
|
50 |
+
- 2
|
51 |
+
- 2
|
52 |
+
- 2
|
53 |
+
- 4
|
54 |
+
- 4
|
55 |
+
- 4
|
56 |
+
- 4
|
57 |
+
- 4
|
58 |
+
- 4
|
59 |
+
- 4
|
60 |
+
- 4
|
61 |
+
- 4
|
62 |
+
- 4
|
63 |
+
- 4
|
64 |
+
- 4
|
65 |
+
- 12
|
66 |
+
- 12
|
67 |
+
- 12
|
68 |
+
- 12
|
69 |
+
- 12
|
70 |
+
- 12
|
71 |
+
- 12
|
72 |
+
- 12
|
73 |
+
- 24
|
74 |
+
- 24
|
75 |
+
- 24
|
76 |
+
- 24
|
77 |
+
- 24
|
78 |
+
- 24
|
79 |
+
- 24
|
80 |
+
- 24
|
81 |
+
- 48
|
82 |
+
- 48
|
83 |
+
- 48
|
84 |
+
- 48
|
85 |
+
- 48
|
86 |
+
- 48
|
87 |
+
- 48
|
88 |
+
- 48
|
89 |
+
- 128
|
90 |
+
- 129
|
91 |
+
dim_head: 64
|
92 |
+
heads: 8
|
93 |
+
attn_dropout: 0.1
|
94 |
+
ff_dropout: 0.1
|
95 |
+
flash_attn: true
|
96 |
+
dim_freqs_in: 1025
|
97 |
+
stft_n_fft: 2048
|
98 |
+
stft_hop_length: 441
|
99 |
+
stft_win_length: 2048
|
100 |
+
stft_normalized: false
|
101 |
+
mask_estimator_depth: 2
|
102 |
+
multi_stft_resolution_loss_weight: 1.0
|
103 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
104 |
+
- 4096
|
105 |
+
- 2048
|
106 |
+
- 1024
|
107 |
+
- 512
|
108 |
+
- 256
|
109 |
+
multi_stft_hop_size: 147
|
110 |
+
multi_stft_normalized: False
|
111 |
+
mlp_expansion_factor: 2
|
112 |
+
use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested)
|
113 |
+
skip_connection: False # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
|
114 |
+
|
115 |
+
training:
|
116 |
+
batch_size: 1
|
117 |
+
gradient_accumulation_steps: 1
|
118 |
+
grad_clip: 0
|
119 |
+
instruments: ['drums', 'bass', 'other', 'vocals']
|
120 |
+
patience: 3
|
121 |
+
reduce_factor: 0.95
|
122 |
+
target_instrument: null
|
123 |
+
num_epochs: 1000
|
124 |
+
num_steps: 1000
|
125 |
+
augmentation: false # enable augmentations by audiomentations and pedalboard
|
126 |
+
augmentation_type: simple1
|
127 |
+
use_mp3_compress: false # Deprecated
|
128 |
+
augmentation_mix: true # Mix several stems of the same type with some probability
|
129 |
+
augmentation_loudness: true # randomly change loudness of each stem
|
130 |
+
augmentation_loudness_type: 1 # Type 1 or 2
|
131 |
+
augmentation_loudness_min: 0.5
|
132 |
+
augmentation_loudness_max: 1.5
|
133 |
+
q: 0.95
|
134 |
+
coarse_loss_clip: true
|
135 |
+
ema_momentum: 0.999
|
136 |
+
# optimizer: prodigy
|
137 |
+
optimizer: adam
|
138 |
+
# lr: 1.0
|
139 |
+
lr: 1.0e-5
|
140 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
141 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
142 |
+
|
143 |
+
augmentations:
|
144 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
145 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
146 |
+
loudness_min: 0.5
|
147 |
+
loudness_max: 1.5
|
148 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
149 |
+
mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
150 |
+
- 0.2
|
151 |
+
- 0.02
|
152 |
+
mixup_loudness_min: 0.5
|
153 |
+
mixup_loudness_max: 1.5
|
154 |
+
|
155 |
+
all:
|
156 |
+
channel_shuffle: 0.5 # Set 0 or lower to disable
|
157 |
+
random_inverse: 0.1 # inverse track (better lower probability)
|
158 |
+
random_polarity: 0.5 # polarity change (multiply waveform to -1)
|
159 |
+
|
160 |
+
vocals:
|
161 |
+
pitch_shift: 0.1
|
162 |
+
pitch_shift_min_semitones: -5
|
163 |
+
pitch_shift_max_semitones: 5
|
164 |
+
seven_band_parametric_eq: 0.1
|
165 |
+
seven_band_parametric_eq_min_gain_db: -9
|
166 |
+
seven_band_parametric_eq_max_gain_db: 9
|
167 |
+
tanh_distortion: 0.1
|
168 |
+
tanh_distortion_min: 0.1
|
169 |
+
tanh_distortion_max: 0.7
|
170 |
+
bass:
|
171 |
+
pitch_shift: 0.1
|
172 |
+
pitch_shift_min_semitones: -2
|
173 |
+
pitch_shift_max_semitones: 2
|
174 |
+
seven_band_parametric_eq: 0.1
|
175 |
+
seven_band_parametric_eq_min_gain_db: -3
|
176 |
+
seven_band_parametric_eq_max_gain_db: 6
|
177 |
+
tanh_distortion: 0.1
|
178 |
+
tanh_distortion_min: 0.1
|
179 |
+
tanh_distortion_max: 0.5
|
180 |
+
drums:
|
181 |
+
pitch_shift: 0.1
|
182 |
+
pitch_shift_min_semitones: -5
|
183 |
+
pitch_shift_max_semitones: 5
|
184 |
+
seven_band_parametric_eq: 0.1
|
185 |
+
seven_band_parametric_eq_min_gain_db: -9
|
186 |
+
seven_band_parametric_eq_max_gain_db: 9
|
187 |
+
tanh_distortion: 0.1
|
188 |
+
tanh_distortion_min: 0.1
|
189 |
+
tanh_distortion_max: 0.6
|
190 |
+
other:
|
191 |
+
pitch_shift: 0.1
|
192 |
+
pitch_shift_min_semitones: -4
|
193 |
+
pitch_shift_max_semitones: 4
|
194 |
+
gaussian_noise: 0.1
|
195 |
+
gaussian_noise_min_amplitude: 0.001
|
196 |
+
gaussian_noise_max_amplitude: 0.015
|
197 |
+
time_stretch: 0.1
|
198 |
+
time_stretch_min_rate: 0.8
|
199 |
+
time_stretch_max_rate: 1.25
|
200 |
+
|
201 |
+
|
202 |
+
inference:
|
203 |
+
batch_size: 2
|
204 |
+
dim_t: 1101
|
205 |
+
num_overlap: 2
|
configs/config_musdb18_demucs3_mmi.yaml
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 485100 # samplerate * segment
|
3 |
+
min_mean_abs: 0.000
|
4 |
+
hop_length: 1024
|
5 |
+
|
6 |
+
training:
|
7 |
+
batch_size: 8
|
8 |
+
gradient_accumulation_steps: 1
|
9 |
+
grad_clip: 0
|
10 |
+
segment: 11
|
11 |
+
shift: 1
|
12 |
+
samplerate: 44100
|
13 |
+
channels: 2
|
14 |
+
normalize: true
|
15 |
+
instruments: ['drums', 'bass', 'other', 'vocals']
|
16 |
+
target_instrument: null
|
17 |
+
num_epochs: 1000
|
18 |
+
num_steps: 1000
|
19 |
+
optimizer: adam
|
20 |
+
lr: 9.0e-05
|
21 |
+
patience: 2
|
22 |
+
reduce_factor: 0.95
|
23 |
+
q: 0.95
|
24 |
+
coarse_loss_clip: true
|
25 |
+
ema_momentum: 0.999
|
26 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
27 |
+
use_amp: false # enable or disable usage of mixed precision (float16) - usually it must be true
|
28 |
+
|
29 |
+
augmentations:
|
30 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
31 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
32 |
+
loudness_min: 0.5
|
33 |
+
loudness_max: 1.5
|
34 |
+
|
35 |
+
inference:
|
36 |
+
num_overlap: 4
|
37 |
+
batch_size: 8
|
38 |
+
|
39 |
+
model: hdemucs
|
40 |
+
|
41 |
+
hdemucs: # see demucs/hdemucs.py for a detailed description
|
42 |
+
channels: 48
|
43 |
+
channels_time: null
|
44 |
+
growth: 2
|
45 |
+
nfft: 4096
|
46 |
+
wiener_iters: 0
|
47 |
+
end_iters: 0
|
48 |
+
wiener_residual: False
|
49 |
+
cac: True
|
50 |
+
depth: 6
|
51 |
+
rewrite: True
|
52 |
+
hybrid: True
|
53 |
+
hybrid_old: False
|
54 |
+
multi_freqs: []
|
55 |
+
multi_freqs_depth: 3
|
56 |
+
freq_emb: 0.2
|
57 |
+
emb_scale: 10
|
58 |
+
emb_smooth: True
|
59 |
+
kernel_size: 8
|
60 |
+
stride: 4
|
61 |
+
time_stride: 2
|
62 |
+
context: 1
|
63 |
+
context_enc: 0
|
64 |
+
norm_starts: 4
|
65 |
+
norm_groups: 4
|
66 |
+
dconv_mode: 1
|
67 |
+
dconv_depth: 2
|
68 |
+
dconv_comp: 4
|
69 |
+
dconv_attn: 4
|
70 |
+
dconv_lstm: 4
|
71 |
+
dconv_init: 0.001
|
72 |
+
rescale: 0.1
|
configs/config_musdb18_htdemucs.yaml
ADDED
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 485100 # samplerate * segment
|
3 |
+
min_mean_abs: 0.001
|
4 |
+
hop_length: 1024
|
5 |
+
|
6 |
+
training:
|
7 |
+
batch_size: 8
|
8 |
+
gradient_accumulation_steps: 1
|
9 |
+
grad_clip: 0
|
10 |
+
segment: 11
|
11 |
+
shift: 1
|
12 |
+
samplerate: 44100
|
13 |
+
channels: 2
|
14 |
+
normalize: true
|
15 |
+
instruments: ['drums', 'bass', 'other', 'vocals']
|
16 |
+
target_instrument: null
|
17 |
+
num_epochs: 1000
|
18 |
+
num_steps: 1000
|
19 |
+
optimizer: adam
|
20 |
+
lr: 9.0e-05
|
21 |
+
patience: 2
|
22 |
+
reduce_factor: 0.95
|
23 |
+
q: 0.95
|
24 |
+
coarse_loss_clip: true
|
25 |
+
ema_momentum: 0.999
|
26 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
27 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
28 |
+
|
29 |
+
augmentations:
|
30 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
31 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
32 |
+
loudness_min: 0.5
|
33 |
+
loudness_max: 1.5
|
34 |
+
|
35 |
+
inference:
|
36 |
+
num_overlap: 4
|
37 |
+
batch_size: 8
|
38 |
+
|
39 |
+
model: htdemucs
|
40 |
+
|
41 |
+
htdemucs: # see demucs/htdemucs.py for a detailed description
|
42 |
+
# Channels
|
43 |
+
channels: 48
|
44 |
+
channels_time:
|
45 |
+
growth: 2
|
46 |
+
# STFT
|
47 |
+
num_subbands: 1
|
48 |
+
nfft: 4096
|
49 |
+
wiener_iters: 0
|
50 |
+
end_iters: 0
|
51 |
+
wiener_residual: false
|
52 |
+
cac: true
|
53 |
+
# Main structure
|
54 |
+
depth: 4
|
55 |
+
rewrite: true
|
56 |
+
# Frequency Branch
|
57 |
+
multi_freqs: []
|
58 |
+
multi_freqs_depth: 3
|
59 |
+
freq_emb: 0.2
|
60 |
+
emb_scale: 10
|
61 |
+
emb_smooth: true
|
62 |
+
# Convolutions
|
63 |
+
kernel_size: 8
|
64 |
+
stride: 4
|
65 |
+
time_stride: 2
|
66 |
+
context: 1
|
67 |
+
context_enc: 0
|
68 |
+
# normalization
|
69 |
+
norm_starts: 4
|
70 |
+
norm_groups: 4
|
71 |
+
# DConv residual branch
|
72 |
+
dconv_mode: 3
|
73 |
+
dconv_depth: 2
|
74 |
+
dconv_comp: 8
|
75 |
+
dconv_init: 1e-3
|
76 |
+
# Before the Transformer
|
77 |
+
bottom_channels: 512
|
78 |
+
# CrossTransformer
|
79 |
+
# ------ Common to all
|
80 |
+
# Regular parameters
|
81 |
+
t_layers: 5
|
82 |
+
t_hidden_scale: 4.0
|
83 |
+
t_heads: 8
|
84 |
+
t_dropout: 0.0
|
85 |
+
t_layer_scale: True
|
86 |
+
t_gelu: True
|
87 |
+
# ------------- Positional Embedding
|
88 |
+
t_emb: sin
|
89 |
+
t_max_positions: 10000 # for the scaled embedding
|
90 |
+
t_max_period: 10000.0
|
91 |
+
t_weight_pos_embed: 1.0
|
92 |
+
t_cape_mean_normalize: True
|
93 |
+
t_cape_augment: True
|
94 |
+
t_cape_glob_loc_scale: [5000.0, 1.0, 1.4]
|
95 |
+
t_sin_random_shift: 0
|
96 |
+
# ------------- norm before a transformer encoder
|
97 |
+
t_norm_in: True
|
98 |
+
t_norm_in_group: False
|
99 |
+
# ------------- norm inside the encoder
|
100 |
+
t_group_norm: False
|
101 |
+
t_norm_first: True
|
102 |
+
t_norm_out: True
|
103 |
+
# ------------- optim
|
104 |
+
t_weight_decay: 0.0
|
105 |
+
t_lr:
|
106 |
+
# ------------- sparsity
|
107 |
+
t_sparse_self_attn: False
|
108 |
+
t_sparse_cross_attn: False
|
109 |
+
t_mask_type: diag
|
110 |
+
t_mask_random_seed: 42
|
111 |
+
t_sparse_attn_window: 400
|
112 |
+
t_global_window: 100
|
113 |
+
t_sparsity: 0.95
|
114 |
+
t_auto_sparsity: False
|
115 |
+
# Cross Encoder First (False)
|
116 |
+
t_cross_first: False
|
117 |
+
# Weight init
|
118 |
+
rescale: 0.1
|
119 |
+
|
configs/config_musdb18_mdx23c.yaml
ADDED
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 261120
|
3 |
+
dim_f: 4096
|
4 |
+
dim_t: 256
|
5 |
+
hop_length: 1024
|
6 |
+
n_fft: 8192
|
7 |
+
num_channels: 2
|
8 |
+
sample_rate: 44100
|
9 |
+
min_mean_abs: 0.001
|
10 |
+
|
11 |
+
model:
|
12 |
+
act: gelu
|
13 |
+
bottleneck_factor: 4
|
14 |
+
growth: 128
|
15 |
+
norm: InstanceNorm
|
16 |
+
num_blocks_per_scale: 2
|
17 |
+
num_channels: 128
|
18 |
+
num_scales: 5
|
19 |
+
num_subbands: 4
|
20 |
+
scale:
|
21 |
+
- 2
|
22 |
+
- 2
|
23 |
+
|
24 |
+
training:
|
25 |
+
batch_size: 6
|
26 |
+
gradient_accumulation_steps: 1
|
27 |
+
grad_clip: 0
|
28 |
+
instruments:
|
29 |
+
- vocals
|
30 |
+
- bass
|
31 |
+
- drums
|
32 |
+
- other
|
33 |
+
lr: 9.0e-05
|
34 |
+
patience: 2
|
35 |
+
reduce_factor: 0.95
|
36 |
+
target_instrument: null
|
37 |
+
num_epochs: 1000
|
38 |
+
num_steps: 1000
|
39 |
+
q: 0.95
|
40 |
+
coarse_loss_clip: true
|
41 |
+
ema_momentum: 0.999
|
42 |
+
optimizer: adam
|
43 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
44 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
45 |
+
|
46 |
+
augmentations:
|
47 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
48 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
49 |
+
loudness_min: 0.5
|
50 |
+
loudness_max: 1.5
|
51 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
52 |
+
mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
53 |
+
- 0.2
|
54 |
+
- 0.02
|
55 |
+
mixup_loudness_min: 0.5
|
56 |
+
mixup_loudness_max: 1.5
|
57 |
+
|
58 |
+
# apply mp3 compression to mixture only (emulate downloading mp3 from internet)
|
59 |
+
mp3_compression_on_mixture: 0.01
|
60 |
+
mp3_compression_on_mixture_bitrate_min: 32
|
61 |
+
mp3_compression_on_mixture_bitrate_max: 320
|
62 |
+
mp3_compression_on_mixture_backend: "lameenc"
|
63 |
+
|
64 |
+
all:
|
65 |
+
channel_shuffle: 0.5 # Set 0 or lower to disable
|
66 |
+
random_inverse: 0.1 # inverse track (better lower probability)
|
67 |
+
random_polarity: 0.5 # polarity change (multiply waveform to -1)
|
68 |
+
mp3_compression: 0.01
|
69 |
+
mp3_compression_min_bitrate: 32
|
70 |
+
mp3_compression_max_bitrate: 320
|
71 |
+
mp3_compression_backend: "lameenc"
|
72 |
+
|
73 |
+
# pedalboard reverb block
|
74 |
+
pedalboard_reverb: 0.01
|
75 |
+
pedalboard_reverb_room_size_min: 0.1
|
76 |
+
pedalboard_reverb_room_size_max: 0.9
|
77 |
+
pedalboard_reverb_damping_min: 0.1
|
78 |
+
pedalboard_reverb_damping_max: 0.9
|
79 |
+
pedalboard_reverb_wet_level_min: 0.1
|
80 |
+
pedalboard_reverb_wet_level_max: 0.9
|
81 |
+
pedalboard_reverb_dry_level_min: 0.1
|
82 |
+
pedalboard_reverb_dry_level_max: 0.9
|
83 |
+
pedalboard_reverb_width_min: 0.9
|
84 |
+
pedalboard_reverb_width_max: 1.0
|
85 |
+
|
86 |
+
# pedalboard chorus block
|
87 |
+
pedalboard_chorus: 0.01
|
88 |
+
pedalboard_chorus_rate_hz_min: 1.0
|
89 |
+
pedalboard_chorus_rate_hz_max: 7.0
|
90 |
+
pedalboard_chorus_depth_min: 0.25
|
91 |
+
pedalboard_chorus_depth_max: 0.95
|
92 |
+
pedalboard_chorus_centre_delay_ms_min: 3
|
93 |
+
pedalboard_chorus_centre_delay_ms_max: 10
|
94 |
+
pedalboard_chorus_feedback_min: 0.0
|
95 |
+
pedalboard_chorus_feedback_max: 0.5
|
96 |
+
pedalboard_chorus_mix_min: 0.1
|
97 |
+
pedalboard_chorus_mix_max: 0.9
|
98 |
+
|
99 |
+
# pedalboard phazer block
|
100 |
+
pedalboard_phazer: 0.01
|
101 |
+
pedalboard_phazer_rate_hz_min: 1.0
|
102 |
+
pedalboard_phazer_rate_hz_max: 10.0
|
103 |
+
pedalboard_phazer_depth_min: 0.25
|
104 |
+
pedalboard_phazer_depth_max: 0.95
|
105 |
+
pedalboard_phazer_centre_frequency_hz_min: 200
|
106 |
+
pedalboard_phazer_centre_frequency_hz_max: 12000
|
107 |
+
pedalboard_phazer_feedback_min: 0.0
|
108 |
+
pedalboard_phazer_feedback_max: 0.5
|
109 |
+
pedalboard_phazer_mix_min: 0.1
|
110 |
+
pedalboard_phazer_mix_max: 0.9
|
111 |
+
|
112 |
+
# pedalboard distortion block
|
113 |
+
pedalboard_distortion: 0.01
|
114 |
+
pedalboard_distortion_drive_db_min: 1.0
|
115 |
+
pedalboard_distortion_drive_db_max: 25.0
|
116 |
+
|
117 |
+
# pedalboard pitch shift block
|
118 |
+
pedalboard_pitch_shift: 0.01
|
119 |
+
pedalboard_pitch_shift_semitones_min: -7
|
120 |
+
pedalboard_pitch_shift_semitones_max: 7
|
121 |
+
|
122 |
+
# pedalboard resample block
|
123 |
+
pedalboard_resample: 0.01
|
124 |
+
pedalboard_resample_target_sample_rate_min: 4000
|
125 |
+
pedalboard_resample_target_sample_rate_max: 44100
|
126 |
+
|
127 |
+
# pedalboard bitcrash block
|
128 |
+
pedalboard_bitcrash: 0.01
|
129 |
+
pedalboard_bitcrash_bit_depth_min: 4
|
130 |
+
pedalboard_bitcrash_bit_depth_max: 16
|
131 |
+
|
132 |
+
# pedalboard mp3 compressor block
|
133 |
+
pedalboard_mp3_compressor: 0.01
|
134 |
+
pedalboard_mp3_compressor_pedalboard_mp3_compressor_min: 0
|
135 |
+
pedalboard_mp3_compressor_pedalboard_mp3_compressor_max: 9.999
|
136 |
+
|
137 |
+
vocals:
|
138 |
+
pitch_shift: 0.1
|
139 |
+
pitch_shift_min_semitones: -5
|
140 |
+
pitch_shift_max_semitones: 5
|
141 |
+
seven_band_parametric_eq: 0.25
|
142 |
+
seven_band_parametric_eq_min_gain_db: -9
|
143 |
+
seven_band_parametric_eq_max_gain_db: 9
|
144 |
+
tanh_distortion: 0.1
|
145 |
+
tanh_distortion_min: 0.1
|
146 |
+
tanh_distortion_max: 0.7
|
147 |
+
bass:
|
148 |
+
pitch_shift: 0.1
|
149 |
+
pitch_shift_min_semitones: -2
|
150 |
+
pitch_shift_max_semitones: 2
|
151 |
+
seven_band_parametric_eq: 0.25
|
152 |
+
seven_band_parametric_eq_min_gain_db: -3
|
153 |
+
seven_band_parametric_eq_max_gain_db: 6
|
154 |
+
tanh_distortion: 0.2
|
155 |
+
tanh_distortion_min: 0.1
|
156 |
+
tanh_distortion_max: 0.5
|
157 |
+
drums:
|
158 |
+
pitch_shift: 0.33
|
159 |
+
pitch_shift_min_semitones: -5
|
160 |
+
pitch_shift_max_semitones: 5
|
161 |
+
seven_band_parametric_eq: 0.25
|
162 |
+
seven_band_parametric_eq_min_gain_db: -9
|
163 |
+
seven_band_parametric_eq_max_gain_db: 9
|
164 |
+
tanh_distortion: 0.33
|
165 |
+
tanh_distortion_min: 0.1
|
166 |
+
tanh_distortion_max: 0.6
|
167 |
+
other:
|
168 |
+
pitch_shift: 0.1
|
169 |
+
pitch_shift_min_semitones: -4
|
170 |
+
pitch_shift_max_semitones: 4
|
171 |
+
gaussian_noise: 0.1
|
172 |
+
gaussian_noise_min_amplitude: 0.001
|
173 |
+
gaussian_noise_max_amplitude: 0.015
|
174 |
+
time_stretch: 0.01
|
175 |
+
time_stretch_min_rate: 0.8
|
176 |
+
time_stretch_max_rate: 1.25
|
177 |
+
|
178 |
+
|
179 |
+
inference:
|
180 |
+
batch_size: 1
|
181 |
+
dim_t: 256
|
182 |
+
num_overlap: 4
|
configs/config_musdb18_mdx23c_stht.yaml
ADDED
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 261120
|
3 |
+
dim_f: 4096
|
4 |
+
dim_t: 256
|
5 |
+
hop_length: 1024
|
6 |
+
n_fft: 8192
|
7 |
+
num_channels: 2
|
8 |
+
sample_rate: 44100
|
9 |
+
min_mean_abs: 0.001
|
10 |
+
|
11 |
+
model:
|
12 |
+
act: gelu
|
13 |
+
bottleneck_factor: 4
|
14 |
+
growth: 128
|
15 |
+
norm: InstanceNorm
|
16 |
+
num_blocks_per_scale: 2
|
17 |
+
num_channels: 128
|
18 |
+
num_scales: 5
|
19 |
+
num_subbands: 4
|
20 |
+
scale:
|
21 |
+
- 2
|
22 |
+
- 2
|
23 |
+
|
24 |
+
training:
|
25 |
+
batch_size: 6
|
26 |
+
gradient_accumulation_steps: 1
|
27 |
+
grad_clip: 0
|
28 |
+
instruments:
|
29 |
+
- vocals
|
30 |
+
- bass
|
31 |
+
- drums
|
32 |
+
- other
|
33 |
+
lr: 9.0e-05
|
34 |
+
patience: 2
|
35 |
+
reduce_factor: 0.95
|
36 |
+
target_instrument: null
|
37 |
+
num_epochs: 1000
|
38 |
+
num_steps: 1000
|
39 |
+
q: 0.95
|
40 |
+
coarse_loss_clip: true
|
41 |
+
ema_momentum: 0.999
|
42 |
+
optimizer: adam
|
43 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
44 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
45 |
+
|
46 |
+
augmentations:
|
47 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
48 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
49 |
+
loudness_min: 0.5
|
50 |
+
loudness_max: 1.5
|
51 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
52 |
+
mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
53 |
+
- 0.2
|
54 |
+
- 0.02
|
55 |
+
mixup_loudness_min: 0.5
|
56 |
+
mixup_loudness_max: 1.5
|
57 |
+
|
58 |
+
# apply mp3 compression to mixture only (emulate downloading mp3 from internet)
|
59 |
+
mp3_compression_on_mixture: 0.01
|
60 |
+
mp3_compression_on_mixture_bitrate_min: 32
|
61 |
+
mp3_compression_on_mixture_bitrate_max: 320
|
62 |
+
mp3_compression_on_mixture_backend: "lameenc"
|
63 |
+
|
64 |
+
all:
|
65 |
+
channel_shuffle: 0.5 # Set 0 or lower to disable
|
66 |
+
random_inverse: 0.1 # inverse track (better lower probability)
|
67 |
+
random_polarity: 0.5 # polarity change (multiply waveform to -1)
|
68 |
+
mp3_compression: 0.01
|
69 |
+
mp3_compression_min_bitrate: 32
|
70 |
+
mp3_compression_max_bitrate: 320
|
71 |
+
mp3_compression_backend: "lameenc"
|
72 |
+
|
73 |
+
# pedalboard reverb block
|
74 |
+
pedalboard_reverb: 0.01
|
75 |
+
pedalboard_reverb_room_size_min: 0.1
|
76 |
+
pedalboard_reverb_room_size_max: 0.9
|
77 |
+
pedalboard_reverb_damping_min: 0.1
|
78 |
+
pedalboard_reverb_damping_max: 0.9
|
79 |
+
pedalboard_reverb_wet_level_min: 0.1
|
80 |
+
pedalboard_reverb_wet_level_max: 0.9
|
81 |
+
pedalboard_reverb_dry_level_min: 0.1
|
82 |
+
pedalboard_reverb_dry_level_max: 0.9
|
83 |
+
pedalboard_reverb_width_min: 0.9
|
84 |
+
pedalboard_reverb_width_max: 1.0
|
85 |
+
|
86 |
+
# pedalboard chorus block
|
87 |
+
pedalboard_chorus: 0.01
|
88 |
+
pedalboard_chorus_rate_hz_min: 1.0
|
89 |
+
pedalboard_chorus_rate_hz_max: 7.0
|
90 |
+
pedalboard_chorus_depth_min: 0.25
|
91 |
+
pedalboard_chorus_depth_max: 0.95
|
92 |
+
pedalboard_chorus_centre_delay_ms_min: 3
|
93 |
+
pedalboard_chorus_centre_delay_ms_max: 10
|
94 |
+
pedalboard_chorus_feedback_min: 0.0
|
95 |
+
pedalboard_chorus_feedback_max: 0.5
|
96 |
+
pedalboard_chorus_mix_min: 0.1
|
97 |
+
pedalboard_chorus_mix_max: 0.9
|
98 |
+
|
99 |
+
# pedalboard phazer block
|
100 |
+
pedalboard_phazer: 0.01
|
101 |
+
pedalboard_phazer_rate_hz_min: 1.0
|
102 |
+
pedalboard_phazer_rate_hz_max: 10.0
|
103 |
+
pedalboard_phazer_depth_min: 0.25
|
104 |
+
pedalboard_phazer_depth_max: 0.95
|
105 |
+
pedalboard_phazer_centre_frequency_hz_min: 200
|
106 |
+
pedalboard_phazer_centre_frequency_hz_max: 12000
|
107 |
+
pedalboard_phazer_feedback_min: 0.0
|
108 |
+
pedalboard_phazer_feedback_max: 0.5
|
109 |
+
pedalboard_phazer_mix_min: 0.1
|
110 |
+
pedalboard_phazer_mix_max: 0.9
|
111 |
+
|
112 |
+
# pedalboard distortion block
|
113 |
+
pedalboard_distortion: 0.01
|
114 |
+
pedalboard_distortion_drive_db_min: 1.0
|
115 |
+
pedalboard_distortion_drive_db_max: 25.0
|
116 |
+
|
117 |
+
# pedalboard pitch shift block
|
118 |
+
pedalboard_pitch_shift: 0.01
|
119 |
+
pedalboard_pitch_shift_semitones_min: -7
|
120 |
+
pedalboard_pitch_shift_semitones_max: 7
|
121 |
+
|
122 |
+
# pedalboard resample block
|
123 |
+
pedalboard_resample: 0.01
|
124 |
+
pedalboard_resample_target_sample_rate_min: 4000
|
125 |
+
pedalboard_resample_target_sample_rate_max: 44100
|
126 |
+
|
127 |
+
# pedalboard bitcrash block
|
128 |
+
pedalboard_bitcrash: 0.01
|
129 |
+
pedalboard_bitcrash_bit_depth_min: 4
|
130 |
+
pedalboard_bitcrash_bit_depth_max: 16
|
131 |
+
|
132 |
+
# pedalboard mp3 compressor block
|
133 |
+
pedalboard_mp3_compressor: 0.01
|
134 |
+
pedalboard_mp3_compressor_pedalboard_mp3_compressor_min: 0
|
135 |
+
pedalboard_mp3_compressor_pedalboard_mp3_compressor_max: 9.999
|
136 |
+
|
137 |
+
vocals:
|
138 |
+
pitch_shift: 0.1
|
139 |
+
pitch_shift_min_semitones: -5
|
140 |
+
pitch_shift_max_semitones: 5
|
141 |
+
seven_band_parametric_eq: 0.25
|
142 |
+
seven_band_parametric_eq_min_gain_db: -9
|
143 |
+
seven_band_parametric_eq_max_gain_db: 9
|
144 |
+
tanh_distortion: 0.1
|
145 |
+
tanh_distortion_min: 0.1
|
146 |
+
tanh_distortion_max: 0.7
|
147 |
+
bass:
|
148 |
+
pitch_shift: 0.1
|
149 |
+
pitch_shift_min_semitones: -2
|
150 |
+
pitch_shift_max_semitones: 2
|
151 |
+
seven_band_parametric_eq: 0.25
|
152 |
+
seven_band_parametric_eq_min_gain_db: -3
|
153 |
+
seven_band_parametric_eq_max_gain_db: 6
|
154 |
+
tanh_distortion: 0.2
|
155 |
+
tanh_distortion_min: 0.1
|
156 |
+
tanh_distortion_max: 0.5
|
157 |
+
drums:
|
158 |
+
pitch_shift: 0.33
|
159 |
+
pitch_shift_min_semitones: -5
|
160 |
+
pitch_shift_max_semitones: 5
|
161 |
+
seven_band_parametric_eq: 0.25
|
162 |
+
seven_band_parametric_eq_min_gain_db: -9
|
163 |
+
seven_band_parametric_eq_max_gain_db: 9
|
164 |
+
tanh_distortion: 0.33
|
165 |
+
tanh_distortion_min: 0.1
|
166 |
+
tanh_distortion_max: 0.6
|
167 |
+
other:
|
168 |
+
pitch_shift: 0.1
|
169 |
+
pitch_shift_min_semitones: -4
|
170 |
+
pitch_shift_max_semitones: 4
|
171 |
+
gaussian_noise: 0.1
|
172 |
+
gaussian_noise_min_amplitude: 0.001
|
173 |
+
gaussian_noise_max_amplitude: 0.015
|
174 |
+
time_stretch: 0.01
|
175 |
+
time_stretch_min_rate: 0.8
|
176 |
+
time_stretch_max_rate: 1.25
|
177 |
+
|
178 |
+
|
179 |
+
inference:
|
180 |
+
batch_size: 1
|
181 |
+
dim_t: 256
|
182 |
+
num_overlap: 4
|
configs/config_musdb18_mel_band_roformer.yaml
ADDED
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 131584
|
3 |
+
dim_f: 1024
|
4 |
+
dim_t: 256
|
5 |
+
hop_length: 512
|
6 |
+
n_fft: 2048
|
7 |
+
num_channels: 2
|
8 |
+
sample_rate: 44100
|
9 |
+
min_mean_abs: 0.001
|
10 |
+
|
11 |
+
model:
|
12 |
+
dim: 192
|
13 |
+
depth: 8
|
14 |
+
stereo: true
|
15 |
+
num_stems: 1
|
16 |
+
time_transformer_depth: 1
|
17 |
+
freq_transformer_depth: 1
|
18 |
+
linear_transformer_depth: 0
|
19 |
+
num_bands: 60
|
20 |
+
dim_head: 64
|
21 |
+
heads: 8
|
22 |
+
attn_dropout: 0.1
|
23 |
+
ff_dropout: 0.1
|
24 |
+
flash_attn: True
|
25 |
+
dim_freqs_in: 1025
|
26 |
+
sample_rate: 44100 # needed for mel filter bank from librosa
|
27 |
+
stft_n_fft: 2048
|
28 |
+
stft_hop_length: 512
|
29 |
+
stft_win_length: 2048
|
30 |
+
stft_normalized: False
|
31 |
+
mask_estimator_depth: 2
|
32 |
+
multi_stft_resolution_loss_weight: 1.0
|
33 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
34 |
+
- 4096
|
35 |
+
- 2048
|
36 |
+
- 1024
|
37 |
+
- 512
|
38 |
+
- 256
|
39 |
+
multi_stft_hop_size: 147
|
40 |
+
multi_stft_normalized: False
|
41 |
+
mlp_expansion_factor: 4 # Probably too big (requires a lot of memory for weights)
|
42 |
+
use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested)
|
43 |
+
skip_connection: False # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
|
44 |
+
|
45 |
+
training:
|
46 |
+
batch_size: 7
|
47 |
+
gradient_accumulation_steps: 1
|
48 |
+
grad_clip: 0
|
49 |
+
instruments:
|
50 |
+
- vocals
|
51 |
+
- bass
|
52 |
+
- drums
|
53 |
+
- other
|
54 |
+
lr: 5.0e-05
|
55 |
+
patience: 2
|
56 |
+
reduce_factor: 0.95
|
57 |
+
target_instrument: vocals
|
58 |
+
num_epochs: 1000
|
59 |
+
num_steps: 1000
|
60 |
+
q: 0.95
|
61 |
+
coarse_loss_clip: true
|
62 |
+
ema_momentum: 0.999
|
63 |
+
optimizer: adam
|
64 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
65 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
66 |
+
|
67 |
+
augmentations:
|
68 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
69 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
70 |
+
loudness_min: 0.5
|
71 |
+
loudness_max: 1.5
|
72 |
+
|
73 |
+
inference:
|
74 |
+
batch_size: 1
|
75 |
+
dim_t: 256
|
76 |
+
num_overlap: 4
|
configs/config_musdb18_mel_band_roformer_all_stems.yaml
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 352800
|
3 |
+
dim_f: 1024
|
4 |
+
dim_t: 256
|
5 |
+
hop_length: 441
|
6 |
+
n_fft: 2048
|
7 |
+
num_channels: 2
|
8 |
+
sample_rate: 44100
|
9 |
+
min_mean_abs: 0.000
|
10 |
+
|
11 |
+
model:
|
12 |
+
dim: 384
|
13 |
+
depth: 6
|
14 |
+
stereo: true
|
15 |
+
num_stems: 4
|
16 |
+
time_transformer_depth: 1
|
17 |
+
freq_transformer_depth: 1
|
18 |
+
linear_transformer_depth: 0
|
19 |
+
num_bands: 60
|
20 |
+
dim_head: 64
|
21 |
+
heads: 8
|
22 |
+
attn_dropout: 0
|
23 |
+
ff_dropout: 0
|
24 |
+
flash_attn: True
|
25 |
+
dim_freqs_in: 1025
|
26 |
+
sample_rate: 44100 # needed for mel filter bank from librosa
|
27 |
+
stft_n_fft: 2048
|
28 |
+
stft_hop_length: 441
|
29 |
+
stft_win_length: 2048
|
30 |
+
stft_normalized: False
|
31 |
+
mask_estimator_depth: 2
|
32 |
+
multi_stft_resolution_loss_weight: 1.0
|
33 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
34 |
+
- 4096
|
35 |
+
- 2048
|
36 |
+
- 1024
|
37 |
+
- 512
|
38 |
+
- 256
|
39 |
+
multi_stft_hop_size: 147
|
40 |
+
multi_stft_normalized: False
|
41 |
+
mlp_expansion_factor: 4 # Probably too big (requires a lot of memory for weights)
|
42 |
+
use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested)
|
43 |
+
skip_connection: False # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
|
44 |
+
|
45 |
+
training:
|
46 |
+
batch_size: 1
|
47 |
+
gradient_accumulation_steps: 1
|
48 |
+
grad_clip: 0
|
49 |
+
instruments:
|
50 |
+
- drums
|
51 |
+
- bass
|
52 |
+
- other
|
53 |
+
- vocals
|
54 |
+
lr: 1.0e-05
|
55 |
+
patience: 2
|
56 |
+
reduce_factor: 0.95
|
57 |
+
target_instrument: null
|
58 |
+
num_epochs: 1000
|
59 |
+
num_steps: 1000
|
60 |
+
augmentation: false # enable augmentations by audiomentations and pedalboard
|
61 |
+
augmentation_type: null
|
62 |
+
use_mp3_compress: false # Deprecated
|
63 |
+
augmentation_mix: false # Mix several stems of the same type with some probability
|
64 |
+
augmentation_loudness: false # randomly change loudness of each stem
|
65 |
+
augmentation_loudness_type: 1 # Type 1 or 2
|
66 |
+
augmentation_loudness_min: 0
|
67 |
+
augmentation_loudness_max: 0
|
68 |
+
q: 0.95
|
69 |
+
coarse_loss_clip: false
|
70 |
+
ema_momentum: 0.999
|
71 |
+
optimizer: adam
|
72 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
73 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
74 |
+
|
75 |
+
|
76 |
+
augmentations:
|
77 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
78 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
79 |
+
loudness_min: 0.5
|
80 |
+
loudness_max: 1.5
|
81 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
82 |
+
mixup_probs:
|
83 |
+
!!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
84 |
+
- 0.2
|
85 |
+
- 0.02
|
86 |
+
mixup_loudness_min: 0.5
|
87 |
+
mixup_loudness_max: 1.5
|
88 |
+
all:
|
89 |
+
channel_shuffle: 0.5 # Set 0 or lower to disable
|
90 |
+
random_inverse: 0.1 # inverse track (better lower probability)
|
91 |
+
random_polarity: 0.5 # polarity change (multiply waveform to -1)
|
92 |
+
|
93 |
+
|
94 |
+
inference:
|
95 |
+
batch_size: 4
|
96 |
+
dim_t: 256
|
97 |
+
num_overlap: 2
|
configs/config_musdb18_scnet.yaml
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 485100 # 44100 * 11
|
3 |
+
num_channels: 2
|
4 |
+
sample_rate: 44100
|
5 |
+
min_mean_abs: 0.000
|
6 |
+
|
7 |
+
model:
|
8 |
+
sources:
|
9 |
+
- drums
|
10 |
+
- bass
|
11 |
+
- other
|
12 |
+
- vocals
|
13 |
+
audio_channels: 2
|
14 |
+
dims:
|
15 |
+
- 4
|
16 |
+
- 32
|
17 |
+
- 64
|
18 |
+
- 128
|
19 |
+
nfft: 4096
|
20 |
+
hop_size: 1024
|
21 |
+
win_size: 4096
|
22 |
+
normalized: True
|
23 |
+
band_SR:
|
24 |
+
- 0.175
|
25 |
+
- 0.392
|
26 |
+
- 0.433
|
27 |
+
band_stride:
|
28 |
+
- 1
|
29 |
+
- 4
|
30 |
+
- 16
|
31 |
+
band_kernel:
|
32 |
+
- 3
|
33 |
+
- 4
|
34 |
+
- 16
|
35 |
+
conv_depths:
|
36 |
+
- 3
|
37 |
+
- 2
|
38 |
+
- 1
|
39 |
+
compress: 4
|
40 |
+
conv_kernel: 3
|
41 |
+
num_dplayer: 6
|
42 |
+
expand: 1
|
43 |
+
|
44 |
+
training:
|
45 |
+
batch_size: 10
|
46 |
+
gradient_accumulation_steps: 1
|
47 |
+
grad_clip: 0
|
48 |
+
instruments:
|
49 |
+
- drums
|
50 |
+
- bass
|
51 |
+
- other
|
52 |
+
- vocals
|
53 |
+
lr: 5.0e-04
|
54 |
+
patience: 2
|
55 |
+
reduce_factor: 0.95
|
56 |
+
target_instrument: null
|
57 |
+
num_epochs: 1000
|
58 |
+
num_steps: 1000
|
59 |
+
q: 0.95
|
60 |
+
coarse_loss_clip: true
|
61 |
+
ema_momentum: 0.999
|
62 |
+
optimizer: adam
|
63 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
64 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
65 |
+
|
66 |
+
augmentations:
|
67 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
68 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
69 |
+
loudness_min: 0.5
|
70 |
+
loudness_max: 1.5
|
71 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
72 |
+
mixup_probs:
|
73 |
+
!!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
74 |
+
- 0.2
|
75 |
+
- 0.02
|
76 |
+
mixup_loudness_min: 0.5
|
77 |
+
mixup_loudness_max: 1.5
|
78 |
+
|
79 |
+
inference:
|
80 |
+
batch_size: 8
|
81 |
+
dim_t: 256
|
82 |
+
num_overlap: 4
|
83 |
+
normalize: true
|
configs/config_musdb18_scnet_large.yaml
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 485100 # 44100 * 11
|
3 |
+
num_channels: 2
|
4 |
+
sample_rate: 44100
|
5 |
+
min_mean_abs: 0.000
|
6 |
+
|
7 |
+
model:
|
8 |
+
sources:
|
9 |
+
- drums
|
10 |
+
- bass
|
11 |
+
- other
|
12 |
+
- vocals
|
13 |
+
audio_channels: 2
|
14 |
+
dims:
|
15 |
+
- 4
|
16 |
+
- 64
|
17 |
+
- 128
|
18 |
+
- 256
|
19 |
+
nfft: 4096
|
20 |
+
hop_size: 1024
|
21 |
+
win_size: 4096
|
22 |
+
normalized: True
|
23 |
+
band_SR:
|
24 |
+
- 0.225
|
25 |
+
- 0.372
|
26 |
+
- 0.403
|
27 |
+
band_stride:
|
28 |
+
- 1
|
29 |
+
- 4
|
30 |
+
- 16
|
31 |
+
band_kernel:
|
32 |
+
- 3
|
33 |
+
- 4
|
34 |
+
- 16
|
35 |
+
conv_depths:
|
36 |
+
- 3
|
37 |
+
- 2
|
38 |
+
- 1
|
39 |
+
compress: 4
|
40 |
+
conv_kernel: 3
|
41 |
+
num_dplayer: 6
|
42 |
+
expand: 1
|
43 |
+
|
44 |
+
training:
|
45 |
+
batch_size: 6
|
46 |
+
gradient_accumulation_steps: 1
|
47 |
+
grad_clip: 0
|
48 |
+
instruments:
|
49 |
+
- drums
|
50 |
+
- bass
|
51 |
+
- other
|
52 |
+
- vocals
|
53 |
+
lr: 5.0e-04
|
54 |
+
patience: 2
|
55 |
+
reduce_factor: 0.95
|
56 |
+
target_instrument: null
|
57 |
+
num_epochs: 1000
|
58 |
+
num_steps: 1000
|
59 |
+
q: 0.95
|
60 |
+
coarse_loss_clip: true
|
61 |
+
ema_momentum: 0.999
|
62 |
+
optimizer: adam
|
63 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
64 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
65 |
+
|
66 |
+
augmentations:
|
67 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
68 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
69 |
+
loudness_min: 0.5
|
70 |
+
loudness_max: 1.5
|
71 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
72 |
+
mixup_probs:
|
73 |
+
!!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
74 |
+
- 0.2
|
75 |
+
- 0.02
|
76 |
+
mixup_loudness_min: 0.5
|
77 |
+
mixup_loudness_max: 1.5
|
78 |
+
|
79 |
+
inference:
|
80 |
+
batch_size: 8
|
81 |
+
dim_t: 256
|
82 |
+
num_overlap: 4
|
83 |
+
normalize: false
|
configs/config_musdb18_segm_models.yaml
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 261632
|
3 |
+
dim_f: 4096
|
4 |
+
dim_t: 512
|
5 |
+
hop_length: 512
|
6 |
+
n_fft: 8192
|
7 |
+
num_channels: 2
|
8 |
+
sample_rate: 44100
|
9 |
+
min_mean_abs: 0.001
|
10 |
+
|
11 |
+
model:
|
12 |
+
encoder_name: tu-maxvit_large_tf_512 # look here for possibilities: https://github.com/qubvel/segmentation_models.pytorch#encoders-
|
13 |
+
decoder_type: unet # unet, fpn
|
14 |
+
act: gelu
|
15 |
+
num_channels: 128
|
16 |
+
num_subbands: 8
|
17 |
+
|
18 |
+
training:
|
19 |
+
batch_size: 7
|
20 |
+
gradient_accumulation_steps: 1
|
21 |
+
grad_clip: 0
|
22 |
+
instruments:
|
23 |
+
- vocals
|
24 |
+
- bass
|
25 |
+
- drums
|
26 |
+
- other
|
27 |
+
lr: 5.0e-05
|
28 |
+
patience: 2
|
29 |
+
reduce_factor: 0.95
|
30 |
+
target_instrument: null
|
31 |
+
num_epochs: 1000
|
32 |
+
num_steps: 2000
|
33 |
+
q: 0.95
|
34 |
+
coarse_loss_clip: true
|
35 |
+
ema_momentum: 0.999
|
36 |
+
optimizer: adamw
|
37 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
38 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
39 |
+
|
40 |
+
augmentations:
|
41 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
42 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
43 |
+
loudness_min: 0.5
|
44 |
+
loudness_max: 1.5
|
45 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
46 |
+
mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
47 |
+
- 0.2
|
48 |
+
- 0.02
|
49 |
+
mixup_loudness_min: 0.5
|
50 |
+
mixup_loudness_max: 1.5
|
51 |
+
|
52 |
+
# apply mp3 compression to mixture only (emulate downloading mp3 from internet)
|
53 |
+
mp3_compression_on_mixture: 0.01
|
54 |
+
mp3_compression_on_mixture_bitrate_min: 32
|
55 |
+
mp3_compression_on_mixture_bitrate_max: 320
|
56 |
+
mp3_compression_on_mixture_backend: "lameenc"
|
57 |
+
|
58 |
+
all:
|
59 |
+
channel_shuffle: 0.5 # Set 0 or lower to disable
|
60 |
+
random_inverse: 0.1 # inverse track (better lower probability)
|
61 |
+
random_polarity: 0.5 # polarity change (multiply waveform to -1)
|
62 |
+
mp3_compression: 0.01
|
63 |
+
mp3_compression_min_bitrate: 32
|
64 |
+
mp3_compression_max_bitrate: 320
|
65 |
+
mp3_compression_backend: "lameenc"
|
66 |
+
|
67 |
+
vocals:
|
68 |
+
pitch_shift: 0.1
|
69 |
+
pitch_shift_min_semitones: -5
|
70 |
+
pitch_shift_max_semitones: 5
|
71 |
+
seven_band_parametric_eq: 0.25
|
72 |
+
seven_band_parametric_eq_min_gain_db: -9
|
73 |
+
seven_band_parametric_eq_max_gain_db: 9
|
74 |
+
tanh_distortion: 0.1
|
75 |
+
tanh_distortion_min: 0.1
|
76 |
+
tanh_distortion_max: 0.7
|
77 |
+
other:
|
78 |
+
pitch_shift: 0.1
|
79 |
+
pitch_shift_min_semitones: -4
|
80 |
+
pitch_shift_max_semitones: 4
|
81 |
+
gaussian_noise: 0.1
|
82 |
+
gaussian_noise_min_amplitude: 0.001
|
83 |
+
gaussian_noise_max_amplitude: 0.015
|
84 |
+
time_stretch: 0.01
|
85 |
+
time_stretch_min_rate: 0.8
|
86 |
+
time_stretch_max_rate: 1.25
|
87 |
+
|
88 |
+
|
89 |
+
inference:
|
90 |
+
batch_size: 1
|
91 |
+
dim_t: 512
|
92 |
+
num_overlap: 4
|
configs/config_musdb18_torchseg.yaml
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 261632
|
3 |
+
dim_f: 4096
|
4 |
+
dim_t: 512
|
5 |
+
hop_length: 512
|
6 |
+
n_fft: 8192
|
7 |
+
num_channels: 2
|
8 |
+
sample_rate: 44100
|
9 |
+
min_mean_abs: 0.001
|
10 |
+
|
11 |
+
model:
|
12 |
+
encoder_name: maxvit_tiny_tf_512 # look with torchseg.list_encoders(). Currently 858 available
|
13 |
+
decoder_type: unet # unet, fpn
|
14 |
+
act: gelu
|
15 |
+
num_channels: 128
|
16 |
+
num_subbands: 8
|
17 |
+
|
18 |
+
training:
|
19 |
+
batch_size: 18
|
20 |
+
gradient_accumulation_steps: 1
|
21 |
+
grad_clip: 0
|
22 |
+
instruments:
|
23 |
+
- vocals
|
24 |
+
- bass
|
25 |
+
- drums
|
26 |
+
- other
|
27 |
+
lr: 5.0e-05
|
28 |
+
patience: 2
|
29 |
+
reduce_factor: 0.95
|
30 |
+
target_instrument: null
|
31 |
+
num_epochs: 1000
|
32 |
+
num_steps: 2000
|
33 |
+
q: 0.95
|
34 |
+
coarse_loss_clip: true
|
35 |
+
ema_momentum: 0.999
|
36 |
+
optimizer: adamw
|
37 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
38 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
39 |
+
|
40 |
+
augmentations:
|
41 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
42 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
43 |
+
loudness_min: 0.5
|
44 |
+
loudness_max: 1.5
|
45 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
46 |
+
mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
47 |
+
- 0.2
|
48 |
+
- 0.02
|
49 |
+
mixup_loudness_min: 0.5
|
50 |
+
mixup_loudness_max: 1.5
|
51 |
+
|
52 |
+
# apply mp3 compression to mixture only (emulate downloading mp3 from internet)
|
53 |
+
mp3_compression_on_mixture: 0.01
|
54 |
+
mp3_compression_on_mixture_bitrate_min: 32
|
55 |
+
mp3_compression_on_mixture_bitrate_max: 320
|
56 |
+
mp3_compression_on_mixture_backend: "lameenc"
|
57 |
+
|
58 |
+
all:
|
59 |
+
channel_shuffle: 0.5 # Set 0 or lower to disable
|
60 |
+
random_inverse: 0.1 # inverse track (better lower probability)
|
61 |
+
random_polarity: 0.5 # polarity change (multiply waveform to -1)
|
62 |
+
mp3_compression: 0.01
|
63 |
+
mp3_compression_min_bitrate: 32
|
64 |
+
mp3_compression_max_bitrate: 320
|
65 |
+
mp3_compression_backend: "lameenc"
|
66 |
+
|
67 |
+
vocals:
|
68 |
+
pitch_shift: 0.1
|
69 |
+
pitch_shift_min_semitones: -5
|
70 |
+
pitch_shift_max_semitones: 5
|
71 |
+
seven_band_parametric_eq: 0.25
|
72 |
+
seven_band_parametric_eq_min_gain_db: -9
|
73 |
+
seven_band_parametric_eq_max_gain_db: 9
|
74 |
+
tanh_distortion: 0.1
|
75 |
+
tanh_distortion_min: 0.1
|
76 |
+
tanh_distortion_max: 0.7
|
77 |
+
other:
|
78 |
+
pitch_shift: 0.1
|
79 |
+
pitch_shift_min_semitones: -4
|
80 |
+
pitch_shift_max_semitones: 4
|
81 |
+
gaussian_noise: 0.1
|
82 |
+
gaussian_noise_min_amplitude: 0.001
|
83 |
+
gaussian_noise_max_amplitude: 0.015
|
84 |
+
time_stretch: 0.01
|
85 |
+
time_stretch_min_rate: 0.8
|
86 |
+
time_stretch_max_rate: 1.25
|
87 |
+
|
88 |
+
|
89 |
+
inference:
|
90 |
+
batch_size: 1
|
91 |
+
dim_t: 512
|
92 |
+
num_overlap: 4
|
configs/config_vocals_bandit_bsrnn_multi_mus64.yaml
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: "MultiMaskMultiSourceBandSplitRNN"
|
2 |
+
audio:
|
3 |
+
chunk_size: 264600
|
4 |
+
num_channels: 2
|
5 |
+
sample_rate: 44100
|
6 |
+
min_mean_abs: 0.001
|
7 |
+
|
8 |
+
model:
|
9 |
+
in_channel: 1
|
10 |
+
stems: ['vocals', 'other']
|
11 |
+
band_specs: "musical"
|
12 |
+
n_bands: 64
|
13 |
+
fs: 44100
|
14 |
+
require_no_overlap: false
|
15 |
+
require_no_gap: true
|
16 |
+
normalize_channel_independently: false
|
17 |
+
treat_channel_as_feature: true
|
18 |
+
n_sqm_modules: 8
|
19 |
+
emb_dim: 128
|
20 |
+
rnn_dim: 256
|
21 |
+
bidirectional: true
|
22 |
+
rnn_type: "GRU"
|
23 |
+
mlp_dim: 512
|
24 |
+
hidden_activation: "Tanh"
|
25 |
+
hidden_activation_kwargs: null
|
26 |
+
complex_mask: true
|
27 |
+
n_fft: 2048
|
28 |
+
win_length: 2048
|
29 |
+
hop_length: 512
|
30 |
+
window_fn: "hann_window"
|
31 |
+
wkwargs: null
|
32 |
+
power: null
|
33 |
+
center: true
|
34 |
+
normalized: true
|
35 |
+
pad_mode: "constant"
|
36 |
+
onesided: true
|
37 |
+
|
38 |
+
training:
|
39 |
+
batch_size: 4
|
40 |
+
gradient_accumulation_steps: 4
|
41 |
+
grad_clip: 0
|
42 |
+
instruments:
|
43 |
+
- vocals
|
44 |
+
- other
|
45 |
+
lr: 9.0e-05
|
46 |
+
patience: 2
|
47 |
+
reduce_factor: 0.95
|
48 |
+
target_instrument: null
|
49 |
+
num_epochs: 1000
|
50 |
+
num_steps: 1000
|
51 |
+
q: 0.95
|
52 |
+
coarse_loss_clip: true
|
53 |
+
ema_momentum: 0.999
|
54 |
+
optimizer: adam
|
55 |
+
other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
|
56 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
57 |
+
|
58 |
+
augmentations:
|
59 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
60 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
61 |
+
loudness_min: 0.5
|
62 |
+
loudness_max: 1.5
|
63 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
64 |
+
mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
65 |
+
- 0.2
|
66 |
+
- 0.02
|
67 |
+
mixup_loudness_min: 0.5
|
68 |
+
mixup_loudness_max: 1.5
|
69 |
+
|
70 |
+
inference:
|
71 |
+
batch_size: 1
|
72 |
+
dim_t: 256
|
73 |
+
num_overlap: 4
|
configs/config_vocals_bs_mamba2.yaml
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 132300 # samplerate * segment
|
3 |
+
hop_length: 1024
|
4 |
+
min_mean_abs: 0.0
|
5 |
+
|
6 |
+
training:
|
7 |
+
batch_size: 8
|
8 |
+
gradient_accumulation_steps: 1
|
9 |
+
grad_clip: 0
|
10 |
+
segment: 11
|
11 |
+
shift: 1
|
12 |
+
samplerate: 44100
|
13 |
+
channels: 2
|
14 |
+
normalize: true
|
15 |
+
instruments: ['vocals', 'other']
|
16 |
+
target_instrument: null
|
17 |
+
num_epochs: 1000
|
18 |
+
num_steps: 1000
|
19 |
+
optimizer: prodigy
|
20 |
+
lr: 1.0
|
21 |
+
patience: 2
|
22 |
+
reduce_factor: 0.95
|
23 |
+
q: 0.95
|
24 |
+
coarse_loss_clip: true
|
25 |
+
ema_momentum: 0.999
|
26 |
+
read_metadata_procs: 8
|
27 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
28 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
29 |
+
|
30 |
+
model:
|
31 |
+
sr: 44100
|
32 |
+
win: 2048
|
33 |
+
stride: 512
|
34 |
+
feature_dim: 128
|
35 |
+
num_repeat_mask: 8
|
36 |
+
num_repeat_map: 4
|
37 |
+
num_output: 2
|
38 |
+
|
39 |
+
augmentations:
|
40 |
+
enable: false # enable or disable all augmentations (to fast disable if needed)
|
41 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
42 |
+
loudness_min: 0.5
|
43 |
+
loudness_max: 1.5
|
44 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
45 |
+
mixup_probs: [0.2, 0.02]
|
46 |
+
mixup_loudness_min: 0.5
|
47 |
+
mixup_loudness_max: 1.5
|
48 |
+
|
49 |
+
inference:
|
50 |
+
num_overlap: 2
|
51 |
+
batch_size: 4
|
configs/config_vocals_bs_roformer.yaml
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 131584
|
3 |
+
dim_f: 1024
|
4 |
+
dim_t: 256
|
5 |
+
hop_length: 512
|
6 |
+
n_fft: 2048
|
7 |
+
num_channels: 2
|
8 |
+
sample_rate: 44100
|
9 |
+
min_mean_abs: 0.001
|
10 |
+
|
11 |
+
model:
|
12 |
+
dim: 192
|
13 |
+
depth: 6
|
14 |
+
stereo: true
|
15 |
+
num_stems: 1
|
16 |
+
time_transformer_depth: 1
|
17 |
+
freq_transformer_depth: 1
|
18 |
+
linear_transformer_depth: 0
|
19 |
+
freqs_per_bands: !!python/tuple
|
20 |
+
- 2
|
21 |
+
- 2
|
22 |
+
- 2
|
23 |
+
- 2
|
24 |
+
- 2
|
25 |
+
- 2
|
26 |
+
- 2
|
27 |
+
- 2
|
28 |
+
- 2
|
29 |
+
- 2
|
30 |
+
- 2
|
31 |
+
- 2
|
32 |
+
- 2
|
33 |
+
- 2
|
34 |
+
- 2
|
35 |
+
- 2
|
36 |
+
- 2
|
37 |
+
- 2
|
38 |
+
- 2
|
39 |
+
- 2
|
40 |
+
- 2
|
41 |
+
- 2
|
42 |
+
- 2
|
43 |
+
- 2
|
44 |
+
- 4
|
45 |
+
- 4
|
46 |
+
- 4
|
47 |
+
- 4
|
48 |
+
- 4
|
49 |
+
- 4
|
50 |
+
- 4
|
51 |
+
- 4
|
52 |
+
- 4
|
53 |
+
- 4
|
54 |
+
- 4
|
55 |
+
- 4
|
56 |
+
- 12
|
57 |
+
- 12
|
58 |
+
- 12
|
59 |
+
- 12
|
60 |
+
- 12
|
61 |
+
- 12
|
62 |
+
- 12
|
63 |
+
- 12
|
64 |
+
- 24
|
65 |
+
- 24
|
66 |
+
- 24
|
67 |
+
- 24
|
68 |
+
- 24
|
69 |
+
- 24
|
70 |
+
- 24
|
71 |
+
- 24
|
72 |
+
- 48
|
73 |
+
- 48
|
74 |
+
- 48
|
75 |
+
- 48
|
76 |
+
- 48
|
77 |
+
- 48
|
78 |
+
- 48
|
79 |
+
- 48
|
80 |
+
- 128
|
81 |
+
- 129
|
82 |
+
dim_head: 64
|
83 |
+
heads: 8
|
84 |
+
attn_dropout: 0.1
|
85 |
+
ff_dropout: 0.1
|
86 |
+
flash_attn: true
|
87 |
+
dim_freqs_in: 1025
|
88 |
+
stft_n_fft: 2048
|
89 |
+
stft_hop_length: 512
|
90 |
+
stft_win_length: 2048
|
91 |
+
stft_normalized: false
|
92 |
+
mask_estimator_depth: 2
|
93 |
+
multi_stft_resolution_loss_weight: 1.0
|
94 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
95 |
+
- 4096
|
96 |
+
- 2048
|
97 |
+
- 1024
|
98 |
+
- 512
|
99 |
+
- 256
|
100 |
+
multi_stft_hop_size: 147
|
101 |
+
multi_stft_normalized: False
|
102 |
+
mlp_expansion_factor: 4 # Probably too big (requires a lot of memory for weights)
|
103 |
+
use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested)
|
104 |
+
skip_connection: False # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
|
105 |
+
|
106 |
+
training:
|
107 |
+
batch_size: 10
|
108 |
+
gradient_accumulation_steps: 1
|
109 |
+
grad_clip: 0
|
110 |
+
instruments:
|
111 |
+
- vocals
|
112 |
+
- other
|
113 |
+
lr: 5.0e-05
|
114 |
+
patience: 2
|
115 |
+
reduce_factor: 0.95
|
116 |
+
target_instrument: vocals
|
117 |
+
num_epochs: 1000
|
118 |
+
num_steps: 1000
|
119 |
+
q: 0.95
|
120 |
+
coarse_loss_clip: true
|
121 |
+
ema_momentum: 0.999
|
122 |
+
optimizer: adam
|
123 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
124 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
125 |
+
|
126 |
+
augmentations:
|
127 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
128 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
129 |
+
loudness_min: 0.5
|
130 |
+
loudness_max: 1.5
|
131 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
132 |
+
mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
133 |
+
- 0.2
|
134 |
+
- 0.02
|
135 |
+
mixup_loudness_min: 0.5
|
136 |
+
mixup_loudness_max: 1.5
|
137 |
+
|
138 |
+
inference:
|
139 |
+
batch_size: 1
|
140 |
+
dim_t: 256
|
141 |
+
num_overlap: 4
|
configs/config_vocals_htdemucs.yaml
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 485100 # samplerate * segment
|
3 |
+
min_mean_abs: 0.001
|
4 |
+
hop_length: 1024
|
5 |
+
|
6 |
+
training:
|
7 |
+
batch_size: 10
|
8 |
+
gradient_accumulation_steps: 1
|
9 |
+
grad_clip: 0
|
10 |
+
segment: 11
|
11 |
+
shift: 1
|
12 |
+
samplerate: 44100
|
13 |
+
channels: 2
|
14 |
+
normalize: true
|
15 |
+
instruments: ['vocals', 'other']
|
16 |
+
target_instrument: null
|
17 |
+
num_epochs: 1000
|
18 |
+
num_steps: 1000
|
19 |
+
optimizer: adam
|
20 |
+
lr: 9.0e-05
|
21 |
+
patience: 2
|
22 |
+
reduce_factor: 0.95
|
23 |
+
q: 0.95
|
24 |
+
coarse_loss_clip: true
|
25 |
+
ema_momentum: 0.999
|
26 |
+
other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
|
27 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
28 |
+
|
29 |
+
augmentations:
|
30 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
31 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
32 |
+
loudness_min: 0.5
|
33 |
+
loudness_max: 1.5
|
34 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
35 |
+
mixup_probs: [0.2, 0.02]
|
36 |
+
mixup_loudness_min: 0.5
|
37 |
+
mixup_loudness_max: 1.5
|
38 |
+
|
39 |
+
inference:
|
40 |
+
num_overlap: 2
|
41 |
+
batch_size: 8
|
42 |
+
|
43 |
+
model: htdemucs
|
44 |
+
|
45 |
+
htdemucs: # see demucs/htdemucs.py for a detailed description
|
46 |
+
# Channels
|
47 |
+
channels: 48
|
48 |
+
channels_time:
|
49 |
+
growth: 2
|
50 |
+
# STFT
|
51 |
+
num_subbands: 1
|
52 |
+
nfft: 4096
|
53 |
+
wiener_iters: 0
|
54 |
+
end_iters: 0
|
55 |
+
wiener_residual: false
|
56 |
+
cac: true
|
57 |
+
# Main structure
|
58 |
+
depth: 4
|
59 |
+
rewrite: true
|
60 |
+
# Frequency Branch
|
61 |
+
multi_freqs: []
|
62 |
+
multi_freqs_depth: 3
|
63 |
+
freq_emb: 0.2
|
64 |
+
emb_scale: 10
|
65 |
+
emb_smooth: true
|
66 |
+
# Convolutions
|
67 |
+
kernel_size: 8
|
68 |
+
stride: 4
|
69 |
+
time_stride: 2
|
70 |
+
context: 1
|
71 |
+
context_enc: 0
|
72 |
+
# normalization
|
73 |
+
norm_starts: 4
|
74 |
+
norm_groups: 4
|
75 |
+
# DConv residual branch
|
76 |
+
dconv_mode: 3
|
77 |
+
dconv_depth: 2
|
78 |
+
dconv_comp: 8
|
79 |
+
dconv_init: 1e-3
|
80 |
+
# Before the Transformer
|
81 |
+
bottom_channels: 512
|
82 |
+
# CrossTransformer
|
83 |
+
# ------ Common to all
|
84 |
+
# Regular parameters
|
85 |
+
t_layers: 5
|
86 |
+
t_hidden_scale: 4.0
|
87 |
+
t_heads: 8
|
88 |
+
t_dropout: 0.0
|
89 |
+
t_layer_scale: True
|
90 |
+
t_gelu: True
|
91 |
+
# ------------- Positional Embedding
|
92 |
+
t_emb: sin
|
93 |
+
t_max_positions: 10000 # for the scaled embedding
|
94 |
+
t_max_period: 10000.0
|
95 |
+
t_weight_pos_embed: 1.0
|
96 |
+
t_cape_mean_normalize: True
|
97 |
+
t_cape_augment: True
|
98 |
+
t_cape_glob_loc_scale: [5000.0, 1.0, 1.4]
|
99 |
+
t_sin_random_shift: 0
|
100 |
+
# ------------- norm before a transformer encoder
|
101 |
+
t_norm_in: True
|
102 |
+
t_norm_in_group: False
|
103 |
+
# ------------- norm inside the encoder
|
104 |
+
t_group_norm: False
|
105 |
+
t_norm_first: True
|
106 |
+
t_norm_out: True
|
107 |
+
# ------------- optim
|
108 |
+
t_weight_decay: 0.0
|
109 |
+
t_lr:
|
110 |
+
# ------------- sparsity
|
111 |
+
t_sparse_self_attn: False
|
112 |
+
t_sparse_cross_attn: False
|
113 |
+
t_mask_type: diag
|
114 |
+
t_mask_random_seed: 42
|
115 |
+
t_sparse_attn_window: 400
|
116 |
+
t_global_window: 100
|
117 |
+
t_sparsity: 0.95
|
118 |
+
t_auto_sparsity: False
|
119 |
+
# Cross Encoder First (False)
|
120 |
+
t_cross_first: False
|
121 |
+
# Weight init
|
122 |
+
rescale: 0.1
|
123 |
+
|
configs/config_vocals_mdx23c.yaml
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 261120
|
3 |
+
dim_f: 4096
|
4 |
+
dim_t: 256
|
5 |
+
hop_length: 1024
|
6 |
+
n_fft: 8192
|
7 |
+
num_channels: 2
|
8 |
+
sample_rate: 44100
|
9 |
+
min_mean_abs: 0.001
|
10 |
+
|
11 |
+
model:
|
12 |
+
act: gelu
|
13 |
+
bottleneck_factor: 4
|
14 |
+
growth: 128
|
15 |
+
norm: InstanceNorm
|
16 |
+
num_blocks_per_scale: 2
|
17 |
+
num_channels: 128
|
18 |
+
num_scales: 5
|
19 |
+
num_subbands: 4
|
20 |
+
scale:
|
21 |
+
- 2
|
22 |
+
- 2
|
23 |
+
|
24 |
+
training:
|
25 |
+
batch_size: 6
|
26 |
+
gradient_accumulation_steps: 1
|
27 |
+
grad_clip: 0
|
28 |
+
instruments:
|
29 |
+
- vocals
|
30 |
+
- other
|
31 |
+
lr: 9.0e-05
|
32 |
+
patience: 2
|
33 |
+
reduce_factor: 0.95
|
34 |
+
target_instrument: null
|
35 |
+
num_epochs: 1000
|
36 |
+
num_steps: 1000
|
37 |
+
q: 0.95
|
38 |
+
coarse_loss_clip: true
|
39 |
+
ema_momentum: 0.999
|
40 |
+
optimizer: adam
|
41 |
+
read_metadata_procs: 8 # Number of processes to use during metadata reading for dataset. Can speed up metadata generation
|
42 |
+
other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
|
43 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
44 |
+
|
45 |
+
augmentations:
|
46 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
47 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
48 |
+
loudness_min: 0.5
|
49 |
+
loudness_max: 1.5
|
50 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
51 |
+
mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
52 |
+
- 0.2
|
53 |
+
- 0.02
|
54 |
+
mixup_loudness_min: 0.5
|
55 |
+
mixup_loudness_max: 1.5
|
56 |
+
|
57 |
+
# apply mp3 compression to mixture only (emulate downloading mp3 from internet)
|
58 |
+
mp3_compression_on_mixture: 0.01
|
59 |
+
mp3_compression_on_mixture_bitrate_min: 32
|
60 |
+
mp3_compression_on_mixture_bitrate_max: 320
|
61 |
+
mp3_compression_on_mixture_backend: "lameenc"
|
62 |
+
|
63 |
+
all:
|
64 |
+
channel_shuffle: 0.5 # Set 0 or lower to disable
|
65 |
+
random_inverse: 0.1 # inverse track (better lower probability)
|
66 |
+
random_polarity: 0.5 # polarity change (multiply waveform to -1)
|
67 |
+
mp3_compression: 0.01
|
68 |
+
mp3_compression_min_bitrate: 32
|
69 |
+
mp3_compression_max_bitrate: 320
|
70 |
+
mp3_compression_backend: "lameenc"
|
71 |
+
|
72 |
+
vocals:
|
73 |
+
pitch_shift: 0.1
|
74 |
+
pitch_shift_min_semitones: -5
|
75 |
+
pitch_shift_max_semitones: 5
|
76 |
+
seven_band_parametric_eq: 0.25
|
77 |
+
seven_band_parametric_eq_min_gain_db: -9
|
78 |
+
seven_band_parametric_eq_max_gain_db: 9
|
79 |
+
tanh_distortion: 0.1
|
80 |
+
tanh_distortion_min: 0.1
|
81 |
+
tanh_distortion_max: 0.7
|
82 |
+
other:
|
83 |
+
pitch_shift: 0.1
|
84 |
+
pitch_shift_min_semitones: -4
|
85 |
+
pitch_shift_max_semitones: 4
|
86 |
+
gaussian_noise: 0.1
|
87 |
+
gaussian_noise_min_amplitude: 0.001
|
88 |
+
gaussian_noise_max_amplitude: 0.015
|
89 |
+
time_stretch: 0.01
|
90 |
+
time_stretch_min_rate: 0.8
|
91 |
+
time_stretch_max_rate: 1.25
|
92 |
+
|
93 |
+
inference:
|
94 |
+
batch_size: 1
|
95 |
+
dim_t: 256
|
96 |
+
num_overlap: 4
|
configs/config_vocals_mel_band_roformer.yaml
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 131584
|
3 |
+
dim_f: 1024
|
4 |
+
dim_t: 256
|
5 |
+
hop_length: 512
|
6 |
+
n_fft: 2048
|
7 |
+
num_channels: 2
|
8 |
+
sample_rate: 44100
|
9 |
+
min_mean_abs: 0.001
|
10 |
+
|
11 |
+
model:
|
12 |
+
dim: 192
|
13 |
+
depth: 8
|
14 |
+
stereo: true
|
15 |
+
num_stems: 1
|
16 |
+
time_transformer_depth: 1
|
17 |
+
freq_transformer_depth: 1
|
18 |
+
linear_transformer_depth: 0
|
19 |
+
num_bands: 60
|
20 |
+
dim_head: 64
|
21 |
+
heads: 8
|
22 |
+
attn_dropout: 0.1
|
23 |
+
ff_dropout: 0.1
|
24 |
+
flash_attn: True
|
25 |
+
dim_freqs_in: 1025
|
26 |
+
sample_rate: 44100 # needed for mel filter bank from librosa
|
27 |
+
stft_n_fft: 2048
|
28 |
+
stft_hop_length: 512
|
29 |
+
stft_win_length: 2048
|
30 |
+
stft_normalized: False
|
31 |
+
mask_estimator_depth: 2
|
32 |
+
multi_stft_resolution_loss_weight: 1.0
|
33 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
34 |
+
- 4096
|
35 |
+
- 2048
|
36 |
+
- 1024
|
37 |
+
- 512
|
38 |
+
- 256
|
39 |
+
multi_stft_hop_size: 147
|
40 |
+
multi_stft_normalized: False
|
41 |
+
mlp_expansion_factor: 4 # Probably too big (requires a lot of memory for weights)
|
42 |
+
use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested)
|
43 |
+
skip_connection: False # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
|
44 |
+
|
45 |
+
training:
|
46 |
+
batch_size: 7
|
47 |
+
gradient_accumulation_steps: 1
|
48 |
+
grad_clip: 0
|
49 |
+
instruments:
|
50 |
+
- vocals
|
51 |
+
- other
|
52 |
+
lr: 5.0e-05
|
53 |
+
patience: 2
|
54 |
+
reduce_factor: 0.95
|
55 |
+
target_instrument: vocals
|
56 |
+
num_epochs: 1000
|
57 |
+
num_steps: 1000
|
58 |
+
q: 0.95
|
59 |
+
coarse_loss_clip: true
|
60 |
+
ema_momentum: 0.999
|
61 |
+
optimizer: adam
|
62 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
63 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
64 |
+
|
65 |
+
augmentations:
|
66 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
67 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
68 |
+
loudness_min: 0.5
|
69 |
+
loudness_max: 1.5
|
70 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
71 |
+
mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
72 |
+
- 0.2
|
73 |
+
- 0.02
|
74 |
+
mixup_loudness_min: 0.5
|
75 |
+
mixup_loudness_max: 1.5
|
76 |
+
|
77 |
+
inference:
|
78 |
+
batch_size: 1
|
79 |
+
dim_t: 256
|
80 |
+
num_overlap: 4
|
configs/config_vocals_scnet.yaml
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 485100 # 44100 * 11
|
3 |
+
num_channels: 2
|
4 |
+
sample_rate: 44100
|
5 |
+
min_mean_abs: 0.000
|
6 |
+
|
7 |
+
model:
|
8 |
+
sources:
|
9 |
+
- vocals
|
10 |
+
- other
|
11 |
+
audio_channels: 2
|
12 |
+
dims:
|
13 |
+
- 4
|
14 |
+
- 32
|
15 |
+
- 64
|
16 |
+
- 128
|
17 |
+
nfft: 4096
|
18 |
+
hop_size: 1024
|
19 |
+
win_size: 4096
|
20 |
+
normalized: True
|
21 |
+
band_SR:
|
22 |
+
- 0.175
|
23 |
+
- 0.392
|
24 |
+
- 0.433
|
25 |
+
band_stride:
|
26 |
+
- 1
|
27 |
+
- 4
|
28 |
+
- 16
|
29 |
+
band_kernel:
|
30 |
+
- 3
|
31 |
+
- 4
|
32 |
+
- 16
|
33 |
+
conv_depths:
|
34 |
+
- 3
|
35 |
+
- 2
|
36 |
+
- 1
|
37 |
+
compress: 4
|
38 |
+
conv_kernel: 3
|
39 |
+
num_dplayer: 6
|
40 |
+
expand: 1
|
41 |
+
|
42 |
+
training:
|
43 |
+
batch_size: 10
|
44 |
+
gradient_accumulation_steps: 1
|
45 |
+
grad_clip: 0
|
46 |
+
instruments:
|
47 |
+
- vocals
|
48 |
+
- other
|
49 |
+
lr: 5.0e-04
|
50 |
+
patience: 2
|
51 |
+
reduce_factor: 0.95
|
52 |
+
target_instrument: null
|
53 |
+
num_epochs: 1000
|
54 |
+
num_steps: 10
|
55 |
+
q: 0.95
|
56 |
+
coarse_loss_clip: true
|
57 |
+
ema_momentum: 0.999
|
58 |
+
optimizer: adam
|
59 |
+
other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
|
60 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
61 |
+
|
62 |
+
augmentations:
|
63 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
64 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
65 |
+
loudness_min: 0.5
|
66 |
+
loudness_max: 1.5
|
67 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
68 |
+
mixup_probs:
|
69 |
+
!!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
70 |
+
- 0.2
|
71 |
+
- 0.02
|
72 |
+
mixup_loudness_min: 0.5
|
73 |
+
mixup_loudness_max: 1.5
|
74 |
+
|
75 |
+
inference:
|
76 |
+
batch_size: 8
|
77 |
+
dim_t: 256
|
78 |
+
num_overlap: 4
|
79 |
+
normalize: false
|
configs/config_vocals_scnet_large.yaml
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 485100 # 44100 * 11
|
3 |
+
num_channels: 2
|
4 |
+
sample_rate: 44100
|
5 |
+
min_mean_abs: 0.000
|
6 |
+
|
7 |
+
model:
|
8 |
+
sources:
|
9 |
+
- vocals
|
10 |
+
- other
|
11 |
+
audio_channels: 2
|
12 |
+
dims:
|
13 |
+
- 4
|
14 |
+
- 64
|
15 |
+
- 128
|
16 |
+
- 256
|
17 |
+
nfft: 4096
|
18 |
+
hop_size: 1024
|
19 |
+
win_size: 4096
|
20 |
+
normalized: True
|
21 |
+
band_SR:
|
22 |
+
- 0.225
|
23 |
+
- 0.372
|
24 |
+
- 0.403
|
25 |
+
band_stride:
|
26 |
+
- 1
|
27 |
+
- 4
|
28 |
+
- 16
|
29 |
+
band_kernel:
|
30 |
+
- 3
|
31 |
+
- 4
|
32 |
+
- 16
|
33 |
+
conv_depths:
|
34 |
+
- 3
|
35 |
+
- 2
|
36 |
+
- 1
|
37 |
+
compress: 4
|
38 |
+
conv_kernel: 3
|
39 |
+
num_dplayer: 6
|
40 |
+
expand: 1
|
41 |
+
|
42 |
+
training:
|
43 |
+
batch_size: 6
|
44 |
+
gradient_accumulation_steps: 1
|
45 |
+
grad_clip: 0
|
46 |
+
instruments:
|
47 |
+
- vocals
|
48 |
+
- other
|
49 |
+
lr: 1.0e-04
|
50 |
+
patience: 2
|
51 |
+
reduce_factor: 0.95
|
52 |
+
target_instrument: null
|
53 |
+
num_epochs: 1000
|
54 |
+
num_steps: 1000
|
55 |
+
q: 0.95
|
56 |
+
coarse_loss_clip: true
|
57 |
+
ema_momentum: 0.999
|
58 |
+
optimizer: adam
|
59 |
+
other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
|
60 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
61 |
+
|
62 |
+
augmentations:
|
63 |
+
enable: false # enable or disable all augmentations (to fast disable if needed)
|
64 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
65 |
+
loudness_min: 0.5
|
66 |
+
loudness_max: 1.5
|
67 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
68 |
+
mixup_probs:
|
69 |
+
!!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
70 |
+
- 0.2
|
71 |
+
- 0.02
|
72 |
+
mixup_loudness_min: 0.5
|
73 |
+
mixup_loudness_max: 1.5
|
74 |
+
|
75 |
+
inference:
|
76 |
+
batch_size: 8
|
77 |
+
dim_t: 256
|
78 |
+
num_overlap: 4
|
79 |
+
normalize: false
|
configs/config_vocals_scnet_unofficial.yaml
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 264600
|
3 |
+
num_channels: 2
|
4 |
+
sample_rate: 44100
|
5 |
+
min_mean_abs: 0.000
|
6 |
+
|
7 |
+
model:
|
8 |
+
dims: [4, 32, 64, 128]
|
9 |
+
bandsplit_ratios: [.175, .392, .433]
|
10 |
+
downsample_strides: [1, 4, 16]
|
11 |
+
n_conv_modules: [3, 2, 1]
|
12 |
+
n_rnn_layers: 6
|
13 |
+
rnn_hidden_dim: 128
|
14 |
+
n_sources: 2
|
15 |
+
|
16 |
+
n_fft: 4096
|
17 |
+
hop_length: 1024
|
18 |
+
win_length: 4096
|
19 |
+
stft_normalized: false
|
20 |
+
|
21 |
+
use_mamba: false
|
22 |
+
d_state: 16
|
23 |
+
d_conv: 4
|
24 |
+
d_expand: 2
|
25 |
+
|
26 |
+
training:
|
27 |
+
batch_size: 10
|
28 |
+
gradient_accumulation_steps: 2
|
29 |
+
grad_clip: 0
|
30 |
+
instruments:
|
31 |
+
- vocals
|
32 |
+
- other
|
33 |
+
lr: 5.0e-04
|
34 |
+
patience: 2
|
35 |
+
reduce_factor: 0.95
|
36 |
+
target_instrument: null
|
37 |
+
num_epochs: 1000
|
38 |
+
num_steps: 1000
|
39 |
+
q: 0.95
|
40 |
+
coarse_loss_clip: true
|
41 |
+
ema_momentum: 0.999
|
42 |
+
optimizer: adam
|
43 |
+
other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
|
44 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
45 |
+
|
46 |
+
augmentations:
|
47 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
48 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
49 |
+
loudness_min: 0.5
|
50 |
+
loudness_max: 1.5
|
51 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
52 |
+
mixup_probs:
|
53 |
+
!!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
54 |
+
- 0.2
|
55 |
+
- 0.02
|
56 |
+
mixup_loudness_min: 0.5
|
57 |
+
mixup_loudness_max: 1.5
|
58 |
+
|
59 |
+
inference:
|
60 |
+
batch_size: 8
|
61 |
+
dim_t: 256
|
62 |
+
num_overlap: 4
|
configs/config_vocals_segm_models.yaml
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 261632
|
3 |
+
dim_f: 4096
|
4 |
+
dim_t: 512
|
5 |
+
hop_length: 512
|
6 |
+
n_fft: 8192
|
7 |
+
num_channels: 2
|
8 |
+
sample_rate: 44100
|
9 |
+
min_mean_abs: 0.001
|
10 |
+
|
11 |
+
model:
|
12 |
+
encoder_name: tu-maxvit_large_tf_512 # look here for possibilities: https://github.com/qubvel/segmentation_models.pytorch#encoders-
|
13 |
+
decoder_type: unet # unet, fpn
|
14 |
+
act: gelu
|
15 |
+
num_channels: 128
|
16 |
+
num_subbands: 8
|
17 |
+
|
18 |
+
loss_multistft:
|
19 |
+
fft_sizes:
|
20 |
+
- 1024
|
21 |
+
- 2048
|
22 |
+
- 4096
|
23 |
+
hop_sizes:
|
24 |
+
- 512
|
25 |
+
- 1024
|
26 |
+
- 2048
|
27 |
+
win_lengths:
|
28 |
+
- 1024
|
29 |
+
- 2048
|
30 |
+
- 4096
|
31 |
+
window: "hann_window"
|
32 |
+
scale: "mel"
|
33 |
+
n_bins: 128
|
34 |
+
sample_rate: 44100
|
35 |
+
perceptual_weighting: true
|
36 |
+
w_sc: 1.0
|
37 |
+
w_log_mag: 1.0
|
38 |
+
w_lin_mag: 0.0
|
39 |
+
w_phs: 0.0
|
40 |
+
mag_distance: "L1"
|
41 |
+
|
42 |
+
|
43 |
+
training:
|
44 |
+
batch_size: 8
|
45 |
+
gradient_accumulation_steps: 1
|
46 |
+
grad_clip: 0
|
47 |
+
instruments:
|
48 |
+
- vocals
|
49 |
+
- other
|
50 |
+
lr: 5.0e-05
|
51 |
+
patience: 2
|
52 |
+
reduce_factor: 0.95
|
53 |
+
target_instrument: null
|
54 |
+
num_epochs: 1000
|
55 |
+
num_steps: 2000
|
56 |
+
q: 0.95
|
57 |
+
coarse_loss_clip: true
|
58 |
+
ema_momentum: 0.999
|
59 |
+
optimizer: adamw
|
60 |
+
other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
|
61 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
62 |
+
|
63 |
+
augmentations:
|
64 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
65 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
66 |
+
loudness_min: 0.5
|
67 |
+
loudness_max: 1.5
|
68 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
69 |
+
mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
70 |
+
- 0.2
|
71 |
+
- 0.02
|
72 |
+
mixup_loudness_min: 0.5
|
73 |
+
mixup_loudness_max: 1.5
|
74 |
+
|
75 |
+
inference:
|
76 |
+
batch_size: 1
|
77 |
+
dim_t: 512
|
78 |
+
num_overlap: 4
|
configs/config_vocals_swin_upernet.yaml
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 261632
|
3 |
+
dim_f: 4096
|
4 |
+
dim_t: 512
|
5 |
+
hop_length: 512
|
6 |
+
n_fft: 8192
|
7 |
+
num_channels: 2
|
8 |
+
sample_rate: 44100
|
9 |
+
min_mean_abs: 0.001
|
10 |
+
|
11 |
+
model:
|
12 |
+
act: gelu
|
13 |
+
num_channels: 16
|
14 |
+
num_subbands: 8
|
15 |
+
|
16 |
+
training:
|
17 |
+
batch_size: 14
|
18 |
+
gradient_accumulation_steps: 4
|
19 |
+
grad_clip: 0
|
20 |
+
instruments:
|
21 |
+
- vocals
|
22 |
+
- other
|
23 |
+
lr: 3.0e-05
|
24 |
+
patience: 2
|
25 |
+
reduce_factor: 0.95
|
26 |
+
target_instrument: null
|
27 |
+
num_epochs: 1000
|
28 |
+
num_steps: 1000
|
29 |
+
q: 0.95
|
30 |
+
coarse_loss_clip: true
|
31 |
+
ema_momentum: 0.999
|
32 |
+
optimizer: adamw
|
33 |
+
other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
|
34 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
35 |
+
|
36 |
+
augmentations:
|
37 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
38 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
39 |
+
loudness_min: 0.5
|
40 |
+
loudness_max: 1.5
|
41 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
42 |
+
mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
43 |
+
- 0.2
|
44 |
+
- 0.02
|
45 |
+
mixup_loudness_min: 0.5
|
46 |
+
mixup_loudness_max: 1.5
|
47 |
+
|
48 |
+
inference:
|
49 |
+
batch_size: 1
|
50 |
+
dim_t: 512
|
51 |
+
num_overlap: 4
|
configs/config_vocals_torchseg.yaml
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 261632
|
3 |
+
dim_f: 4096
|
4 |
+
dim_t: 512
|
5 |
+
hop_length: 512
|
6 |
+
n_fft: 8192
|
7 |
+
num_channels: 2
|
8 |
+
sample_rate: 44100
|
9 |
+
min_mean_abs: 0.000
|
10 |
+
|
11 |
+
model:
|
12 |
+
encoder_name: maxvit_tiny_tf_512 # look with torchseg.list_encoders(). Currently 858 available
|
13 |
+
decoder_type: unet # unet, fpn
|
14 |
+
act: gelu
|
15 |
+
num_channels: 128
|
16 |
+
num_subbands: 8
|
17 |
+
|
18 |
+
training:
|
19 |
+
batch_size: 18
|
20 |
+
gradient_accumulation_steps: 1
|
21 |
+
grad_clip: 1.0
|
22 |
+
instruments:
|
23 |
+
- vocals
|
24 |
+
- other
|
25 |
+
lr: 1.0e-04
|
26 |
+
patience: 2
|
27 |
+
reduce_factor: 0.95
|
28 |
+
target_instrument: null
|
29 |
+
num_epochs: 1000
|
30 |
+
num_steps: 1000
|
31 |
+
q: 0.95
|
32 |
+
coarse_loss_clip: true
|
33 |
+
ema_momentum: 0.999
|
34 |
+
optimizer: radam
|
35 |
+
other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
|
36 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
37 |
+
|
38 |
+
augmentations:
|
39 |
+
enable: false # enable or disable all augmentations (to fast disable if needed)
|
40 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
41 |
+
loudness_min: 0.5
|
42 |
+
loudness_max: 1.5
|
43 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
44 |
+
mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
45 |
+
- 0.2
|
46 |
+
- 0.02
|
47 |
+
mixup_loudness_min: 0.5
|
48 |
+
mixup_loudness_max: 1.5
|
49 |
+
|
50 |
+
all:
|
51 |
+
channel_shuffle: 0.5 # Set 0 or lower to disable
|
52 |
+
random_inverse: 0.1 # inverse track (better lower probability)
|
53 |
+
random_polarity: 0.5 # polarity change (multiply waveform to -1)
|
54 |
+
|
55 |
+
inference:
|
56 |
+
batch_size: 8
|
57 |
+
dim_t: 512
|
58 |
+
num_overlap: 2
|
configs/viperx/model_bs_roformer_ep_317_sdr_12.9755.yaml
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 352800
|
3 |
+
dim_f: 1024
|
4 |
+
dim_t: 801 # don't work (use in model)
|
5 |
+
hop_length: 441 # don't work (use in model)
|
6 |
+
n_fft: 2048
|
7 |
+
num_channels: 2
|
8 |
+
sample_rate: 44100
|
9 |
+
min_mean_abs: 0.000
|
10 |
+
|
11 |
+
model:
|
12 |
+
dim: 512
|
13 |
+
depth: 12
|
14 |
+
stereo: true
|
15 |
+
num_stems: 1
|
16 |
+
time_transformer_depth: 1
|
17 |
+
freq_transformer_depth: 1
|
18 |
+
linear_transformer_depth: 0
|
19 |
+
freqs_per_bands: !!python/tuple
|
20 |
+
- 2
|
21 |
+
- 2
|
22 |
+
- 2
|
23 |
+
- 2
|
24 |
+
- 2
|
25 |
+
- 2
|
26 |
+
- 2
|
27 |
+
- 2
|
28 |
+
- 2
|
29 |
+
- 2
|
30 |
+
- 2
|
31 |
+
- 2
|
32 |
+
- 2
|
33 |
+
- 2
|
34 |
+
- 2
|
35 |
+
- 2
|
36 |
+
- 2
|
37 |
+
- 2
|
38 |
+
- 2
|
39 |
+
- 2
|
40 |
+
- 2
|
41 |
+
- 2
|
42 |
+
- 2
|
43 |
+
- 2
|
44 |
+
- 4
|
45 |
+
- 4
|
46 |
+
- 4
|
47 |
+
- 4
|
48 |
+
- 4
|
49 |
+
- 4
|
50 |
+
- 4
|
51 |
+
- 4
|
52 |
+
- 4
|
53 |
+
- 4
|
54 |
+
- 4
|
55 |
+
- 4
|
56 |
+
- 12
|
57 |
+
- 12
|
58 |
+
- 12
|
59 |
+
- 12
|
60 |
+
- 12
|
61 |
+
- 12
|
62 |
+
- 12
|
63 |
+
- 12
|
64 |
+
- 24
|
65 |
+
- 24
|
66 |
+
- 24
|
67 |
+
- 24
|
68 |
+
- 24
|
69 |
+
- 24
|
70 |
+
- 24
|
71 |
+
- 24
|
72 |
+
- 48
|
73 |
+
- 48
|
74 |
+
- 48
|
75 |
+
- 48
|
76 |
+
- 48
|
77 |
+
- 48
|
78 |
+
- 48
|
79 |
+
- 48
|
80 |
+
- 128
|
81 |
+
- 129
|
82 |
+
dim_head: 64
|
83 |
+
heads: 8
|
84 |
+
attn_dropout: 0.1
|
85 |
+
ff_dropout: 0.1
|
86 |
+
flash_attn: true
|
87 |
+
dim_freqs_in: 1025
|
88 |
+
stft_n_fft: 2048
|
89 |
+
stft_hop_length: 441
|
90 |
+
stft_win_length: 2048
|
91 |
+
stft_normalized: false
|
92 |
+
mask_estimator_depth: 2
|
93 |
+
multi_stft_resolution_loss_weight: 1.0
|
94 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
95 |
+
- 4096
|
96 |
+
- 2048
|
97 |
+
- 1024
|
98 |
+
- 512
|
99 |
+
- 256
|
100 |
+
multi_stft_hop_size: 147
|
101 |
+
multi_stft_normalized: False
|
102 |
+
|
103 |
+
training:
|
104 |
+
batch_size: 2
|
105 |
+
gradient_accumulation_steps: 1
|
106 |
+
grad_clip: 0
|
107 |
+
instruments:
|
108 |
+
- vocals
|
109 |
+
- other
|
110 |
+
lr: 1.0e-05
|
111 |
+
patience: 2
|
112 |
+
reduce_factor: 0.95
|
113 |
+
target_instrument: vocals
|
114 |
+
num_epochs: 1000
|
115 |
+
num_steps: 1000
|
116 |
+
q: 0.95
|
117 |
+
coarse_loss_clip: true
|
118 |
+
ema_momentum: 0.999
|
119 |
+
optimizer: adam
|
120 |
+
other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
|
121 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
122 |
+
|
123 |
+
inference:
|
124 |
+
batch_size: 4
|
125 |
+
dim_t: 801
|
126 |
+
num_overlap: 2
|
configs/viperx/model_bs_roformer_ep_937_sdr_10.5309.yaml
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 131584
|
3 |
+
dim_f: 1024
|
4 |
+
dim_t: 256
|
5 |
+
hop_length: 512
|
6 |
+
n_fft: 2048
|
7 |
+
num_channels: 2
|
8 |
+
sample_rate: 44100
|
9 |
+
min_mean_abs: 0.001
|
10 |
+
|
11 |
+
model:
|
12 |
+
dim: 384
|
13 |
+
depth: 12
|
14 |
+
stereo: true
|
15 |
+
num_stems: 1
|
16 |
+
time_transformer_depth: 1
|
17 |
+
freq_transformer_depth: 1
|
18 |
+
linear_transformer_depth: 0
|
19 |
+
freqs_per_bands: !!python/tuple
|
20 |
+
- 2
|
21 |
+
- 2
|
22 |
+
- 2
|
23 |
+
- 2
|
24 |
+
- 2
|
25 |
+
- 2
|
26 |
+
- 2
|
27 |
+
- 2
|
28 |
+
- 2
|
29 |
+
- 2
|
30 |
+
- 2
|
31 |
+
- 2
|
32 |
+
- 2
|
33 |
+
- 2
|
34 |
+
- 2
|
35 |
+
- 2
|
36 |
+
- 2
|
37 |
+
- 2
|
38 |
+
- 2
|
39 |
+
- 2
|
40 |
+
- 2
|
41 |
+
- 2
|
42 |
+
- 2
|
43 |
+
- 2
|
44 |
+
- 4
|
45 |
+
- 4
|
46 |
+
- 4
|
47 |
+
- 4
|
48 |
+
- 4
|
49 |
+
- 4
|
50 |
+
- 4
|
51 |
+
- 4
|
52 |
+
- 4
|
53 |
+
- 4
|
54 |
+
- 4
|
55 |
+
- 4
|
56 |
+
- 12
|
57 |
+
- 12
|
58 |
+
- 12
|
59 |
+
- 12
|
60 |
+
- 12
|
61 |
+
- 12
|
62 |
+
- 12
|
63 |
+
- 12
|
64 |
+
- 24
|
65 |
+
- 24
|
66 |
+
- 24
|
67 |
+
- 24
|
68 |
+
- 24
|
69 |
+
- 24
|
70 |
+
- 24
|
71 |
+
- 24
|
72 |
+
- 48
|
73 |
+
- 48
|
74 |
+
- 48
|
75 |
+
- 48
|
76 |
+
- 48
|
77 |
+
- 48
|
78 |
+
- 48
|
79 |
+
- 48
|
80 |
+
- 128
|
81 |
+
- 129
|
82 |
+
dim_head: 64
|
83 |
+
heads: 8
|
84 |
+
attn_dropout: 0.1
|
85 |
+
ff_dropout: 0.1
|
86 |
+
flash_attn: true
|
87 |
+
dim_freqs_in: 1025
|
88 |
+
stft_n_fft: 2048
|
89 |
+
stft_hop_length: 512
|
90 |
+
stft_win_length: 2048
|
91 |
+
stft_normalized: false
|
92 |
+
mask_estimator_depth: 2
|
93 |
+
multi_stft_resolution_loss_weight: 1.0
|
94 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
95 |
+
- 4096
|
96 |
+
- 2048
|
97 |
+
- 1024
|
98 |
+
- 512
|
99 |
+
- 256
|
100 |
+
multi_stft_hop_size: 147
|
101 |
+
multi_stft_normalized: False
|
102 |
+
|
103 |
+
training:
|
104 |
+
batch_size: 4
|
105 |
+
gradient_accumulation_steps: 1
|
106 |
+
grad_clip: 0
|
107 |
+
instruments:
|
108 |
+
- vocals
|
109 |
+
- other
|
110 |
+
lr: 5.0e-05
|
111 |
+
patience: 2
|
112 |
+
reduce_factor: 0.95
|
113 |
+
target_instrument: other
|
114 |
+
num_epochs: 1000
|
115 |
+
num_steps: 1000
|
116 |
+
q: 0.95
|
117 |
+
coarse_loss_clip: true
|
118 |
+
ema_momentum: 0.999
|
119 |
+
optimizer: adam
|
120 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
121 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
122 |
+
|
123 |
+
augmentations:
|
124 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
125 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
126 |
+
loudness_min: 0.5
|
127 |
+
loudness_max: 1.5
|
128 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
129 |
+
mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
130 |
+
- 0.2
|
131 |
+
- 0.02
|
132 |
+
mixup_loudness_min: 0.5
|
133 |
+
mixup_loudness_max: 1.5
|
134 |
+
|
135 |
+
inference:
|
136 |
+
batch_size: 8
|
137 |
+
dim_t: 512
|
138 |
+
num_overlap: 2
|
configs/viperx/model_mel_band_roformer_ep_3005_sdr_11.4360.yaml
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio:
|
2 |
+
chunk_size: 352800
|
3 |
+
dim_f: 1024
|
4 |
+
dim_t: 801 # don't work (use in model)
|
5 |
+
hop_length: 441 # don't work (use in model)
|
6 |
+
n_fft: 2048
|
7 |
+
num_channels: 2
|
8 |
+
sample_rate: 44100
|
9 |
+
min_mean_abs: 0.000
|
10 |
+
|
11 |
+
model:
|
12 |
+
dim: 384
|
13 |
+
depth: 12
|
14 |
+
stereo: true
|
15 |
+
num_stems: 1
|
16 |
+
time_transformer_depth: 1
|
17 |
+
freq_transformer_depth: 1
|
18 |
+
linear_transformer_depth: 0
|
19 |
+
num_bands: 60
|
20 |
+
dim_head: 64
|
21 |
+
heads: 8
|
22 |
+
attn_dropout: 0.1
|
23 |
+
ff_dropout: 0.1
|
24 |
+
flash_attn: True
|
25 |
+
dim_freqs_in: 1025
|
26 |
+
sample_rate: 44100 # needed for mel filter bank from librosa
|
27 |
+
stft_n_fft: 2048
|
28 |
+
stft_hop_length: 441
|
29 |
+
stft_win_length: 2048
|
30 |
+
stft_normalized: False
|
31 |
+
mask_estimator_depth: 2
|
32 |
+
multi_stft_resolution_loss_weight: 1.0
|
33 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
34 |
+
- 4096
|
35 |
+
- 2048
|
36 |
+
- 1024
|
37 |
+
- 512
|
38 |
+
- 256
|
39 |
+
multi_stft_hop_size: 147
|
40 |
+
multi_stft_normalized: False
|
41 |
+
|
42 |
+
training:
|
43 |
+
batch_size: 1
|
44 |
+
gradient_accumulation_steps: 8
|
45 |
+
grad_clip: 0
|
46 |
+
instruments:
|
47 |
+
- vocals
|
48 |
+
- other
|
49 |
+
lr: 4.0e-05
|
50 |
+
patience: 2
|
51 |
+
reduce_factor: 0.95
|
52 |
+
target_instrument: vocals
|
53 |
+
num_epochs: 1000
|
54 |
+
num_steps: 1000
|
55 |
+
q: 0.95
|
56 |
+
coarse_loss_clip: true
|
57 |
+
ema_momentum: 0.999
|
58 |
+
optimizer: adam
|
59 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
60 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
61 |
+
|
62 |
+
inference:
|
63 |
+
batch_size: 4
|
64 |
+
dim_t: 801
|
65 |
+
num_overlap: 2
|
cookies.txt
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Netscape HTTP Cookie File
|
2 |
+
# This file is generated by yt-dlp. Do not edit.
|
3 |
+
|
4 |
+
.youtube.com TRUE / FALSE 1756513080 HSID AkpR1gV80KfDyBAeq
|
5 |
+
.youtube.com TRUE / TRUE 1756513080 SSID AUkQz9BsAZ9dihvT7
|
6 |
+
.youtube.com TRUE / FALSE 1756513080 APISID FPGwyoC5hlxA_ztn/ADM7Q4t2tMF9LolFH
|
7 |
+
.youtube.com TRUE / TRUE 1756513080 SAPISID 4yc1vubX-H2x2gTg/A4eb_29p67eyBKNwo
|
8 |
+
.youtube.com TRUE / TRUE 1756513080 __Secure-1PAPISID 4yc1vubX-H2x2gTg/A4eb_29p67eyBKNwo
|
9 |
+
.youtube.com TRUE / TRUE 1756513080 __Secure-3PAPISID 4yc1vubX-H2x2gTg/A4eb_29p67eyBKNwo
|
10 |
+
.youtube.com TRUE / FALSE 0 PREF f4=4000000&tz=UTC&f7=100&f6=40000000&hl=en
|
11 |
+
.youtube.com TRUE / FALSE 1756513080 SID g.a000uQhpMC7F759FwOg4eAHYr_VFV7qLJJzrVdnrbB1Gg1ruHpzr7Q7JXHasofNz_IFpc8N2LgACgYKAfYSARISFQHGX2Miyesv7_oABGm-5jwErW1A3BoVAUF8yKrc9rgHmp5qJT6VRm79tW1A0076
|
12 |
+
.youtube.com TRUE / TRUE 1756513080 __Secure-1PSID g.a000uQhpMC7F759FwOg4eAHYr_VFV7qLJJzrVdnrbB1Gg1ruHpzrr6x28jzJl8SymGUnS601CAACgYKAVESARISFQHGX2MiMrjFi53JrIU9q__AtJkTHhoVAUF8yKq9HTb9EMf-IuIKrE24vlao0076
|
13 |
+
.youtube.com TRUE / TRUE 1756513080 __Secure-3PSID g.a000uQhpMC7F759FwOg4eAHYr_VFV7qLJJzrVdnrbB1Gg1ruHpzrggFtS3EfdibObQagLMZPwgACgYKAS0SARISFQHGX2MiVtkRAB_snp0m6Ci8U8_KdxoVAUF8yKpL1TslRsnn1zHR9IM89xyI0076
|
14 |
+
.youtube.com TRUE / TRUE 0 wide 0
|
15 |
+
.youtube.com TRUE / TRUE 1756638167 __Secure-1PSIDTS sidts-CjIBEJ3XV-avYWfDaATyg0Nhkmwux6CKyFaF1gYPa-AjJzR_e3PPij4K2ft8TRk6khgu2xAA
|
16 |
+
.youtube.com TRUE / TRUE 1756638167 __Secure-3PSIDTS sidts-CjIBEJ3XV-avYWfDaATyg0Nhkmwux6CKyFaF1gYPa-AjJzR_e3PPij4K2ft8TRk6khgu2xAA
|
17 |
+
.youtube.com TRUE / TRUE 1756638341 LOGIN_INFO AFmmF2swRgIhAKDOVmKULP27JwVcR_zerOJpO9GmXntRWiR4zWAazwz_AiEAwvt5os697PYAjWwVLGwA5oN3mFBrA1kh_4AlSuvoE-M:QUQ3MjNmem5mN3p3NVNiM2hzMEJ0R1EwUzI2SFNDXzhlQlNXemF2Z2IyZER1cmt1VXZSbk5EbkpaekFySGw4a09MQ3Z6c3RhSFhoXzFUNE9mdHB4ZEdPTFgzNEZrMTB2SWd2azlTdi1SUTdZWGczMEpRb3otemhHZ08wZDlzc0dhRE1sM2tJUTBfSkNiSmpuOTBXdG13eDhsR2JTVVlVQWZB
|
18 |
+
.youtube.com TRUE / TRUE 1741086952 CONSISTENCY AKreu9tDDatJYzjErs5c0WuYZjTQFRMZu7GKaDYzvFqROdgjqcrkvrsWqoTI1zZioac6yVWq7BCSzc1y0Pk0j8ikhC_l9YEyMmQs14Kg3IHcli61swZK3uWn
|
19 |
+
.youtube.com TRUE / FALSE 1756638353 SIDCC AKEyXzWXY14_kKbTxyT3AyRMPsKsEGUsuIHbGutwC42o1YlZS06ch-ug7SyZAYQ7jEDVx5EDfw
|
20 |
+
.youtube.com TRUE / TRUE 1756638353 __Secure-1PSIDCC AKEyXzX0UJz8MZ6u_9s7hOlSPGjbu-JwY0Q1l77e5oO5CJTXNIDO95oxQyCdFaP2D-4qbJrCI1I
|
21 |
+
.youtube.com TRUE / TRUE 1756638353 __Secure-3PSIDCC AKEyXzU5TOnpQc0o7qGCur58CMCcshJ2tsoLi9rVwsER2dK2P22VqU3jYG0yMz0LsNkMxjbXeg
|
22 |
+
.youtube.com TRUE / TRUE 0 YSC tBB8nN6HoE0
|
23 |
+
.youtube.com TRUE / TRUE 1756638392 __Secure-ROLLOUT_TOKEN CMe15b6p2vn47QEQsKafn6TwiwMYru_Yn6TwiwM%3D
|
24 |
+
.youtube.com TRUE / TRUE 1756638392 VISITOR_INFO1_LIVE 54yW_8GrQNM
|
25 |
+
.youtube.com TRUE / TRUE 1756638392 VISITOR_PRIVACY_METADATA CgJVUxIEGgAgDA%3D%3D
|
26 |
+
.youtube.com TRUE / TRUE 1756638392 YT_DEVICE_MEASUREMENT_ID 3rgPq1s=
|
27 |
+
.youtube.com TRUE / TRUE 1804158392 __Secure-YT_TVFAS t=483635&s=2
|
28 |
+
.youtube.com TRUE / TRUE 1756638392 DEVICE_INFO ChxOelEzTnprd09URXhNemMzTXpRNE1qZ3lNQT09ELi9m74GGLi9m74G
|
dataset.py
ADDED
@@ -0,0 +1,669 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding: utf-8
|
2 |
+
__author__ = 'Roman Solovyev (ZFTurbo): https://github.com/ZFTurbo/'
|
3 |
+
|
4 |
+
|
5 |
+
import os
|
6 |
+
import random
|
7 |
+
import numpy as np
|
8 |
+
import torch
|
9 |
+
import soundfile as sf
|
10 |
+
import pickle
|
11 |
+
import time
|
12 |
+
import itertools
|
13 |
+
import multiprocessing
|
14 |
+
from tqdm.auto import tqdm
|
15 |
+
from glob import glob
|
16 |
+
import audiomentations as AU
|
17 |
+
import pedalboard as PB
|
18 |
+
import warnings
|
19 |
+
warnings.filterwarnings("ignore")
|
20 |
+
|
21 |
+
|
22 |
+
def load_chunk(path, length, chunk_size, offset=None):
|
23 |
+
if chunk_size <= length:
|
24 |
+
if offset is None:
|
25 |
+
offset = np.random.randint(length - chunk_size + 1)
|
26 |
+
x = sf.read(path, dtype='float32', start=offset, frames=chunk_size)[0]
|
27 |
+
else:
|
28 |
+
x = sf.read(path, dtype='float32')[0]
|
29 |
+
if len(x.shape) == 1:
|
30 |
+
# Mono case
|
31 |
+
pad = np.zeros((chunk_size - length))
|
32 |
+
else:
|
33 |
+
pad = np.zeros([chunk_size - length, x.shape[-1]])
|
34 |
+
x = np.concatenate([x, pad], axis=0)
|
35 |
+
# Mono fix
|
36 |
+
if len(x.shape) == 1:
|
37 |
+
x = np.expand_dims(x, axis=1)
|
38 |
+
return x.T
|
39 |
+
|
40 |
+
|
41 |
+
def get_track_set_length(params):
|
42 |
+
path, instruments, file_types = params
|
43 |
+
# Check lengths of all instruments (it can be different in some cases)
|
44 |
+
lengths_arr = []
|
45 |
+
for instr in instruments:
|
46 |
+
length = -1
|
47 |
+
for extension in file_types:
|
48 |
+
path_to_audio_file = path + '/{}.{}'.format(instr, extension)
|
49 |
+
if os.path.isfile(path_to_audio_file):
|
50 |
+
length = len(sf.read(path_to_audio_file)[0])
|
51 |
+
break
|
52 |
+
if length == -1:
|
53 |
+
print('Cant find file "{}" in folder {}'.format(instr, path))
|
54 |
+
continue
|
55 |
+
lengths_arr.append(length)
|
56 |
+
lengths_arr = np.array(lengths_arr)
|
57 |
+
if lengths_arr.min() != lengths_arr.max():
|
58 |
+
print('Warning: lengths of stems are different for path: {}. ({} != {})'.format(
|
59 |
+
path,
|
60 |
+
lengths_arr.min(),
|
61 |
+
lengths_arr.max())
|
62 |
+
)
|
63 |
+
# We use minimum to allow overflow for soundfile read in non-equal length cases
|
64 |
+
return path, lengths_arr.min()
|
65 |
+
|
66 |
+
|
67 |
+
# For multiprocessing
|
68 |
+
def get_track_length(params):
|
69 |
+
path = params
|
70 |
+
length = len(sf.read(path)[0])
|
71 |
+
return (path, length)
|
72 |
+
|
73 |
+
|
74 |
+
class MSSDataset(torch.utils.data.Dataset):
|
75 |
+
def __init__(self, config, data_path, metadata_path="metadata.pkl", dataset_type=1, batch_size=None, verbose=True):
|
76 |
+
self.verbose = verbose
|
77 |
+
self.config = config
|
78 |
+
self.dataset_type = dataset_type # 1, 2, 3 or 4
|
79 |
+
self.data_path = data_path
|
80 |
+
self.instruments = instruments = config.training.instruments
|
81 |
+
if batch_size is None:
|
82 |
+
batch_size = config.training.batch_size
|
83 |
+
self.batch_size = batch_size
|
84 |
+
self.file_types = ['wav', 'flac']
|
85 |
+
self.metadata_path = metadata_path
|
86 |
+
|
87 |
+
# Augmentation block
|
88 |
+
self.aug = False
|
89 |
+
if 'augmentations' in config:
|
90 |
+
if config['augmentations'].enable is True:
|
91 |
+
if self.verbose:
|
92 |
+
print('Use augmentation for training')
|
93 |
+
self.aug = True
|
94 |
+
else:
|
95 |
+
if self.verbose:
|
96 |
+
print('There is no augmentations block in config. Augmentations disabled for training...')
|
97 |
+
|
98 |
+
metadata = self.get_metadata()
|
99 |
+
|
100 |
+
if self.dataset_type in [1, 4]:
|
101 |
+
if len(metadata) > 0:
|
102 |
+
if self.verbose:
|
103 |
+
print('Found tracks in dataset: {}'.format(len(metadata)))
|
104 |
+
else:
|
105 |
+
print('No tracks found for training. Check paths you provided!')
|
106 |
+
exit()
|
107 |
+
else:
|
108 |
+
for instr in self.instruments:
|
109 |
+
if self.verbose:
|
110 |
+
print('Found tracks for {} in dataset: {}'.format(instr, len(metadata[instr])))
|
111 |
+
self.metadata = metadata
|
112 |
+
self.chunk_size = config.audio.chunk_size
|
113 |
+
self.min_mean_abs = config.audio.min_mean_abs
|
114 |
+
|
115 |
+
def __len__(self):
|
116 |
+
return self.config.training.num_steps * self.batch_size
|
117 |
+
|
118 |
+
def read_from_metadata_cache(self, track_paths, instr=None):
|
119 |
+
metadata = []
|
120 |
+
if os.path.isfile(self.metadata_path):
|
121 |
+
if self.verbose:
|
122 |
+
print('Found metadata cache file: {}'.format(self.metadata_path))
|
123 |
+
old_metadata = pickle.load(open(self.metadata_path, 'rb'))
|
124 |
+
else:
|
125 |
+
return track_paths, metadata
|
126 |
+
|
127 |
+
if instr:
|
128 |
+
old_metadata = old_metadata[instr]
|
129 |
+
|
130 |
+
# We will not re-read tracks existed in old metadata file
|
131 |
+
track_paths_set = set(track_paths)
|
132 |
+
for old_path, file_size in old_metadata:
|
133 |
+
if old_path in track_paths_set:
|
134 |
+
metadata.append([old_path, file_size])
|
135 |
+
track_paths_set.remove(old_path)
|
136 |
+
track_paths = list(track_paths_set)
|
137 |
+
if len(metadata) > 0:
|
138 |
+
print('Old metadata was used for {} tracks.'.format(len(metadata)))
|
139 |
+
return track_paths, metadata
|
140 |
+
|
141 |
+
|
142 |
+
def get_metadata(self):
|
143 |
+
read_metadata_procs = multiprocessing.cpu_count()
|
144 |
+
if 'read_metadata_procs' in self.config['training']:
|
145 |
+
read_metadata_procs = int(self.config['training']['read_metadata_procs'])
|
146 |
+
|
147 |
+
if self.verbose:
|
148 |
+
print(
|
149 |
+
'Dataset type:', self.dataset_type,
|
150 |
+
'Processes to use:', read_metadata_procs,
|
151 |
+
'\nCollecting metadata for', str(self.data_path),
|
152 |
+
)
|
153 |
+
|
154 |
+
if self.dataset_type in [1, 4]:
|
155 |
+
track_paths = []
|
156 |
+
if type(self.data_path) == list:
|
157 |
+
for tp in self.data_path:
|
158 |
+
tracks_for_folder = sorted(glob(tp + '/*'))
|
159 |
+
if len(tracks_for_folder) == 0:
|
160 |
+
print('Warning: no tracks found in folder \'{}\'. Please check it!'.format(tp))
|
161 |
+
track_paths += tracks_for_folder
|
162 |
+
else:
|
163 |
+
track_paths += sorted(glob(self.data_path + '/*'))
|
164 |
+
|
165 |
+
track_paths = [path for path in track_paths if os.path.basename(path)[0] != '.' and os.path.isdir(path)]
|
166 |
+
track_paths, metadata = self.read_from_metadata_cache(track_paths, None)
|
167 |
+
|
168 |
+
if read_metadata_procs <= 1:
|
169 |
+
for path in tqdm(track_paths):
|
170 |
+
track_path, track_length = get_track_set_length((path, self.instruments, self.file_types))
|
171 |
+
metadata.append((track_path, track_length))
|
172 |
+
else:
|
173 |
+
p = multiprocessing.Pool(processes=read_metadata_procs)
|
174 |
+
with tqdm(total=len(track_paths)) as pbar:
|
175 |
+
track_iter = p.imap(
|
176 |
+
get_track_set_length,
|
177 |
+
zip(track_paths, itertools.repeat(self.instruments), itertools.repeat(self.file_types))
|
178 |
+
)
|
179 |
+
for track_path, track_length in track_iter:
|
180 |
+
metadata.append((track_path, track_length))
|
181 |
+
pbar.update()
|
182 |
+
p.close()
|
183 |
+
|
184 |
+
elif self.dataset_type == 2:
|
185 |
+
metadata = dict()
|
186 |
+
for instr in self.instruments:
|
187 |
+
metadata[instr] = []
|
188 |
+
track_paths = []
|
189 |
+
if type(self.data_path) == list:
|
190 |
+
for tp in self.data_path:
|
191 |
+
track_paths += sorted(glob(tp + '/{}/*.wav'.format(instr)))
|
192 |
+
track_paths += sorted(glob(tp + '/{}/*.flac'.format(instr)))
|
193 |
+
else:
|
194 |
+
track_paths += sorted(glob(self.data_path + '/{}/*.wav'.format(instr)))
|
195 |
+
track_paths += sorted(glob(self.data_path + '/{}/*.flac'.format(instr)))
|
196 |
+
|
197 |
+
track_paths, metadata[instr] = self.read_from_metadata_cache(track_paths, instr)
|
198 |
+
|
199 |
+
if read_metadata_procs <= 1:
|
200 |
+
for path in tqdm(track_paths):
|
201 |
+
length = len(sf.read(path)[0])
|
202 |
+
metadata[instr].append((path, length))
|
203 |
+
else:
|
204 |
+
p = multiprocessing.Pool(processes=read_metadata_procs)
|
205 |
+
for out in tqdm(p.imap(get_track_length, track_paths), total=len(track_paths)):
|
206 |
+
metadata[instr].append(out)
|
207 |
+
|
208 |
+
elif self.dataset_type == 3:
|
209 |
+
import pandas as pd
|
210 |
+
if type(self.data_path) != list:
|
211 |
+
data_path = [self.data_path]
|
212 |
+
|
213 |
+
metadata = dict()
|
214 |
+
for i in range(len(self.data_path)):
|
215 |
+
if self.verbose:
|
216 |
+
print('Reading tracks from: {}'.format(self.data_path[i]))
|
217 |
+
df = pd.read_csv(self.data_path[i])
|
218 |
+
|
219 |
+
skipped = 0
|
220 |
+
for instr in self.instruments:
|
221 |
+
part = df[df['instrum'] == instr].copy()
|
222 |
+
print('Tracks found for {}: {}'.format(instr, len(part)))
|
223 |
+
for instr in self.instruments:
|
224 |
+
part = df[df['instrum'] == instr].copy()
|
225 |
+
metadata[instr] = []
|
226 |
+
track_paths = list(part['path'].values)
|
227 |
+
track_paths, metadata[instr] = self.read_from_metadata_cache(track_paths, instr)
|
228 |
+
|
229 |
+
for path in tqdm(track_paths):
|
230 |
+
if not os.path.isfile(path):
|
231 |
+
print('Cant find track: {}'.format(path))
|
232 |
+
skipped += 1
|
233 |
+
continue
|
234 |
+
# print(path)
|
235 |
+
try:
|
236 |
+
length = len(sf.read(path)[0])
|
237 |
+
except:
|
238 |
+
print('Problem with path: {}'.format(path))
|
239 |
+
skipped += 1
|
240 |
+
continue
|
241 |
+
metadata[instr].append((path, length))
|
242 |
+
if skipped > 0:
|
243 |
+
print('Missing tracks: {} from {}'.format(skipped, len(df)))
|
244 |
+
else:
|
245 |
+
print('Unknown dataset type: {}. Must be 1, 2, 3 or 4'.format(self.dataset_type))
|
246 |
+
exit()
|
247 |
+
|
248 |
+
# Save metadata
|
249 |
+
pickle.dump(metadata, open(self.metadata_path, 'wb'))
|
250 |
+
return metadata
|
251 |
+
|
252 |
+
def load_source(self, metadata, instr):
|
253 |
+
while True:
|
254 |
+
if self.dataset_type in [1, 4]:
|
255 |
+
track_path, track_length = random.choice(metadata)
|
256 |
+
for extension in self.file_types:
|
257 |
+
path_to_audio_file = track_path + '/{}.{}'.format(instr, extension)
|
258 |
+
if os.path.isfile(path_to_audio_file):
|
259 |
+
try:
|
260 |
+
source = load_chunk(path_to_audio_file, track_length, self.chunk_size)
|
261 |
+
except Exception as e:
|
262 |
+
# Sometimes error during FLAC reading, catch it and use zero stem
|
263 |
+
print('Error: {} Path: {}'.format(e, path_to_audio_file))
|
264 |
+
source = np.zeros((2, self.chunk_size), dtype=np.float32)
|
265 |
+
break
|
266 |
+
else:
|
267 |
+
track_path, track_length = random.choice(metadata[instr])
|
268 |
+
try:
|
269 |
+
source = load_chunk(track_path, track_length, self.chunk_size)
|
270 |
+
except Exception as e:
|
271 |
+
# Sometimes error during FLAC reading, catch it and use zero stem
|
272 |
+
print('Error: {} Path: {}'.format(e, track_path))
|
273 |
+
source = np.zeros((2, self.chunk_size), dtype=np.float32)
|
274 |
+
|
275 |
+
if np.abs(source).mean() >= self.min_mean_abs: # remove quiet chunks
|
276 |
+
break
|
277 |
+
if self.aug:
|
278 |
+
source = self.augm_data(source, instr)
|
279 |
+
return torch.tensor(source, dtype=torch.float32)
|
280 |
+
|
281 |
+
def load_random_mix(self):
|
282 |
+
res = []
|
283 |
+
for instr in self.instruments:
|
284 |
+
s1 = self.load_source(self.metadata, instr)
|
285 |
+
# Mixup augmentation. Multiple mix of same type of stems
|
286 |
+
if self.aug:
|
287 |
+
if 'mixup' in self.config['augmentations']:
|
288 |
+
if self.config['augmentations'].mixup:
|
289 |
+
mixup = [s1]
|
290 |
+
for prob in self.config.augmentations.mixup_probs:
|
291 |
+
if random.uniform(0, 1) < prob:
|
292 |
+
s2 = self.load_source(self.metadata, instr)
|
293 |
+
mixup.append(s2)
|
294 |
+
mixup = torch.stack(mixup, dim=0)
|
295 |
+
loud_values = np.random.uniform(
|
296 |
+
low=self.config.augmentations.loudness_min,
|
297 |
+
high=self.config.augmentations.loudness_max,
|
298 |
+
size=(len(mixup),)
|
299 |
+
)
|
300 |
+
loud_values = torch.tensor(loud_values, dtype=torch.float32)
|
301 |
+
mixup *= loud_values[:, None, None]
|
302 |
+
s1 = mixup.mean(dim=0, dtype=torch.float32)
|
303 |
+
res.append(s1)
|
304 |
+
res = torch.stack(res)
|
305 |
+
return res
|
306 |
+
|
307 |
+
def load_aligned_data(self):
|
308 |
+
track_path, track_length = random.choice(self.metadata)
|
309 |
+
attempts = 10
|
310 |
+
while attempts:
|
311 |
+
if track_length >= self.chunk_size:
|
312 |
+
common_offset = np.random.randint(track_length - self.chunk_size + 1)
|
313 |
+
else:
|
314 |
+
common_offset = None
|
315 |
+
res = []
|
316 |
+
silent_chunks = 0
|
317 |
+
for i in self.instruments:
|
318 |
+
for extension in self.file_types:
|
319 |
+
path_to_audio_file = track_path + '/{}.{}'.format(i, extension)
|
320 |
+
if os.path.isfile(path_to_audio_file):
|
321 |
+
try:
|
322 |
+
source = load_chunk(path_to_audio_file, track_length, self.chunk_size, offset=common_offset)
|
323 |
+
except Exception as e:
|
324 |
+
# Sometimes error during FLAC reading, catch it and use zero stem
|
325 |
+
print('Error: {} Path: {}'.format(e, path_to_audio_file))
|
326 |
+
source = np.zeros((2, self.chunk_size), dtype=np.float32)
|
327 |
+
break
|
328 |
+
res.append(source)
|
329 |
+
if np.abs(source).mean() < self.min_mean_abs: # remove quiet chunks
|
330 |
+
silent_chunks += 1
|
331 |
+
if silent_chunks == 0:
|
332 |
+
break
|
333 |
+
|
334 |
+
attempts -= 1
|
335 |
+
if attempts <= 0:
|
336 |
+
print('Attempts max!', track_path)
|
337 |
+
if common_offset is None:
|
338 |
+
# If track is too small break immediately
|
339 |
+
break
|
340 |
+
|
341 |
+
res = np.stack(res, axis=0)
|
342 |
+
if self.aug:
|
343 |
+
for i, instr in enumerate(self.instruments):
|
344 |
+
res[i] = self.augm_data(res[i], instr)
|
345 |
+
return torch.tensor(res, dtype=torch.float32)
|
346 |
+
|
347 |
+
def augm_data(self, source, instr):
|
348 |
+
# source.shape = (2, 261120) - first channels, second length
|
349 |
+
source_shape = source.shape
|
350 |
+
applied_augs = []
|
351 |
+
if 'all' in self.config['augmentations']:
|
352 |
+
augs = self.config['augmentations']['all']
|
353 |
+
else:
|
354 |
+
augs = dict()
|
355 |
+
|
356 |
+
# We need to add to all augmentations specific augs for stem. And rewrite values if needed
|
357 |
+
if instr in self.config['augmentations']:
|
358 |
+
for el in self.config['augmentations'][instr]:
|
359 |
+
augs[el] = self.config['augmentations'][instr][el]
|
360 |
+
|
361 |
+
# Channel shuffle
|
362 |
+
if 'channel_shuffle' in augs:
|
363 |
+
if augs['channel_shuffle'] > 0:
|
364 |
+
if random.uniform(0, 1) < augs['channel_shuffle']:
|
365 |
+
source = source[::-1].copy()
|
366 |
+
applied_augs.append('channel_shuffle')
|
367 |
+
# Random inverse
|
368 |
+
if 'random_inverse' in augs:
|
369 |
+
if augs['random_inverse'] > 0:
|
370 |
+
if random.uniform(0, 1) < augs['random_inverse']:
|
371 |
+
source = source[:, ::-1].copy()
|
372 |
+
applied_augs.append('random_inverse')
|
373 |
+
# Random polarity (multiply -1)
|
374 |
+
if 'random_polarity' in augs:
|
375 |
+
if augs['random_polarity'] > 0:
|
376 |
+
if random.uniform(0, 1) < augs['random_polarity']:
|
377 |
+
source = -source.copy()
|
378 |
+
applied_augs.append('random_polarity')
|
379 |
+
# Random pitch shift
|
380 |
+
if 'pitch_shift' in augs:
|
381 |
+
if augs['pitch_shift'] > 0:
|
382 |
+
if random.uniform(0, 1) < augs['pitch_shift']:
|
383 |
+
apply_aug = AU.PitchShift(
|
384 |
+
min_semitones=augs['pitch_shift_min_semitones'],
|
385 |
+
max_semitones=augs['pitch_shift_max_semitones'],
|
386 |
+
p=1.0
|
387 |
+
)
|
388 |
+
source = apply_aug(samples=source, sample_rate=44100)
|
389 |
+
applied_augs.append('pitch_shift')
|
390 |
+
# Random seven band parametric eq
|
391 |
+
if 'seven_band_parametric_eq' in augs:
|
392 |
+
if augs['seven_band_parametric_eq'] > 0:
|
393 |
+
if random.uniform(0, 1) < augs['seven_band_parametric_eq']:
|
394 |
+
apply_aug = AU.SevenBandParametricEQ(
|
395 |
+
min_gain_db=augs['seven_band_parametric_eq_min_gain_db'],
|
396 |
+
max_gain_db=augs['seven_band_parametric_eq_max_gain_db'],
|
397 |
+
p=1.0
|
398 |
+
)
|
399 |
+
source = apply_aug(samples=source, sample_rate=44100)
|
400 |
+
applied_augs.append('seven_band_parametric_eq')
|
401 |
+
# Random tanh distortion
|
402 |
+
if 'tanh_distortion' in augs:
|
403 |
+
if augs['tanh_distortion'] > 0:
|
404 |
+
if random.uniform(0, 1) < augs['tanh_distortion']:
|
405 |
+
apply_aug = AU.TanhDistortion(
|
406 |
+
min_distortion=augs['tanh_distortion_min'],
|
407 |
+
max_distortion=augs['tanh_distortion_max'],
|
408 |
+
p=1.0
|
409 |
+
)
|
410 |
+
source = apply_aug(samples=source, sample_rate=44100)
|
411 |
+
applied_augs.append('tanh_distortion')
|
412 |
+
# Random MP3 Compression
|
413 |
+
if 'mp3_compression' in augs:
|
414 |
+
if augs['mp3_compression'] > 0:
|
415 |
+
if random.uniform(0, 1) < augs['mp3_compression']:
|
416 |
+
apply_aug = AU.Mp3Compression(
|
417 |
+
min_bitrate=augs['mp3_compression_min_bitrate'],
|
418 |
+
max_bitrate=augs['mp3_compression_max_bitrate'],
|
419 |
+
backend=augs['mp3_compression_backend'],
|
420 |
+
p=1.0
|
421 |
+
)
|
422 |
+
source = apply_aug(samples=source, sample_rate=44100)
|
423 |
+
applied_augs.append('mp3_compression')
|
424 |
+
# Random AddGaussianNoise
|
425 |
+
if 'gaussian_noise' in augs:
|
426 |
+
if augs['gaussian_noise'] > 0:
|
427 |
+
if random.uniform(0, 1) < augs['gaussian_noise']:
|
428 |
+
apply_aug = AU.AddGaussianNoise(
|
429 |
+
min_amplitude=augs['gaussian_noise_min_amplitude'],
|
430 |
+
max_amplitude=augs['gaussian_noise_max_amplitude'],
|
431 |
+
p=1.0
|
432 |
+
)
|
433 |
+
source = apply_aug(samples=source, sample_rate=44100)
|
434 |
+
applied_augs.append('gaussian_noise')
|
435 |
+
# Random TimeStretch
|
436 |
+
if 'time_stretch' in augs:
|
437 |
+
if augs['time_stretch'] > 0:
|
438 |
+
if random.uniform(0, 1) < augs['time_stretch']:
|
439 |
+
apply_aug = AU.TimeStretch(
|
440 |
+
min_rate=augs['time_stretch_min_rate'],
|
441 |
+
max_rate=augs['time_stretch_max_rate'],
|
442 |
+
leave_length_unchanged=True,
|
443 |
+
p=1.0
|
444 |
+
)
|
445 |
+
source = apply_aug(samples=source, sample_rate=44100)
|
446 |
+
applied_augs.append('time_stretch')
|
447 |
+
|
448 |
+
# Possible fix of shape
|
449 |
+
if source_shape != source.shape:
|
450 |
+
source = source[..., :source_shape[-1]]
|
451 |
+
|
452 |
+
# Random Reverb
|
453 |
+
if 'pedalboard_reverb' in augs:
|
454 |
+
if augs['pedalboard_reverb'] > 0:
|
455 |
+
if random.uniform(0, 1) < augs['pedalboard_reverb']:
|
456 |
+
room_size = random.uniform(
|
457 |
+
augs['pedalboard_reverb_room_size_min'],
|
458 |
+
augs['pedalboard_reverb_room_size_max'],
|
459 |
+
)
|
460 |
+
damping = random.uniform(
|
461 |
+
augs['pedalboard_reverb_damping_min'],
|
462 |
+
augs['pedalboard_reverb_damping_max'],
|
463 |
+
)
|
464 |
+
wet_level = random.uniform(
|
465 |
+
augs['pedalboard_reverb_wet_level_min'],
|
466 |
+
augs['pedalboard_reverb_wet_level_max'],
|
467 |
+
)
|
468 |
+
dry_level = random.uniform(
|
469 |
+
augs['pedalboard_reverb_dry_level_min'],
|
470 |
+
augs['pedalboard_reverb_dry_level_max'],
|
471 |
+
)
|
472 |
+
width = random.uniform(
|
473 |
+
augs['pedalboard_reverb_width_min'],
|
474 |
+
augs['pedalboard_reverb_width_max'],
|
475 |
+
)
|
476 |
+
board = PB.Pedalboard([PB.Reverb(
|
477 |
+
room_size=room_size, # 0.1 - 0.9
|
478 |
+
damping=damping, # 0.1 - 0.9
|
479 |
+
wet_level=wet_level, # 0.1 - 0.9
|
480 |
+
dry_level=dry_level, # 0.1 - 0.9
|
481 |
+
width=width, # 0.9 - 1.0
|
482 |
+
freeze_mode=0.0,
|
483 |
+
)])
|
484 |
+
source = board(source, 44100)
|
485 |
+
applied_augs.append('pedalboard_reverb')
|
486 |
+
|
487 |
+
# Random Chorus
|
488 |
+
if 'pedalboard_chorus' in augs:
|
489 |
+
if augs['pedalboard_chorus'] > 0:
|
490 |
+
if random.uniform(0, 1) < augs['pedalboard_chorus']:
|
491 |
+
rate_hz = random.uniform(
|
492 |
+
augs['pedalboard_chorus_rate_hz_min'],
|
493 |
+
augs['pedalboard_chorus_rate_hz_max'],
|
494 |
+
)
|
495 |
+
depth = random.uniform(
|
496 |
+
augs['pedalboard_chorus_depth_min'],
|
497 |
+
augs['pedalboard_chorus_depth_max'],
|
498 |
+
)
|
499 |
+
centre_delay_ms = random.uniform(
|
500 |
+
augs['pedalboard_chorus_centre_delay_ms_min'],
|
501 |
+
augs['pedalboard_chorus_centre_delay_ms_max'],
|
502 |
+
)
|
503 |
+
feedback = random.uniform(
|
504 |
+
augs['pedalboard_chorus_feedback_min'],
|
505 |
+
augs['pedalboard_chorus_feedback_max'],
|
506 |
+
)
|
507 |
+
mix = random.uniform(
|
508 |
+
augs['pedalboard_chorus_mix_min'],
|
509 |
+
augs['pedalboard_chorus_mix_max'],
|
510 |
+
)
|
511 |
+
board = PB.Pedalboard([PB.Chorus(
|
512 |
+
rate_hz=rate_hz,
|
513 |
+
depth=depth,
|
514 |
+
centre_delay_ms=centre_delay_ms,
|
515 |
+
feedback=feedback,
|
516 |
+
mix=mix,
|
517 |
+
)])
|
518 |
+
source = board(source, 44100)
|
519 |
+
applied_augs.append('pedalboard_chorus')
|
520 |
+
|
521 |
+
# Random Phazer
|
522 |
+
if 'pedalboard_phazer' in augs:
|
523 |
+
if augs['pedalboard_phazer'] > 0:
|
524 |
+
if random.uniform(0, 1) < augs['pedalboard_phazer']:
|
525 |
+
rate_hz = random.uniform(
|
526 |
+
augs['pedalboard_phazer_rate_hz_min'],
|
527 |
+
augs['pedalboard_phazer_rate_hz_max'],
|
528 |
+
)
|
529 |
+
depth = random.uniform(
|
530 |
+
augs['pedalboard_phazer_depth_min'],
|
531 |
+
augs['pedalboard_phazer_depth_max'],
|
532 |
+
)
|
533 |
+
centre_frequency_hz = random.uniform(
|
534 |
+
augs['pedalboard_phazer_centre_frequency_hz_min'],
|
535 |
+
augs['pedalboard_phazer_centre_frequency_hz_max'],
|
536 |
+
)
|
537 |
+
feedback = random.uniform(
|
538 |
+
augs['pedalboard_phazer_feedback_min'],
|
539 |
+
augs['pedalboard_phazer_feedback_max'],
|
540 |
+
)
|
541 |
+
mix = random.uniform(
|
542 |
+
augs['pedalboard_phazer_mix_min'],
|
543 |
+
augs['pedalboard_phazer_mix_max'],
|
544 |
+
)
|
545 |
+
board = PB.Pedalboard([PB.Phaser(
|
546 |
+
rate_hz=rate_hz,
|
547 |
+
depth=depth,
|
548 |
+
centre_frequency_hz=centre_frequency_hz,
|
549 |
+
feedback=feedback,
|
550 |
+
mix=mix,
|
551 |
+
)])
|
552 |
+
source = board(source, 44100)
|
553 |
+
applied_augs.append('pedalboard_phazer')
|
554 |
+
|
555 |
+
# Random Distortion
|
556 |
+
if 'pedalboard_distortion' in augs:
|
557 |
+
if augs['pedalboard_distortion'] > 0:
|
558 |
+
if random.uniform(0, 1) < augs['pedalboard_distortion']:
|
559 |
+
drive_db = random.uniform(
|
560 |
+
augs['pedalboard_distortion_drive_db_min'],
|
561 |
+
augs['pedalboard_distortion_drive_db_max'],
|
562 |
+
)
|
563 |
+
board = PB.Pedalboard([PB.Distortion(
|
564 |
+
drive_db=drive_db,
|
565 |
+
)])
|
566 |
+
source = board(source, 44100)
|
567 |
+
applied_augs.append('pedalboard_distortion')
|
568 |
+
|
569 |
+
# Random PitchShift
|
570 |
+
if 'pedalboard_pitch_shift' in augs:
|
571 |
+
if augs['pedalboard_pitch_shift'] > 0:
|
572 |
+
if random.uniform(0, 1) < augs['pedalboard_pitch_shift']:
|
573 |
+
semitones = random.uniform(
|
574 |
+
augs['pedalboard_pitch_shift_semitones_min'],
|
575 |
+
augs['pedalboard_pitch_shift_semitones_max'],
|
576 |
+
)
|
577 |
+
board = PB.Pedalboard([PB.PitchShift(
|
578 |
+
semitones=semitones
|
579 |
+
)])
|
580 |
+
source = board(source, 44100)
|
581 |
+
applied_augs.append('pedalboard_pitch_shift')
|
582 |
+
|
583 |
+
# Random Resample
|
584 |
+
if 'pedalboard_resample' in augs:
|
585 |
+
if augs['pedalboard_resample'] > 0:
|
586 |
+
if random.uniform(0, 1) < augs['pedalboard_resample']:
|
587 |
+
target_sample_rate = random.uniform(
|
588 |
+
augs['pedalboard_resample_target_sample_rate_min'],
|
589 |
+
augs['pedalboard_resample_target_sample_rate_max'],
|
590 |
+
)
|
591 |
+
board = PB.Pedalboard([PB.Resample(
|
592 |
+
target_sample_rate=target_sample_rate
|
593 |
+
)])
|
594 |
+
source = board(source, 44100)
|
595 |
+
applied_augs.append('pedalboard_resample')
|
596 |
+
|
597 |
+
# Random Bitcrash
|
598 |
+
if 'pedalboard_bitcrash' in augs:
|
599 |
+
if augs['pedalboard_bitcrash'] > 0:
|
600 |
+
if random.uniform(0, 1) < augs['pedalboard_bitcrash']:
|
601 |
+
bit_depth = random.uniform(
|
602 |
+
augs['pedalboard_bitcrash_bit_depth_min'],
|
603 |
+
augs['pedalboard_bitcrash_bit_depth_max'],
|
604 |
+
)
|
605 |
+
board = PB.Pedalboard([PB.Bitcrush(
|
606 |
+
bit_depth=bit_depth
|
607 |
+
)])
|
608 |
+
source = board(source, 44100)
|
609 |
+
applied_augs.append('pedalboard_bitcrash')
|
610 |
+
|
611 |
+
# Random MP3Compressor
|
612 |
+
if 'pedalboard_mp3_compressor' in augs:
|
613 |
+
if augs['pedalboard_mp3_compressor'] > 0:
|
614 |
+
if random.uniform(0, 1) < augs['pedalboard_mp3_compressor']:
|
615 |
+
vbr_quality = random.uniform(
|
616 |
+
augs['pedalboard_mp3_compressor_pedalboard_mp3_compressor_min'],
|
617 |
+
augs['pedalboard_mp3_compressor_pedalboard_mp3_compressor_max'],
|
618 |
+
)
|
619 |
+
board = PB.Pedalboard([PB.MP3Compressor(
|
620 |
+
vbr_quality=vbr_quality
|
621 |
+
)])
|
622 |
+
source = board(source, 44100)
|
623 |
+
applied_augs.append('pedalboard_mp3_compressor')
|
624 |
+
|
625 |
+
# print(applied_augs)
|
626 |
+
return source
|
627 |
+
|
628 |
+
def __getitem__(self, index):
|
629 |
+
if self.dataset_type in [1, 2, 3]:
|
630 |
+
res = self.load_random_mix()
|
631 |
+
else:
|
632 |
+
res = self.load_aligned_data()
|
633 |
+
|
634 |
+
# Randomly change loudness of each stem
|
635 |
+
if self.aug:
|
636 |
+
if 'loudness' in self.config['augmentations']:
|
637 |
+
if self.config['augmentations']['loudness']:
|
638 |
+
loud_values = np.random.uniform(
|
639 |
+
low=self.config['augmentations']['loudness_min'],
|
640 |
+
high=self.config['augmentations']['loudness_max'],
|
641 |
+
size=(len(res),)
|
642 |
+
)
|
643 |
+
loud_values = torch.tensor(loud_values, dtype=torch.float32)
|
644 |
+
res *= loud_values[:, None, None]
|
645 |
+
|
646 |
+
mix = res.sum(0)
|
647 |
+
|
648 |
+
if self.aug:
|
649 |
+
if 'mp3_compression_on_mixture' in self.config['augmentations']:
|
650 |
+
apply_aug = AU.Mp3Compression(
|
651 |
+
min_bitrate=self.config['augmentations']['mp3_compression_on_mixture_bitrate_min'],
|
652 |
+
max_bitrate=self.config['augmentations']['mp3_compression_on_mixture_bitrate_max'],
|
653 |
+
backend=self.config['augmentations']['mp3_compression_on_mixture_backend'],
|
654 |
+
p=self.config['augmentations']['mp3_compression_on_mixture']
|
655 |
+
)
|
656 |
+
mix_conv = mix.cpu().numpy().astype(np.float32)
|
657 |
+
required_shape = mix_conv.shape
|
658 |
+
mix = apply_aug(samples=mix_conv, sample_rate=44100)
|
659 |
+
# Sometimes it gives longer audio (so we cut)
|
660 |
+
if mix.shape != required_shape:
|
661 |
+
mix = mix[..., :required_shape[-1]]
|
662 |
+
mix = torch.tensor(mix, dtype=torch.float32)
|
663 |
+
|
664 |
+
# If we need to optimize only given stem
|
665 |
+
if self.config.training.target_instrument is not None:
|
666 |
+
index = self.config.training.instruments.index(self.config.training.target_instrument)
|
667 |
+
return res[index:index+1], mix
|
668 |
+
|
669 |
+
return res, mix
|
docs/LoRA.md
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Training with LoRA
|
2 |
+
|
3 |
+
### What is LoRA?
|
4 |
+
|
5 |
+
LoRA (Low-Rank Adaptation) is a technique designed to reduce the computational and memory cost of fine-tuning large-scale neural networks. Instead of fine-tuning all the model parameters, LoRA introduces small trainable low-rank matrices that are injected into the network. This allows significant reductions in the number of trainable parameters, making it more efficient to adapt pre-trained models to new tasks. For more details, you can refer to the original paper.
|
6 |
+
|
7 |
+
### Enabling LoRA in Training
|
8 |
+
|
9 |
+
To include LoRA in your training pipeline, you need to:
|
10 |
+
|
11 |
+
Add the `--train_lora` flag to the training command.
|
12 |
+
|
13 |
+
Add the following configuration for LoRA in your config file:
|
14 |
+
|
15 |
+
Example:
|
16 |
+
```
|
17 |
+
lora:
|
18 |
+
r: 8
|
19 |
+
lora_alpha: 16 # alpha / rank > 1
|
20 |
+
lora_dropout: 0.05
|
21 |
+
merge_weights: False
|
22 |
+
fan_in_fan_out: False
|
23 |
+
enable_lora: [True]
|
24 |
+
```
|
25 |
+
|
26 |
+
Configuration Parameters Explained:
|
27 |
+
|
28 |
+
* `r` (Rank): This determines the rank of the low-rank adaptation matrices. A smaller rank reduces memory usage and file size but may limit the model's adaptability to new tasks. Common values are 4, 8, or 16.
|
29 |
+
|
30 |
+
* `lora_alpha`: Scaling factor for the LoRA weights. The ratio lora_alpha / r should generally be greater than 1 to ensure sufficient expressive power. For example, with r=8 and lora_alpha=16, the scaling factor is 2.
|
31 |
+
|
32 |
+
* `lora_dropout`: Dropout rate applied to LoRA layers. It helps regularize the model and prevent overfitting, especially for smaller datasets. Typical values are in the range [0.0, 0.1].
|
33 |
+
|
34 |
+
* `merge_weights`: Whether to merge the LoRA weights into the original model weights during inference. Set this to True only if you want to save the final model with merged weights for deployment.
|
35 |
+
|
36 |
+
* `fan_in_fan_out`: Defines the weight initialization convention. Leave this as False for most scenarios unless your model uses a specific convention requiring it.
|
37 |
+
|
38 |
+
* `enable_lora`: A list of booleans specifying whether LoRA should be applied to certain layers.
|
39 |
+
* For example, `[True, False, True]` enables LoRA for the 1st and 3rd layers but not the 2nd.
|
40 |
+
* The number of output neurons in the layer must be divisible by the length of enable_lora to ensure proper distribution of LoRA parameters across layers.
|
41 |
+
* For transformer architectures such as GPT models, `enable_lora = [True, False, True]` is typically used to apply LoRA to the Query (Q) and Value (V) projection matrices while skipping the Key (K) projection matrix. This setup allows efficient fine-tuning of the attention mechanism while maintaining computational efficiency.
|
42 |
+
|
43 |
+
### Benefits of Using LoRA
|
44 |
+
|
45 |
+
* File Size Reduction: With LoRA, only the LoRA layer weights are saved, which significantly reduces the size of the saved model.
|
46 |
+
|
47 |
+
* Flexible Fine-Tuning: You can fine-tune the LoRA layers while keeping the base model frozen, preserving the original model's generalization capabilities.
|
48 |
+
|
49 |
+
* Using Pretrained Weights with LoRA
|
50 |
+
|
51 |
+
### To train a model using both pretrained weights and LoRA weights, you need to:
|
52 |
+
|
53 |
+
1. Include the `--lora_checkpoint` parameter in the training command.
|
54 |
+
|
55 |
+
2. Specify the path to the LoRA checkpoint file.
|
56 |
+
|
57 |
+
### Validating and Inferencing with LoRA
|
58 |
+
|
59 |
+
When using a model fine-tuned with LoRA for validation or inference, you must provide the LoRA checkpoint using the `--lora_checkpoint` parameter.
|
60 |
+
|
61 |
+
### Example Commands
|
62 |
+
|
63 |
+
* Training with LoRA
|
64 |
+
|
65 |
+
```
|
66 |
+
python train.py --model_type scnet \
|
67 |
+
--config_path configs/config_musdb18_scnet_large_starrytong.yaml \
|
68 |
+
--start_check_point weights/last_scnet.ckpt \
|
69 |
+
--results_path results/ \
|
70 |
+
--data_path datasets/moisesdb/train_tracks \
|
71 |
+
--valid_path datasets/moisesdb/valid \
|
72 |
+
--device_ids 0 \
|
73 |
+
--metrics neg_log_wmse l1_freq sdr \
|
74 |
+
--metric_for_scheduler neg_log_wmse \
|
75 |
+
--train_lora
|
76 |
+
```
|
77 |
+
|
78 |
+
* Validating with LoRA
|
79 |
+
```
|
80 |
+
python valid.py --model_type scnet \
|
81 |
+
--config_path configs/config_musdb18_scnet_large_starrytong.yaml \
|
82 |
+
--start_check_point weights/last_scnet.ckpt \
|
83 |
+
--store_dir results_store/ \
|
84 |
+
--valid_path datasets/moisesdb/valid \
|
85 |
+
--device_ids 0 \
|
86 |
+
--metrics neg_log_wmse l1_freq si_sdr sdr aura_stft aura_mrstft bleedless fullness
|
87 |
+
```
|
88 |
+
|
89 |
+
* Inference with LoRA
|
90 |
+
```
|
91 |
+
python inference.py --lora_checkpoint weights/lora_last_scnet.ckpt \
|
92 |
+
--model_type scnet \
|
93 |
+
--config_path configs/config_musdb18_scnet_large_starrytong.yaml \
|
94 |
+
--start_check_point weights/last_scnet.ckpt \
|
95 |
+
--store_dir inference_results/ \
|
96 |
+
--input_folder datasets/moisesdb/mixtures_for_inference \
|
97 |
+
--device_ids 0
|
98 |
+
```
|
99 |
+
|
100 |
+
### Train example with BSRoformer and LoRA
|
101 |
+
|
102 |
+
You can use this [config](../configs/config_musdb18_bs_roformer_with_lora.yaml) and this [weights](https://github.com/ZFTurbo/Music-Source-Separation-Training/releases/download/v1.0.12/model_bs_roformer_ep_17_sdr_9.6568.ckpt) to finetune BSRoformer on your dataset.
|
103 |
+
|
104 |
+
```
|
105 |
+
python train.py --model_type bs_roformer \
|
106 |
+
--config_path configs/config_musdb18_bs_roformer_with_lora.yaml \
|
107 |
+
--start_check_point weights/model_bs_roformer_ep_17_sdr_9.6568.ckpt \
|
108 |
+
--results_path results/ \
|
109 |
+
--data_path musdb18hq/train \
|
110 |
+
--valid_path musdb18hq/test \
|
111 |
+
--device_ids 0 \
|
112 |
+
--metrics sdr \
|
113 |
+
--train_lora
|
114 |
+
```
|
docs/augmentations.md
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### Augmentations
|
2 |
+
|
3 |
+
Augmentations allows to change stems on the fly increasing the size of dataset by creating new samples from old samples.
|
4 |
+
Now control for augmentations is done from config file. Below you can find the example of full config,
|
5 |
+
which includes all available augmentations:
|
6 |
+
|
7 |
+
```config
|
8 |
+
augmentations:
|
9 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
10 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
11 |
+
loudness_min: 0.5
|
12 |
+
loudness_max: 1.5
|
13 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
14 |
+
mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
15 |
+
- 0.2
|
16 |
+
- 0.02
|
17 |
+
mixup_loudness_min: 0.5
|
18 |
+
mixup_loudness_max: 1.5
|
19 |
+
|
20 |
+
# apply mp3 compression to mixture only (emulate downloading mp3 from internet)
|
21 |
+
mp3_compression_on_mixture: 0.01
|
22 |
+
mp3_compression_on_mixture_bitrate_min: 32
|
23 |
+
mp3_compression_on_mixture_bitrate_max: 320
|
24 |
+
mp3_compression_on_mixture_backend: "lameenc"
|
25 |
+
|
26 |
+
all:
|
27 |
+
channel_shuffle: 0.5 # Set 0 or lower to disable
|
28 |
+
random_inverse: 0.1 # inverse track (better lower probability)
|
29 |
+
random_polarity: 0.5 # polarity change (multiply waveform to -1)
|
30 |
+
mp3_compression: 0.01
|
31 |
+
mp3_compression_min_bitrate: 32
|
32 |
+
mp3_compression_max_bitrate: 320
|
33 |
+
mp3_compression_backend: "lameenc"
|
34 |
+
|
35 |
+
# pedalboard reverb block
|
36 |
+
pedalboard_reverb: 0.01
|
37 |
+
pedalboard_reverb_room_size_min: 0.1
|
38 |
+
pedalboard_reverb_room_size_max: 0.9
|
39 |
+
pedalboard_reverb_damping_min: 0.1
|
40 |
+
pedalboard_reverb_damping_max: 0.9
|
41 |
+
pedalboard_reverb_wet_level_min: 0.1
|
42 |
+
pedalboard_reverb_wet_level_max: 0.9
|
43 |
+
pedalboard_reverb_dry_level_min: 0.1
|
44 |
+
pedalboard_reverb_dry_level_max: 0.9
|
45 |
+
pedalboard_reverb_width_min: 0.9
|
46 |
+
pedalboard_reverb_width_max: 1.0
|
47 |
+
|
48 |
+
# pedalboard chorus block
|
49 |
+
pedalboard_chorus: 0.01
|
50 |
+
pedalboard_chorus_rate_hz_min: 1.0
|
51 |
+
pedalboard_chorus_rate_hz_max: 7.0
|
52 |
+
pedalboard_chorus_depth_min: 0.25
|
53 |
+
pedalboard_chorus_depth_max: 0.95
|
54 |
+
pedalboard_chorus_centre_delay_ms_min: 3
|
55 |
+
pedalboard_chorus_centre_delay_ms_max: 10
|
56 |
+
pedalboard_chorus_feedback_min: 0.0
|
57 |
+
pedalboard_chorus_feedback_max: 0.5
|
58 |
+
pedalboard_chorus_mix_min: 0.1
|
59 |
+
pedalboard_chorus_mix_max: 0.9
|
60 |
+
|
61 |
+
# pedalboard phazer block
|
62 |
+
pedalboard_phazer: 0.01
|
63 |
+
pedalboard_phazer_rate_hz_min: 1.0
|
64 |
+
pedalboard_phazer_rate_hz_max: 10.0
|
65 |
+
pedalboard_phazer_depth_min: 0.25
|
66 |
+
pedalboard_phazer_depth_max: 0.95
|
67 |
+
pedalboard_phazer_centre_frequency_hz_min: 200
|
68 |
+
pedalboard_phazer_centre_frequency_hz_max: 12000
|
69 |
+
pedalboard_phazer_feedback_min: 0.0
|
70 |
+
pedalboard_phazer_feedback_max: 0.5
|
71 |
+
pedalboard_phazer_mix_min: 0.1
|
72 |
+
pedalboard_phazer_mix_max: 0.9
|
73 |
+
|
74 |
+
# pedalboard distortion block
|
75 |
+
pedalboard_distortion: 0.01
|
76 |
+
pedalboard_distortion_drive_db_min: 1.0
|
77 |
+
pedalboard_distortion_drive_db_max: 25.0
|
78 |
+
|
79 |
+
# pedalboard pitch shift block
|
80 |
+
pedalboard_pitch_shift: 0.01
|
81 |
+
pedalboard_pitch_shift_semitones_min: -7
|
82 |
+
pedalboard_pitch_shift_semitones_max: 7
|
83 |
+
|
84 |
+
# pedalboard resample block
|
85 |
+
pedalboard_resample: 0.01
|
86 |
+
pedalboard_resample_target_sample_rate_min: 4000
|
87 |
+
pedalboard_resample_target_sample_rate_max: 44100
|
88 |
+
|
89 |
+
# pedalboard bitcrash block
|
90 |
+
pedalboard_bitcrash: 0.01
|
91 |
+
pedalboard_bitcrash_bit_depth_min: 4
|
92 |
+
pedalboard_bitcrash_bit_depth_max: 16
|
93 |
+
|
94 |
+
# pedalboard mp3 compressor block
|
95 |
+
pedalboard_mp3_compressor: 0.01
|
96 |
+
pedalboard_mp3_compressor_pedalboard_mp3_compressor_min: 0
|
97 |
+
pedalboard_mp3_compressor_pedalboard_mp3_compressor_max: 9.999
|
98 |
+
|
99 |
+
vocals:
|
100 |
+
pitch_shift: 0.1
|
101 |
+
pitch_shift_min_semitones: -5
|
102 |
+
pitch_shift_max_semitones: 5
|
103 |
+
seven_band_parametric_eq: 0.25
|
104 |
+
seven_band_parametric_eq_min_gain_db: -9
|
105 |
+
seven_band_parametric_eq_max_gain_db: 9
|
106 |
+
tanh_distortion: 0.1
|
107 |
+
tanh_distortion_min: 0.1
|
108 |
+
tanh_distortion_max: 0.7
|
109 |
+
bass:
|
110 |
+
pitch_shift: 0.1
|
111 |
+
pitch_shift_min_semitones: -2
|
112 |
+
pitch_shift_max_semitones: 2
|
113 |
+
seven_band_parametric_eq: 0.25
|
114 |
+
seven_band_parametric_eq_min_gain_db: -3
|
115 |
+
seven_band_parametric_eq_max_gain_db: 6
|
116 |
+
tanh_distortion: 0.2
|
117 |
+
tanh_distortion_min: 0.1
|
118 |
+
tanh_distortion_max: 0.5
|
119 |
+
drums:
|
120 |
+
pitch_shift: 0.33
|
121 |
+
pitch_shift_min_semitones: -5
|
122 |
+
pitch_shift_max_semitones: 5
|
123 |
+
seven_band_parametric_eq: 0.25
|
124 |
+
seven_band_parametric_eq_min_gain_db: -9
|
125 |
+
seven_band_parametric_eq_max_gain_db: 9
|
126 |
+
tanh_distortion: 0.33
|
127 |
+
tanh_distortion_min: 0.1
|
128 |
+
tanh_distortion_max: 0.6
|
129 |
+
other:
|
130 |
+
pitch_shift: 0.1
|
131 |
+
pitch_shift_min_semitones: -4
|
132 |
+
pitch_shift_max_semitones: 4
|
133 |
+
gaussian_noise: 0.1
|
134 |
+
gaussian_noise_min_amplitude: 0.001
|
135 |
+
gaussian_noise_max_amplitude: 0.015
|
136 |
+
time_stretch: 0.01
|
137 |
+
time_stretch_min_rate: 0.8
|
138 |
+
time_stretch_max_rate: 1.25
|
139 |
+
```
|
140 |
+
|
141 |
+
You can copypaste it into your config to use augmentations.
|
142 |
+
Notes:
|
143 |
+
* To completely disable all augmentations you can either remove `augmentations` section from config or set `enable` to `false`.
|
144 |
+
* If you want to disable some augmentation, just set it to zero.
|
145 |
+
* Augmentations in `all` subsections applied to all stems
|
146 |
+
* Augmentations in `vocals`, `bass` etc subsections applied only to corresponding stems. You can create such subsections for all stems which are given in `training.instruments`.
|
docs/bs_roformer_info.md
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### Batch sizes for BSRoformer
|
2 |
+
|
3 |
+
You can use table below to choose BS Roformer `batch_size` parameter for training based on your GPUs. Batch size values provided for single GPU. If you have several GPUs you need to multiply value on number of GPUs.
|
4 |
+
|
5 |
+
| chunk_size | dim | depth | batch_size (A6000 48GB) | batch_size (3090/4090 24GB) | batch_size (16GB) |
|
6 |
+
|:----------:|:---:|:-----:|:-----------------------:|:---------------------------:|:-----------------:|
|
7 |
+
| 131584 | 128 | 6 | 10 | 5 | 3 |
|
8 |
+
| 131584 | 256 | 6 | 8 | 4 | 2 |
|
9 |
+
| 131584 | 384 | 6 | 7 | 3 | 2 |
|
10 |
+
| 131584 | 512 | 6 | 6 | 3 | 2 |
|
11 |
+
| 131584 | 256 | 8 | 6 | 3 | 2 |
|
12 |
+
| 131584 | 256 | 12 | 4 | 2 | 1 |
|
13 |
+
| 263168 | 128 | 6 | 4 | 2 | 1 |
|
14 |
+
| 263168 | 256 | 6 | 3 | 1 | 1 |
|
15 |
+
| 352800 | 128 | 6 | 2 | 1 | - |
|
16 |
+
| 352800 | 256 | 6 | 2 | 1 | - |
|
17 |
+
| 352800 | 384 | 12 | 1 | - | - |
|
18 |
+
| 352800 | 512 | 12 | - | - | - |
|
19 |
+
|
20 |
+
|
21 |
+
Parameters obtained with initial config:
|
22 |
+
|
23 |
+
```
|
24 |
+
audio:
|
25 |
+
chunk_size: 131584
|
26 |
+
dim_f: 1024
|
27 |
+
dim_t: 515
|
28 |
+
hop_length: 512
|
29 |
+
n_fft: 2048
|
30 |
+
num_channels: 2
|
31 |
+
sample_rate: 44100
|
32 |
+
min_mean_abs: 0.000
|
33 |
+
|
34 |
+
model:
|
35 |
+
dim: 384
|
36 |
+
depth: 12
|
37 |
+
stereo: true
|
38 |
+
num_stems: 1
|
39 |
+
time_transformer_depth: 1
|
40 |
+
freq_transformer_depth: 1
|
41 |
+
linear_transformer_depth: 0
|
42 |
+
freqs_per_bands: !!python/tuple
|
43 |
+
- 2
|
44 |
+
- 2
|
45 |
+
- 2
|
46 |
+
- 2
|
47 |
+
- 2
|
48 |
+
- 2
|
49 |
+
- 2
|
50 |
+
- 2
|
51 |
+
- 2
|
52 |
+
- 2
|
53 |
+
- 2
|
54 |
+
- 2
|
55 |
+
- 2
|
56 |
+
- 2
|
57 |
+
- 2
|
58 |
+
- 2
|
59 |
+
- 2
|
60 |
+
- 2
|
61 |
+
- 2
|
62 |
+
- 2
|
63 |
+
- 2
|
64 |
+
- 2
|
65 |
+
- 2
|
66 |
+
- 2
|
67 |
+
- 4
|
68 |
+
- 4
|
69 |
+
- 4
|
70 |
+
- 4
|
71 |
+
- 4
|
72 |
+
- 4
|
73 |
+
- 4
|
74 |
+
- 4
|
75 |
+
- 4
|
76 |
+
- 4
|
77 |
+
- 4
|
78 |
+
- 4
|
79 |
+
- 12
|
80 |
+
- 12
|
81 |
+
- 12
|
82 |
+
- 12
|
83 |
+
- 12
|
84 |
+
- 12
|
85 |
+
- 12
|
86 |
+
- 12
|
87 |
+
- 24
|
88 |
+
- 24
|
89 |
+
- 24
|
90 |
+
- 24
|
91 |
+
- 24
|
92 |
+
- 24
|
93 |
+
- 24
|
94 |
+
- 24
|
95 |
+
- 48
|
96 |
+
- 48
|
97 |
+
- 48
|
98 |
+
- 48
|
99 |
+
- 48
|
100 |
+
- 48
|
101 |
+
- 48
|
102 |
+
- 48
|
103 |
+
- 128
|
104 |
+
- 129
|
105 |
+
dim_head: 64
|
106 |
+
heads: 8
|
107 |
+
attn_dropout: 0.1
|
108 |
+
ff_dropout: 0.1
|
109 |
+
flash_attn: false
|
110 |
+
dim_freqs_in: 1025
|
111 |
+
stft_n_fft: 2048
|
112 |
+
stft_hop_length: 512
|
113 |
+
stft_win_length: 2048
|
114 |
+
stft_normalized: false
|
115 |
+
mask_estimator_depth: 2
|
116 |
+
multi_stft_resolution_loss_weight: 1.0
|
117 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
118 |
+
- 4096
|
119 |
+
- 2048
|
120 |
+
- 1024
|
121 |
+
- 512
|
122 |
+
- 256
|
123 |
+
multi_stft_hop_size: 147
|
124 |
+
multi_stft_normalized: False
|
125 |
+
|
126 |
+
training:
|
127 |
+
batch_size: 1
|
128 |
+
gradient_accumulation_steps: 1
|
129 |
+
grad_clip: 0
|
130 |
+
instruments:
|
131 |
+
- vocals
|
132 |
+
- other
|
133 |
+
lr: 3.0e-05
|
134 |
+
patience: 2
|
135 |
+
reduce_factor: 0.95
|
136 |
+
target_instrument: vocals
|
137 |
+
num_epochs: 1000
|
138 |
+
num_steps: 1000
|
139 |
+
q: 0.95
|
140 |
+
coarse_loss_clip: true
|
141 |
+
ema_momentum: 0.999
|
142 |
+
optimizer: adam
|
143 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
144 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
145 |
+
```
|
docs/changes.md
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### Changes
|
2 |
+
|
3 |
+
#### v1.0.2
|
4 |
+
|
5 |
+
* Added multi GPU validation (earlier validation was performed on single GPU)
|
6 |
+
* `training.batch_size` in config now must be set for single GPU (if you use multiple GPUs it will be automatically multiplied by number of GPUs)
|
7 |
+
|
8 |
+
#### v1.0.3
|
9 |
+
|
10 |
+
* Added "spawn" fix for multiprocessing
|
11 |
+
* Function `get_model_from_config` now takes path of config as input.
|
12 |
+
* On latest version of pytorch some problems with torch.backends.cudnn.benchmark = True - big slow down. Fixed version 2.0.1 in requirements.txt
|
13 |
+
* `--valid_path` parameter for train.py now can accept several validation folders instead of one. Added warning if validation folder is empty.
|
14 |
+
* Small fix for AMP usage in Demucs models taken from config
|
15 |
+
* Support for Demucs3 mmi model was added
|
16 |
+
* GPU memory consumption was reduced during inference and validation.
|
17 |
+
* Some changes to repair click problems on the edges of segment.
|
18 |
+
* Added support to train on FLAC files. Some more error checks added.
|
19 |
+
* viperx's Roformer weights and configs added
|
20 |
+
* `--extract_instrumental` argument added to inference.py
|
docs/dataset_types.md
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### Dataset types for training
|
2 |
+
|
3 |
+
* **Type 1 (MUSDB)**: different folders. Each folder contains all needed stems in format _< stem name >.wav_. The same as in MUSDBHQ18 dataset. In latest code releases it's possible to use `flac` instead of `wav`.
|
4 |
+
|
5 |
+
Example:
|
6 |
+
```
|
7 |
+
--- Song 1:
|
8 |
+
------ vocals.wav
|
9 |
+
------ bass.wav
|
10 |
+
------ drums.wav
|
11 |
+
------ other.wav
|
12 |
+
--- Song 2:
|
13 |
+
------ vocals.wav
|
14 |
+
------ bass.wav
|
15 |
+
------ drums.wav
|
16 |
+
------ other.wav
|
17 |
+
--- Song 3:
|
18 |
+
...........
|
19 |
+
```
|
20 |
+
|
21 |
+
* **Type 2 (Stems)**: each folder is "stem name". Folder contains wav files which consists only of required stem.
|
22 |
+
```
|
23 |
+
--- vocals:
|
24 |
+
------ vocals_1.wav
|
25 |
+
------ vocals_2.wav
|
26 |
+
------ vocals_3.wav
|
27 |
+
------ vocals_4.wav
|
28 |
+
------ ...
|
29 |
+
--- bass:
|
30 |
+
------ bass_1.wav
|
31 |
+
------ bass_2.wav
|
32 |
+
------ bass_3.wav
|
33 |
+
------ bass_4.wav
|
34 |
+
------ ...
|
35 |
+
...........
|
36 |
+
```
|
37 |
+
|
38 |
+
* **Type 3 (CSV file)**:
|
39 |
+
|
40 |
+
You can provide CSV-file (or list of CSV-files) with following structure:
|
41 |
+
```
|
42 |
+
instrum,path
|
43 |
+
vocals,/path/to/dataset/vocals_1.wav
|
44 |
+
vocals,/path/to/dataset2/vocals_v2.wav
|
45 |
+
vocals,/path/to/dataset3/vocals_some.wav
|
46 |
+
...
|
47 |
+
drums,/path/to/dataset/drums_good.wav
|
48 |
+
...
|
49 |
+
```
|
50 |
+
|
51 |
+
* **Type 4 (MUSDB Aligned)**:
|
52 |
+
|
53 |
+
The same as Type 1, but during training all instruments will be from the same position of song.
|
54 |
+
|
55 |
+
### Dataset for validation
|
56 |
+
|
57 |
+
* The validation dataset must be the same structure as type 1 datasets (regardless of what type of dataset you're using for training), but also each folder must include `mixture.wav` for each song. `mixture.wav` - is the sum of all stems for song.
|
58 |
+
|
59 |
+
Example:
|
60 |
+
```
|
61 |
+
--- Song 1:
|
62 |
+
------ vocals.wav
|
63 |
+
------ bass.wav
|
64 |
+
------ drums.wav
|
65 |
+
------ other.wav
|
66 |
+
------ mixture.wav
|
67 |
+
--- Song 2:
|
68 |
+
------ vocals.wav
|
69 |
+
------ bass.wav
|
70 |
+
------ drums.wav
|
71 |
+
------ other.wav
|
72 |
+
------ mixture.wav
|
73 |
+
--- Song 3:
|
74 |
+
...........
|
75 |
+
```
|
docs/ensemble.md
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### Ensemble usage
|
2 |
+
|
3 |
+
Repository contains `ensemble.py` script which can be used to ensemble results of different algorithms.
|
4 |
+
|
5 |
+
Arguments:
|
6 |
+
* `--files` - Path to all audio-files to ensemble
|
7 |
+
* `--type` - Method to do ensemble. One of avg_wave, median_wave, min_wave, max_wave, avg_fft, median_fft, min_fft, max_fft. Default: avg_wave.
|
8 |
+
* `--weights` - Weights to create ensemble. Number of weights must be equal to number of files
|
9 |
+
* `--output` - Path to wav file where ensemble result will be stored (Default: res.wav)
|
10 |
+
|
11 |
+
Example:
|
12 |
+
```
|
13 |
+
ensemble.py --files ./results_tracks/vocals1.wav ./results_tracks/vocals2.wav --weights 2 1 --type max_fft --output out.wav
|
14 |
+
```
|
15 |
+
|
16 |
+
### Ensemble types:
|
17 |
+
|
18 |
+
* `avg_wave` - ensemble on 1D variant, find average for every sample of waveform independently
|
19 |
+
* `median_wave` - ensemble on 1D variant, find median value for every sample of waveform independently
|
20 |
+
* `min_wave` - ensemble on 1D variant, find minimum absolute value for every sample of waveform independently
|
21 |
+
* `max_wave` - ensemble on 1D variant, find maximum absolute value for every sample of waveform independently
|
22 |
+
* `avg_fft` - ensemble on spectrogram (Short-time Fourier transform (STFT), 2D variant), find average for every pixel of spectrogram independently. After averaging use inverse STFT to obtain original 1D-waveform back.
|
23 |
+
* `median_fft` - the same as avg_fft but use median instead of mean (only useful for ensembling of 3 or more sources).
|
24 |
+
* `min_fft` - the same as avg_fft but use minimum function instead of mean (reduce aggressiveness).
|
25 |
+
* `max_fft` - the same as avg_fft but use maximum function instead of mean (the most aggressive).
|
26 |
+
|
27 |
+
### Notes
|
28 |
+
* `min_fft` can be used to do more conservative ensemble - it will reduce influence of more aggressive models.
|
29 |
+
* It's better to ensemble models which are of equal quality - in this case it will give gain. If one of model is bad - it will reduce overall quality.
|
30 |
+
* In my experiments `avg_wave` was always better or equal in SDR score comparing with other methods.
|