ASesYusuf1 commited on
Commit
3978e51
·
1 Parent(s): c3eda24

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +5 -0
  2. .gitignore +76 -0
  3. .gradio/certificate.pem +31 -0
  4. LICENSE +21 -0
  5. README.md +12 -1
  6. WebUi2.py +0 -0
  7. ckpts/inst_gabox.yaml +48 -0
  8. clean_model.py +157 -0
  9. configs/KimberleyJensen/config_vocals_mel_band_roformer_kj.yaml +72 -0
  10. configs/config_apollo.yaml +33 -0
  11. configs/config_dnr_bandit_bsrnn_multi_mus64.yaml +78 -0
  12. configs/config_dnr_bandit_v2_mus64.yaml +77 -0
  13. configs/config_drumsep.yaml +72 -0
  14. configs/config_htdemucs_6stems.yaml +127 -0
  15. configs/config_musdb18_bs_mamba2.yaml +58 -0
  16. configs/config_musdb18_bs_roformer.yaml +137 -0
  17. configs/config_musdb18_bs_roformer_with_lora.yaml +205 -0
  18. configs/config_musdb18_demucs3_mmi.yaml +72 -0
  19. configs/config_musdb18_htdemucs.yaml +119 -0
  20. configs/config_musdb18_mdx23c.yaml +182 -0
  21. configs/config_musdb18_mdx23c_stht.yaml +182 -0
  22. configs/config_musdb18_mel_band_roformer.yaml +76 -0
  23. configs/config_musdb18_mel_band_roformer_all_stems.yaml +97 -0
  24. configs/config_musdb18_scnet.yaml +83 -0
  25. configs/config_musdb18_scnet_large.yaml +83 -0
  26. configs/config_musdb18_segm_models.yaml +92 -0
  27. configs/config_musdb18_torchseg.yaml +92 -0
  28. configs/config_vocals_bandit_bsrnn_multi_mus64.yaml +73 -0
  29. configs/config_vocals_bs_mamba2.yaml +51 -0
  30. configs/config_vocals_bs_roformer.yaml +141 -0
  31. configs/config_vocals_htdemucs.yaml +123 -0
  32. configs/config_vocals_mdx23c.yaml +96 -0
  33. configs/config_vocals_mel_band_roformer.yaml +80 -0
  34. configs/config_vocals_scnet.yaml +79 -0
  35. configs/config_vocals_scnet_large.yaml +79 -0
  36. configs/config_vocals_scnet_unofficial.yaml +62 -0
  37. configs/config_vocals_segm_models.yaml +78 -0
  38. configs/config_vocals_swin_upernet.yaml +51 -0
  39. configs/config_vocals_torchseg.yaml +58 -0
  40. configs/viperx/model_bs_roformer_ep_317_sdr_12.9755.yaml +126 -0
  41. configs/viperx/model_bs_roformer_ep_937_sdr_10.5309.yaml +138 -0
  42. configs/viperx/model_mel_band_roformer_ep_3005_sdr_11.4360.yaml +65 -0
  43. cookies.txt +28 -0
  44. dataset.py +669 -0
  45. docs/LoRA.md +114 -0
  46. docs/augmentations.md +146 -0
  47. docs/bs_roformer_info.md +145 -0
  48. docs/changes.md +20 -0
  49. docs/dataset_types.md +75 -0
  50. docs/ensemble.md +30 -0
.gitattributes CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ gui/Poppins[[:space:]]Bold[[:space:]]700.ttf filter=lfs diff=lfs merge=lfs -text
37
+ gui/Poppins[[:space:]]Regular[[:space:]]400.ttf filter=lfs diff=lfs merge=lfs -text
38
+ input/APT.[[:space:]][[:space:]]YOR[[:space:]]X[[:space:]]LOID[[:space:]][[:space:]]AMV[[:space:]]4K.mp3 filter=lfs diff=lfs merge=lfs -text
39
+ old_output/APT.[[:space:]][[:space:]]YOR[[:space:]]X[[:space:]]LOID[[:space:]][[:space:]]AMV[[:space:]]4K.mp3_Instrumental_Inst_GaboxV7_(by[[:space:]]Gabox)_old.wav filter=lfs diff=lfs merge=lfs -text
40
+ output/APT.[[:space:]][[:space:]]YOR[[:space:]]X[[:space:]]LOID[[:space:]][[:space:]]AMV[[:space:]]4K.mp3_Instrumental_Inst_GaboxV7_(by[[:space:]]Gabox).wav filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__
2
+ .DS_Store
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Distribution / packaging
7
+ .Python
8
+ build/
9
+ develop-eggs/
10
+ dist/
11
+ downloads/
12
+ eggs/
13
+ .eggs/
14
+ lib/
15
+ Lib/site-packages/
16
+ lib64/
17
+ parts/
18
+ sdist/
19
+ var/
20
+ wheels/
21
+ share/python-wheels/
22
+ share/man/man1/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+ MANIFEST
27
+
28
+ # PyInstaller
29
+ # Usually these files are written by a python script from a template
30
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
31
+ *.manifest
32
+ *.spec
33
+
34
+ # Installer logs
35
+ pip-log.txt
36
+ pip-delete-this-directory.txt
37
+
38
+ # Unit test / coverage reports
39
+ htmlcov/
40
+ .tox/
41
+ .nox/
42
+ .coverage
43
+ .coverage.*
44
+ .cache
45
+ nosetests.xml
46
+ coverage.xml
47
+ *.cover
48
+ *.py,cover
49
+ .hypothesis/
50
+ .pytest_cache/
51
+ cover/
52
+
53
+ # Jupyter Notebook
54
+ .ipynb_checkpoints
55
+ share/jupyter
56
+ etc/jupyter
57
+
58
+ # IPython
59
+ profile_default/
60
+ ipython_config.py
61
+
62
+ # Environments
63
+ .env
64
+ .venv
65
+ env/
66
+ venv/
67
+ ENV/
68
+ env.bak/
69
+ venv.bak/
70
+ pyvenv.cfg
71
+ Scripts/
72
+
73
+ *.code-workspace
74
+
75
+ results/
76
+ wandb/
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Roman Solovyev (ZFTurbo)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,3 +1,14 @@
1
 
 
 
 
 
 
 
 
 
 
 
 
2
  # Gecekondu Dubbing Production Space
3
- Bu Space, ses ayrıştırma ve dublaj işlemleri için profesyonel bir arayüz sunar.
 
1
 
2
+ ---
3
+ title: Gecekondu Dubbing Production Studio
4
+ emoji: 🎙️
5
+ colorFrom: red
6
+ colorTo: yellow # gold yerine yellow kullanıldı
7
+ sdk: gradio
8
+ sdk_version: "4.44.1"
9
+ app_file: app.py
10
+ pinned: false
11
+ ---
12
+
13
  # Gecekondu Dubbing Production Space
14
+ Bu Space, ses ayrıştırma ve dublaj işlemleri için profesyonel bir arayüz sunar. Gecekondu ekibi tarafından geliştirilmiştir.
WebUi2.py ADDED
The diff for this file is too large to render. See raw diff
 
ckpts/inst_gabox.yaml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352800
3
+ dim_f: 1024
4
+ dim_t: 1101
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.0
10
+ model:
11
+ dim: 384
12
+ depth: 6
13
+ stereo: true
14
+ num_stems: 1
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ num_bands: 60
18
+ dim_head: 64
19
+ heads: 8
20
+ attn_dropout: 0
21
+ ff_dropout: 0
22
+ flash_attn: true
23
+ dim_freqs_in: 1025
24
+ sample_rate: 44100
25
+ stft_n_fft: 2048
26
+ stft_hop_length: 441
27
+ stft_win_length: 2048
28
+ stft_normalized: false
29
+ mask_estimator_depth: 2
30
+ multi_stft_resolution_loss_weight: 1.0
31
+ multi_stft_resolutions_window_sizes: !!python/tuple
32
+ - 4096
33
+ - 2048
34
+ - 1024
35
+ - 512
36
+ - 256
37
+ multi_stft_hop_size: 147
38
+ multi_stft_normalized: false
39
+ training:
40
+ instruments:
41
+ - Instrumental
42
+ - Vocals
43
+ target_instrument: Instrumental
44
+ use_amp: true
45
+ inference:
46
+ batch_size: 2
47
+ dim_t: 1101
48
+ num_overlap: 2
clean_model.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import glob
3
+ import subprocess
4
+ import time
5
+ import gc
6
+ import shutil
7
+ import sys
8
+ from datetime import datetime
9
+ import torch
10
+ import yaml
11
+ import gradio as gr
12
+ import threading
13
+ import random
14
+ import librosa
15
+ import soundfile as sf
16
+ import numpy as np
17
+ import requests
18
+ import json
19
+ import locale
20
+ import re
21
+ import psutil
22
+ import concurrent.futures
23
+ from tqdm import tqdm
24
+ from google.oauth2.credentials import Credentials
25
+ import tempfile
26
+ from urllib.parse import urlparse, quote
27
+ import gdown
28
+
29
+ import warnings
30
+ warnings.filterwarnings("ignore")
31
+
32
+ # BASE_DIR'i dinamik olarak güncel dizine ayarla
33
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__)) # processing.py'nin bulunduğu dizin
34
+ INFERENCE_PATH = os.path.join(BASE_DIR, "inference.py") # inference.py'nin tam yolu
35
+ OUTPUT_DIR = os.path.join(BASE_DIR, "output") # Çıkış dizini BASE_DIR/output olarak güncellendi
36
+ AUTO_ENSEMBLE_OUTPUT = os.path.join(BASE_DIR, "ensemble_output") # Ensemble çıkış dizini
37
+
38
+ def clean_model_name(model):
39
+ """
40
+ Clean and standardize model names for filename
41
+ """
42
+ model_name_mapping = {
43
+ 'VOCALS-InstVocHQ': 'InstVocHQ',
44
+ 'VOCALS-MelBand-Roformer (by KimberleyJSN)': 'KimberleyJSN',
45
+ 'VOCALS-BS-Roformer_1297 (by viperx)': 'VOCALS_BS_Roformer1297',
46
+ 'VOCALS-BS-Roformer_1296 (by viperx)': 'VOCALS-BS-Roformer_1296',
47
+ 'VOCALS-BS-RoformerLargev1 (by unwa)': 'UnwaLargeV1',
48
+ 'VOCALS-Mel-Roformer big beta 4 (by unwa)': 'UnwaBigBeta4',
49
+ 'VOCALS-Melband-Roformer BigBeta5e (by unwa)': 'UnwaBigBeta5e',
50
+ 'INST-Mel-Roformer v1 (by unwa)': 'UnwaInstV1',
51
+ 'INST-Mel-Roformer v2 (by unwa)': 'UnwaInstV2',
52
+ 'INST-VOC-Mel-Roformer a.k.a. duality (by unwa)': 'UnwaDualityV1',
53
+ 'INST-VOC-Mel-Roformer a.k.a. duality v2 (by unwa)': 'UnwaDualityV2',
54
+ 'KARAOKE-MelBand-Roformer (by aufr33 & viperx)': 'KaraokeMelBandRoformer',
55
+ 'VOCALS-VitLarge23 (by ZFTurbo)': 'VitLarge23',
56
+ 'VOCALS-MelBand-Roformer (by Becruily)': 'BecruilyVocals',
57
+ 'INST-MelBand-Roformer (by Becruily)': 'BecruilyInst',
58
+ 'VOCALS-MelBand-Roformer Kim FT (by Unwa)': 'KimFT',
59
+ 'INST-MelBand-Roformer Kim FT (by Unwa)': 'KimFTInst',
60
+ 'OTHER-BS-Roformer_1053 (by viperx)': 'OtherViperx1053',
61
+ 'CROWD-REMOVAL-MelBand-Roformer (by aufr33)': 'CrowdRemovalRoformer',
62
+ 'CINEMATIC-BandIt_Plus (by kwatcharasupat)': 'CinematicBandItPlus',
63
+ 'DRUMSEP-MDX23C_DrumSep_6stem (by aufr33 & jarredou)': 'DrumSepMDX23C',
64
+ '4STEMS-SCNet_MUSDB18 (by starrytong)': 'FourStemsSCNet',
65
+ 'DE-REVERB-MDX23C (by aufr33 & jarredou)': 'DeReverbMDX23C',
66
+ 'DENOISE-MelBand-Roformer-1 (by aufr33)': 'DenoiseMelBand1',
67
+ 'DENOISE-MelBand-Roformer-2 (by aufr33)': 'DenoiseMelBand2',
68
+ 'INST-MelBand-Roformer (by Becruily)': 'BecruilyInst',
69
+ '4STEMS-SCNet_XL_MUSDB18 (by ZFTurbo)': 'FourStemsSCNetXL',
70
+ '4STEMS-SCNet_Large (by starrytong)': 'FourStemsSCNetLarge',
71
+ '4STEMS-BS-Roformer_MUSDB18 (by ZFTurbo)': 'FourStemsBSRoformer',
72
+ 'DE-REVERB-MelBand-Roformer aggr./v2/19.1729 (by anvuew)': 'DeReverbMelBandAggr',
73
+ 'DE-REVERB-Echo-MelBand-Roformer (by Sucial)': 'DeReverbEchoMelBand',
74
+ 'bleed_suppressor_v1 (by unwa)': 'BleedSuppressorV1',
75
+ 'inst_v1e (by unwa)': 'InstV1E',
76
+ 'inst_gabox (by Gabox)': 'InstGabox',
77
+ 'inst_gaboxBV1 (by Gabox)': 'InstGaboxBV1',
78
+ 'inst_gaboxBV2 (by Gabox)': 'InstGaboxBV2',
79
+ 'inst_gaboxBFV1 (by Gabox)': 'InstGaboxBFV1',
80
+ 'inst_gaboxFV2 (by Gabox)': 'InstGaboxFV2',
81
+ 'inst_gaboxFV1 (by Gabox)': 'InstGaboxFV1',
82
+ 'dereverb_mel_band_roformer_less_aggressive_anvuew': 'DereverbMelBandRoformerLessAggressive',
83
+ 'dereverb_mel_band_roformer_anvuew': 'DereverbMelBandRoformer',
84
+ 'VOCALS-Male Female-BS-RoFormer Male Female Beta 7_2889 (by aufr33)': 'MaleFemale-BS-RoFormer-(by aufr33)',
85
+ 'VOCALS-MelBand-Roformer (by Becruily)': 'Vocals-MelBand-Roformer-(by Becruily)',
86
+ 'VOCALS-MelBand-Roformer Kim FT 2 (by Unwa)': 'Vocals-MelBand-Roformer-KİM-FT-2(by Unwa)',
87
+ 'voc_gaboxMelRoformer (by Gabox)': 'voc_gaboxMelRoformer',
88
+ 'voc_gaboxBSroformer (by Gabox)': 'voc_gaboxBSroformer',
89
+ 'voc_gaboxMelRoformerFV1 (by Gabox)': 'voc_gaboxMelRoformerFV1',
90
+ 'voc_gaboxMelRoformerFV2 (by Gabox)': 'voc_gaboxMelRoformerFV2',
91
+ 'SYH99999/MelBandRoformerSYHFTB1(by Amane)': 'MelBandRoformerSYHFTB1',
92
+ 'inst_V5 (by Gabox)': 'INSTV5-(by Gabox)',
93
+ 'inst_Fv4Noise (by Gabox)': 'Inst_Fv4Noise-(by Gabox)',
94
+ 'Intrumental_Gabox (by Gabox)': 'Intrumental_Gabox-(by Gabox)',
95
+ 'inst_GaboxFv3 (by Gabox)': 'INST_GaboxFv3-(by Gabox)',
96
+ 'SYH99999/MelBandRoformerSYHFTB1_Model1 (by Amane)': 'MelBandRoformerSYHFTB1_model1',
97
+ 'SYH99999/MelBandRoformerSYHFTB1_Model2 (by Amane)': 'MelBandRoformerSYHFTB1_model2',
98
+ 'SYH99999/MelBandRoformerSYHFTB1_Model3 (by Amane)': 'MelBandRoformerSYHFTB1_model3',
99
+ 'VOCALS-MelBand-Roformer Kim FT 2 Blendless (by unwa)': 'VOCALS-MelBand-Roformer-Kim-FT-2-Blendless-(by unwa)',
100
+ 'inst_gaboxFV6 (by Gabox)': 'inst_gaboxFV6-(by Gabox)',
101
+ 'denoisedebleed (by Gabox)': 'denoisedebleed-(by Gabox)',
102
+ 'INSTV5N (by Gabox)': 'INSTV5N_(by Gabox)',
103
+ 'Voc_Fv3 (by Gabox)': 'Voc_Fv3_(by Gabox)',
104
+ 'MelBandRoformer4StemFTLarge (SYH99999)': 'MelBandRoformer4StemFTLarge_(SYH99999)',
105
+ 'dereverb_mel_band_roformer_mono (by anvuew)': 'dereverb_mel_band_roformer_mono_(by anvuew)',
106
+ 'INSTV6N (by Gabox)': 'INSTV6N_(by Gabox)',
107
+ 'KaraokeGabox': 'KaraokeGabox',
108
+ 'FullnessVocalModel (by Amane)': 'FullnessVocalModel',
109
+ 'Inst_GaboxV7 (by Gabox)': 'Inst_GaboxV7_(by Gabox)',
110
+ }
111
+
112
+ if model in model_name_mapping:
113
+ return model_name_mapping[model]
114
+
115
+ cleaned = re.sub(r'\s*\(.*?\)', '', model) # Remove parenthetical info
116
+ cleaned = cleaned.replace('-', '_')
117
+ cleaned = ''.join(char for char in cleaned if char.isalnum() or char == '_')
118
+
119
+ return cleaned
120
+
121
+ def shorten_filename(filename, max_length=30):
122
+ """
123
+ Shortens a filename to a specified maximum length
124
+ """
125
+ base, ext = os.path.splitext(filename)
126
+ if len(base) <= max_length:
127
+ return filename
128
+ shortened = base[:15] + "..." + base[-10:] + ext
129
+ return shortened
130
+
131
+ def clean_filename(filename):
132
+ """
133
+ Temizlenmiş dosya adını döndürür
134
+ """
135
+ cleanup_patterns = [
136
+ r'_\d{8}_\d{6}_\d{6}$', # _20231215_123456_123456
137
+ r'_\d{14}$', # _20231215123456
138
+ r'_\d{10}$', # _1702658400
139
+ r'_\d+$' # Herhangi bir sayı
140
+ ]
141
+
142
+ base, ext = os.path.splitext(filename)
143
+ for pattern in cleanup_patterns:
144
+ base = re.sub(pattern, '', base)
145
+
146
+ file_types = ['vocals', 'instrumental', 'drum', 'bass', 'other', 'effects', 'speech', 'music', 'dry', 'male', 'female']
147
+ for type_keyword in file_types:
148
+ base = base.replace(f'_{type_keyword}', '')
149
+
150
+ detected_type = None
151
+ for type_keyword in file_types:
152
+ if type_keyword in base.lower():
153
+ detected_type = type_keyword
154
+ break
155
+
156
+ clean_base = base.strip('_- ')
157
+ return clean_base, detected_type, ext
configs/KimberleyJensen/config_vocals_mel_band_roformer_kj.yaml ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352800
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.000
10
+
11
+ model:
12
+ dim: 384
13
+ depth: 6
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ num_bands: 60
19
+ dim_head: 64
20
+ heads: 8
21
+ attn_dropout: 0
22
+ ff_dropout: 0
23
+ flash_attn: True
24
+ dim_freqs_in: 1025
25
+ sample_rate: 44100 # needed for mel filter bank from librosa
26
+ stft_n_fft: 2048
27
+ stft_hop_length: 441
28
+ stft_win_length: 2048
29
+ stft_normalized: False
30
+ mask_estimator_depth: 2
31
+ multi_stft_resolution_loss_weight: 1.0
32
+ multi_stft_resolutions_window_sizes: !!python/tuple
33
+ - 4096
34
+ - 2048
35
+ - 1024
36
+ - 512
37
+ - 256
38
+ multi_stft_hop_size: 147
39
+ multi_stft_normalized: False
40
+
41
+ training:
42
+ batch_size: 4
43
+ gradient_accumulation_steps: 1
44
+ grad_clip: 0
45
+ instruments:
46
+ - vocals
47
+ - other
48
+ lr: 1.0e-05
49
+ patience: 2
50
+ reduce_factor: 0.95
51
+ target_instrument: vocals
52
+ num_epochs: 1000
53
+ num_steps: 1000
54
+ augmentation: false # enable augmentations by audiomentations and pedalboard
55
+ augmentation_type: null
56
+ use_mp3_compress: false # Deprecated
57
+ augmentation_mix: false # Mix several stems of the same type with some probability
58
+ augmentation_loudness: false # randomly change loudness of each stem
59
+ augmentation_loudness_type: 1 # Type 1 or 2
60
+ augmentation_loudness_min: 0
61
+ augmentation_loudness_max: 0
62
+ q: 0.95
63
+ coarse_loss_clip: false
64
+ ema_momentum: 0.999
65
+ optimizer: adam
66
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
67
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
68
+
69
+ inference:
70
+ batch_size: 4
71
+ dim_t: 256
72
+ num_overlap: 2
configs/config_apollo.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 132300
3
+ num_channels: 2
4
+ sample_rate: 44100
5
+ min_mean_abs: 0.0
6
+
7
+ model:
8
+ sr: 44100
9
+ win: 20
10
+ feature_dim: 256
11
+ layer: 6
12
+
13
+ training:
14
+ instruments: ['restored', 'addition']
15
+ target_instrument: 'restored'
16
+ batch_size: 2
17
+ num_steps: 1000
18
+ num_epochs: 1000
19
+ optimizer: 'prodigy'
20
+ lr: 1.0
21
+ patience: 2
22
+ reduce_factor: 0.95
23
+ coarse_loss_clip: true
24
+ grad_clip: 0
25
+ q: 0.95
26
+ use_amp: true
27
+
28
+ augmentations:
29
+ enable: false # enable or disable all augmentations (to fast disable if needed)
30
+
31
+ inference:
32
+ batch_size: 4
33
+ num_overlap: 4
configs/config_dnr_bandit_bsrnn_multi_mus64.yaml ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "MultiMaskMultiSourceBandSplitRNN"
2
+ audio:
3
+ chunk_size: 264600
4
+ num_channels: 2
5
+ sample_rate: 44100
6
+ min_mean_abs: 0.001
7
+
8
+ model:
9
+ in_channel: 1
10
+ stems: ['speech', 'music', 'effects']
11
+ band_specs: "musical"
12
+ n_bands: 64
13
+ fs: 44100
14
+ require_no_overlap: false
15
+ require_no_gap: true
16
+ normalize_channel_independently: false
17
+ treat_channel_as_feature: true
18
+ n_sqm_modules: 8
19
+ emb_dim: 128
20
+ rnn_dim: 256
21
+ bidirectional: true
22
+ rnn_type: "GRU"
23
+ mlp_dim: 512
24
+ hidden_activation: "Tanh"
25
+ hidden_activation_kwargs: null
26
+ complex_mask: true
27
+ n_fft: 2048
28
+ win_length: 2048
29
+ hop_length: 512
30
+ window_fn: "hann_window"
31
+ wkwargs: null
32
+ power: null
33
+ center: true
34
+ normalized: true
35
+ pad_mode: "constant"
36
+ onesided: true
37
+
38
+ training:
39
+ batch_size: 4
40
+ gradient_accumulation_steps: 4
41
+ grad_clip: 0
42
+ instruments:
43
+ - speech
44
+ - music
45
+ - effects
46
+ lr: 9.0e-05
47
+ patience: 2
48
+ reduce_factor: 0.95
49
+ target_instrument: null
50
+ num_epochs: 1000
51
+ num_steps: 1000
52
+ q: 0.95
53
+ coarse_loss_clip: true
54
+ ema_momentum: 0.999
55
+ optimizer: adam
56
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
57
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
58
+
59
+ augmentations:
60
+ enable: true # enable or disable all augmentations (to fast disable if needed)
61
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
62
+ loudness_min: 0.5
63
+ loudness_max: 1.5
64
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
65
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
66
+ - 0.2
67
+ - 0.02
68
+ mixup_loudness_min: 0.5
69
+ mixup_loudness_max: 1.5
70
+ all:
71
+ channel_shuffle: 0.5 # Set 0 or lower to disable
72
+ random_inverse: 0.1 # inverse track (better lower probability)
73
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
74
+
75
+ inference:
76
+ batch_size: 1
77
+ dim_t: 256
78
+ num_overlap: 4
configs/config_dnr_bandit_v2_mus64.yaml ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cls: Bandit
2
+
3
+ audio:
4
+ chunk_size: 384000
5
+ num_channels: 2
6
+ sample_rate: 48000
7
+ min_mean_abs: 0.000
8
+
9
+ kwargs:
10
+ in_channels: 1
11
+ stems: ['speech', 'music', 'sfx']
12
+ band_type: musical
13
+ n_bands: 64
14
+ normalize_channel_independently: false
15
+ treat_channel_as_feature: true
16
+ n_sqm_modules: 8
17
+ emb_dim: 128
18
+ rnn_dim: 256
19
+ bidirectional: true
20
+ rnn_type: "GRU"
21
+ mlp_dim: 512
22
+ hidden_activation: "Tanh"
23
+ hidden_activation_kwargs: null
24
+ complex_mask: true
25
+ use_freq_weights: true
26
+ n_fft: 2048
27
+ win_length: 2048
28
+ hop_length: 512
29
+ window_fn: "hann_window"
30
+ wkwargs: null
31
+ power: null
32
+ center: true
33
+ normalized: true
34
+ pad_mode: "reflect"
35
+ onesided: true
36
+
37
+ training:
38
+ batch_size: 4
39
+ gradient_accumulation_steps: 4
40
+ grad_clip: 0
41
+ instruments:
42
+ - speech
43
+ - music
44
+ - sfx
45
+ lr: 9.0e-05
46
+ patience: 2
47
+ reduce_factor: 0.95
48
+ target_instrument: null
49
+ num_epochs: 1000
50
+ num_steps: 1000
51
+ q: 0.95
52
+ coarse_loss_clip: true
53
+ ema_momentum: 0.999
54
+ optimizer: adam
55
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
56
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
57
+
58
+ augmentations:
59
+ enable: true # enable or disable all augmentations (to fast disable if needed)
60
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
61
+ loudness_min: 0.5
62
+ loudness_max: 1.5
63
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
64
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
65
+ - 0.2
66
+ - 0.02
67
+ mixup_loudness_min: 0.5
68
+ mixup_loudness_max: 1.5
69
+ all:
70
+ channel_shuffle: 0.5 # Set 0 or lower to disable
71
+ random_inverse: 0.1 # inverse track (better lower probability)
72
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
73
+
74
+ inference:
75
+ batch_size: 8
76
+ dim_t: 256
77
+ num_overlap: 4
configs/config_drumsep.yaml ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 1764000 # samplerate * segment
3
+ min_mean_abs: 0.000
4
+ hop_length: 1024
5
+
6
+ training:
7
+ batch_size: 8
8
+ gradient_accumulation_steps: 1
9
+ grad_clip: 0
10
+ segment: 40
11
+ shift: 1
12
+ samplerate: 44100
13
+ channels: 2
14
+ normalize: true
15
+ instruments: ['kick', 'snare', 'cymbals', 'toms']
16
+ target_instrument: null
17
+ num_epochs: 1000
18
+ num_steps: 1000
19
+ optimizer: adam
20
+ lr: 9.0e-05
21
+ patience: 2
22
+ reduce_factor: 0.95
23
+ q: 0.95
24
+ coarse_loss_clip: true
25
+ ema_momentum: 0.999
26
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
27
+ use_amp: false # enable or disable usage of mixed precision (float16) - usually it must be true
28
+
29
+ augmentations:
30
+ enable: true # enable or disable all augmentations (to fast disable if needed)
31
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
32
+ loudness_min: 0.5
33
+ loudness_max: 1.5
34
+
35
+ inference:
36
+ num_overlap: 4
37
+ batch_size: 8
38
+
39
+ model: hdemucs
40
+
41
+ hdemucs: # see demucs/hdemucs.py for a detailed description
42
+ channels: 48
43
+ channels_time: null
44
+ growth: 2
45
+ nfft: 4096
46
+ wiener_iters: 0
47
+ end_iters: 0
48
+ wiener_residual: False
49
+ cac: True
50
+ depth: 6
51
+ rewrite: True
52
+ hybrid: True
53
+ hybrid_old: False
54
+ multi_freqs: []
55
+ multi_freqs_depth: 3
56
+ freq_emb: 0.2
57
+ emb_scale: 10
58
+ emb_smooth: True
59
+ kernel_size: 8
60
+ stride: 4
61
+ time_stride: 2
62
+ context: 1
63
+ context_enc: 0
64
+ norm_starts: 4
65
+ norm_groups: 4
66
+ dconv_mode: 1
67
+ dconv_depth: 2
68
+ dconv_comp: 4
69
+ dconv_attn: 4
70
+ dconv_lstm: 4
71
+ dconv_init: 0.001
72
+ rescale: 0.1
configs/config_htdemucs_6stems.yaml ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100 # samplerate * segment
3
+ min_mean_abs: 0.001
4
+ hop_length: 1024
5
+
6
+ training:
7
+ batch_size: 8
8
+ gradient_accumulation_steps: 1
9
+ grad_clip: 0
10
+ segment: 11
11
+ shift: 1
12
+ samplerate: 44100
13
+ channels: 2
14
+ normalize: true
15
+ instruments: ['drums', 'bass', 'other', 'vocals', 'guitar', 'piano']
16
+ target_instrument: null
17
+ num_epochs: 1000
18
+ num_steps: 1000
19
+ optimizer: adam
20
+ lr: 9.0e-05
21
+ patience: 2
22
+ reduce_factor: 0.95
23
+ q: 0.95
24
+ coarse_loss_clip: true
25
+ ema_momentum: 0.999
26
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
27
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
28
+
29
+ augmentations:
30
+ enable: true # enable or disable all augmentations (to fast disable if needed)
31
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
32
+ loudness_min: 0.5
33
+ loudness_max: 1.5
34
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
35
+ mixup_probs: [0.2, 0.02]
36
+ mixup_loudness_min: 0.5
37
+ mixup_loudness_max: 1.5
38
+ all:
39
+ channel_shuffle: 0.5 # Set 0 or lower to disable
40
+ random_inverse: 0.1 # inverse track (better lower probability)
41
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
42
+
43
+ inference:
44
+ num_overlap: 4
45
+ batch_size: 8
46
+
47
+ model: htdemucs
48
+
49
+ htdemucs: # see demucs/htdemucs.py for a detailed description
50
+ # Channels
51
+ channels: 48
52
+ channels_time:
53
+ growth: 2
54
+ # STFT
55
+ num_subbands: 1
56
+ nfft: 4096
57
+ wiener_iters: 0
58
+ end_iters: 0
59
+ wiener_residual: false
60
+ cac: true
61
+ # Main structure
62
+ depth: 4
63
+ rewrite: true
64
+ # Frequency Branch
65
+ multi_freqs: []
66
+ multi_freqs_depth: 3
67
+ freq_emb: 0.2
68
+ emb_scale: 10
69
+ emb_smooth: true
70
+ # Convolutions
71
+ kernel_size: 8
72
+ stride: 4
73
+ time_stride: 2
74
+ context: 1
75
+ context_enc: 0
76
+ # normalization
77
+ norm_starts: 4
78
+ norm_groups: 4
79
+ # DConv residual branch
80
+ dconv_mode: 3
81
+ dconv_depth: 2
82
+ dconv_comp: 8
83
+ dconv_init: 1e-3
84
+ # Before the Transformer
85
+ bottom_channels: 0
86
+ # CrossTransformer
87
+ # ------ Common to all
88
+ # Regular parameters
89
+ t_layers: 5
90
+ t_hidden_scale: 4.0
91
+ t_heads: 8
92
+ t_dropout: 0.0
93
+ t_layer_scale: True
94
+ t_gelu: True
95
+ # ------------- Positional Embedding
96
+ t_emb: sin
97
+ t_max_positions: 10000 # for the scaled embedding
98
+ t_max_period: 10000.0
99
+ t_weight_pos_embed: 1.0
100
+ t_cape_mean_normalize: True
101
+ t_cape_augment: True
102
+ t_cape_glob_loc_scale: [5000.0, 1.0, 1.4]
103
+ t_sin_random_shift: 0
104
+ # ------------- norm before a transformer encoder
105
+ t_norm_in: True
106
+ t_norm_in_group: False
107
+ # ------------- norm inside the encoder
108
+ t_group_norm: False
109
+ t_norm_first: True
110
+ t_norm_out: True
111
+ # ------------- optim
112
+ t_weight_decay: 0.0
113
+ t_lr:
114
+ # ------------- sparsity
115
+ t_sparse_self_attn: False
116
+ t_sparse_cross_attn: False
117
+ t_mask_type: diag
118
+ t_mask_random_seed: 42
119
+ t_sparse_attn_window: 400
120
+ t_global_window: 100
121
+ t_sparsity: 0.95
122
+ t_auto_sparsity: False
123
+ # Cross Encoder First (False)
124
+ t_cross_first: False
125
+ # Weight init
126
+ rescale: 0.1
127
+
configs/config_musdb18_bs_mamba2.yaml ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 132300 # samplerate * segment
3
+ hop_length: 1024
4
+ min_mean_abs: 0.0
5
+
6
+ training:
7
+ batch_size: 8
8
+ gradient_accumulation_steps: 1
9
+ grad_clip: 0
10
+ segment: 11
11
+ shift: 1
12
+ samplerate: 44100
13
+ channels: 2
14
+ normalize: true
15
+ instruments: ['drums', 'bass', 'other', 'vocals']
16
+ target_instrument: null
17
+ num_epochs: 1000
18
+ num_steps: 1000
19
+ optimizer: prodigy
20
+ lr: 1.0
21
+ patience: 2
22
+ reduce_factor: 0.95
23
+ q: 0.95
24
+ coarse_loss_clip: true
25
+ ema_momentum: 0.999
26
+ read_metadata_procs: 8
27
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
28
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
29
+
30
+ model:
31
+ sr: 44100
32
+ win: 2048
33
+ stride: 512
34
+ feature_dim: 128
35
+ num_repeat_mask: 8
36
+ num_repeat_map: 4
37
+ num_output: 4
38
+
39
+ augmentations:
40
+ enable: true # enable or disable all augmentations (to fast disable if needed)
41
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
42
+ loudness_min: 0.5
43
+ loudness_max: 1.5
44
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
45
+ mixup_probs:
46
+ !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
47
+ - 0.2
48
+ - 0.02
49
+ mixup_loudness_min: 0.5
50
+ mixup_loudness_max: 1.5
51
+ all:
52
+ channel_shuffle: 0.5 # Set 0 or lower to disable
53
+ random_inverse: 0.1 # inverse track (better lower probability)
54
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
55
+
56
+ inference:
57
+ num_overlap: 2
58
+ batch_size: 8
configs/config_musdb18_bs_roformer.yaml ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 131584
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 512
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ dim: 192
13
+ depth: 6
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ freqs_per_bands: !!python/tuple
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 2
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 4
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 12
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 24
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 48
80
+ - 128
81
+ - 129
82
+ dim_head: 64
83
+ heads: 8
84
+ attn_dropout: 0.1
85
+ ff_dropout: 0.1
86
+ flash_attn: true
87
+ dim_freqs_in: 1025
88
+ stft_n_fft: 2048
89
+ stft_hop_length: 512
90
+ stft_win_length: 2048
91
+ stft_normalized: false
92
+ mask_estimator_depth: 2
93
+ multi_stft_resolution_loss_weight: 1.0
94
+ multi_stft_resolutions_window_sizes: !!python/tuple
95
+ - 4096
96
+ - 2048
97
+ - 1024
98
+ - 512
99
+ - 256
100
+ multi_stft_hop_size: 147
101
+ multi_stft_normalized: False
102
+ mlp_expansion_factor: 4 # Probably too big (requires a lot of memory for weights)
103
+ use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested)
104
+ skip_connection: False # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
105
+
106
+ training:
107
+ batch_size: 10
108
+ gradient_accumulation_steps: 1
109
+ grad_clip: 0
110
+ instruments:
111
+ - vocals
112
+ - bass
113
+ - drums
114
+ - other
115
+ lr: 5.0e-05
116
+ patience: 2
117
+ reduce_factor: 0.95
118
+ target_instrument: vocals
119
+ num_epochs: 1000
120
+ num_steps: 1000
121
+ q: 0.95
122
+ coarse_loss_clip: true
123
+ ema_momentum: 0.999
124
+ optimizer: adam
125
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
126
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
127
+
128
+ augmentations:
129
+ enable: true # enable or disable all augmentations (to fast disable if needed)
130
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
131
+ loudness_min: 0.5
132
+ loudness_max: 1.5
133
+
134
+ inference:
135
+ batch_size: 1
136
+ dim_t: 256
137
+ num_overlap: 4
configs/config_musdb18_bs_roformer_with_lora.yaml ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100
3
+ dim_f: 1024
4
+ dim_t: 801 # don't work (use in model)
5
+ hop_length: 441 # don't work (use in model)
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.000
10
+
11
+ lora:
12
+ r: 8
13
+ lora_alpha: 16 # alpha / rank > 1
14
+ lora_dropout: 0.05
15
+ merge_weights: False
16
+ fan_in_fan_out: False
17
+ enable_lora: [True, False, True] # This for QKV
18
+ # enable_lora: [True] # For non-Roformers architectures
19
+
20
+ model:
21
+ dim: 384
22
+ depth: 8
23
+ stereo: true
24
+ num_stems: 4
25
+ time_transformer_depth: 1
26
+ freq_transformer_depth: 1
27
+ linear_transformer_depth: 0
28
+ freqs_per_bands: !!python/tuple
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 2
44
+ - 2
45
+ - 2
46
+ - 2
47
+ - 2
48
+ - 2
49
+ - 2
50
+ - 2
51
+ - 2
52
+ - 2
53
+ - 4
54
+ - 4
55
+ - 4
56
+ - 4
57
+ - 4
58
+ - 4
59
+ - 4
60
+ - 4
61
+ - 4
62
+ - 4
63
+ - 4
64
+ - 4
65
+ - 12
66
+ - 12
67
+ - 12
68
+ - 12
69
+ - 12
70
+ - 12
71
+ - 12
72
+ - 12
73
+ - 24
74
+ - 24
75
+ - 24
76
+ - 24
77
+ - 24
78
+ - 24
79
+ - 24
80
+ - 24
81
+ - 48
82
+ - 48
83
+ - 48
84
+ - 48
85
+ - 48
86
+ - 48
87
+ - 48
88
+ - 48
89
+ - 128
90
+ - 129
91
+ dim_head: 64
92
+ heads: 8
93
+ attn_dropout: 0.1
94
+ ff_dropout: 0.1
95
+ flash_attn: true
96
+ dim_freqs_in: 1025
97
+ stft_n_fft: 2048
98
+ stft_hop_length: 441
99
+ stft_win_length: 2048
100
+ stft_normalized: false
101
+ mask_estimator_depth: 2
102
+ multi_stft_resolution_loss_weight: 1.0
103
+ multi_stft_resolutions_window_sizes: !!python/tuple
104
+ - 4096
105
+ - 2048
106
+ - 1024
107
+ - 512
108
+ - 256
109
+ multi_stft_hop_size: 147
110
+ multi_stft_normalized: False
111
+ mlp_expansion_factor: 2
112
+ use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested)
113
+ skip_connection: False # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
114
+
115
+ training:
116
+ batch_size: 1
117
+ gradient_accumulation_steps: 1
118
+ grad_clip: 0
119
+ instruments: ['drums', 'bass', 'other', 'vocals']
120
+ patience: 3
121
+ reduce_factor: 0.95
122
+ target_instrument: null
123
+ num_epochs: 1000
124
+ num_steps: 1000
125
+ augmentation: false # enable augmentations by audiomentations and pedalboard
126
+ augmentation_type: simple1
127
+ use_mp3_compress: false # Deprecated
128
+ augmentation_mix: true # Mix several stems of the same type with some probability
129
+ augmentation_loudness: true # randomly change loudness of each stem
130
+ augmentation_loudness_type: 1 # Type 1 or 2
131
+ augmentation_loudness_min: 0.5
132
+ augmentation_loudness_max: 1.5
133
+ q: 0.95
134
+ coarse_loss_clip: true
135
+ ema_momentum: 0.999
136
+ # optimizer: prodigy
137
+ optimizer: adam
138
+ # lr: 1.0
139
+ lr: 1.0e-5
140
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
141
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
142
+
143
+ augmentations:
144
+ enable: true # enable or disable all augmentations (to fast disable if needed)
145
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
146
+ loudness_min: 0.5
147
+ loudness_max: 1.5
148
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
149
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
150
+ - 0.2
151
+ - 0.02
152
+ mixup_loudness_min: 0.5
153
+ mixup_loudness_max: 1.5
154
+
155
+ all:
156
+ channel_shuffle: 0.5 # Set 0 or lower to disable
157
+ random_inverse: 0.1 # inverse track (better lower probability)
158
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
159
+
160
+ vocals:
161
+ pitch_shift: 0.1
162
+ pitch_shift_min_semitones: -5
163
+ pitch_shift_max_semitones: 5
164
+ seven_band_parametric_eq: 0.1
165
+ seven_band_parametric_eq_min_gain_db: -9
166
+ seven_band_parametric_eq_max_gain_db: 9
167
+ tanh_distortion: 0.1
168
+ tanh_distortion_min: 0.1
169
+ tanh_distortion_max: 0.7
170
+ bass:
171
+ pitch_shift: 0.1
172
+ pitch_shift_min_semitones: -2
173
+ pitch_shift_max_semitones: 2
174
+ seven_band_parametric_eq: 0.1
175
+ seven_band_parametric_eq_min_gain_db: -3
176
+ seven_band_parametric_eq_max_gain_db: 6
177
+ tanh_distortion: 0.1
178
+ tanh_distortion_min: 0.1
179
+ tanh_distortion_max: 0.5
180
+ drums:
181
+ pitch_shift: 0.1
182
+ pitch_shift_min_semitones: -5
183
+ pitch_shift_max_semitones: 5
184
+ seven_band_parametric_eq: 0.1
185
+ seven_band_parametric_eq_min_gain_db: -9
186
+ seven_band_parametric_eq_max_gain_db: 9
187
+ tanh_distortion: 0.1
188
+ tanh_distortion_min: 0.1
189
+ tanh_distortion_max: 0.6
190
+ other:
191
+ pitch_shift: 0.1
192
+ pitch_shift_min_semitones: -4
193
+ pitch_shift_max_semitones: 4
194
+ gaussian_noise: 0.1
195
+ gaussian_noise_min_amplitude: 0.001
196
+ gaussian_noise_max_amplitude: 0.015
197
+ time_stretch: 0.1
198
+ time_stretch_min_rate: 0.8
199
+ time_stretch_max_rate: 1.25
200
+
201
+
202
+ inference:
203
+ batch_size: 2
204
+ dim_t: 1101
205
+ num_overlap: 2
configs/config_musdb18_demucs3_mmi.yaml ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100 # samplerate * segment
3
+ min_mean_abs: 0.000
4
+ hop_length: 1024
5
+
6
+ training:
7
+ batch_size: 8
8
+ gradient_accumulation_steps: 1
9
+ grad_clip: 0
10
+ segment: 11
11
+ shift: 1
12
+ samplerate: 44100
13
+ channels: 2
14
+ normalize: true
15
+ instruments: ['drums', 'bass', 'other', 'vocals']
16
+ target_instrument: null
17
+ num_epochs: 1000
18
+ num_steps: 1000
19
+ optimizer: adam
20
+ lr: 9.0e-05
21
+ patience: 2
22
+ reduce_factor: 0.95
23
+ q: 0.95
24
+ coarse_loss_clip: true
25
+ ema_momentum: 0.999
26
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
27
+ use_amp: false # enable or disable usage of mixed precision (float16) - usually it must be true
28
+
29
+ augmentations:
30
+ enable: true # enable or disable all augmentations (to fast disable if needed)
31
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
32
+ loudness_min: 0.5
33
+ loudness_max: 1.5
34
+
35
+ inference:
36
+ num_overlap: 4
37
+ batch_size: 8
38
+
39
+ model: hdemucs
40
+
41
+ hdemucs: # see demucs/hdemucs.py for a detailed description
42
+ channels: 48
43
+ channels_time: null
44
+ growth: 2
45
+ nfft: 4096
46
+ wiener_iters: 0
47
+ end_iters: 0
48
+ wiener_residual: False
49
+ cac: True
50
+ depth: 6
51
+ rewrite: True
52
+ hybrid: True
53
+ hybrid_old: False
54
+ multi_freqs: []
55
+ multi_freqs_depth: 3
56
+ freq_emb: 0.2
57
+ emb_scale: 10
58
+ emb_smooth: True
59
+ kernel_size: 8
60
+ stride: 4
61
+ time_stride: 2
62
+ context: 1
63
+ context_enc: 0
64
+ norm_starts: 4
65
+ norm_groups: 4
66
+ dconv_mode: 1
67
+ dconv_depth: 2
68
+ dconv_comp: 4
69
+ dconv_attn: 4
70
+ dconv_lstm: 4
71
+ dconv_init: 0.001
72
+ rescale: 0.1
configs/config_musdb18_htdemucs.yaml ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100 # samplerate * segment
3
+ min_mean_abs: 0.001
4
+ hop_length: 1024
5
+
6
+ training:
7
+ batch_size: 8
8
+ gradient_accumulation_steps: 1
9
+ grad_clip: 0
10
+ segment: 11
11
+ shift: 1
12
+ samplerate: 44100
13
+ channels: 2
14
+ normalize: true
15
+ instruments: ['drums', 'bass', 'other', 'vocals']
16
+ target_instrument: null
17
+ num_epochs: 1000
18
+ num_steps: 1000
19
+ optimizer: adam
20
+ lr: 9.0e-05
21
+ patience: 2
22
+ reduce_factor: 0.95
23
+ q: 0.95
24
+ coarse_loss_clip: true
25
+ ema_momentum: 0.999
26
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
27
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
28
+
29
+ augmentations:
30
+ enable: true # enable or disable all augmentations (to fast disable if needed)
31
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
32
+ loudness_min: 0.5
33
+ loudness_max: 1.5
34
+
35
+ inference:
36
+ num_overlap: 4
37
+ batch_size: 8
38
+
39
+ model: htdemucs
40
+
41
+ htdemucs: # see demucs/htdemucs.py for a detailed description
42
+ # Channels
43
+ channels: 48
44
+ channels_time:
45
+ growth: 2
46
+ # STFT
47
+ num_subbands: 1
48
+ nfft: 4096
49
+ wiener_iters: 0
50
+ end_iters: 0
51
+ wiener_residual: false
52
+ cac: true
53
+ # Main structure
54
+ depth: 4
55
+ rewrite: true
56
+ # Frequency Branch
57
+ multi_freqs: []
58
+ multi_freqs_depth: 3
59
+ freq_emb: 0.2
60
+ emb_scale: 10
61
+ emb_smooth: true
62
+ # Convolutions
63
+ kernel_size: 8
64
+ stride: 4
65
+ time_stride: 2
66
+ context: 1
67
+ context_enc: 0
68
+ # normalization
69
+ norm_starts: 4
70
+ norm_groups: 4
71
+ # DConv residual branch
72
+ dconv_mode: 3
73
+ dconv_depth: 2
74
+ dconv_comp: 8
75
+ dconv_init: 1e-3
76
+ # Before the Transformer
77
+ bottom_channels: 512
78
+ # CrossTransformer
79
+ # ------ Common to all
80
+ # Regular parameters
81
+ t_layers: 5
82
+ t_hidden_scale: 4.0
83
+ t_heads: 8
84
+ t_dropout: 0.0
85
+ t_layer_scale: True
86
+ t_gelu: True
87
+ # ------------- Positional Embedding
88
+ t_emb: sin
89
+ t_max_positions: 10000 # for the scaled embedding
90
+ t_max_period: 10000.0
91
+ t_weight_pos_embed: 1.0
92
+ t_cape_mean_normalize: True
93
+ t_cape_augment: True
94
+ t_cape_glob_loc_scale: [5000.0, 1.0, 1.4]
95
+ t_sin_random_shift: 0
96
+ # ------------- norm before a transformer encoder
97
+ t_norm_in: True
98
+ t_norm_in_group: False
99
+ # ------------- norm inside the encoder
100
+ t_group_norm: False
101
+ t_norm_first: True
102
+ t_norm_out: True
103
+ # ------------- optim
104
+ t_weight_decay: 0.0
105
+ t_lr:
106
+ # ------------- sparsity
107
+ t_sparse_self_attn: False
108
+ t_sparse_cross_attn: False
109
+ t_mask_type: diag
110
+ t_mask_random_seed: 42
111
+ t_sparse_attn_window: 400
112
+ t_global_window: 100
113
+ t_sparsity: 0.95
114
+ t_auto_sparsity: False
115
+ # Cross Encoder First (False)
116
+ t_cross_first: False
117
+ # Weight init
118
+ rescale: 0.1
119
+
configs/config_musdb18_mdx23c.yaml ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 261120
3
+ dim_f: 4096
4
+ dim_t: 256
5
+ hop_length: 1024
6
+ n_fft: 8192
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ act: gelu
13
+ bottleneck_factor: 4
14
+ growth: 128
15
+ norm: InstanceNorm
16
+ num_blocks_per_scale: 2
17
+ num_channels: 128
18
+ num_scales: 5
19
+ num_subbands: 4
20
+ scale:
21
+ - 2
22
+ - 2
23
+
24
+ training:
25
+ batch_size: 6
26
+ gradient_accumulation_steps: 1
27
+ grad_clip: 0
28
+ instruments:
29
+ - vocals
30
+ - bass
31
+ - drums
32
+ - other
33
+ lr: 9.0e-05
34
+ patience: 2
35
+ reduce_factor: 0.95
36
+ target_instrument: null
37
+ num_epochs: 1000
38
+ num_steps: 1000
39
+ q: 0.95
40
+ coarse_loss_clip: true
41
+ ema_momentum: 0.999
42
+ optimizer: adam
43
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
44
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
45
+
46
+ augmentations:
47
+ enable: true # enable or disable all augmentations (to fast disable if needed)
48
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
49
+ loudness_min: 0.5
50
+ loudness_max: 1.5
51
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
52
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
53
+ - 0.2
54
+ - 0.02
55
+ mixup_loudness_min: 0.5
56
+ mixup_loudness_max: 1.5
57
+
58
+ # apply mp3 compression to mixture only (emulate downloading mp3 from internet)
59
+ mp3_compression_on_mixture: 0.01
60
+ mp3_compression_on_mixture_bitrate_min: 32
61
+ mp3_compression_on_mixture_bitrate_max: 320
62
+ mp3_compression_on_mixture_backend: "lameenc"
63
+
64
+ all:
65
+ channel_shuffle: 0.5 # Set 0 or lower to disable
66
+ random_inverse: 0.1 # inverse track (better lower probability)
67
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
68
+ mp3_compression: 0.01
69
+ mp3_compression_min_bitrate: 32
70
+ mp3_compression_max_bitrate: 320
71
+ mp3_compression_backend: "lameenc"
72
+
73
+ # pedalboard reverb block
74
+ pedalboard_reverb: 0.01
75
+ pedalboard_reverb_room_size_min: 0.1
76
+ pedalboard_reverb_room_size_max: 0.9
77
+ pedalboard_reverb_damping_min: 0.1
78
+ pedalboard_reverb_damping_max: 0.9
79
+ pedalboard_reverb_wet_level_min: 0.1
80
+ pedalboard_reverb_wet_level_max: 0.9
81
+ pedalboard_reverb_dry_level_min: 0.1
82
+ pedalboard_reverb_dry_level_max: 0.9
83
+ pedalboard_reverb_width_min: 0.9
84
+ pedalboard_reverb_width_max: 1.0
85
+
86
+ # pedalboard chorus block
87
+ pedalboard_chorus: 0.01
88
+ pedalboard_chorus_rate_hz_min: 1.0
89
+ pedalboard_chorus_rate_hz_max: 7.0
90
+ pedalboard_chorus_depth_min: 0.25
91
+ pedalboard_chorus_depth_max: 0.95
92
+ pedalboard_chorus_centre_delay_ms_min: 3
93
+ pedalboard_chorus_centre_delay_ms_max: 10
94
+ pedalboard_chorus_feedback_min: 0.0
95
+ pedalboard_chorus_feedback_max: 0.5
96
+ pedalboard_chorus_mix_min: 0.1
97
+ pedalboard_chorus_mix_max: 0.9
98
+
99
+ # pedalboard phazer block
100
+ pedalboard_phazer: 0.01
101
+ pedalboard_phazer_rate_hz_min: 1.0
102
+ pedalboard_phazer_rate_hz_max: 10.0
103
+ pedalboard_phazer_depth_min: 0.25
104
+ pedalboard_phazer_depth_max: 0.95
105
+ pedalboard_phazer_centre_frequency_hz_min: 200
106
+ pedalboard_phazer_centre_frequency_hz_max: 12000
107
+ pedalboard_phazer_feedback_min: 0.0
108
+ pedalboard_phazer_feedback_max: 0.5
109
+ pedalboard_phazer_mix_min: 0.1
110
+ pedalboard_phazer_mix_max: 0.9
111
+
112
+ # pedalboard distortion block
113
+ pedalboard_distortion: 0.01
114
+ pedalboard_distortion_drive_db_min: 1.0
115
+ pedalboard_distortion_drive_db_max: 25.0
116
+
117
+ # pedalboard pitch shift block
118
+ pedalboard_pitch_shift: 0.01
119
+ pedalboard_pitch_shift_semitones_min: -7
120
+ pedalboard_pitch_shift_semitones_max: 7
121
+
122
+ # pedalboard resample block
123
+ pedalboard_resample: 0.01
124
+ pedalboard_resample_target_sample_rate_min: 4000
125
+ pedalboard_resample_target_sample_rate_max: 44100
126
+
127
+ # pedalboard bitcrash block
128
+ pedalboard_bitcrash: 0.01
129
+ pedalboard_bitcrash_bit_depth_min: 4
130
+ pedalboard_bitcrash_bit_depth_max: 16
131
+
132
+ # pedalboard mp3 compressor block
133
+ pedalboard_mp3_compressor: 0.01
134
+ pedalboard_mp3_compressor_pedalboard_mp3_compressor_min: 0
135
+ pedalboard_mp3_compressor_pedalboard_mp3_compressor_max: 9.999
136
+
137
+ vocals:
138
+ pitch_shift: 0.1
139
+ pitch_shift_min_semitones: -5
140
+ pitch_shift_max_semitones: 5
141
+ seven_band_parametric_eq: 0.25
142
+ seven_band_parametric_eq_min_gain_db: -9
143
+ seven_band_parametric_eq_max_gain_db: 9
144
+ tanh_distortion: 0.1
145
+ tanh_distortion_min: 0.1
146
+ tanh_distortion_max: 0.7
147
+ bass:
148
+ pitch_shift: 0.1
149
+ pitch_shift_min_semitones: -2
150
+ pitch_shift_max_semitones: 2
151
+ seven_band_parametric_eq: 0.25
152
+ seven_band_parametric_eq_min_gain_db: -3
153
+ seven_band_parametric_eq_max_gain_db: 6
154
+ tanh_distortion: 0.2
155
+ tanh_distortion_min: 0.1
156
+ tanh_distortion_max: 0.5
157
+ drums:
158
+ pitch_shift: 0.33
159
+ pitch_shift_min_semitones: -5
160
+ pitch_shift_max_semitones: 5
161
+ seven_band_parametric_eq: 0.25
162
+ seven_band_parametric_eq_min_gain_db: -9
163
+ seven_band_parametric_eq_max_gain_db: 9
164
+ tanh_distortion: 0.33
165
+ tanh_distortion_min: 0.1
166
+ tanh_distortion_max: 0.6
167
+ other:
168
+ pitch_shift: 0.1
169
+ pitch_shift_min_semitones: -4
170
+ pitch_shift_max_semitones: 4
171
+ gaussian_noise: 0.1
172
+ gaussian_noise_min_amplitude: 0.001
173
+ gaussian_noise_max_amplitude: 0.015
174
+ time_stretch: 0.01
175
+ time_stretch_min_rate: 0.8
176
+ time_stretch_max_rate: 1.25
177
+
178
+
179
+ inference:
180
+ batch_size: 1
181
+ dim_t: 256
182
+ num_overlap: 4
configs/config_musdb18_mdx23c_stht.yaml ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 261120
3
+ dim_f: 4096
4
+ dim_t: 256
5
+ hop_length: 1024
6
+ n_fft: 8192
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ act: gelu
13
+ bottleneck_factor: 4
14
+ growth: 128
15
+ norm: InstanceNorm
16
+ num_blocks_per_scale: 2
17
+ num_channels: 128
18
+ num_scales: 5
19
+ num_subbands: 4
20
+ scale:
21
+ - 2
22
+ - 2
23
+
24
+ training:
25
+ batch_size: 6
26
+ gradient_accumulation_steps: 1
27
+ grad_clip: 0
28
+ instruments:
29
+ - vocals
30
+ - bass
31
+ - drums
32
+ - other
33
+ lr: 9.0e-05
34
+ patience: 2
35
+ reduce_factor: 0.95
36
+ target_instrument: null
37
+ num_epochs: 1000
38
+ num_steps: 1000
39
+ q: 0.95
40
+ coarse_loss_clip: true
41
+ ema_momentum: 0.999
42
+ optimizer: adam
43
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
44
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
45
+
46
+ augmentations:
47
+ enable: true # enable or disable all augmentations (to fast disable if needed)
48
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
49
+ loudness_min: 0.5
50
+ loudness_max: 1.5
51
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
52
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
53
+ - 0.2
54
+ - 0.02
55
+ mixup_loudness_min: 0.5
56
+ mixup_loudness_max: 1.5
57
+
58
+ # apply mp3 compression to mixture only (emulate downloading mp3 from internet)
59
+ mp3_compression_on_mixture: 0.01
60
+ mp3_compression_on_mixture_bitrate_min: 32
61
+ mp3_compression_on_mixture_bitrate_max: 320
62
+ mp3_compression_on_mixture_backend: "lameenc"
63
+
64
+ all:
65
+ channel_shuffle: 0.5 # Set 0 or lower to disable
66
+ random_inverse: 0.1 # inverse track (better lower probability)
67
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
68
+ mp3_compression: 0.01
69
+ mp3_compression_min_bitrate: 32
70
+ mp3_compression_max_bitrate: 320
71
+ mp3_compression_backend: "lameenc"
72
+
73
+ # pedalboard reverb block
74
+ pedalboard_reverb: 0.01
75
+ pedalboard_reverb_room_size_min: 0.1
76
+ pedalboard_reverb_room_size_max: 0.9
77
+ pedalboard_reverb_damping_min: 0.1
78
+ pedalboard_reverb_damping_max: 0.9
79
+ pedalboard_reverb_wet_level_min: 0.1
80
+ pedalboard_reverb_wet_level_max: 0.9
81
+ pedalboard_reverb_dry_level_min: 0.1
82
+ pedalboard_reverb_dry_level_max: 0.9
83
+ pedalboard_reverb_width_min: 0.9
84
+ pedalboard_reverb_width_max: 1.0
85
+
86
+ # pedalboard chorus block
87
+ pedalboard_chorus: 0.01
88
+ pedalboard_chorus_rate_hz_min: 1.0
89
+ pedalboard_chorus_rate_hz_max: 7.0
90
+ pedalboard_chorus_depth_min: 0.25
91
+ pedalboard_chorus_depth_max: 0.95
92
+ pedalboard_chorus_centre_delay_ms_min: 3
93
+ pedalboard_chorus_centre_delay_ms_max: 10
94
+ pedalboard_chorus_feedback_min: 0.0
95
+ pedalboard_chorus_feedback_max: 0.5
96
+ pedalboard_chorus_mix_min: 0.1
97
+ pedalboard_chorus_mix_max: 0.9
98
+
99
+ # pedalboard phazer block
100
+ pedalboard_phazer: 0.01
101
+ pedalboard_phazer_rate_hz_min: 1.0
102
+ pedalboard_phazer_rate_hz_max: 10.0
103
+ pedalboard_phazer_depth_min: 0.25
104
+ pedalboard_phazer_depth_max: 0.95
105
+ pedalboard_phazer_centre_frequency_hz_min: 200
106
+ pedalboard_phazer_centre_frequency_hz_max: 12000
107
+ pedalboard_phazer_feedback_min: 0.0
108
+ pedalboard_phazer_feedback_max: 0.5
109
+ pedalboard_phazer_mix_min: 0.1
110
+ pedalboard_phazer_mix_max: 0.9
111
+
112
+ # pedalboard distortion block
113
+ pedalboard_distortion: 0.01
114
+ pedalboard_distortion_drive_db_min: 1.0
115
+ pedalboard_distortion_drive_db_max: 25.0
116
+
117
+ # pedalboard pitch shift block
118
+ pedalboard_pitch_shift: 0.01
119
+ pedalboard_pitch_shift_semitones_min: -7
120
+ pedalboard_pitch_shift_semitones_max: 7
121
+
122
+ # pedalboard resample block
123
+ pedalboard_resample: 0.01
124
+ pedalboard_resample_target_sample_rate_min: 4000
125
+ pedalboard_resample_target_sample_rate_max: 44100
126
+
127
+ # pedalboard bitcrash block
128
+ pedalboard_bitcrash: 0.01
129
+ pedalboard_bitcrash_bit_depth_min: 4
130
+ pedalboard_bitcrash_bit_depth_max: 16
131
+
132
+ # pedalboard mp3 compressor block
133
+ pedalboard_mp3_compressor: 0.01
134
+ pedalboard_mp3_compressor_pedalboard_mp3_compressor_min: 0
135
+ pedalboard_mp3_compressor_pedalboard_mp3_compressor_max: 9.999
136
+
137
+ vocals:
138
+ pitch_shift: 0.1
139
+ pitch_shift_min_semitones: -5
140
+ pitch_shift_max_semitones: 5
141
+ seven_band_parametric_eq: 0.25
142
+ seven_band_parametric_eq_min_gain_db: -9
143
+ seven_band_parametric_eq_max_gain_db: 9
144
+ tanh_distortion: 0.1
145
+ tanh_distortion_min: 0.1
146
+ tanh_distortion_max: 0.7
147
+ bass:
148
+ pitch_shift: 0.1
149
+ pitch_shift_min_semitones: -2
150
+ pitch_shift_max_semitones: 2
151
+ seven_band_parametric_eq: 0.25
152
+ seven_band_parametric_eq_min_gain_db: -3
153
+ seven_band_parametric_eq_max_gain_db: 6
154
+ tanh_distortion: 0.2
155
+ tanh_distortion_min: 0.1
156
+ tanh_distortion_max: 0.5
157
+ drums:
158
+ pitch_shift: 0.33
159
+ pitch_shift_min_semitones: -5
160
+ pitch_shift_max_semitones: 5
161
+ seven_band_parametric_eq: 0.25
162
+ seven_band_parametric_eq_min_gain_db: -9
163
+ seven_band_parametric_eq_max_gain_db: 9
164
+ tanh_distortion: 0.33
165
+ tanh_distortion_min: 0.1
166
+ tanh_distortion_max: 0.6
167
+ other:
168
+ pitch_shift: 0.1
169
+ pitch_shift_min_semitones: -4
170
+ pitch_shift_max_semitones: 4
171
+ gaussian_noise: 0.1
172
+ gaussian_noise_min_amplitude: 0.001
173
+ gaussian_noise_max_amplitude: 0.015
174
+ time_stretch: 0.01
175
+ time_stretch_min_rate: 0.8
176
+ time_stretch_max_rate: 1.25
177
+
178
+
179
+ inference:
180
+ batch_size: 1
181
+ dim_t: 256
182
+ num_overlap: 4
configs/config_musdb18_mel_band_roformer.yaml ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 131584
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 512
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ dim: 192
13
+ depth: 8
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ num_bands: 60
20
+ dim_head: 64
21
+ heads: 8
22
+ attn_dropout: 0.1
23
+ ff_dropout: 0.1
24
+ flash_attn: True
25
+ dim_freqs_in: 1025
26
+ sample_rate: 44100 # needed for mel filter bank from librosa
27
+ stft_n_fft: 2048
28
+ stft_hop_length: 512
29
+ stft_win_length: 2048
30
+ stft_normalized: False
31
+ mask_estimator_depth: 2
32
+ multi_stft_resolution_loss_weight: 1.0
33
+ multi_stft_resolutions_window_sizes: !!python/tuple
34
+ - 4096
35
+ - 2048
36
+ - 1024
37
+ - 512
38
+ - 256
39
+ multi_stft_hop_size: 147
40
+ multi_stft_normalized: False
41
+ mlp_expansion_factor: 4 # Probably too big (requires a lot of memory for weights)
42
+ use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested)
43
+ skip_connection: False # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
44
+
45
+ training:
46
+ batch_size: 7
47
+ gradient_accumulation_steps: 1
48
+ grad_clip: 0
49
+ instruments:
50
+ - vocals
51
+ - bass
52
+ - drums
53
+ - other
54
+ lr: 5.0e-05
55
+ patience: 2
56
+ reduce_factor: 0.95
57
+ target_instrument: vocals
58
+ num_epochs: 1000
59
+ num_steps: 1000
60
+ q: 0.95
61
+ coarse_loss_clip: true
62
+ ema_momentum: 0.999
63
+ optimizer: adam
64
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
65
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
66
+
67
+ augmentations:
68
+ enable: true # enable or disable all augmentations (to fast disable if needed)
69
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
70
+ loudness_min: 0.5
71
+ loudness_max: 1.5
72
+
73
+ inference:
74
+ batch_size: 1
75
+ dim_t: 256
76
+ num_overlap: 4
configs/config_musdb18_mel_band_roformer_all_stems.yaml ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352800
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.000
10
+
11
+ model:
12
+ dim: 384
13
+ depth: 6
14
+ stereo: true
15
+ num_stems: 4
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ num_bands: 60
20
+ dim_head: 64
21
+ heads: 8
22
+ attn_dropout: 0
23
+ ff_dropout: 0
24
+ flash_attn: True
25
+ dim_freqs_in: 1025
26
+ sample_rate: 44100 # needed for mel filter bank from librosa
27
+ stft_n_fft: 2048
28
+ stft_hop_length: 441
29
+ stft_win_length: 2048
30
+ stft_normalized: False
31
+ mask_estimator_depth: 2
32
+ multi_stft_resolution_loss_weight: 1.0
33
+ multi_stft_resolutions_window_sizes: !!python/tuple
34
+ - 4096
35
+ - 2048
36
+ - 1024
37
+ - 512
38
+ - 256
39
+ multi_stft_hop_size: 147
40
+ multi_stft_normalized: False
41
+ mlp_expansion_factor: 4 # Probably too big (requires a lot of memory for weights)
42
+ use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested)
43
+ skip_connection: False # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
44
+
45
+ training:
46
+ batch_size: 1
47
+ gradient_accumulation_steps: 1
48
+ grad_clip: 0
49
+ instruments:
50
+ - drums
51
+ - bass
52
+ - other
53
+ - vocals
54
+ lr: 1.0e-05
55
+ patience: 2
56
+ reduce_factor: 0.95
57
+ target_instrument: null
58
+ num_epochs: 1000
59
+ num_steps: 1000
60
+ augmentation: false # enable augmentations by audiomentations and pedalboard
61
+ augmentation_type: null
62
+ use_mp3_compress: false # Deprecated
63
+ augmentation_mix: false # Mix several stems of the same type with some probability
64
+ augmentation_loudness: false # randomly change loudness of each stem
65
+ augmentation_loudness_type: 1 # Type 1 or 2
66
+ augmentation_loudness_min: 0
67
+ augmentation_loudness_max: 0
68
+ q: 0.95
69
+ coarse_loss_clip: false
70
+ ema_momentum: 0.999
71
+ optimizer: adam
72
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
73
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
74
+
75
+
76
+ augmentations:
77
+ enable: true # enable or disable all augmentations (to fast disable if needed)
78
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
79
+ loudness_min: 0.5
80
+ loudness_max: 1.5
81
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
82
+ mixup_probs:
83
+ !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
84
+ - 0.2
85
+ - 0.02
86
+ mixup_loudness_min: 0.5
87
+ mixup_loudness_max: 1.5
88
+ all:
89
+ channel_shuffle: 0.5 # Set 0 or lower to disable
90
+ random_inverse: 0.1 # inverse track (better lower probability)
91
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
92
+
93
+
94
+ inference:
95
+ batch_size: 4
96
+ dim_t: 256
97
+ num_overlap: 2
configs/config_musdb18_scnet.yaml ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100 # 44100 * 11
3
+ num_channels: 2
4
+ sample_rate: 44100
5
+ min_mean_abs: 0.000
6
+
7
+ model:
8
+ sources:
9
+ - drums
10
+ - bass
11
+ - other
12
+ - vocals
13
+ audio_channels: 2
14
+ dims:
15
+ - 4
16
+ - 32
17
+ - 64
18
+ - 128
19
+ nfft: 4096
20
+ hop_size: 1024
21
+ win_size: 4096
22
+ normalized: True
23
+ band_SR:
24
+ - 0.175
25
+ - 0.392
26
+ - 0.433
27
+ band_stride:
28
+ - 1
29
+ - 4
30
+ - 16
31
+ band_kernel:
32
+ - 3
33
+ - 4
34
+ - 16
35
+ conv_depths:
36
+ - 3
37
+ - 2
38
+ - 1
39
+ compress: 4
40
+ conv_kernel: 3
41
+ num_dplayer: 6
42
+ expand: 1
43
+
44
+ training:
45
+ batch_size: 10
46
+ gradient_accumulation_steps: 1
47
+ grad_clip: 0
48
+ instruments:
49
+ - drums
50
+ - bass
51
+ - other
52
+ - vocals
53
+ lr: 5.0e-04
54
+ patience: 2
55
+ reduce_factor: 0.95
56
+ target_instrument: null
57
+ num_epochs: 1000
58
+ num_steps: 1000
59
+ q: 0.95
60
+ coarse_loss_clip: true
61
+ ema_momentum: 0.999
62
+ optimizer: adam
63
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
64
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
65
+
66
+ augmentations:
67
+ enable: true # enable or disable all augmentations (to fast disable if needed)
68
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
69
+ loudness_min: 0.5
70
+ loudness_max: 1.5
71
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
72
+ mixup_probs:
73
+ !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
74
+ - 0.2
75
+ - 0.02
76
+ mixup_loudness_min: 0.5
77
+ mixup_loudness_max: 1.5
78
+
79
+ inference:
80
+ batch_size: 8
81
+ dim_t: 256
82
+ num_overlap: 4
83
+ normalize: true
configs/config_musdb18_scnet_large.yaml ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100 # 44100 * 11
3
+ num_channels: 2
4
+ sample_rate: 44100
5
+ min_mean_abs: 0.000
6
+
7
+ model:
8
+ sources:
9
+ - drums
10
+ - bass
11
+ - other
12
+ - vocals
13
+ audio_channels: 2
14
+ dims:
15
+ - 4
16
+ - 64
17
+ - 128
18
+ - 256
19
+ nfft: 4096
20
+ hop_size: 1024
21
+ win_size: 4096
22
+ normalized: True
23
+ band_SR:
24
+ - 0.225
25
+ - 0.372
26
+ - 0.403
27
+ band_stride:
28
+ - 1
29
+ - 4
30
+ - 16
31
+ band_kernel:
32
+ - 3
33
+ - 4
34
+ - 16
35
+ conv_depths:
36
+ - 3
37
+ - 2
38
+ - 1
39
+ compress: 4
40
+ conv_kernel: 3
41
+ num_dplayer: 6
42
+ expand: 1
43
+
44
+ training:
45
+ batch_size: 6
46
+ gradient_accumulation_steps: 1
47
+ grad_clip: 0
48
+ instruments:
49
+ - drums
50
+ - bass
51
+ - other
52
+ - vocals
53
+ lr: 5.0e-04
54
+ patience: 2
55
+ reduce_factor: 0.95
56
+ target_instrument: null
57
+ num_epochs: 1000
58
+ num_steps: 1000
59
+ q: 0.95
60
+ coarse_loss_clip: true
61
+ ema_momentum: 0.999
62
+ optimizer: adam
63
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
64
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
65
+
66
+ augmentations:
67
+ enable: true # enable or disable all augmentations (to fast disable if needed)
68
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
69
+ loudness_min: 0.5
70
+ loudness_max: 1.5
71
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
72
+ mixup_probs:
73
+ !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
74
+ - 0.2
75
+ - 0.02
76
+ mixup_loudness_min: 0.5
77
+ mixup_loudness_max: 1.5
78
+
79
+ inference:
80
+ batch_size: 8
81
+ dim_t: 256
82
+ num_overlap: 4
83
+ normalize: false
configs/config_musdb18_segm_models.yaml ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 261632
3
+ dim_f: 4096
4
+ dim_t: 512
5
+ hop_length: 512
6
+ n_fft: 8192
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ encoder_name: tu-maxvit_large_tf_512 # look here for possibilities: https://github.com/qubvel/segmentation_models.pytorch#encoders-
13
+ decoder_type: unet # unet, fpn
14
+ act: gelu
15
+ num_channels: 128
16
+ num_subbands: 8
17
+
18
+ training:
19
+ batch_size: 7
20
+ gradient_accumulation_steps: 1
21
+ grad_clip: 0
22
+ instruments:
23
+ - vocals
24
+ - bass
25
+ - drums
26
+ - other
27
+ lr: 5.0e-05
28
+ patience: 2
29
+ reduce_factor: 0.95
30
+ target_instrument: null
31
+ num_epochs: 1000
32
+ num_steps: 2000
33
+ q: 0.95
34
+ coarse_loss_clip: true
35
+ ema_momentum: 0.999
36
+ optimizer: adamw
37
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
38
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
39
+
40
+ augmentations:
41
+ enable: true # enable or disable all augmentations (to fast disable if needed)
42
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
43
+ loudness_min: 0.5
44
+ loudness_max: 1.5
45
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
46
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
47
+ - 0.2
48
+ - 0.02
49
+ mixup_loudness_min: 0.5
50
+ mixup_loudness_max: 1.5
51
+
52
+ # apply mp3 compression to mixture only (emulate downloading mp3 from internet)
53
+ mp3_compression_on_mixture: 0.01
54
+ mp3_compression_on_mixture_bitrate_min: 32
55
+ mp3_compression_on_mixture_bitrate_max: 320
56
+ mp3_compression_on_mixture_backend: "lameenc"
57
+
58
+ all:
59
+ channel_shuffle: 0.5 # Set 0 or lower to disable
60
+ random_inverse: 0.1 # inverse track (better lower probability)
61
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
62
+ mp3_compression: 0.01
63
+ mp3_compression_min_bitrate: 32
64
+ mp3_compression_max_bitrate: 320
65
+ mp3_compression_backend: "lameenc"
66
+
67
+ vocals:
68
+ pitch_shift: 0.1
69
+ pitch_shift_min_semitones: -5
70
+ pitch_shift_max_semitones: 5
71
+ seven_band_parametric_eq: 0.25
72
+ seven_band_parametric_eq_min_gain_db: -9
73
+ seven_band_parametric_eq_max_gain_db: 9
74
+ tanh_distortion: 0.1
75
+ tanh_distortion_min: 0.1
76
+ tanh_distortion_max: 0.7
77
+ other:
78
+ pitch_shift: 0.1
79
+ pitch_shift_min_semitones: -4
80
+ pitch_shift_max_semitones: 4
81
+ gaussian_noise: 0.1
82
+ gaussian_noise_min_amplitude: 0.001
83
+ gaussian_noise_max_amplitude: 0.015
84
+ time_stretch: 0.01
85
+ time_stretch_min_rate: 0.8
86
+ time_stretch_max_rate: 1.25
87
+
88
+
89
+ inference:
90
+ batch_size: 1
91
+ dim_t: 512
92
+ num_overlap: 4
configs/config_musdb18_torchseg.yaml ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 261632
3
+ dim_f: 4096
4
+ dim_t: 512
5
+ hop_length: 512
6
+ n_fft: 8192
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ encoder_name: maxvit_tiny_tf_512 # look with torchseg.list_encoders(). Currently 858 available
13
+ decoder_type: unet # unet, fpn
14
+ act: gelu
15
+ num_channels: 128
16
+ num_subbands: 8
17
+
18
+ training:
19
+ batch_size: 18
20
+ gradient_accumulation_steps: 1
21
+ grad_clip: 0
22
+ instruments:
23
+ - vocals
24
+ - bass
25
+ - drums
26
+ - other
27
+ lr: 5.0e-05
28
+ patience: 2
29
+ reduce_factor: 0.95
30
+ target_instrument: null
31
+ num_epochs: 1000
32
+ num_steps: 2000
33
+ q: 0.95
34
+ coarse_loss_clip: true
35
+ ema_momentum: 0.999
36
+ optimizer: adamw
37
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
38
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
39
+
40
+ augmentations:
41
+ enable: true # enable or disable all augmentations (to fast disable if needed)
42
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
43
+ loudness_min: 0.5
44
+ loudness_max: 1.5
45
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
46
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
47
+ - 0.2
48
+ - 0.02
49
+ mixup_loudness_min: 0.5
50
+ mixup_loudness_max: 1.5
51
+
52
+ # apply mp3 compression to mixture only (emulate downloading mp3 from internet)
53
+ mp3_compression_on_mixture: 0.01
54
+ mp3_compression_on_mixture_bitrate_min: 32
55
+ mp3_compression_on_mixture_bitrate_max: 320
56
+ mp3_compression_on_mixture_backend: "lameenc"
57
+
58
+ all:
59
+ channel_shuffle: 0.5 # Set 0 or lower to disable
60
+ random_inverse: 0.1 # inverse track (better lower probability)
61
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
62
+ mp3_compression: 0.01
63
+ mp3_compression_min_bitrate: 32
64
+ mp3_compression_max_bitrate: 320
65
+ mp3_compression_backend: "lameenc"
66
+
67
+ vocals:
68
+ pitch_shift: 0.1
69
+ pitch_shift_min_semitones: -5
70
+ pitch_shift_max_semitones: 5
71
+ seven_band_parametric_eq: 0.25
72
+ seven_band_parametric_eq_min_gain_db: -9
73
+ seven_band_parametric_eq_max_gain_db: 9
74
+ tanh_distortion: 0.1
75
+ tanh_distortion_min: 0.1
76
+ tanh_distortion_max: 0.7
77
+ other:
78
+ pitch_shift: 0.1
79
+ pitch_shift_min_semitones: -4
80
+ pitch_shift_max_semitones: 4
81
+ gaussian_noise: 0.1
82
+ gaussian_noise_min_amplitude: 0.001
83
+ gaussian_noise_max_amplitude: 0.015
84
+ time_stretch: 0.01
85
+ time_stretch_min_rate: 0.8
86
+ time_stretch_max_rate: 1.25
87
+
88
+
89
+ inference:
90
+ batch_size: 1
91
+ dim_t: 512
92
+ num_overlap: 4
configs/config_vocals_bandit_bsrnn_multi_mus64.yaml ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "MultiMaskMultiSourceBandSplitRNN"
2
+ audio:
3
+ chunk_size: 264600
4
+ num_channels: 2
5
+ sample_rate: 44100
6
+ min_mean_abs: 0.001
7
+
8
+ model:
9
+ in_channel: 1
10
+ stems: ['vocals', 'other']
11
+ band_specs: "musical"
12
+ n_bands: 64
13
+ fs: 44100
14
+ require_no_overlap: false
15
+ require_no_gap: true
16
+ normalize_channel_independently: false
17
+ treat_channel_as_feature: true
18
+ n_sqm_modules: 8
19
+ emb_dim: 128
20
+ rnn_dim: 256
21
+ bidirectional: true
22
+ rnn_type: "GRU"
23
+ mlp_dim: 512
24
+ hidden_activation: "Tanh"
25
+ hidden_activation_kwargs: null
26
+ complex_mask: true
27
+ n_fft: 2048
28
+ win_length: 2048
29
+ hop_length: 512
30
+ window_fn: "hann_window"
31
+ wkwargs: null
32
+ power: null
33
+ center: true
34
+ normalized: true
35
+ pad_mode: "constant"
36
+ onesided: true
37
+
38
+ training:
39
+ batch_size: 4
40
+ gradient_accumulation_steps: 4
41
+ grad_clip: 0
42
+ instruments:
43
+ - vocals
44
+ - other
45
+ lr: 9.0e-05
46
+ patience: 2
47
+ reduce_factor: 0.95
48
+ target_instrument: null
49
+ num_epochs: 1000
50
+ num_steps: 1000
51
+ q: 0.95
52
+ coarse_loss_clip: true
53
+ ema_momentum: 0.999
54
+ optimizer: adam
55
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
56
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
57
+
58
+ augmentations:
59
+ enable: true # enable or disable all augmentations (to fast disable if needed)
60
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
61
+ loudness_min: 0.5
62
+ loudness_max: 1.5
63
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
64
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
65
+ - 0.2
66
+ - 0.02
67
+ mixup_loudness_min: 0.5
68
+ mixup_loudness_max: 1.5
69
+
70
+ inference:
71
+ batch_size: 1
72
+ dim_t: 256
73
+ num_overlap: 4
configs/config_vocals_bs_mamba2.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 132300 # samplerate * segment
3
+ hop_length: 1024
4
+ min_mean_abs: 0.0
5
+
6
+ training:
7
+ batch_size: 8
8
+ gradient_accumulation_steps: 1
9
+ grad_clip: 0
10
+ segment: 11
11
+ shift: 1
12
+ samplerate: 44100
13
+ channels: 2
14
+ normalize: true
15
+ instruments: ['vocals', 'other']
16
+ target_instrument: null
17
+ num_epochs: 1000
18
+ num_steps: 1000
19
+ optimizer: prodigy
20
+ lr: 1.0
21
+ patience: 2
22
+ reduce_factor: 0.95
23
+ q: 0.95
24
+ coarse_loss_clip: true
25
+ ema_momentum: 0.999
26
+ read_metadata_procs: 8
27
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
28
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
29
+
30
+ model:
31
+ sr: 44100
32
+ win: 2048
33
+ stride: 512
34
+ feature_dim: 128
35
+ num_repeat_mask: 8
36
+ num_repeat_map: 4
37
+ num_output: 2
38
+
39
+ augmentations:
40
+ enable: false # enable or disable all augmentations (to fast disable if needed)
41
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
42
+ loudness_min: 0.5
43
+ loudness_max: 1.5
44
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
45
+ mixup_probs: [0.2, 0.02]
46
+ mixup_loudness_min: 0.5
47
+ mixup_loudness_max: 1.5
48
+
49
+ inference:
50
+ num_overlap: 2
51
+ batch_size: 4
configs/config_vocals_bs_roformer.yaml ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 131584
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 512
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ dim: 192
13
+ depth: 6
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ freqs_per_bands: !!python/tuple
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 2
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 4
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 12
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 24
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 48
80
+ - 128
81
+ - 129
82
+ dim_head: 64
83
+ heads: 8
84
+ attn_dropout: 0.1
85
+ ff_dropout: 0.1
86
+ flash_attn: true
87
+ dim_freqs_in: 1025
88
+ stft_n_fft: 2048
89
+ stft_hop_length: 512
90
+ stft_win_length: 2048
91
+ stft_normalized: false
92
+ mask_estimator_depth: 2
93
+ multi_stft_resolution_loss_weight: 1.0
94
+ multi_stft_resolutions_window_sizes: !!python/tuple
95
+ - 4096
96
+ - 2048
97
+ - 1024
98
+ - 512
99
+ - 256
100
+ multi_stft_hop_size: 147
101
+ multi_stft_normalized: False
102
+ mlp_expansion_factor: 4 # Probably too big (requires a lot of memory for weights)
103
+ use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested)
104
+ skip_connection: False # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
105
+
106
+ training:
107
+ batch_size: 10
108
+ gradient_accumulation_steps: 1
109
+ grad_clip: 0
110
+ instruments:
111
+ - vocals
112
+ - other
113
+ lr: 5.0e-05
114
+ patience: 2
115
+ reduce_factor: 0.95
116
+ target_instrument: vocals
117
+ num_epochs: 1000
118
+ num_steps: 1000
119
+ q: 0.95
120
+ coarse_loss_clip: true
121
+ ema_momentum: 0.999
122
+ optimizer: adam
123
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
124
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
125
+
126
+ augmentations:
127
+ enable: true # enable or disable all augmentations (to fast disable if needed)
128
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
129
+ loudness_min: 0.5
130
+ loudness_max: 1.5
131
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
132
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
133
+ - 0.2
134
+ - 0.02
135
+ mixup_loudness_min: 0.5
136
+ mixup_loudness_max: 1.5
137
+
138
+ inference:
139
+ batch_size: 1
140
+ dim_t: 256
141
+ num_overlap: 4
configs/config_vocals_htdemucs.yaml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100 # samplerate * segment
3
+ min_mean_abs: 0.001
4
+ hop_length: 1024
5
+
6
+ training:
7
+ batch_size: 10
8
+ gradient_accumulation_steps: 1
9
+ grad_clip: 0
10
+ segment: 11
11
+ shift: 1
12
+ samplerate: 44100
13
+ channels: 2
14
+ normalize: true
15
+ instruments: ['vocals', 'other']
16
+ target_instrument: null
17
+ num_epochs: 1000
18
+ num_steps: 1000
19
+ optimizer: adam
20
+ lr: 9.0e-05
21
+ patience: 2
22
+ reduce_factor: 0.95
23
+ q: 0.95
24
+ coarse_loss_clip: true
25
+ ema_momentum: 0.999
26
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
27
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
28
+
29
+ augmentations:
30
+ enable: true # enable or disable all augmentations (to fast disable if needed)
31
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
32
+ loudness_min: 0.5
33
+ loudness_max: 1.5
34
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
35
+ mixup_probs: [0.2, 0.02]
36
+ mixup_loudness_min: 0.5
37
+ mixup_loudness_max: 1.5
38
+
39
+ inference:
40
+ num_overlap: 2
41
+ batch_size: 8
42
+
43
+ model: htdemucs
44
+
45
+ htdemucs: # see demucs/htdemucs.py for a detailed description
46
+ # Channels
47
+ channels: 48
48
+ channels_time:
49
+ growth: 2
50
+ # STFT
51
+ num_subbands: 1
52
+ nfft: 4096
53
+ wiener_iters: 0
54
+ end_iters: 0
55
+ wiener_residual: false
56
+ cac: true
57
+ # Main structure
58
+ depth: 4
59
+ rewrite: true
60
+ # Frequency Branch
61
+ multi_freqs: []
62
+ multi_freqs_depth: 3
63
+ freq_emb: 0.2
64
+ emb_scale: 10
65
+ emb_smooth: true
66
+ # Convolutions
67
+ kernel_size: 8
68
+ stride: 4
69
+ time_stride: 2
70
+ context: 1
71
+ context_enc: 0
72
+ # normalization
73
+ norm_starts: 4
74
+ norm_groups: 4
75
+ # DConv residual branch
76
+ dconv_mode: 3
77
+ dconv_depth: 2
78
+ dconv_comp: 8
79
+ dconv_init: 1e-3
80
+ # Before the Transformer
81
+ bottom_channels: 512
82
+ # CrossTransformer
83
+ # ------ Common to all
84
+ # Regular parameters
85
+ t_layers: 5
86
+ t_hidden_scale: 4.0
87
+ t_heads: 8
88
+ t_dropout: 0.0
89
+ t_layer_scale: True
90
+ t_gelu: True
91
+ # ------------- Positional Embedding
92
+ t_emb: sin
93
+ t_max_positions: 10000 # for the scaled embedding
94
+ t_max_period: 10000.0
95
+ t_weight_pos_embed: 1.0
96
+ t_cape_mean_normalize: True
97
+ t_cape_augment: True
98
+ t_cape_glob_loc_scale: [5000.0, 1.0, 1.4]
99
+ t_sin_random_shift: 0
100
+ # ------------- norm before a transformer encoder
101
+ t_norm_in: True
102
+ t_norm_in_group: False
103
+ # ------------- norm inside the encoder
104
+ t_group_norm: False
105
+ t_norm_first: True
106
+ t_norm_out: True
107
+ # ------------- optim
108
+ t_weight_decay: 0.0
109
+ t_lr:
110
+ # ------------- sparsity
111
+ t_sparse_self_attn: False
112
+ t_sparse_cross_attn: False
113
+ t_mask_type: diag
114
+ t_mask_random_seed: 42
115
+ t_sparse_attn_window: 400
116
+ t_global_window: 100
117
+ t_sparsity: 0.95
118
+ t_auto_sparsity: False
119
+ # Cross Encoder First (False)
120
+ t_cross_first: False
121
+ # Weight init
122
+ rescale: 0.1
123
+
configs/config_vocals_mdx23c.yaml ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 261120
3
+ dim_f: 4096
4
+ dim_t: 256
5
+ hop_length: 1024
6
+ n_fft: 8192
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ act: gelu
13
+ bottleneck_factor: 4
14
+ growth: 128
15
+ norm: InstanceNorm
16
+ num_blocks_per_scale: 2
17
+ num_channels: 128
18
+ num_scales: 5
19
+ num_subbands: 4
20
+ scale:
21
+ - 2
22
+ - 2
23
+
24
+ training:
25
+ batch_size: 6
26
+ gradient_accumulation_steps: 1
27
+ grad_clip: 0
28
+ instruments:
29
+ - vocals
30
+ - other
31
+ lr: 9.0e-05
32
+ patience: 2
33
+ reduce_factor: 0.95
34
+ target_instrument: null
35
+ num_epochs: 1000
36
+ num_steps: 1000
37
+ q: 0.95
38
+ coarse_loss_clip: true
39
+ ema_momentum: 0.999
40
+ optimizer: adam
41
+ read_metadata_procs: 8 # Number of processes to use during metadata reading for dataset. Can speed up metadata generation
42
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
43
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
44
+
45
+ augmentations:
46
+ enable: true # enable or disable all augmentations (to fast disable if needed)
47
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
48
+ loudness_min: 0.5
49
+ loudness_max: 1.5
50
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
51
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
52
+ - 0.2
53
+ - 0.02
54
+ mixup_loudness_min: 0.5
55
+ mixup_loudness_max: 1.5
56
+
57
+ # apply mp3 compression to mixture only (emulate downloading mp3 from internet)
58
+ mp3_compression_on_mixture: 0.01
59
+ mp3_compression_on_mixture_bitrate_min: 32
60
+ mp3_compression_on_mixture_bitrate_max: 320
61
+ mp3_compression_on_mixture_backend: "lameenc"
62
+
63
+ all:
64
+ channel_shuffle: 0.5 # Set 0 or lower to disable
65
+ random_inverse: 0.1 # inverse track (better lower probability)
66
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
67
+ mp3_compression: 0.01
68
+ mp3_compression_min_bitrate: 32
69
+ mp3_compression_max_bitrate: 320
70
+ mp3_compression_backend: "lameenc"
71
+
72
+ vocals:
73
+ pitch_shift: 0.1
74
+ pitch_shift_min_semitones: -5
75
+ pitch_shift_max_semitones: 5
76
+ seven_band_parametric_eq: 0.25
77
+ seven_band_parametric_eq_min_gain_db: -9
78
+ seven_band_parametric_eq_max_gain_db: 9
79
+ tanh_distortion: 0.1
80
+ tanh_distortion_min: 0.1
81
+ tanh_distortion_max: 0.7
82
+ other:
83
+ pitch_shift: 0.1
84
+ pitch_shift_min_semitones: -4
85
+ pitch_shift_max_semitones: 4
86
+ gaussian_noise: 0.1
87
+ gaussian_noise_min_amplitude: 0.001
88
+ gaussian_noise_max_amplitude: 0.015
89
+ time_stretch: 0.01
90
+ time_stretch_min_rate: 0.8
91
+ time_stretch_max_rate: 1.25
92
+
93
+ inference:
94
+ batch_size: 1
95
+ dim_t: 256
96
+ num_overlap: 4
configs/config_vocals_mel_band_roformer.yaml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 131584
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 512
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ dim: 192
13
+ depth: 8
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ num_bands: 60
20
+ dim_head: 64
21
+ heads: 8
22
+ attn_dropout: 0.1
23
+ ff_dropout: 0.1
24
+ flash_attn: True
25
+ dim_freqs_in: 1025
26
+ sample_rate: 44100 # needed for mel filter bank from librosa
27
+ stft_n_fft: 2048
28
+ stft_hop_length: 512
29
+ stft_win_length: 2048
30
+ stft_normalized: False
31
+ mask_estimator_depth: 2
32
+ multi_stft_resolution_loss_weight: 1.0
33
+ multi_stft_resolutions_window_sizes: !!python/tuple
34
+ - 4096
35
+ - 2048
36
+ - 1024
37
+ - 512
38
+ - 256
39
+ multi_stft_hop_size: 147
40
+ multi_stft_normalized: False
41
+ mlp_expansion_factor: 4 # Probably too big (requires a lot of memory for weights)
42
+ use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested)
43
+ skip_connection: False # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
44
+
45
+ training:
46
+ batch_size: 7
47
+ gradient_accumulation_steps: 1
48
+ grad_clip: 0
49
+ instruments:
50
+ - vocals
51
+ - other
52
+ lr: 5.0e-05
53
+ patience: 2
54
+ reduce_factor: 0.95
55
+ target_instrument: vocals
56
+ num_epochs: 1000
57
+ num_steps: 1000
58
+ q: 0.95
59
+ coarse_loss_clip: true
60
+ ema_momentum: 0.999
61
+ optimizer: adam
62
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
63
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
64
+
65
+ augmentations:
66
+ enable: true # enable or disable all augmentations (to fast disable if needed)
67
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
68
+ loudness_min: 0.5
69
+ loudness_max: 1.5
70
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
71
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
72
+ - 0.2
73
+ - 0.02
74
+ mixup_loudness_min: 0.5
75
+ mixup_loudness_max: 1.5
76
+
77
+ inference:
78
+ batch_size: 1
79
+ dim_t: 256
80
+ num_overlap: 4
configs/config_vocals_scnet.yaml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100 # 44100 * 11
3
+ num_channels: 2
4
+ sample_rate: 44100
5
+ min_mean_abs: 0.000
6
+
7
+ model:
8
+ sources:
9
+ - vocals
10
+ - other
11
+ audio_channels: 2
12
+ dims:
13
+ - 4
14
+ - 32
15
+ - 64
16
+ - 128
17
+ nfft: 4096
18
+ hop_size: 1024
19
+ win_size: 4096
20
+ normalized: True
21
+ band_SR:
22
+ - 0.175
23
+ - 0.392
24
+ - 0.433
25
+ band_stride:
26
+ - 1
27
+ - 4
28
+ - 16
29
+ band_kernel:
30
+ - 3
31
+ - 4
32
+ - 16
33
+ conv_depths:
34
+ - 3
35
+ - 2
36
+ - 1
37
+ compress: 4
38
+ conv_kernel: 3
39
+ num_dplayer: 6
40
+ expand: 1
41
+
42
+ training:
43
+ batch_size: 10
44
+ gradient_accumulation_steps: 1
45
+ grad_clip: 0
46
+ instruments:
47
+ - vocals
48
+ - other
49
+ lr: 5.0e-04
50
+ patience: 2
51
+ reduce_factor: 0.95
52
+ target_instrument: null
53
+ num_epochs: 1000
54
+ num_steps: 10
55
+ q: 0.95
56
+ coarse_loss_clip: true
57
+ ema_momentum: 0.999
58
+ optimizer: adam
59
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
60
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
61
+
62
+ augmentations:
63
+ enable: true # enable or disable all augmentations (to fast disable if needed)
64
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
65
+ loudness_min: 0.5
66
+ loudness_max: 1.5
67
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
68
+ mixup_probs:
69
+ !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
70
+ - 0.2
71
+ - 0.02
72
+ mixup_loudness_min: 0.5
73
+ mixup_loudness_max: 1.5
74
+
75
+ inference:
76
+ batch_size: 8
77
+ dim_t: 256
78
+ num_overlap: 4
79
+ normalize: false
configs/config_vocals_scnet_large.yaml ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100 # 44100 * 11
3
+ num_channels: 2
4
+ sample_rate: 44100
5
+ min_mean_abs: 0.000
6
+
7
+ model:
8
+ sources:
9
+ - vocals
10
+ - other
11
+ audio_channels: 2
12
+ dims:
13
+ - 4
14
+ - 64
15
+ - 128
16
+ - 256
17
+ nfft: 4096
18
+ hop_size: 1024
19
+ win_size: 4096
20
+ normalized: True
21
+ band_SR:
22
+ - 0.225
23
+ - 0.372
24
+ - 0.403
25
+ band_stride:
26
+ - 1
27
+ - 4
28
+ - 16
29
+ band_kernel:
30
+ - 3
31
+ - 4
32
+ - 16
33
+ conv_depths:
34
+ - 3
35
+ - 2
36
+ - 1
37
+ compress: 4
38
+ conv_kernel: 3
39
+ num_dplayer: 6
40
+ expand: 1
41
+
42
+ training:
43
+ batch_size: 6
44
+ gradient_accumulation_steps: 1
45
+ grad_clip: 0
46
+ instruments:
47
+ - vocals
48
+ - other
49
+ lr: 1.0e-04
50
+ patience: 2
51
+ reduce_factor: 0.95
52
+ target_instrument: null
53
+ num_epochs: 1000
54
+ num_steps: 1000
55
+ q: 0.95
56
+ coarse_loss_clip: true
57
+ ema_momentum: 0.999
58
+ optimizer: adam
59
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
60
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
61
+
62
+ augmentations:
63
+ enable: false # enable or disable all augmentations (to fast disable if needed)
64
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
65
+ loudness_min: 0.5
66
+ loudness_max: 1.5
67
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
68
+ mixup_probs:
69
+ !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
70
+ - 0.2
71
+ - 0.02
72
+ mixup_loudness_min: 0.5
73
+ mixup_loudness_max: 1.5
74
+
75
+ inference:
76
+ batch_size: 8
77
+ dim_t: 256
78
+ num_overlap: 4
79
+ normalize: false
configs/config_vocals_scnet_unofficial.yaml ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 264600
3
+ num_channels: 2
4
+ sample_rate: 44100
5
+ min_mean_abs: 0.000
6
+
7
+ model:
8
+ dims: [4, 32, 64, 128]
9
+ bandsplit_ratios: [.175, .392, .433]
10
+ downsample_strides: [1, 4, 16]
11
+ n_conv_modules: [3, 2, 1]
12
+ n_rnn_layers: 6
13
+ rnn_hidden_dim: 128
14
+ n_sources: 2
15
+
16
+ n_fft: 4096
17
+ hop_length: 1024
18
+ win_length: 4096
19
+ stft_normalized: false
20
+
21
+ use_mamba: false
22
+ d_state: 16
23
+ d_conv: 4
24
+ d_expand: 2
25
+
26
+ training:
27
+ batch_size: 10
28
+ gradient_accumulation_steps: 2
29
+ grad_clip: 0
30
+ instruments:
31
+ - vocals
32
+ - other
33
+ lr: 5.0e-04
34
+ patience: 2
35
+ reduce_factor: 0.95
36
+ target_instrument: null
37
+ num_epochs: 1000
38
+ num_steps: 1000
39
+ q: 0.95
40
+ coarse_loss_clip: true
41
+ ema_momentum: 0.999
42
+ optimizer: adam
43
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
44
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
45
+
46
+ augmentations:
47
+ enable: true # enable or disable all augmentations (to fast disable if needed)
48
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
49
+ loudness_min: 0.5
50
+ loudness_max: 1.5
51
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
52
+ mixup_probs:
53
+ !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
54
+ - 0.2
55
+ - 0.02
56
+ mixup_loudness_min: 0.5
57
+ mixup_loudness_max: 1.5
58
+
59
+ inference:
60
+ batch_size: 8
61
+ dim_t: 256
62
+ num_overlap: 4
configs/config_vocals_segm_models.yaml ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 261632
3
+ dim_f: 4096
4
+ dim_t: 512
5
+ hop_length: 512
6
+ n_fft: 8192
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ encoder_name: tu-maxvit_large_tf_512 # look here for possibilities: https://github.com/qubvel/segmentation_models.pytorch#encoders-
13
+ decoder_type: unet # unet, fpn
14
+ act: gelu
15
+ num_channels: 128
16
+ num_subbands: 8
17
+
18
+ loss_multistft:
19
+ fft_sizes:
20
+ - 1024
21
+ - 2048
22
+ - 4096
23
+ hop_sizes:
24
+ - 512
25
+ - 1024
26
+ - 2048
27
+ win_lengths:
28
+ - 1024
29
+ - 2048
30
+ - 4096
31
+ window: "hann_window"
32
+ scale: "mel"
33
+ n_bins: 128
34
+ sample_rate: 44100
35
+ perceptual_weighting: true
36
+ w_sc: 1.0
37
+ w_log_mag: 1.0
38
+ w_lin_mag: 0.0
39
+ w_phs: 0.0
40
+ mag_distance: "L1"
41
+
42
+
43
+ training:
44
+ batch_size: 8
45
+ gradient_accumulation_steps: 1
46
+ grad_clip: 0
47
+ instruments:
48
+ - vocals
49
+ - other
50
+ lr: 5.0e-05
51
+ patience: 2
52
+ reduce_factor: 0.95
53
+ target_instrument: null
54
+ num_epochs: 1000
55
+ num_steps: 2000
56
+ q: 0.95
57
+ coarse_loss_clip: true
58
+ ema_momentum: 0.999
59
+ optimizer: adamw
60
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
61
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
62
+
63
+ augmentations:
64
+ enable: true # enable or disable all augmentations (to fast disable if needed)
65
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
66
+ loudness_min: 0.5
67
+ loudness_max: 1.5
68
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
69
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
70
+ - 0.2
71
+ - 0.02
72
+ mixup_loudness_min: 0.5
73
+ mixup_loudness_max: 1.5
74
+
75
+ inference:
76
+ batch_size: 1
77
+ dim_t: 512
78
+ num_overlap: 4
configs/config_vocals_swin_upernet.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 261632
3
+ dim_f: 4096
4
+ dim_t: 512
5
+ hop_length: 512
6
+ n_fft: 8192
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ act: gelu
13
+ num_channels: 16
14
+ num_subbands: 8
15
+
16
+ training:
17
+ batch_size: 14
18
+ gradient_accumulation_steps: 4
19
+ grad_clip: 0
20
+ instruments:
21
+ - vocals
22
+ - other
23
+ lr: 3.0e-05
24
+ patience: 2
25
+ reduce_factor: 0.95
26
+ target_instrument: null
27
+ num_epochs: 1000
28
+ num_steps: 1000
29
+ q: 0.95
30
+ coarse_loss_clip: true
31
+ ema_momentum: 0.999
32
+ optimizer: adamw
33
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
34
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
35
+
36
+ augmentations:
37
+ enable: true # enable or disable all augmentations (to fast disable if needed)
38
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
39
+ loudness_min: 0.5
40
+ loudness_max: 1.5
41
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
42
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
43
+ - 0.2
44
+ - 0.02
45
+ mixup_loudness_min: 0.5
46
+ mixup_loudness_max: 1.5
47
+
48
+ inference:
49
+ batch_size: 1
50
+ dim_t: 512
51
+ num_overlap: 4
configs/config_vocals_torchseg.yaml ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 261632
3
+ dim_f: 4096
4
+ dim_t: 512
5
+ hop_length: 512
6
+ n_fft: 8192
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.000
10
+
11
+ model:
12
+ encoder_name: maxvit_tiny_tf_512 # look with torchseg.list_encoders(). Currently 858 available
13
+ decoder_type: unet # unet, fpn
14
+ act: gelu
15
+ num_channels: 128
16
+ num_subbands: 8
17
+
18
+ training:
19
+ batch_size: 18
20
+ gradient_accumulation_steps: 1
21
+ grad_clip: 1.0
22
+ instruments:
23
+ - vocals
24
+ - other
25
+ lr: 1.0e-04
26
+ patience: 2
27
+ reduce_factor: 0.95
28
+ target_instrument: null
29
+ num_epochs: 1000
30
+ num_steps: 1000
31
+ q: 0.95
32
+ coarse_loss_clip: true
33
+ ema_momentum: 0.999
34
+ optimizer: radam
35
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
36
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
37
+
38
+ augmentations:
39
+ enable: false # enable or disable all augmentations (to fast disable if needed)
40
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
41
+ loudness_min: 0.5
42
+ loudness_max: 1.5
43
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
44
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
45
+ - 0.2
46
+ - 0.02
47
+ mixup_loudness_min: 0.5
48
+ mixup_loudness_max: 1.5
49
+
50
+ all:
51
+ channel_shuffle: 0.5 # Set 0 or lower to disable
52
+ random_inverse: 0.1 # inverse track (better lower probability)
53
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
54
+
55
+ inference:
56
+ batch_size: 8
57
+ dim_t: 512
58
+ num_overlap: 2
configs/viperx/model_bs_roformer_ep_317_sdr_12.9755.yaml ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352800
3
+ dim_f: 1024
4
+ dim_t: 801 # don't work (use in model)
5
+ hop_length: 441 # don't work (use in model)
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.000
10
+
11
+ model:
12
+ dim: 512
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ freqs_per_bands: !!python/tuple
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 2
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 4
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 12
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 24
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 48
80
+ - 128
81
+ - 129
82
+ dim_head: 64
83
+ heads: 8
84
+ attn_dropout: 0.1
85
+ ff_dropout: 0.1
86
+ flash_attn: true
87
+ dim_freqs_in: 1025
88
+ stft_n_fft: 2048
89
+ stft_hop_length: 441
90
+ stft_win_length: 2048
91
+ stft_normalized: false
92
+ mask_estimator_depth: 2
93
+ multi_stft_resolution_loss_weight: 1.0
94
+ multi_stft_resolutions_window_sizes: !!python/tuple
95
+ - 4096
96
+ - 2048
97
+ - 1024
98
+ - 512
99
+ - 256
100
+ multi_stft_hop_size: 147
101
+ multi_stft_normalized: False
102
+
103
+ training:
104
+ batch_size: 2
105
+ gradient_accumulation_steps: 1
106
+ grad_clip: 0
107
+ instruments:
108
+ - vocals
109
+ - other
110
+ lr: 1.0e-05
111
+ patience: 2
112
+ reduce_factor: 0.95
113
+ target_instrument: vocals
114
+ num_epochs: 1000
115
+ num_steps: 1000
116
+ q: 0.95
117
+ coarse_loss_clip: true
118
+ ema_momentum: 0.999
119
+ optimizer: adam
120
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
121
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
122
+
123
+ inference:
124
+ batch_size: 4
125
+ dim_t: 801
126
+ num_overlap: 2
configs/viperx/model_bs_roformer_ep_937_sdr_10.5309.yaml ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 131584
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 512
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ dim: 384
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ freqs_per_bands: !!python/tuple
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 2
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 4
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 12
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 24
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 48
80
+ - 128
81
+ - 129
82
+ dim_head: 64
83
+ heads: 8
84
+ attn_dropout: 0.1
85
+ ff_dropout: 0.1
86
+ flash_attn: true
87
+ dim_freqs_in: 1025
88
+ stft_n_fft: 2048
89
+ stft_hop_length: 512
90
+ stft_win_length: 2048
91
+ stft_normalized: false
92
+ mask_estimator_depth: 2
93
+ multi_stft_resolution_loss_weight: 1.0
94
+ multi_stft_resolutions_window_sizes: !!python/tuple
95
+ - 4096
96
+ - 2048
97
+ - 1024
98
+ - 512
99
+ - 256
100
+ multi_stft_hop_size: 147
101
+ multi_stft_normalized: False
102
+
103
+ training:
104
+ batch_size: 4
105
+ gradient_accumulation_steps: 1
106
+ grad_clip: 0
107
+ instruments:
108
+ - vocals
109
+ - other
110
+ lr: 5.0e-05
111
+ patience: 2
112
+ reduce_factor: 0.95
113
+ target_instrument: other
114
+ num_epochs: 1000
115
+ num_steps: 1000
116
+ q: 0.95
117
+ coarse_loss_clip: true
118
+ ema_momentum: 0.999
119
+ optimizer: adam
120
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
121
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
122
+
123
+ augmentations:
124
+ enable: true # enable or disable all augmentations (to fast disable if needed)
125
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
126
+ loudness_min: 0.5
127
+ loudness_max: 1.5
128
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
129
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
130
+ - 0.2
131
+ - 0.02
132
+ mixup_loudness_min: 0.5
133
+ mixup_loudness_max: 1.5
134
+
135
+ inference:
136
+ batch_size: 8
137
+ dim_t: 512
138
+ num_overlap: 2
configs/viperx/model_mel_band_roformer_ep_3005_sdr_11.4360.yaml ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352800
3
+ dim_f: 1024
4
+ dim_t: 801 # don't work (use in model)
5
+ hop_length: 441 # don't work (use in model)
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.000
10
+
11
+ model:
12
+ dim: 384
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ num_bands: 60
20
+ dim_head: 64
21
+ heads: 8
22
+ attn_dropout: 0.1
23
+ ff_dropout: 0.1
24
+ flash_attn: True
25
+ dim_freqs_in: 1025
26
+ sample_rate: 44100 # needed for mel filter bank from librosa
27
+ stft_n_fft: 2048
28
+ stft_hop_length: 441
29
+ stft_win_length: 2048
30
+ stft_normalized: False
31
+ mask_estimator_depth: 2
32
+ multi_stft_resolution_loss_weight: 1.0
33
+ multi_stft_resolutions_window_sizes: !!python/tuple
34
+ - 4096
35
+ - 2048
36
+ - 1024
37
+ - 512
38
+ - 256
39
+ multi_stft_hop_size: 147
40
+ multi_stft_normalized: False
41
+
42
+ training:
43
+ batch_size: 1
44
+ gradient_accumulation_steps: 8
45
+ grad_clip: 0
46
+ instruments:
47
+ - vocals
48
+ - other
49
+ lr: 4.0e-05
50
+ patience: 2
51
+ reduce_factor: 0.95
52
+ target_instrument: vocals
53
+ num_epochs: 1000
54
+ num_steps: 1000
55
+ q: 0.95
56
+ coarse_loss_clip: true
57
+ ema_momentum: 0.999
58
+ optimizer: adam
59
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
60
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
61
+
62
+ inference:
63
+ batch_size: 4
64
+ dim_t: 801
65
+ num_overlap: 2
cookies.txt ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Netscape HTTP Cookie File
2
+ # This file is generated by yt-dlp. Do not edit.
3
+
4
+ .youtube.com TRUE / FALSE 1756513080 HSID AkpR1gV80KfDyBAeq
5
+ .youtube.com TRUE / TRUE 1756513080 SSID AUkQz9BsAZ9dihvT7
6
+ .youtube.com TRUE / FALSE 1756513080 APISID FPGwyoC5hlxA_ztn/ADM7Q4t2tMF9LolFH
7
+ .youtube.com TRUE / TRUE 1756513080 SAPISID 4yc1vubX-H2x2gTg/A4eb_29p67eyBKNwo
8
+ .youtube.com TRUE / TRUE 1756513080 __Secure-1PAPISID 4yc1vubX-H2x2gTg/A4eb_29p67eyBKNwo
9
+ .youtube.com TRUE / TRUE 1756513080 __Secure-3PAPISID 4yc1vubX-H2x2gTg/A4eb_29p67eyBKNwo
10
+ .youtube.com TRUE / FALSE 0 PREF f4=4000000&tz=UTC&f7=100&f6=40000000&hl=en
11
+ .youtube.com TRUE / FALSE 1756513080 SID g.a000uQhpMC7F759FwOg4eAHYr_VFV7qLJJzrVdnrbB1Gg1ruHpzr7Q7JXHasofNz_IFpc8N2LgACgYKAfYSARISFQHGX2Miyesv7_oABGm-5jwErW1A3BoVAUF8yKrc9rgHmp5qJT6VRm79tW1A0076
12
+ .youtube.com TRUE / TRUE 1756513080 __Secure-1PSID g.a000uQhpMC7F759FwOg4eAHYr_VFV7qLJJzrVdnrbB1Gg1ruHpzrr6x28jzJl8SymGUnS601CAACgYKAVESARISFQHGX2MiMrjFi53JrIU9q__AtJkTHhoVAUF8yKq9HTb9EMf-IuIKrE24vlao0076
13
+ .youtube.com TRUE / TRUE 1756513080 __Secure-3PSID g.a000uQhpMC7F759FwOg4eAHYr_VFV7qLJJzrVdnrbB1Gg1ruHpzrggFtS3EfdibObQagLMZPwgACgYKAS0SARISFQHGX2MiVtkRAB_snp0m6Ci8U8_KdxoVAUF8yKpL1TslRsnn1zHR9IM89xyI0076
14
+ .youtube.com TRUE / TRUE 0 wide 0
15
+ .youtube.com TRUE / TRUE 1756638167 __Secure-1PSIDTS sidts-CjIBEJ3XV-avYWfDaATyg0Nhkmwux6CKyFaF1gYPa-AjJzR_e3PPij4K2ft8TRk6khgu2xAA
16
+ .youtube.com TRUE / TRUE 1756638167 __Secure-3PSIDTS sidts-CjIBEJ3XV-avYWfDaATyg0Nhkmwux6CKyFaF1gYPa-AjJzR_e3PPij4K2ft8TRk6khgu2xAA
17
+ .youtube.com TRUE / TRUE 1756638341 LOGIN_INFO AFmmF2swRgIhAKDOVmKULP27JwVcR_zerOJpO9GmXntRWiR4zWAazwz_AiEAwvt5os697PYAjWwVLGwA5oN3mFBrA1kh_4AlSuvoE-M:QUQ3MjNmem5mN3p3NVNiM2hzMEJ0R1EwUzI2SFNDXzhlQlNXemF2Z2IyZER1cmt1VXZSbk5EbkpaekFySGw4a09MQ3Z6c3RhSFhoXzFUNE9mdHB4ZEdPTFgzNEZrMTB2SWd2azlTdi1SUTdZWGczMEpRb3otemhHZ08wZDlzc0dhRE1sM2tJUTBfSkNiSmpuOTBXdG13eDhsR2JTVVlVQWZB
18
+ .youtube.com TRUE / TRUE 1741086952 CONSISTENCY AKreu9tDDatJYzjErs5c0WuYZjTQFRMZu7GKaDYzvFqROdgjqcrkvrsWqoTI1zZioac6yVWq7BCSzc1y0Pk0j8ikhC_l9YEyMmQs14Kg3IHcli61swZK3uWn
19
+ .youtube.com TRUE / FALSE 1756638353 SIDCC AKEyXzWXY14_kKbTxyT3AyRMPsKsEGUsuIHbGutwC42o1YlZS06ch-ug7SyZAYQ7jEDVx5EDfw
20
+ .youtube.com TRUE / TRUE 1756638353 __Secure-1PSIDCC AKEyXzX0UJz8MZ6u_9s7hOlSPGjbu-JwY0Q1l77e5oO5CJTXNIDO95oxQyCdFaP2D-4qbJrCI1I
21
+ .youtube.com TRUE / TRUE 1756638353 __Secure-3PSIDCC AKEyXzU5TOnpQc0o7qGCur58CMCcshJ2tsoLi9rVwsER2dK2P22VqU3jYG0yMz0LsNkMxjbXeg
22
+ .youtube.com TRUE / TRUE 0 YSC tBB8nN6HoE0
23
+ .youtube.com TRUE / TRUE 1756638392 __Secure-ROLLOUT_TOKEN CMe15b6p2vn47QEQsKafn6TwiwMYru_Yn6TwiwM%3D
24
+ .youtube.com TRUE / TRUE 1756638392 VISITOR_INFO1_LIVE 54yW_8GrQNM
25
+ .youtube.com TRUE / TRUE 1756638392 VISITOR_PRIVACY_METADATA CgJVUxIEGgAgDA%3D%3D
26
+ .youtube.com TRUE / TRUE 1756638392 YT_DEVICE_MEASUREMENT_ID 3rgPq1s=
27
+ .youtube.com TRUE / TRUE 1804158392 __Secure-YT_TVFAS t=483635&s=2
28
+ .youtube.com TRUE / TRUE 1756638392 DEVICE_INFO ChxOelEzTnprd09URXhNemMzTXpRNE1qZ3lNQT09ELi9m74GGLi9m74G
dataset.py ADDED
@@ -0,0 +1,669 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding: utf-8
2
+ __author__ = 'Roman Solovyev (ZFTurbo): https://github.com/ZFTurbo/'
3
+
4
+
5
+ import os
6
+ import random
7
+ import numpy as np
8
+ import torch
9
+ import soundfile as sf
10
+ import pickle
11
+ import time
12
+ import itertools
13
+ import multiprocessing
14
+ from tqdm.auto import tqdm
15
+ from glob import glob
16
+ import audiomentations as AU
17
+ import pedalboard as PB
18
+ import warnings
19
+ warnings.filterwarnings("ignore")
20
+
21
+
22
+ def load_chunk(path, length, chunk_size, offset=None):
23
+ if chunk_size <= length:
24
+ if offset is None:
25
+ offset = np.random.randint(length - chunk_size + 1)
26
+ x = sf.read(path, dtype='float32', start=offset, frames=chunk_size)[0]
27
+ else:
28
+ x = sf.read(path, dtype='float32')[0]
29
+ if len(x.shape) == 1:
30
+ # Mono case
31
+ pad = np.zeros((chunk_size - length))
32
+ else:
33
+ pad = np.zeros([chunk_size - length, x.shape[-1]])
34
+ x = np.concatenate([x, pad], axis=0)
35
+ # Mono fix
36
+ if len(x.shape) == 1:
37
+ x = np.expand_dims(x, axis=1)
38
+ return x.T
39
+
40
+
41
+ def get_track_set_length(params):
42
+ path, instruments, file_types = params
43
+ # Check lengths of all instruments (it can be different in some cases)
44
+ lengths_arr = []
45
+ for instr in instruments:
46
+ length = -1
47
+ for extension in file_types:
48
+ path_to_audio_file = path + '/{}.{}'.format(instr, extension)
49
+ if os.path.isfile(path_to_audio_file):
50
+ length = len(sf.read(path_to_audio_file)[0])
51
+ break
52
+ if length == -1:
53
+ print('Cant find file "{}" in folder {}'.format(instr, path))
54
+ continue
55
+ lengths_arr.append(length)
56
+ lengths_arr = np.array(lengths_arr)
57
+ if lengths_arr.min() != lengths_arr.max():
58
+ print('Warning: lengths of stems are different for path: {}. ({} != {})'.format(
59
+ path,
60
+ lengths_arr.min(),
61
+ lengths_arr.max())
62
+ )
63
+ # We use minimum to allow overflow for soundfile read in non-equal length cases
64
+ return path, lengths_arr.min()
65
+
66
+
67
+ # For multiprocessing
68
+ def get_track_length(params):
69
+ path = params
70
+ length = len(sf.read(path)[0])
71
+ return (path, length)
72
+
73
+
74
+ class MSSDataset(torch.utils.data.Dataset):
75
+ def __init__(self, config, data_path, metadata_path="metadata.pkl", dataset_type=1, batch_size=None, verbose=True):
76
+ self.verbose = verbose
77
+ self.config = config
78
+ self.dataset_type = dataset_type # 1, 2, 3 or 4
79
+ self.data_path = data_path
80
+ self.instruments = instruments = config.training.instruments
81
+ if batch_size is None:
82
+ batch_size = config.training.batch_size
83
+ self.batch_size = batch_size
84
+ self.file_types = ['wav', 'flac']
85
+ self.metadata_path = metadata_path
86
+
87
+ # Augmentation block
88
+ self.aug = False
89
+ if 'augmentations' in config:
90
+ if config['augmentations'].enable is True:
91
+ if self.verbose:
92
+ print('Use augmentation for training')
93
+ self.aug = True
94
+ else:
95
+ if self.verbose:
96
+ print('There is no augmentations block in config. Augmentations disabled for training...')
97
+
98
+ metadata = self.get_metadata()
99
+
100
+ if self.dataset_type in [1, 4]:
101
+ if len(metadata) > 0:
102
+ if self.verbose:
103
+ print('Found tracks in dataset: {}'.format(len(metadata)))
104
+ else:
105
+ print('No tracks found for training. Check paths you provided!')
106
+ exit()
107
+ else:
108
+ for instr in self.instruments:
109
+ if self.verbose:
110
+ print('Found tracks for {} in dataset: {}'.format(instr, len(metadata[instr])))
111
+ self.metadata = metadata
112
+ self.chunk_size = config.audio.chunk_size
113
+ self.min_mean_abs = config.audio.min_mean_abs
114
+
115
+ def __len__(self):
116
+ return self.config.training.num_steps * self.batch_size
117
+
118
+ def read_from_metadata_cache(self, track_paths, instr=None):
119
+ metadata = []
120
+ if os.path.isfile(self.metadata_path):
121
+ if self.verbose:
122
+ print('Found metadata cache file: {}'.format(self.metadata_path))
123
+ old_metadata = pickle.load(open(self.metadata_path, 'rb'))
124
+ else:
125
+ return track_paths, metadata
126
+
127
+ if instr:
128
+ old_metadata = old_metadata[instr]
129
+
130
+ # We will not re-read tracks existed in old metadata file
131
+ track_paths_set = set(track_paths)
132
+ for old_path, file_size in old_metadata:
133
+ if old_path in track_paths_set:
134
+ metadata.append([old_path, file_size])
135
+ track_paths_set.remove(old_path)
136
+ track_paths = list(track_paths_set)
137
+ if len(metadata) > 0:
138
+ print('Old metadata was used for {} tracks.'.format(len(metadata)))
139
+ return track_paths, metadata
140
+
141
+
142
+ def get_metadata(self):
143
+ read_metadata_procs = multiprocessing.cpu_count()
144
+ if 'read_metadata_procs' in self.config['training']:
145
+ read_metadata_procs = int(self.config['training']['read_metadata_procs'])
146
+
147
+ if self.verbose:
148
+ print(
149
+ 'Dataset type:', self.dataset_type,
150
+ 'Processes to use:', read_metadata_procs,
151
+ '\nCollecting metadata for', str(self.data_path),
152
+ )
153
+
154
+ if self.dataset_type in [1, 4]:
155
+ track_paths = []
156
+ if type(self.data_path) == list:
157
+ for tp in self.data_path:
158
+ tracks_for_folder = sorted(glob(tp + '/*'))
159
+ if len(tracks_for_folder) == 0:
160
+ print('Warning: no tracks found in folder \'{}\'. Please check it!'.format(tp))
161
+ track_paths += tracks_for_folder
162
+ else:
163
+ track_paths += sorted(glob(self.data_path + '/*'))
164
+
165
+ track_paths = [path for path in track_paths if os.path.basename(path)[0] != '.' and os.path.isdir(path)]
166
+ track_paths, metadata = self.read_from_metadata_cache(track_paths, None)
167
+
168
+ if read_metadata_procs <= 1:
169
+ for path in tqdm(track_paths):
170
+ track_path, track_length = get_track_set_length((path, self.instruments, self.file_types))
171
+ metadata.append((track_path, track_length))
172
+ else:
173
+ p = multiprocessing.Pool(processes=read_metadata_procs)
174
+ with tqdm(total=len(track_paths)) as pbar:
175
+ track_iter = p.imap(
176
+ get_track_set_length,
177
+ zip(track_paths, itertools.repeat(self.instruments), itertools.repeat(self.file_types))
178
+ )
179
+ for track_path, track_length in track_iter:
180
+ metadata.append((track_path, track_length))
181
+ pbar.update()
182
+ p.close()
183
+
184
+ elif self.dataset_type == 2:
185
+ metadata = dict()
186
+ for instr in self.instruments:
187
+ metadata[instr] = []
188
+ track_paths = []
189
+ if type(self.data_path) == list:
190
+ for tp in self.data_path:
191
+ track_paths += sorted(glob(tp + '/{}/*.wav'.format(instr)))
192
+ track_paths += sorted(glob(tp + '/{}/*.flac'.format(instr)))
193
+ else:
194
+ track_paths += sorted(glob(self.data_path + '/{}/*.wav'.format(instr)))
195
+ track_paths += sorted(glob(self.data_path + '/{}/*.flac'.format(instr)))
196
+
197
+ track_paths, metadata[instr] = self.read_from_metadata_cache(track_paths, instr)
198
+
199
+ if read_metadata_procs <= 1:
200
+ for path in tqdm(track_paths):
201
+ length = len(sf.read(path)[0])
202
+ metadata[instr].append((path, length))
203
+ else:
204
+ p = multiprocessing.Pool(processes=read_metadata_procs)
205
+ for out in tqdm(p.imap(get_track_length, track_paths), total=len(track_paths)):
206
+ metadata[instr].append(out)
207
+
208
+ elif self.dataset_type == 3:
209
+ import pandas as pd
210
+ if type(self.data_path) != list:
211
+ data_path = [self.data_path]
212
+
213
+ metadata = dict()
214
+ for i in range(len(self.data_path)):
215
+ if self.verbose:
216
+ print('Reading tracks from: {}'.format(self.data_path[i]))
217
+ df = pd.read_csv(self.data_path[i])
218
+
219
+ skipped = 0
220
+ for instr in self.instruments:
221
+ part = df[df['instrum'] == instr].copy()
222
+ print('Tracks found for {}: {}'.format(instr, len(part)))
223
+ for instr in self.instruments:
224
+ part = df[df['instrum'] == instr].copy()
225
+ metadata[instr] = []
226
+ track_paths = list(part['path'].values)
227
+ track_paths, metadata[instr] = self.read_from_metadata_cache(track_paths, instr)
228
+
229
+ for path in tqdm(track_paths):
230
+ if not os.path.isfile(path):
231
+ print('Cant find track: {}'.format(path))
232
+ skipped += 1
233
+ continue
234
+ # print(path)
235
+ try:
236
+ length = len(sf.read(path)[0])
237
+ except:
238
+ print('Problem with path: {}'.format(path))
239
+ skipped += 1
240
+ continue
241
+ metadata[instr].append((path, length))
242
+ if skipped > 0:
243
+ print('Missing tracks: {} from {}'.format(skipped, len(df)))
244
+ else:
245
+ print('Unknown dataset type: {}. Must be 1, 2, 3 or 4'.format(self.dataset_type))
246
+ exit()
247
+
248
+ # Save metadata
249
+ pickle.dump(metadata, open(self.metadata_path, 'wb'))
250
+ return metadata
251
+
252
+ def load_source(self, metadata, instr):
253
+ while True:
254
+ if self.dataset_type in [1, 4]:
255
+ track_path, track_length = random.choice(metadata)
256
+ for extension in self.file_types:
257
+ path_to_audio_file = track_path + '/{}.{}'.format(instr, extension)
258
+ if os.path.isfile(path_to_audio_file):
259
+ try:
260
+ source = load_chunk(path_to_audio_file, track_length, self.chunk_size)
261
+ except Exception as e:
262
+ # Sometimes error during FLAC reading, catch it and use zero stem
263
+ print('Error: {} Path: {}'.format(e, path_to_audio_file))
264
+ source = np.zeros((2, self.chunk_size), dtype=np.float32)
265
+ break
266
+ else:
267
+ track_path, track_length = random.choice(metadata[instr])
268
+ try:
269
+ source = load_chunk(track_path, track_length, self.chunk_size)
270
+ except Exception as e:
271
+ # Sometimes error during FLAC reading, catch it and use zero stem
272
+ print('Error: {} Path: {}'.format(e, track_path))
273
+ source = np.zeros((2, self.chunk_size), dtype=np.float32)
274
+
275
+ if np.abs(source).mean() >= self.min_mean_abs: # remove quiet chunks
276
+ break
277
+ if self.aug:
278
+ source = self.augm_data(source, instr)
279
+ return torch.tensor(source, dtype=torch.float32)
280
+
281
+ def load_random_mix(self):
282
+ res = []
283
+ for instr in self.instruments:
284
+ s1 = self.load_source(self.metadata, instr)
285
+ # Mixup augmentation. Multiple mix of same type of stems
286
+ if self.aug:
287
+ if 'mixup' in self.config['augmentations']:
288
+ if self.config['augmentations'].mixup:
289
+ mixup = [s1]
290
+ for prob in self.config.augmentations.mixup_probs:
291
+ if random.uniform(0, 1) < prob:
292
+ s2 = self.load_source(self.metadata, instr)
293
+ mixup.append(s2)
294
+ mixup = torch.stack(mixup, dim=0)
295
+ loud_values = np.random.uniform(
296
+ low=self.config.augmentations.loudness_min,
297
+ high=self.config.augmentations.loudness_max,
298
+ size=(len(mixup),)
299
+ )
300
+ loud_values = torch.tensor(loud_values, dtype=torch.float32)
301
+ mixup *= loud_values[:, None, None]
302
+ s1 = mixup.mean(dim=0, dtype=torch.float32)
303
+ res.append(s1)
304
+ res = torch.stack(res)
305
+ return res
306
+
307
+ def load_aligned_data(self):
308
+ track_path, track_length = random.choice(self.metadata)
309
+ attempts = 10
310
+ while attempts:
311
+ if track_length >= self.chunk_size:
312
+ common_offset = np.random.randint(track_length - self.chunk_size + 1)
313
+ else:
314
+ common_offset = None
315
+ res = []
316
+ silent_chunks = 0
317
+ for i in self.instruments:
318
+ for extension in self.file_types:
319
+ path_to_audio_file = track_path + '/{}.{}'.format(i, extension)
320
+ if os.path.isfile(path_to_audio_file):
321
+ try:
322
+ source = load_chunk(path_to_audio_file, track_length, self.chunk_size, offset=common_offset)
323
+ except Exception as e:
324
+ # Sometimes error during FLAC reading, catch it and use zero stem
325
+ print('Error: {} Path: {}'.format(e, path_to_audio_file))
326
+ source = np.zeros((2, self.chunk_size), dtype=np.float32)
327
+ break
328
+ res.append(source)
329
+ if np.abs(source).mean() < self.min_mean_abs: # remove quiet chunks
330
+ silent_chunks += 1
331
+ if silent_chunks == 0:
332
+ break
333
+
334
+ attempts -= 1
335
+ if attempts <= 0:
336
+ print('Attempts max!', track_path)
337
+ if common_offset is None:
338
+ # If track is too small break immediately
339
+ break
340
+
341
+ res = np.stack(res, axis=0)
342
+ if self.aug:
343
+ for i, instr in enumerate(self.instruments):
344
+ res[i] = self.augm_data(res[i], instr)
345
+ return torch.tensor(res, dtype=torch.float32)
346
+
347
+ def augm_data(self, source, instr):
348
+ # source.shape = (2, 261120) - first channels, second length
349
+ source_shape = source.shape
350
+ applied_augs = []
351
+ if 'all' in self.config['augmentations']:
352
+ augs = self.config['augmentations']['all']
353
+ else:
354
+ augs = dict()
355
+
356
+ # We need to add to all augmentations specific augs for stem. And rewrite values if needed
357
+ if instr in self.config['augmentations']:
358
+ for el in self.config['augmentations'][instr]:
359
+ augs[el] = self.config['augmentations'][instr][el]
360
+
361
+ # Channel shuffle
362
+ if 'channel_shuffle' in augs:
363
+ if augs['channel_shuffle'] > 0:
364
+ if random.uniform(0, 1) < augs['channel_shuffle']:
365
+ source = source[::-1].copy()
366
+ applied_augs.append('channel_shuffle')
367
+ # Random inverse
368
+ if 'random_inverse' in augs:
369
+ if augs['random_inverse'] > 0:
370
+ if random.uniform(0, 1) < augs['random_inverse']:
371
+ source = source[:, ::-1].copy()
372
+ applied_augs.append('random_inverse')
373
+ # Random polarity (multiply -1)
374
+ if 'random_polarity' in augs:
375
+ if augs['random_polarity'] > 0:
376
+ if random.uniform(0, 1) < augs['random_polarity']:
377
+ source = -source.copy()
378
+ applied_augs.append('random_polarity')
379
+ # Random pitch shift
380
+ if 'pitch_shift' in augs:
381
+ if augs['pitch_shift'] > 0:
382
+ if random.uniform(0, 1) < augs['pitch_shift']:
383
+ apply_aug = AU.PitchShift(
384
+ min_semitones=augs['pitch_shift_min_semitones'],
385
+ max_semitones=augs['pitch_shift_max_semitones'],
386
+ p=1.0
387
+ )
388
+ source = apply_aug(samples=source, sample_rate=44100)
389
+ applied_augs.append('pitch_shift')
390
+ # Random seven band parametric eq
391
+ if 'seven_band_parametric_eq' in augs:
392
+ if augs['seven_band_parametric_eq'] > 0:
393
+ if random.uniform(0, 1) < augs['seven_band_parametric_eq']:
394
+ apply_aug = AU.SevenBandParametricEQ(
395
+ min_gain_db=augs['seven_band_parametric_eq_min_gain_db'],
396
+ max_gain_db=augs['seven_band_parametric_eq_max_gain_db'],
397
+ p=1.0
398
+ )
399
+ source = apply_aug(samples=source, sample_rate=44100)
400
+ applied_augs.append('seven_band_parametric_eq')
401
+ # Random tanh distortion
402
+ if 'tanh_distortion' in augs:
403
+ if augs['tanh_distortion'] > 0:
404
+ if random.uniform(0, 1) < augs['tanh_distortion']:
405
+ apply_aug = AU.TanhDistortion(
406
+ min_distortion=augs['tanh_distortion_min'],
407
+ max_distortion=augs['tanh_distortion_max'],
408
+ p=1.0
409
+ )
410
+ source = apply_aug(samples=source, sample_rate=44100)
411
+ applied_augs.append('tanh_distortion')
412
+ # Random MP3 Compression
413
+ if 'mp3_compression' in augs:
414
+ if augs['mp3_compression'] > 0:
415
+ if random.uniform(0, 1) < augs['mp3_compression']:
416
+ apply_aug = AU.Mp3Compression(
417
+ min_bitrate=augs['mp3_compression_min_bitrate'],
418
+ max_bitrate=augs['mp3_compression_max_bitrate'],
419
+ backend=augs['mp3_compression_backend'],
420
+ p=1.0
421
+ )
422
+ source = apply_aug(samples=source, sample_rate=44100)
423
+ applied_augs.append('mp3_compression')
424
+ # Random AddGaussianNoise
425
+ if 'gaussian_noise' in augs:
426
+ if augs['gaussian_noise'] > 0:
427
+ if random.uniform(0, 1) < augs['gaussian_noise']:
428
+ apply_aug = AU.AddGaussianNoise(
429
+ min_amplitude=augs['gaussian_noise_min_amplitude'],
430
+ max_amplitude=augs['gaussian_noise_max_amplitude'],
431
+ p=1.0
432
+ )
433
+ source = apply_aug(samples=source, sample_rate=44100)
434
+ applied_augs.append('gaussian_noise')
435
+ # Random TimeStretch
436
+ if 'time_stretch' in augs:
437
+ if augs['time_stretch'] > 0:
438
+ if random.uniform(0, 1) < augs['time_stretch']:
439
+ apply_aug = AU.TimeStretch(
440
+ min_rate=augs['time_stretch_min_rate'],
441
+ max_rate=augs['time_stretch_max_rate'],
442
+ leave_length_unchanged=True,
443
+ p=1.0
444
+ )
445
+ source = apply_aug(samples=source, sample_rate=44100)
446
+ applied_augs.append('time_stretch')
447
+
448
+ # Possible fix of shape
449
+ if source_shape != source.shape:
450
+ source = source[..., :source_shape[-1]]
451
+
452
+ # Random Reverb
453
+ if 'pedalboard_reverb' in augs:
454
+ if augs['pedalboard_reverb'] > 0:
455
+ if random.uniform(0, 1) < augs['pedalboard_reverb']:
456
+ room_size = random.uniform(
457
+ augs['pedalboard_reverb_room_size_min'],
458
+ augs['pedalboard_reverb_room_size_max'],
459
+ )
460
+ damping = random.uniform(
461
+ augs['pedalboard_reverb_damping_min'],
462
+ augs['pedalboard_reverb_damping_max'],
463
+ )
464
+ wet_level = random.uniform(
465
+ augs['pedalboard_reverb_wet_level_min'],
466
+ augs['pedalboard_reverb_wet_level_max'],
467
+ )
468
+ dry_level = random.uniform(
469
+ augs['pedalboard_reverb_dry_level_min'],
470
+ augs['pedalboard_reverb_dry_level_max'],
471
+ )
472
+ width = random.uniform(
473
+ augs['pedalboard_reverb_width_min'],
474
+ augs['pedalboard_reverb_width_max'],
475
+ )
476
+ board = PB.Pedalboard([PB.Reverb(
477
+ room_size=room_size, # 0.1 - 0.9
478
+ damping=damping, # 0.1 - 0.9
479
+ wet_level=wet_level, # 0.1 - 0.9
480
+ dry_level=dry_level, # 0.1 - 0.9
481
+ width=width, # 0.9 - 1.0
482
+ freeze_mode=0.0,
483
+ )])
484
+ source = board(source, 44100)
485
+ applied_augs.append('pedalboard_reverb')
486
+
487
+ # Random Chorus
488
+ if 'pedalboard_chorus' in augs:
489
+ if augs['pedalboard_chorus'] > 0:
490
+ if random.uniform(0, 1) < augs['pedalboard_chorus']:
491
+ rate_hz = random.uniform(
492
+ augs['pedalboard_chorus_rate_hz_min'],
493
+ augs['pedalboard_chorus_rate_hz_max'],
494
+ )
495
+ depth = random.uniform(
496
+ augs['pedalboard_chorus_depth_min'],
497
+ augs['pedalboard_chorus_depth_max'],
498
+ )
499
+ centre_delay_ms = random.uniform(
500
+ augs['pedalboard_chorus_centre_delay_ms_min'],
501
+ augs['pedalboard_chorus_centre_delay_ms_max'],
502
+ )
503
+ feedback = random.uniform(
504
+ augs['pedalboard_chorus_feedback_min'],
505
+ augs['pedalboard_chorus_feedback_max'],
506
+ )
507
+ mix = random.uniform(
508
+ augs['pedalboard_chorus_mix_min'],
509
+ augs['pedalboard_chorus_mix_max'],
510
+ )
511
+ board = PB.Pedalboard([PB.Chorus(
512
+ rate_hz=rate_hz,
513
+ depth=depth,
514
+ centre_delay_ms=centre_delay_ms,
515
+ feedback=feedback,
516
+ mix=mix,
517
+ )])
518
+ source = board(source, 44100)
519
+ applied_augs.append('pedalboard_chorus')
520
+
521
+ # Random Phazer
522
+ if 'pedalboard_phazer' in augs:
523
+ if augs['pedalboard_phazer'] > 0:
524
+ if random.uniform(0, 1) < augs['pedalboard_phazer']:
525
+ rate_hz = random.uniform(
526
+ augs['pedalboard_phazer_rate_hz_min'],
527
+ augs['pedalboard_phazer_rate_hz_max'],
528
+ )
529
+ depth = random.uniform(
530
+ augs['pedalboard_phazer_depth_min'],
531
+ augs['pedalboard_phazer_depth_max'],
532
+ )
533
+ centre_frequency_hz = random.uniform(
534
+ augs['pedalboard_phazer_centre_frequency_hz_min'],
535
+ augs['pedalboard_phazer_centre_frequency_hz_max'],
536
+ )
537
+ feedback = random.uniform(
538
+ augs['pedalboard_phazer_feedback_min'],
539
+ augs['pedalboard_phazer_feedback_max'],
540
+ )
541
+ mix = random.uniform(
542
+ augs['pedalboard_phazer_mix_min'],
543
+ augs['pedalboard_phazer_mix_max'],
544
+ )
545
+ board = PB.Pedalboard([PB.Phaser(
546
+ rate_hz=rate_hz,
547
+ depth=depth,
548
+ centre_frequency_hz=centre_frequency_hz,
549
+ feedback=feedback,
550
+ mix=mix,
551
+ )])
552
+ source = board(source, 44100)
553
+ applied_augs.append('pedalboard_phazer')
554
+
555
+ # Random Distortion
556
+ if 'pedalboard_distortion' in augs:
557
+ if augs['pedalboard_distortion'] > 0:
558
+ if random.uniform(0, 1) < augs['pedalboard_distortion']:
559
+ drive_db = random.uniform(
560
+ augs['pedalboard_distortion_drive_db_min'],
561
+ augs['pedalboard_distortion_drive_db_max'],
562
+ )
563
+ board = PB.Pedalboard([PB.Distortion(
564
+ drive_db=drive_db,
565
+ )])
566
+ source = board(source, 44100)
567
+ applied_augs.append('pedalboard_distortion')
568
+
569
+ # Random PitchShift
570
+ if 'pedalboard_pitch_shift' in augs:
571
+ if augs['pedalboard_pitch_shift'] > 0:
572
+ if random.uniform(0, 1) < augs['pedalboard_pitch_shift']:
573
+ semitones = random.uniform(
574
+ augs['pedalboard_pitch_shift_semitones_min'],
575
+ augs['pedalboard_pitch_shift_semitones_max'],
576
+ )
577
+ board = PB.Pedalboard([PB.PitchShift(
578
+ semitones=semitones
579
+ )])
580
+ source = board(source, 44100)
581
+ applied_augs.append('pedalboard_pitch_shift')
582
+
583
+ # Random Resample
584
+ if 'pedalboard_resample' in augs:
585
+ if augs['pedalboard_resample'] > 0:
586
+ if random.uniform(0, 1) < augs['pedalboard_resample']:
587
+ target_sample_rate = random.uniform(
588
+ augs['pedalboard_resample_target_sample_rate_min'],
589
+ augs['pedalboard_resample_target_sample_rate_max'],
590
+ )
591
+ board = PB.Pedalboard([PB.Resample(
592
+ target_sample_rate=target_sample_rate
593
+ )])
594
+ source = board(source, 44100)
595
+ applied_augs.append('pedalboard_resample')
596
+
597
+ # Random Bitcrash
598
+ if 'pedalboard_bitcrash' in augs:
599
+ if augs['pedalboard_bitcrash'] > 0:
600
+ if random.uniform(0, 1) < augs['pedalboard_bitcrash']:
601
+ bit_depth = random.uniform(
602
+ augs['pedalboard_bitcrash_bit_depth_min'],
603
+ augs['pedalboard_bitcrash_bit_depth_max'],
604
+ )
605
+ board = PB.Pedalboard([PB.Bitcrush(
606
+ bit_depth=bit_depth
607
+ )])
608
+ source = board(source, 44100)
609
+ applied_augs.append('pedalboard_bitcrash')
610
+
611
+ # Random MP3Compressor
612
+ if 'pedalboard_mp3_compressor' in augs:
613
+ if augs['pedalboard_mp3_compressor'] > 0:
614
+ if random.uniform(0, 1) < augs['pedalboard_mp3_compressor']:
615
+ vbr_quality = random.uniform(
616
+ augs['pedalboard_mp3_compressor_pedalboard_mp3_compressor_min'],
617
+ augs['pedalboard_mp3_compressor_pedalboard_mp3_compressor_max'],
618
+ )
619
+ board = PB.Pedalboard([PB.MP3Compressor(
620
+ vbr_quality=vbr_quality
621
+ )])
622
+ source = board(source, 44100)
623
+ applied_augs.append('pedalboard_mp3_compressor')
624
+
625
+ # print(applied_augs)
626
+ return source
627
+
628
+ def __getitem__(self, index):
629
+ if self.dataset_type in [1, 2, 3]:
630
+ res = self.load_random_mix()
631
+ else:
632
+ res = self.load_aligned_data()
633
+
634
+ # Randomly change loudness of each stem
635
+ if self.aug:
636
+ if 'loudness' in self.config['augmentations']:
637
+ if self.config['augmentations']['loudness']:
638
+ loud_values = np.random.uniform(
639
+ low=self.config['augmentations']['loudness_min'],
640
+ high=self.config['augmentations']['loudness_max'],
641
+ size=(len(res),)
642
+ )
643
+ loud_values = torch.tensor(loud_values, dtype=torch.float32)
644
+ res *= loud_values[:, None, None]
645
+
646
+ mix = res.sum(0)
647
+
648
+ if self.aug:
649
+ if 'mp3_compression_on_mixture' in self.config['augmentations']:
650
+ apply_aug = AU.Mp3Compression(
651
+ min_bitrate=self.config['augmentations']['mp3_compression_on_mixture_bitrate_min'],
652
+ max_bitrate=self.config['augmentations']['mp3_compression_on_mixture_bitrate_max'],
653
+ backend=self.config['augmentations']['mp3_compression_on_mixture_backend'],
654
+ p=self.config['augmentations']['mp3_compression_on_mixture']
655
+ )
656
+ mix_conv = mix.cpu().numpy().astype(np.float32)
657
+ required_shape = mix_conv.shape
658
+ mix = apply_aug(samples=mix_conv, sample_rate=44100)
659
+ # Sometimes it gives longer audio (so we cut)
660
+ if mix.shape != required_shape:
661
+ mix = mix[..., :required_shape[-1]]
662
+ mix = torch.tensor(mix, dtype=torch.float32)
663
+
664
+ # If we need to optimize only given stem
665
+ if self.config.training.target_instrument is not None:
666
+ index = self.config.training.instruments.index(self.config.training.target_instrument)
667
+ return res[index:index+1], mix
668
+
669
+ return res, mix
docs/LoRA.md ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Training with LoRA
2
+
3
+ ### What is LoRA?
4
+
5
+ LoRA (Low-Rank Adaptation) is a technique designed to reduce the computational and memory cost of fine-tuning large-scale neural networks. Instead of fine-tuning all the model parameters, LoRA introduces small trainable low-rank matrices that are injected into the network. This allows significant reductions in the number of trainable parameters, making it more efficient to adapt pre-trained models to new tasks. For more details, you can refer to the original paper.
6
+
7
+ ### Enabling LoRA in Training
8
+
9
+ To include LoRA in your training pipeline, you need to:
10
+
11
+ Add the `--train_lora` flag to the training command.
12
+
13
+ Add the following configuration for LoRA in your config file:
14
+
15
+ Example:
16
+ ```
17
+ lora:
18
+ r: 8
19
+ lora_alpha: 16 # alpha / rank > 1
20
+ lora_dropout: 0.05
21
+ merge_weights: False
22
+ fan_in_fan_out: False
23
+ enable_lora: [True]
24
+ ```
25
+
26
+ Configuration Parameters Explained:
27
+
28
+ * `r` (Rank): This determines the rank of the low-rank adaptation matrices. A smaller rank reduces memory usage and file size but may limit the model's adaptability to new tasks. Common values are 4, 8, or 16.
29
+
30
+ * `lora_alpha`: Scaling factor for the LoRA weights. The ratio lora_alpha / r should generally be greater than 1 to ensure sufficient expressive power. For example, with r=8 and lora_alpha=16, the scaling factor is 2.
31
+
32
+ * `lora_dropout`: Dropout rate applied to LoRA layers. It helps regularize the model and prevent overfitting, especially for smaller datasets. Typical values are in the range [0.0, 0.1].
33
+
34
+ * `merge_weights`: Whether to merge the LoRA weights into the original model weights during inference. Set this to True only if you want to save the final model with merged weights for deployment.
35
+
36
+ * `fan_in_fan_out`: Defines the weight initialization convention. Leave this as False for most scenarios unless your model uses a specific convention requiring it.
37
+
38
+ * `enable_lora`: A list of booleans specifying whether LoRA should be applied to certain layers.
39
+ * For example, `[True, False, True]` enables LoRA for the 1st and 3rd layers but not the 2nd.
40
+ * The number of output neurons in the layer must be divisible by the length of enable_lora to ensure proper distribution of LoRA parameters across layers.
41
+ * For transformer architectures such as GPT models, `enable_lora = [True, False, True]` is typically used to apply LoRA to the Query (Q) and Value (V) projection matrices while skipping the Key (K) projection matrix. This setup allows efficient fine-tuning of the attention mechanism while maintaining computational efficiency.
42
+
43
+ ### Benefits of Using LoRA
44
+
45
+ * File Size Reduction: With LoRA, only the LoRA layer weights are saved, which significantly reduces the size of the saved model.
46
+
47
+ * Flexible Fine-Tuning: You can fine-tune the LoRA layers while keeping the base model frozen, preserving the original model's generalization capabilities.
48
+
49
+ * Using Pretrained Weights with LoRA
50
+
51
+ ### To train a model using both pretrained weights and LoRA weights, you need to:
52
+
53
+ 1. Include the `--lora_checkpoint` parameter in the training command.
54
+
55
+ 2. Specify the path to the LoRA checkpoint file.
56
+
57
+ ### Validating and Inferencing with LoRA
58
+
59
+ When using a model fine-tuned with LoRA for validation or inference, you must provide the LoRA checkpoint using the `--lora_checkpoint` parameter.
60
+
61
+ ### Example Commands
62
+
63
+ * Training with LoRA
64
+
65
+ ```
66
+ python train.py --model_type scnet \
67
+ --config_path configs/config_musdb18_scnet_large_starrytong.yaml \
68
+ --start_check_point weights/last_scnet.ckpt \
69
+ --results_path results/ \
70
+ --data_path datasets/moisesdb/train_tracks \
71
+ --valid_path datasets/moisesdb/valid \
72
+ --device_ids 0 \
73
+ --metrics neg_log_wmse l1_freq sdr \
74
+ --metric_for_scheduler neg_log_wmse \
75
+ --train_lora
76
+ ```
77
+
78
+ * Validating with LoRA
79
+ ```
80
+ python valid.py --model_type scnet \
81
+ --config_path configs/config_musdb18_scnet_large_starrytong.yaml \
82
+ --start_check_point weights/last_scnet.ckpt \
83
+ --store_dir results_store/ \
84
+ --valid_path datasets/moisesdb/valid \
85
+ --device_ids 0 \
86
+ --metrics neg_log_wmse l1_freq si_sdr sdr aura_stft aura_mrstft bleedless fullness
87
+ ```
88
+
89
+ * Inference with LoRA
90
+ ```
91
+ python inference.py --lora_checkpoint weights/lora_last_scnet.ckpt \
92
+ --model_type scnet \
93
+ --config_path configs/config_musdb18_scnet_large_starrytong.yaml \
94
+ --start_check_point weights/last_scnet.ckpt \
95
+ --store_dir inference_results/ \
96
+ --input_folder datasets/moisesdb/mixtures_for_inference \
97
+ --device_ids 0
98
+ ```
99
+
100
+ ### Train example with BSRoformer and LoRA
101
+
102
+ You can use this [config](../configs/config_musdb18_bs_roformer_with_lora.yaml) and this [weights](https://github.com/ZFTurbo/Music-Source-Separation-Training/releases/download/v1.0.12/model_bs_roformer_ep_17_sdr_9.6568.ckpt) to finetune BSRoformer on your dataset.
103
+
104
+ ```
105
+ python train.py --model_type bs_roformer \
106
+ --config_path configs/config_musdb18_bs_roformer_with_lora.yaml \
107
+ --start_check_point weights/model_bs_roformer_ep_17_sdr_9.6568.ckpt \
108
+ --results_path results/ \
109
+ --data_path musdb18hq/train \
110
+ --valid_path musdb18hq/test \
111
+ --device_ids 0 \
112
+ --metrics sdr \
113
+ --train_lora
114
+ ```
docs/augmentations.md ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Augmentations
2
+
3
+ Augmentations allows to change stems on the fly increasing the size of dataset by creating new samples from old samples.
4
+ Now control for augmentations is done from config file. Below you can find the example of full config,
5
+ which includes all available augmentations:
6
+
7
+ ```config
8
+ augmentations:
9
+ enable: true # enable or disable all augmentations (to fast disable if needed)
10
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
11
+ loudness_min: 0.5
12
+ loudness_max: 1.5
13
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
14
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
15
+ - 0.2
16
+ - 0.02
17
+ mixup_loudness_min: 0.5
18
+ mixup_loudness_max: 1.5
19
+
20
+ # apply mp3 compression to mixture only (emulate downloading mp3 from internet)
21
+ mp3_compression_on_mixture: 0.01
22
+ mp3_compression_on_mixture_bitrate_min: 32
23
+ mp3_compression_on_mixture_bitrate_max: 320
24
+ mp3_compression_on_mixture_backend: "lameenc"
25
+
26
+ all:
27
+ channel_shuffle: 0.5 # Set 0 or lower to disable
28
+ random_inverse: 0.1 # inverse track (better lower probability)
29
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
30
+ mp3_compression: 0.01
31
+ mp3_compression_min_bitrate: 32
32
+ mp3_compression_max_bitrate: 320
33
+ mp3_compression_backend: "lameenc"
34
+
35
+ # pedalboard reverb block
36
+ pedalboard_reverb: 0.01
37
+ pedalboard_reverb_room_size_min: 0.1
38
+ pedalboard_reverb_room_size_max: 0.9
39
+ pedalboard_reverb_damping_min: 0.1
40
+ pedalboard_reverb_damping_max: 0.9
41
+ pedalboard_reverb_wet_level_min: 0.1
42
+ pedalboard_reverb_wet_level_max: 0.9
43
+ pedalboard_reverb_dry_level_min: 0.1
44
+ pedalboard_reverb_dry_level_max: 0.9
45
+ pedalboard_reverb_width_min: 0.9
46
+ pedalboard_reverb_width_max: 1.0
47
+
48
+ # pedalboard chorus block
49
+ pedalboard_chorus: 0.01
50
+ pedalboard_chorus_rate_hz_min: 1.0
51
+ pedalboard_chorus_rate_hz_max: 7.0
52
+ pedalboard_chorus_depth_min: 0.25
53
+ pedalboard_chorus_depth_max: 0.95
54
+ pedalboard_chorus_centre_delay_ms_min: 3
55
+ pedalboard_chorus_centre_delay_ms_max: 10
56
+ pedalboard_chorus_feedback_min: 0.0
57
+ pedalboard_chorus_feedback_max: 0.5
58
+ pedalboard_chorus_mix_min: 0.1
59
+ pedalboard_chorus_mix_max: 0.9
60
+
61
+ # pedalboard phazer block
62
+ pedalboard_phazer: 0.01
63
+ pedalboard_phazer_rate_hz_min: 1.0
64
+ pedalboard_phazer_rate_hz_max: 10.0
65
+ pedalboard_phazer_depth_min: 0.25
66
+ pedalboard_phazer_depth_max: 0.95
67
+ pedalboard_phazer_centre_frequency_hz_min: 200
68
+ pedalboard_phazer_centre_frequency_hz_max: 12000
69
+ pedalboard_phazer_feedback_min: 0.0
70
+ pedalboard_phazer_feedback_max: 0.5
71
+ pedalboard_phazer_mix_min: 0.1
72
+ pedalboard_phazer_mix_max: 0.9
73
+
74
+ # pedalboard distortion block
75
+ pedalboard_distortion: 0.01
76
+ pedalboard_distortion_drive_db_min: 1.0
77
+ pedalboard_distortion_drive_db_max: 25.0
78
+
79
+ # pedalboard pitch shift block
80
+ pedalboard_pitch_shift: 0.01
81
+ pedalboard_pitch_shift_semitones_min: -7
82
+ pedalboard_pitch_shift_semitones_max: 7
83
+
84
+ # pedalboard resample block
85
+ pedalboard_resample: 0.01
86
+ pedalboard_resample_target_sample_rate_min: 4000
87
+ pedalboard_resample_target_sample_rate_max: 44100
88
+
89
+ # pedalboard bitcrash block
90
+ pedalboard_bitcrash: 0.01
91
+ pedalboard_bitcrash_bit_depth_min: 4
92
+ pedalboard_bitcrash_bit_depth_max: 16
93
+
94
+ # pedalboard mp3 compressor block
95
+ pedalboard_mp3_compressor: 0.01
96
+ pedalboard_mp3_compressor_pedalboard_mp3_compressor_min: 0
97
+ pedalboard_mp3_compressor_pedalboard_mp3_compressor_max: 9.999
98
+
99
+ vocals:
100
+ pitch_shift: 0.1
101
+ pitch_shift_min_semitones: -5
102
+ pitch_shift_max_semitones: 5
103
+ seven_band_parametric_eq: 0.25
104
+ seven_band_parametric_eq_min_gain_db: -9
105
+ seven_band_parametric_eq_max_gain_db: 9
106
+ tanh_distortion: 0.1
107
+ tanh_distortion_min: 0.1
108
+ tanh_distortion_max: 0.7
109
+ bass:
110
+ pitch_shift: 0.1
111
+ pitch_shift_min_semitones: -2
112
+ pitch_shift_max_semitones: 2
113
+ seven_band_parametric_eq: 0.25
114
+ seven_band_parametric_eq_min_gain_db: -3
115
+ seven_band_parametric_eq_max_gain_db: 6
116
+ tanh_distortion: 0.2
117
+ tanh_distortion_min: 0.1
118
+ tanh_distortion_max: 0.5
119
+ drums:
120
+ pitch_shift: 0.33
121
+ pitch_shift_min_semitones: -5
122
+ pitch_shift_max_semitones: 5
123
+ seven_band_parametric_eq: 0.25
124
+ seven_band_parametric_eq_min_gain_db: -9
125
+ seven_band_parametric_eq_max_gain_db: 9
126
+ tanh_distortion: 0.33
127
+ tanh_distortion_min: 0.1
128
+ tanh_distortion_max: 0.6
129
+ other:
130
+ pitch_shift: 0.1
131
+ pitch_shift_min_semitones: -4
132
+ pitch_shift_max_semitones: 4
133
+ gaussian_noise: 0.1
134
+ gaussian_noise_min_amplitude: 0.001
135
+ gaussian_noise_max_amplitude: 0.015
136
+ time_stretch: 0.01
137
+ time_stretch_min_rate: 0.8
138
+ time_stretch_max_rate: 1.25
139
+ ```
140
+
141
+ You can copypaste it into your config to use augmentations.
142
+ Notes:
143
+ * To completely disable all augmentations you can either remove `augmentations` section from config or set `enable` to `false`.
144
+ * If you want to disable some augmentation, just set it to zero.
145
+ * Augmentations in `all` subsections applied to all stems
146
+ * Augmentations in `vocals`, `bass` etc subsections applied only to corresponding stems. You can create such subsections for all stems which are given in `training.instruments`.
docs/bs_roformer_info.md ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Batch sizes for BSRoformer
2
+
3
+ You can use table below to choose BS Roformer `batch_size` parameter for training based on your GPUs. Batch size values provided for single GPU. If you have several GPUs you need to multiply value on number of GPUs.
4
+
5
+ | chunk_size | dim | depth | batch_size (A6000 48GB) | batch_size (3090/4090 24GB) | batch_size (16GB) |
6
+ |:----------:|:---:|:-----:|:-----------------------:|:---------------------------:|:-----------------:|
7
+ | 131584 | 128 | 6 | 10 | 5 | 3 |
8
+ | 131584 | 256 | 6 | 8 | 4 | 2 |
9
+ | 131584 | 384 | 6 | 7 | 3 | 2 |
10
+ | 131584 | 512 | 6 | 6 | 3 | 2 |
11
+ | 131584 | 256 | 8 | 6 | 3 | 2 |
12
+ | 131584 | 256 | 12 | 4 | 2 | 1 |
13
+ | 263168 | 128 | 6 | 4 | 2 | 1 |
14
+ | 263168 | 256 | 6 | 3 | 1 | 1 |
15
+ | 352800 | 128 | 6 | 2 | 1 | - |
16
+ | 352800 | 256 | 6 | 2 | 1 | - |
17
+ | 352800 | 384 | 12 | 1 | - | - |
18
+ | 352800 | 512 | 12 | - | - | - |
19
+
20
+
21
+ Parameters obtained with initial config:
22
+
23
+ ```
24
+ audio:
25
+ chunk_size: 131584
26
+ dim_f: 1024
27
+ dim_t: 515
28
+ hop_length: 512
29
+ n_fft: 2048
30
+ num_channels: 2
31
+ sample_rate: 44100
32
+ min_mean_abs: 0.000
33
+
34
+ model:
35
+ dim: 384
36
+ depth: 12
37
+ stereo: true
38
+ num_stems: 1
39
+ time_transformer_depth: 1
40
+ freq_transformer_depth: 1
41
+ linear_transformer_depth: 0
42
+ freqs_per_bands: !!python/tuple
43
+ - 2
44
+ - 2
45
+ - 2
46
+ - 2
47
+ - 2
48
+ - 2
49
+ - 2
50
+ - 2
51
+ - 2
52
+ - 2
53
+ - 2
54
+ - 2
55
+ - 2
56
+ - 2
57
+ - 2
58
+ - 2
59
+ - 2
60
+ - 2
61
+ - 2
62
+ - 2
63
+ - 2
64
+ - 2
65
+ - 2
66
+ - 2
67
+ - 4
68
+ - 4
69
+ - 4
70
+ - 4
71
+ - 4
72
+ - 4
73
+ - 4
74
+ - 4
75
+ - 4
76
+ - 4
77
+ - 4
78
+ - 4
79
+ - 12
80
+ - 12
81
+ - 12
82
+ - 12
83
+ - 12
84
+ - 12
85
+ - 12
86
+ - 12
87
+ - 24
88
+ - 24
89
+ - 24
90
+ - 24
91
+ - 24
92
+ - 24
93
+ - 24
94
+ - 24
95
+ - 48
96
+ - 48
97
+ - 48
98
+ - 48
99
+ - 48
100
+ - 48
101
+ - 48
102
+ - 48
103
+ - 128
104
+ - 129
105
+ dim_head: 64
106
+ heads: 8
107
+ attn_dropout: 0.1
108
+ ff_dropout: 0.1
109
+ flash_attn: false
110
+ dim_freqs_in: 1025
111
+ stft_n_fft: 2048
112
+ stft_hop_length: 512
113
+ stft_win_length: 2048
114
+ stft_normalized: false
115
+ mask_estimator_depth: 2
116
+ multi_stft_resolution_loss_weight: 1.0
117
+ multi_stft_resolutions_window_sizes: !!python/tuple
118
+ - 4096
119
+ - 2048
120
+ - 1024
121
+ - 512
122
+ - 256
123
+ multi_stft_hop_size: 147
124
+ multi_stft_normalized: False
125
+
126
+ training:
127
+ batch_size: 1
128
+ gradient_accumulation_steps: 1
129
+ grad_clip: 0
130
+ instruments:
131
+ - vocals
132
+ - other
133
+ lr: 3.0e-05
134
+ patience: 2
135
+ reduce_factor: 0.95
136
+ target_instrument: vocals
137
+ num_epochs: 1000
138
+ num_steps: 1000
139
+ q: 0.95
140
+ coarse_loss_clip: true
141
+ ema_momentum: 0.999
142
+ optimizer: adam
143
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
144
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
145
+ ```
docs/changes.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Changes
2
+
3
+ #### v1.0.2
4
+
5
+ * Added multi GPU validation (earlier validation was performed on single GPU)
6
+ * `training.batch_size` in config now must be set for single GPU (if you use multiple GPUs it will be automatically multiplied by number of GPUs)
7
+
8
+ #### v1.0.3
9
+
10
+ * Added "spawn" fix for multiprocessing
11
+ * Function `get_model_from_config` now takes path of config as input.
12
+ * On latest version of pytorch some problems with torch.backends.cudnn.benchmark = True - big slow down. Fixed version 2.0.1 in requirements.txt
13
+ * `--valid_path` parameter for train.py now can accept several validation folders instead of one. Added warning if validation folder is empty.
14
+ * Small fix for AMP usage in Demucs models taken from config
15
+ * Support for Demucs3 mmi model was added
16
+ * GPU memory consumption was reduced during inference and validation.
17
+ * Some changes to repair click problems on the edges of segment.
18
+ * Added support to train on FLAC files. Some more error checks added.
19
+ * viperx's Roformer weights and configs added
20
+ * `--extract_instrumental` argument added to inference.py
docs/dataset_types.md ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Dataset types for training
2
+
3
+ * **Type 1 (MUSDB)**: different folders. Each folder contains all needed stems in format _< stem name >.wav_. The same as in MUSDBHQ18 dataset. In latest code releases it's possible to use `flac` instead of `wav`.
4
+
5
+ Example:
6
+ ```
7
+ --- Song 1:
8
+ ------ vocals.wav
9
+ ------ bass.wav
10
+ ------ drums.wav
11
+ ------ other.wav
12
+ --- Song 2:
13
+ ------ vocals.wav
14
+ ------ bass.wav
15
+ ------ drums.wav
16
+ ------ other.wav
17
+ --- Song 3:
18
+ ...........
19
+ ```
20
+
21
+ * **Type 2 (Stems)**: each folder is "stem name". Folder contains wav files which consists only of required stem.
22
+ ```
23
+ --- vocals:
24
+ ------ vocals_1.wav
25
+ ------ vocals_2.wav
26
+ ------ vocals_3.wav
27
+ ------ vocals_4.wav
28
+ ------ ...
29
+ --- bass:
30
+ ------ bass_1.wav
31
+ ------ bass_2.wav
32
+ ------ bass_3.wav
33
+ ------ bass_4.wav
34
+ ------ ...
35
+ ...........
36
+ ```
37
+
38
+ * **Type 3 (CSV file)**:
39
+
40
+ You can provide CSV-file (or list of CSV-files) with following structure:
41
+ ```
42
+ instrum,path
43
+ vocals,/path/to/dataset/vocals_1.wav
44
+ vocals,/path/to/dataset2/vocals_v2.wav
45
+ vocals,/path/to/dataset3/vocals_some.wav
46
+ ...
47
+ drums,/path/to/dataset/drums_good.wav
48
+ ...
49
+ ```
50
+
51
+ * **Type 4 (MUSDB Aligned)**:
52
+
53
+ The same as Type 1, but during training all instruments will be from the same position of song.
54
+
55
+ ### Dataset for validation
56
+
57
+ * The validation dataset must be the same structure as type 1 datasets (regardless of what type of dataset you're using for training), but also each folder must include `mixture.wav` for each song. `mixture.wav` - is the sum of all stems for song.
58
+
59
+ Example:
60
+ ```
61
+ --- Song 1:
62
+ ------ vocals.wav
63
+ ------ bass.wav
64
+ ------ drums.wav
65
+ ------ other.wav
66
+ ------ mixture.wav
67
+ --- Song 2:
68
+ ------ vocals.wav
69
+ ------ bass.wav
70
+ ------ drums.wav
71
+ ------ other.wav
72
+ ------ mixture.wav
73
+ --- Song 3:
74
+ ...........
75
+ ```
docs/ensemble.md ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Ensemble usage
2
+
3
+ Repository contains `ensemble.py` script which can be used to ensemble results of different algorithms.
4
+
5
+ Arguments:
6
+ * `--files` - Path to all audio-files to ensemble
7
+ * `--type` - Method to do ensemble. One of avg_wave, median_wave, min_wave, max_wave, avg_fft, median_fft, min_fft, max_fft. Default: avg_wave.
8
+ * `--weights` - Weights to create ensemble. Number of weights must be equal to number of files
9
+ * `--output` - Path to wav file where ensemble result will be stored (Default: res.wav)
10
+
11
+ Example:
12
+ ```
13
+ ensemble.py --files ./results_tracks/vocals1.wav ./results_tracks/vocals2.wav --weights 2 1 --type max_fft --output out.wav
14
+ ```
15
+
16
+ ### Ensemble types:
17
+
18
+ * `avg_wave` - ensemble on 1D variant, find average for every sample of waveform independently
19
+ * `median_wave` - ensemble on 1D variant, find median value for every sample of waveform independently
20
+ * `min_wave` - ensemble on 1D variant, find minimum absolute value for every sample of waveform independently
21
+ * `max_wave` - ensemble on 1D variant, find maximum absolute value for every sample of waveform independently
22
+ * `avg_fft` - ensemble on spectrogram (Short-time Fourier transform (STFT), 2D variant), find average for every pixel of spectrogram independently. After averaging use inverse STFT to obtain original 1D-waveform back.
23
+ * `median_fft` - the same as avg_fft but use median instead of mean (only useful for ensembling of 3 or more sources).
24
+ * `min_fft` - the same as avg_fft but use minimum function instead of mean (reduce aggressiveness).
25
+ * `max_fft` - the same as avg_fft but use maximum function instead of mean (the most aggressive).
26
+
27
+ ### Notes
28
+ * `min_fft` can be used to do more conservative ensemble - it will reduce influence of more aggressive models.
29
+ * It's better to ensemble models which are of equal quality - in this case it will give gain. If one of model is bad - it will reduce overall quality.
30
+ * In my experiments `avg_wave` was always better or equal in SDR score comparing with other methods.