De-limiter / prepro /save_musdb_XL_train_numpy.py
jeonchangbin49's picture
tenth commit
9e538da
import os
import glob
import argparse
import csv
import numpy as np
import librosa
import soundfile as sf
import tqdm
def main():
parser = argparse.ArgumentParser(
description="Save sample-wise gain parameters for dataset distribution"
)
parser.add_argument(
"--root",
type=str,
default="/path/to/musdb18hq",
help="Root directory",
)
parser.add_argument(
"--musdb_XL_train_root",
type=str,
default="/path/to/musdb-XL-train",
help="Directory of musdb-XL-train dataset",
)
parser.add_argument(
"--output",
type=str,
default="/path/to/musdb-XL-train/np_ratio",
help="Directory to save sample-wise gain ratio",
)
args = parser.parse_args()
sources = ["vocals", "bass", "drums", "other"]
path_csv_fixed = f"{args.musdb_XL_train_root}/ozone_train_fixed.csv"
list_path_csv_random = sorted(
glob.glob(f"{args.musdb_XL_train_root}/ozone_train_random_*.csv")
)
# read ozone_train_fixed list
fixed_list = []
os.makedirs(f"{args.output}/ozone_train_fixed", exist_ok=True)
with open(path_csv_fixed, "r", encoding="utf-8") as f:
rdr = csv.reader(f)
for k, line in enumerate(rdr):
if k == 0: # song_name, max_threshold, max_character
pass
else:
fixed_list.append(line)
# save numpy files of ozone_train_fixed
# which is the limiter-applied version of 100 songs from musdb-HQ train set
# each numpy file contain sample-wise gain ratio parameters
for fixed_song in tqdm.tqdm(fixed_list):
audio_sources = []
for source in sources:
audio, sr = librosa.load(
f"{args.root}/train/{fixed_song[0]}/{source}.wav", sr=44100, mono=False
)
audio_sources.append(audio)
stems = np.stack(audio_sources, axis=0)
mixture = stems.sum(0)
ozone_mixture, sr = librosa.load(
f"{args.musdb_XL_train_root}/ozone_train_fixed/{fixed_song[0]}.wav",
sr=44100,
mono=False,
)
mixture[mixture == 0.0] = np.finfo(np.float32).eps # to avoid 'divided by zero'
ratio = ozone_mixture / mixture
np.save(
f"{args.output}/ozone_train_fixed/{fixed_song[0]}.npy",
ratio.astype(np.float16), # 16bit is enough...
)
# read ozone_train_random list
random_list = []
os.makedirs(f"{args.output}/ozone_train_random", exist_ok=True)
for path_csv_random in list_path_csv_random:
with open(path_csv_random, "r", encoding="utf-8") as f:
rdr = csv.reader(f)
for k, line in enumerate(rdr):
if k == 0:
# ['song_name',
# 'max_threshold',
# 'max_character',
# 'vocals_name',
# 'vocals_start_sec',
# 'vocals_gain',
# 'vocals_channelswap',
# 'bass_name',
# 'bass_start_sec',
# 'bass_gain',
# 'bass_channelswap',
# 'drums_name',
# 'drums_start_sec',
# 'drums_gain',
# 'drums_channelswap',
# 'other_name',
# 'other_start_sec',
# 'other_gain',
# 'other_channelswap']
pass
else:
random_list.append(line)
# save wave files of ozone_train_random,
# which is the limiter-applied version of 4-sec 300,000 segments randomly created from musdb-HQ train subset
for random_song in tqdm.tqdm(random_list):
audio_sources = []
for k, source in enumerate(sources):
audio, sr = librosa.load(
f"{args.root}/train/{random_song[3 + k * 4]}/{source}.wav",
sr=44100,
mono=False,
offset=float(random_song[4 + k * 4]), # 'inst_start_sec'
duration=4.0,
)
audio = audio * float(random_song[5 + k * 4]) # 'inst_gain'
if random_song[6 + k * 4].lower() == "true": # 'inst_channelswap'
audio = np.flip(audio, axis=0)
audio_sources.append(audio)
stems = np.stack(audio_sources, axis=0)
mixture = stems.sum(0)
ozone_mixture, sr = librosa.load(
f"{args.musdb_XL_train_root}/ozone_train_random/{random_song[0]}.wav",
sr=44100,
mono=False,
)
mixture[mixture == 0.0] = np.finfo(np.float32).eps # to avoid 'divided by zero'
ratio = ozone_mixture / mixture
np.save(
f"{args.output}/ozone_train_random/{random_song[0]}.npy",
ratio.astype(np.float16), # 16bit is enough...
)
if __name__ == "__main__":
main()