Spaces:
Paused
Paused
File size: 5,513 Bytes
3ac2113 e6da193 3ac2113 37da229 3ac2113 37da229 3ac2113 37da229 3ac2113 ccc5378 3ac2113 37da229 e6da193 37da229 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
from scipy.ndimage import median_filter
import json
import numpy as np
from pathlib import Path
LOW = 250
HIGH = 4000
FPS = 100
BIN_FREQS = [
43.06640625, 64.599609375, 86.1328125, 107.666015625, 129.19921875, 150.732421875, 172.265625, 193.798828125,
215.33203125, 236.865234375, 258.3984375, 279.931640625, 301.46484375, 322.998046875, 344.53125, 366.064453125,
387.59765625, 409.130859375, 430.6640625, 452.197265625, 495.263671875, 516.796875, 538.330078125, 581.396484375,
624.462890625, 645.99609375, 689.0625, 732.12890625, 775.1953125, 839.794921875, 882.861328125, 925.927734375,
990.52734375, 1055.126953125, 1098.193359375, 1184.326171875, 1248.92578125, 1313.525390625, 1399.658203125,
1485.791015625, 1571.923828125, 1658.056640625, 1765.72265625, 1873.388671875, 1981.0546875, 2088.720703125,
2217.919921875, 2347.119140625, 2497.8515625, 2627.05078125, 2799.31640625, 2950.048828125, 3143.84765625,
3316.11328125, 3509.912109375, 3725.244140625, 3940.576171875, 4177.44140625, 4435.83984375, 4694.23828125,
4974.169921875, 5275.634765625, 5577.099609375, 5921.630859375, 6266.162109375, 6653.759765625, 7041.357421875,
7450.48828125, 7902.685546875, 8376.416015625, 8871.6796875, 9388.4765625, 9948.33984375, 10551.26953125,
11175.732421875, 11843.26171875, 12553.857421875, 13285.986328125, 14082.71484375, 14922.509765625, 15805.37109375
]
BIN_FREQS = np.array(BIN_FREQS).round().astype(int)
def to_uint8_list(arr):
"""Converts a numpy array to a list of uint8 values."""
scaled_arr = (arr * 255).astype(np.uint8)
return scaled_arr.tolist()
def apply_to_dict(d, func):
"""Recursively applies func to the leaf values of a nested dictionary."""
for key, value in d.items():
if isinstance(value, dict):
apply_to_dict(value, func)
else:
d[key] = func(value)
def convert_segments(input_data):
segments_output = []
labels_output = []
# Extracting segments and appending to the respective lists
for segment in input_data.segments:
segments_output.append(segment.start)
labels_output.append(segment.label)
# Appending the end time of the last segment
segments_output.append(input_data.segments[-1].end)
return {"segments": segments_output, "labels": labels_output}
def process(specs, struct, name):
i_low = np.flatnonzero(BIN_FREQS < LOW)
i_high = np.flatnonzero(BIN_FREQS > HIGH)
i_mid = np.flatnonzero((LOW <= BIN_FREQS) & (BIN_FREQS <= HIGH))
# Compute the max energy value for each frequency band considering all instruments.
max_low = specs[:, :, i_low].max()
max_mid = specs[:, :, i_mid].max()
max_high = specs[:, :, i_high].max()
wavs_low, wavs_mid, wavs_high = [
specs[:, :, indices].mean(axis=-1)
# spec[:, indices].mean(axis=1)
for indices in [i_low, i_mid, i_high]
]
wavs_low /= max_low
wavs_mid /= max_mid
wavs_high /= max_high
assert wavs_low.max() <= 1.0
assert wavs_mid.max() <= 1.0
assert wavs_high.max() <= 1.0
navs_low = np.array([median_filter(wav, size=FPS) for wav in wavs_low])
navs_mid = np.array([median_filter(wav, size=FPS) for wav in wavs_mid])
navs_high = np.array([median_filter(wav, size=FPS) for wav in wavs_high])
navs_low = navs_low
navs_mid = navs_low + navs_mid
navs_high = navs_mid + navs_high
max_nav = np.max([navs_low.max(), navs_mid.max(), navs_high.max()])
navs_low /= max_nav
navs_mid /= max_nav
navs_high /= max_nav
assert navs_high.max() <= 1.0
data = {
'nav': {},
'wav': {},
}
for (
eg_low, eg_mid, eg_high,
nav_low, nav_mid, nav_high,
inst
) in zip(
wavs_low, wavs_mid, wavs_high,
navs_low, navs_mid, navs_high,
[
'bass',
'drum',
'other',
'vocal',
]
):
data['wav'][inst] = {
'low': eg_low,
'mid': eg_mid,
'high': eg_high,
}
data['nav'][inst] = {
'low': nav_low,
'mid': nav_mid,
'high': nav_high,
}
apply_to_dict(data, to_uint8_list)
data['duration'] = specs.shape[1] / FPS
data['scores'] = {
"segment": {
"[email protected]":0,
"[email protected]":0,
"[email protected]":0,
"[email protected]":0,
"[email protected]":0,
"[email protected]":0,
"Ref-to-est deviation":0,
"Est-to-ref deviation":0,
"Pairwise Precision":0,
"Pairwise Recall":0,
"Pairwise F-measure":0,
"Rand Index":0,
"Adjusted Rand Index":0,
"Mutual Information":0,
"Adjusted Mutual Information":0,
"Normalized Mutual Information":0,
"NCE Over":0,
"NCE Under":0,
"NCE F-measure":0,
"V Precision":0,
"V Recall":0,
"V-measure":0,
"Accuracy":0
},
"beat": {
"f1":0,
"precision":0,
"recall":0,
"cmlt":0,
"amlt":0
},
"downbeat": {
"f1":0,
"precision":0,
"recall":0,
"cmlt":0,
"amlt":0
}
}
data['id'] = name
data['truths'] = {'beats': struct.beats, 'downbeats': struct.downbeats, **convert_segments(struct)}
data['inferences'] = data['truths']
filename = f'dissector.{name}.json'
with open(filename, 'w') as file:
file.write(json.dumps(data))
return filename
def generate_dissector_data(name, result):
spec_path = Path(f'./spec/{name}.npy').resolve().as_posix()
struct_path = Path(f'./struct/{name}.json').resolve().as_posix()
specs = np.load(spec_path)
return process(specs, result, name)
|