Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 5,083 Bytes
338e293 00fa560 338e293 4c49639 338e293 4c49639 338e293 4c49639 338e293 00fa560 4c49639 338e293 00fa560 338e293 00fa560 338e293 00fa560 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
from inference import get_clap_embeddings_from_audio, get_clap_embeddings_from_text
from pedalboard import Pedalboard, Reverb, HighpassFilter, LowpassFilter, Distortion, Bitcrush
from sklearn.metrics.pairwise import cosine_similarity
import soundfile as sf
from skopt import gp_minimize
from skopt.space import Real
import librosa
import numpy as np
import os
concat_file_path = "temp_concat.wav"
def concatenate_sounds(drum_kit, output_path="temp_concat.wav"):
"""Stitch together all drum sounds into one audio file."""
all_audio = []
sr = 48000
for instrument, samples in drum_kit.items():
for sample in samples:
audio, _ = librosa.load(sample, sr=48000)
all_audio.append(audio)
# Concatenate all sounds with a small silence gap
gap = np.zeros(int(sr * 0.2)) # 200ms silence between sounds
full_audio = np.concatenate([item for audio in all_audio for item in (audio, gap)])
# Save to temp file
sf.write(output_path, full_audio, sr)
return output_path
def evaluate_fitness(audio_path, text_embed):
"""Compute similarity between processed audio and text query."""
audio_embed = get_clap_embeddings_from_audio(audio_path)
return cosine_similarity([text_embed], [audio_embed])[0][0]
def apply_fx(audio_path, params, write_wav=True, output_dir="processed_audio"):
"""Apply EQ and Reverb to an audio file and return the modified file path."""
audio, sr = librosa.load(audio_path, sr=48000)
board = Pedalboard([
LowpassFilter(cutoff_frequency_hz=params['lowpass']),
HighpassFilter(cutoff_frequency_hz=params['highpass']),
Distortion(drive_db=params['drive_db']),
Bitcrush(bit_depth=params['bit_depth']),
Reverb(room_size=params['reverb_size'], wet_level=params['reverb_wet'])
])
processed_audio = board(audio, sr)
if write_wav:
# Determine output directory dynamically
base_dir = os.path.dirname(os.path.dirname(audio_path)) # Get 'dataset' level
output_dir = os.path.join(base_dir, output_dir)
# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)
# Create new file path inside the processed_sounds directory
file_name = os.path.basename(audio_path).replace(".wav", "_processed.wav")
output_path = os.path.join(output_dir, file_name)
# Save processed audio
sf.write(output_path, processed_audio, sr)
return output_path
else:
return processed_audio
def objective_function(params, audio_file, text_embedding):
"""Objective function for Bayesian Optimization using the concatenated file."""
processed_audio = apply_fx(audio_file, {
"lowpass": params[0],
"highpass": params[1],
"reverb_size": params[2],
"reverb_wet": params[3],
"drive_db": params[4],
"bit_depth": params[5]
}, write_wav=True)
similarity = evaluate_fitness(processed_audio, text_embedding)
return -similarity # Minimize negative similarity (maximize similarity)
def get_params_dict(params_list):
return {
"lowpass cutoff (Hz)": params_list[0],
"highpass cutoff (Hz)": params_list[1],
"reverb size": params_list[2],
"reverb mix": params_list[3],
"distortion - gain_db": params_list[4],
"bitcrush - bit depth": params_list[5]
}
# Define parameter search space
search_space = [
Real(4000, 20000, name="lowpass"),
Real(50, 1000, name="highpass"),
Real(0.0, 0.8, name="reverb_size"),
Real(0.2, 1.0, name="reverb_wet"),
Real(0.0, 10.0, name="drive_db"),
Real(4.0, 32.0, name="bit_depth")
]
##### Main function #####
def get_fx(drum_kit, fx_prompt):
"""Optimize FX settings for the entire drum kit by using a concatenated audio file."""
text_embedding = get_clap_embeddings_from_text(fx_prompt)
# Concatenate all drum sounds
concat_file = concatenate_sounds(drum_kit)
# Define the objective function for the concatenated file
def obj_func(params):
return objective_function(params, concat_file, text_embedding)
# Get CLAP similarity without FX (for evaluation purposes)
pre_fx_fitness = - evaluate_fitness(concat_file_path, text_embedding)
# Run Bayesian optimization
res = gp_minimize(obj_func, search_space, n_calls=30, random_state=42)
best_params = res.x
# Get post-FX fitness (for evaluation purposes)
post_fx_fitness = obj_func(best_params)
# Apply the best FX parameters to each individual sound
optimized_kit = {}
for instrument, samples in drum_kit.items():
optimized_kit[instrument] = [apply_fx(sample, {
"lowpass": best_params[0],
"highpass": best_params[1],
"reverb_size": best_params[2],
"reverb_wet": best_params[3],
"drive_db": best_params[4],
"bit_depth": best_params[5]
}, write_wav=True) for sample in samples]
return optimized_kit, get_params_dict(best_params), pre_fx_fitness, post_fx_fitness |