Spaces:

arhanv
/

drum-kit-generator

Running on CPU Upgrade

App Files Files Community

arhanv commited on 8 days ago

Commit

ac3dd61

1 Parent(s): ab0f8ba

fixed dataset and inference

Browse files

Files changed (4) hide show

.gitignore +3 -1
dataset/{one_shot_percussive_sounds.zip → all_sounds.zip} +2 -2
inference.py +38 -12
requirements.txt +7 -21

.gitignore CHANGED Viewed

@@ -1,3 +1,5 @@
 /.env
 ._*
-/dataset/unzipped

 /.env
 ._*
+/dataset/unzipped
+/dataset/all_sounds
+*.pyc

dataset/{one_shot_percussive_sounds.zip → all_sounds.zip} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c45401b3cbdd56606f0d9e5e494a18efbae1ca830f835504dccc316c1934720c
-size 112614838

 version https://git-lfs.github.com/spec/v1
+oid sha256:5d8773333b5f600f968c3d2c2c5fb09332440f19867ce69e9a20b76ab5aed618
+size 112639857

inference.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import laion_clap
 import numpy as np
 import librosa
 import pickle
@@ -7,20 +6,23 @@ from sklearn.metrics.pairwise import cosine_similarity
 import pandas as pd
 import zipfile
 import json
-dataset_zip = "dataset/one_shot_percussive_sounds.zip"
-extracted_folder = "dataset/unzipped"
 metadata_path = "dataset/licenses.txt"
 audio_embeddings_path = "dataset/audio_embeddings.pkl"
 # Unzip if not already extracted
 if not os.path.exists(extracted_folder):
     with zipfile.ZipFile(dataset_zip, "r") as zip_ref:
-        zip_ref.extractall("dataset")
-# Load the model
-model = laion_clap.CLAP_Module(enable_fusion=True)
-model.load_ckpt(model_id=3)
 # Load dataset metadata
 with open(metadata_path, "r") as file:
@@ -30,14 +32,38 @@ with open(metadata_path, "r") as file:
 metadata = pd.DataFrame.from_dict(data, orient="index")
 metadata.index = metadata.index.astype(str) + '.wav'
 # Load precomputed audio embeddings (to avoid recomputing on every request)
 with open(audio_embeddings_path, "rb") as f:
     audio_embeddings = pickle.load(f)
 def get_clap_embeddings_from_text(text):
-    """Convert user text input to a CLAP embedding."""
-    text_embed = model.get_text_embedding([text])
-    return text_embed[0, :]
 def find_top_sounds(text_embed, instrument, top_N=4):
     """Finds the closest N sounds for an instrument."""
@@ -50,7 +76,7 @@ def find_top_sounds(text_embed, instrument, top_N=4):
     # Get top N matches
     top_indices = np.argsort(similarities)[-top_N:][::-1]
-    top_files = [valid_sounds[i] for i in top_indices]
     return top_files
@@ -62,4 +88,4 @@ def generate_drum_kit(prompt, kit_size=4):
     for instrument in ["Kick", "Snare", "Hi-Hat", "Tom", "Cymbal", "Clap", "Percussion", "Other"]:
         drum_kit[instrument] = find_top_sounds(text_embed, instrument, top_N=kit_size)
-    return drum_kit

 import numpy as np
 import librosa
 import pickle
 import pandas as pd
 import zipfile
 import json
+from transformers import ClapModel, ClapProcessor
+import torch
+import shutil
+dataset_zip = "dataset/all_sounds.zip"
+extracted_folder = "dataset/all_sounds"
 metadata_path = "dataset/licenses.txt"
 audio_embeddings_path = "dataset/audio_embeddings.pkl"
 # Unzip if not already extracted
 if not os.path.exists(extracted_folder):
     with zipfile.ZipFile(dataset_zip, "r") as zip_ref:
+        zip_ref.extractall(extracted_folder)
+# Load Hugging Face's CLAP model
+processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")
+model = ClapModel.from_pretrained("laion/clap-htsat-fused")
 # Load dataset metadata
 with open(metadata_path, "r") as file:
 metadata = pd.DataFrame.from_dict(data, orient="index")
 metadata.index = metadata.index.astype(str) + '.wav'
+instrument_categories = {
+    "Kick": ["kick", "bd", "bass", "808", "kd"],
+    "Snare": ["snare", "sd", "sn"],
+    "Hi-Hat": ["hihat", "hh", "hi_hat", "hi-hat"],
+    "Tom": ["tom"],
+    "Cymbal": ["crash", "ride", "splash", "cymbal"],
+    "Clap": ["clap"],
+    "Percussion": ["shaker", "perc", "tamb", "cowbell", "bongo", "conga", "egg"]
+}
+# Function to categorize filenames based on keywords
+def categorize_instrument(filename):
+    lower_filename = filename.lower()
+    for category, keywords in instrument_categories.items():
+        if any(keyword in lower_filename for keyword in keywords):
+            return category
+    return "Other"  # Default category if no match is found
+# Apply function to create a new 'Instrument' column
+metadata["Instrument"] = metadata["name"].apply(categorize_instrument)
+metadata["Instrument"].value_counts()
 # Load precomputed audio embeddings (to avoid recomputing on every request)
 with open(audio_embeddings_path, "rb") as f:
     audio_embeddings = pickle.load(f)
 def get_clap_embeddings_from_text(text):
+    """Convert user text input to a CLAP embedding using Hugging Face's CLAP."""
+    inputs = processor(text=text, return_tensors="pt")
+    with torch.no_grad():
+        text_embeddings = model.get_text_features(**inputs)
+    return text_embeddings.squeeze(0).numpy()
 def find_top_sounds(text_embed, instrument, top_N=4):
     """Finds the closest N sounds for an instrument."""
     # Get top N matches
     top_indices = np.argsort(similarities)[-top_N:][::-1]
+    top_files = [os.path.join(extracted_folder, valid_sounds[i]) for i in top_indices]
     return top_files
     for instrument in ["Kick", "Snare", "Hi-Hat", "Tom", "Cymbal", "Clap", "Percussion", "Other"]:
         drum_kit[instrument] = find_top_sounds(text_embed, instrument, top_N=kit_size)
+    return drum_kit

requirements.txt CHANGED Viewed

@@ -1,49 +1,41 @@
 altair==5.5.0
-annotated-types==0.7.0
 attrs==25.3.0
 audioread==3.0.1
 blinker==1.9.0
-braceexpand==0.1.7
 cachetools==5.5.2
 certifi==2025.1.31
 cffi==1.17.1
 charset-normalizer==3.4.1
 click==8.1.8
 decorator==5.2.1
-docker-pycreds==0.4.0
 filelock==3.17.0
 fsspec==2025.3.0
-ftfy==6.3.1
 gitdb==4.0.12
 GitPython==3.1.44
-h5py==3.13.0
 huggingface-hub==0.29.3
 idna==3.10
 Jinja2==3.1.6
 joblib==1.4.2
 jsonschema==4.23.0
 jsonschema-specifications==2024.10.1
-laion_clap==1.1.6
 lazy_loader==0.4
 librosa==0.11.0
-llvmlite==0.43.0
 MarkupSafe==3.0.2
 msgpack==1.1.0
 narwhals==1.30.0
-numba==0.60.0
-numpy==1.23.5
 packaging==24.2
 pandas==2.2.3
 pillow==11.1.0
 platformdirs==4.3.6
 pooch==1.8.2
-progressbar==2.5
 protobuf==5.29.3
-psutil==7.0.0
 pyarrow==19.0.1
 pycparser==2.22
-pydantic==2.10.6
-pydantic_core==2.27.2
 pydeck==0.9.1
 python-dateutil==2.9.0.post0
 pytz==2025.1
@@ -55,26 +47,20 @@ rpds-py==0.23.1
 safetensors==0.5.3
 scikit-learn==1.6.1
 scipy==1.15.2
-sentry-sdk==2.22.0
-setproctitle==1.3.5
 six==1.17.0
 smmap==5.0.2
 soundfile==0.13.1
 soxr==0.5.0.post1
 streamlit==1.43.2
 tenacity==9.0.0
 threadpoolctl==3.6.0
 tokenizers==0.21.1
 toml==0.10.2
-torchlibrosa==0.1.0
 tornado==6.4.2
 tqdm==4.67.1
 transformers==4.49.0
 typing_extensions==4.12.2
 tzdata==2025.1
 urllib3==2.3.0
-wandb==0.19.8
-wcwidth==0.2.13
-webdataset==0.2.111
-wget==3.2
-torch

 altair==5.5.0
 attrs==25.3.0
 audioread==3.0.1
 blinker==1.9.0
 cachetools==5.5.2
 certifi==2025.1.31
 cffi==1.17.1
 charset-normalizer==3.4.1
 click==8.1.8
 decorator==5.2.1
 filelock==3.17.0
 fsspec==2025.3.0
 gitdb==4.0.12
 GitPython==3.1.44
 huggingface-hub==0.29.3
 idna==3.10
 Jinja2==3.1.6
 joblib==1.4.2
 jsonschema==4.23.0
 jsonschema-specifications==2024.10.1
 lazy_loader==0.4
 librosa==0.11.0
+llvmlite==0.44.0
 MarkupSafe==3.0.2
+mpmath==1.3.0
 msgpack==1.1.0
 narwhals==1.30.0
+networkx==3.4.2
+numba==0.61.0
+numpy==2.1.3
 packaging==24.2
 pandas==2.2.3
 pillow==11.1.0
 platformdirs==4.3.6
 pooch==1.8.2
 protobuf==5.29.3
 pyarrow==19.0.1
 pycparser==2.22
 pydeck==0.9.1
 python-dateutil==2.9.0.post0
 pytz==2025.1
 safetensors==0.5.3
 scikit-learn==1.6.1
 scipy==1.15.2
 six==1.17.0
 smmap==5.0.2
 soundfile==0.13.1
 soxr==0.5.0.post1
 streamlit==1.43.2
+sympy==1.13.1
 tenacity==9.0.0
 threadpoolctl==3.6.0
 tokenizers==0.21.1
 toml==0.10.2
+torch==2.6.0
 tornado==6.4.2
 tqdm==4.67.1
 transformers==4.49.0
 typing_extensions==4.12.2
 tzdata==2025.1
 urllib3==2.3.0