Spaces:
Paused
Paused
First commit
Browse files- .gitattributes +1 -0
- Weights/pytorch_lora_weights.safetensors +3 -0
- flask/10.mp3 +0 -0
- flask/Beat_tracking_service.py +20 -0
- flask/Emotion_spotting_service.py +65 -0
- flask/Genre_classifier_model.h5 +3 -0
- flask/Genre_spotting_service.py +73 -0
- flask/Image_generation_service.py +20 -0
- flask/__pycache__/Beat_tracking_service.cpython-310.pyc +0 -0
- flask/__pycache__/Beat_tracking_service.cpython-311.pyc +0 -0
- flask/__pycache__/Emotion_spotting_service.cpython-310.pyc +0 -0
- flask/__pycache__/Emotion_spotting_service.cpython-311.pyc +0 -0
- flask/__pycache__/Genre_spotting_service.cpython-310.pyc +0 -0
- flask/__pycache__/Genre_spotting_service.cpython-311.pyc +0 -0
- flask/__pycache__/Image_generation_service.cpython-310.pyc +0 -0
- flask/__pycache__/Image_generation_service.cpython-311.pyc +0 -0
- flask/emotion_model.h5 +3 -0
- main.py +77 -0
- requirements.txt +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
flask/emotion_model.h5 filter=lfs diff=lfs merge=lfs -text
|
Weights/pytorch_lora_weights.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b1b19610541f9a2c6f235a1bac2690d04b98535f9f9f7790e9ad4d0fe8ac89b0
|
3 |
+
size 3226184
|
flask/10.mp3
ADDED
Binary file (961 kB). View file
|
|
flask/Beat_tracking_service.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
|
3 |
+
class _Beat_tracking_service():
|
4 |
+
instance = None
|
5 |
+
def __init__(self):
|
6 |
+
self.instance = 1
|
7 |
+
|
8 |
+
def get_beat(self,file_path):
|
9 |
+
y, sr = librosa.load(file_path)
|
10 |
+
beat = librosa.beat.beat_track(y=y, sr=sr)
|
11 |
+
return beat[0][0]
|
12 |
+
|
13 |
+
def Beat_tracking_service():
|
14 |
+
if _Beat_tracking_service.instance == None:
|
15 |
+
_Beat_tracking_service.instance = _Beat_tracking_service()
|
16 |
+
return _Beat_tracking_service.instance
|
17 |
+
|
18 |
+
# beat_tracking_service = Beat_tracking_service()
|
19 |
+
# predicted_beat = beat_tracking_service.get_beat("10.mp3")
|
20 |
+
# print(predicted_beat)
|
flask/Emotion_spotting_service.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tensorflow.keras as keras
|
2 |
+
import numpy as np
|
3 |
+
import librosa
|
4 |
+
import random
|
5 |
+
#import tensorflow_addons
|
6 |
+
SAMPLE_RATE = 22050
|
7 |
+
|
8 |
+
class _Emotion_spotting_service():
|
9 |
+
model = None
|
10 |
+
#instance = None
|
11 |
+
mapping = [' amazement', ' solemnity', ' tenderness',
|
12 |
+
' nostalgia', ' calmness', ' power',
|
13 |
+
' joyfulness', ' tension',' sadness']
|
14 |
+
|
15 |
+
def __init__(self,model_path):
|
16 |
+
self.model = keras.models.load_model(model_path)
|
17 |
+
def predict(self,file_path):
|
18 |
+
log_spectrogram = self.preprocess(file_path)
|
19 |
+
X = np.array(log_spectrogram).astype("float32")
|
20 |
+
X = np.expand_dims(X, axis=0)
|
21 |
+
# Do predictions
|
22 |
+
num_predictions = self.model.predict(X)
|
23 |
+
prediction = np.argmax(num_predictions)
|
24 |
+
predicted_keyword = self.mapping[prediction]
|
25 |
+
return predicted_keyword
|
26 |
+
|
27 |
+
# Split audio into 10 second excerpts
|
28 |
+
# Attain log spectrogram with following parameters
|
29 |
+
# sample rate = 22050, n_fft = 2048, hop_length = 512
|
30 |
+
# output, 1024*431
|
31 |
+
def preprocess(self,file_path):
|
32 |
+
signal, sr = librosa.load(file_path,sr=SAMPLE_RATE)
|
33 |
+
signal_normalized = librosa.util.normalize(signal)
|
34 |
+
len_to_check = 10 * 22050
|
35 |
+
# If audio is less than 10 seconds, we pad it with zeroes
|
36 |
+
# If audio is more than 10 seconds, we split into segments and randomly choose one
|
37 |
+
if len(signal_normalized) < len_to_check:
|
38 |
+
num_zeros = len_to_check - len(signal_normalized)
|
39 |
+
signal_normalized = signal_normalized + [0] * num_zeros
|
40 |
+
elif len(signal_normalized) > len_to_check:
|
41 |
+
num_segments = len(signal_normalized)//len_to_check
|
42 |
+
segments = []
|
43 |
+
for i in range(num_segments):
|
44 |
+
start = i * len_to_check
|
45 |
+
end = start + len_to_check
|
46 |
+
if len(signal[start:end]) != len_to_check:
|
47 |
+
continue
|
48 |
+
else:
|
49 |
+
segments.append(signal[start:end])
|
50 |
+
signal_normalized = random.choice(segments)
|
51 |
+
stft = librosa.stft(signal_normalized, n_fft=2048,hop_length=512)[:-1]
|
52 |
+
spectrogram = np.abs(stft)
|
53 |
+
log_spectrogram = librosa.amplitude_to_db(spectrogram)
|
54 |
+
return log_spectrogram
|
55 |
+
|
56 |
+
# def Emotion_spotting_service():
|
57 |
+
# if _Emotion_spotting_service.instance == None:
|
58 |
+
# _Emotion_spotting_service.instance = _Emotion_spotting_service()
|
59 |
+
# _Emotion_spotting_service.model = keras.models.load_model("ERM.h5")
|
60 |
+
# return _Emotion_spotting_service.instance
|
61 |
+
|
62 |
+
# if __name__ == "__main__":
|
63 |
+
# emotion_service = _Emotion_spotting_service("emotion_model.h5")
|
64 |
+
# predicted_word = emotion_service.predict("10.mp3")
|
65 |
+
# print(predicted_word)
|
flask/Genre_classifier_model.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6572829f9add78efffc0f17a8b0447ce50f84f751b942ba9c7dbf14703603f28
|
3 |
+
size 6874520
|
flask/Genre_spotting_service.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import numpy as np
|
3 |
+
import tensorflow as tf
|
4 |
+
#import argparse
|
5 |
+
import json
|
6 |
+
import tensorflow.keras as keras
|
7 |
+
|
8 |
+
# Input dimension for model = 259*13
|
9 |
+
|
10 |
+
mappings = ["blues","classical","country","disco","hip-hop","jazz","metal","pop","reggae","rock"]
|
11 |
+
model_path = "Genre_classifier_model.h5"
|
12 |
+
class _Genre_spotting_service():
|
13 |
+
instance = None
|
14 |
+
model = None
|
15 |
+
mappings = ["blues", "classical", "country", "disco", "hip-hop", "jazz", "metal", "pop", "reggae", "rock"]
|
16 |
+
|
17 |
+
def __init__(self,model_path):
|
18 |
+
self.model = tf.keras.models.load_model(model_path)
|
19 |
+
def predict(self,file_path):
|
20 |
+
input_data = self.preprocess_audio(file_path)
|
21 |
+
input_data = input_data[np.newaxis, ...]
|
22 |
+
predictions = self.model.predict(input_data)
|
23 |
+
text_predictions = [mappings[np.argmax(predictions, axis=1)[0]]]
|
24 |
+
return text_predictions
|
25 |
+
|
26 |
+
def preprocess_audio(self,file_path, target_frames=259, n_mfcc=13, sr=22050, n_fft=2048, hop_length=512):
|
27 |
+
# Load audio file
|
28 |
+
y, sr = librosa.load(file_path, sr=sr)
|
29 |
+
|
30 |
+
# Compute MFCCs
|
31 |
+
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
|
32 |
+
|
33 |
+
# Pad or truncate MFCCs to ensure they have target_frames
|
34 |
+
if mfccs.shape[1] < target_frames:
|
35 |
+
pad_width = target_frames - mfccs.shape[1]
|
36 |
+
mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
|
37 |
+
else:
|
38 |
+
mfccs = mfccs[:, :target_frames]
|
39 |
+
return mfccs.T
|
40 |
+
|
41 |
+
|
42 |
+
def Genre_spotting_service():
|
43 |
+
if _Genre_spotting_service.instance == None:
|
44 |
+
_Genre_spotting_service.instance = _Genre_spotting_service()
|
45 |
+
_Genre_spotting_service.model = keras.models.load_model(model_path)
|
46 |
+
return _Genre_spotting_service.instance
|
47 |
+
|
48 |
+
# if __name__ == "__main__":
|
49 |
+
# genre_service = _Genre_spotting_service("Genre_classifier_model.h5")
|
50 |
+
# predicted_genre = genre_service.predict("10.mp3")
|
51 |
+
# print(predicted_genre)
|
52 |
+
|
53 |
+
|
54 |
+
# file_path = "10.mp3"
|
55 |
+
# target_frames=259
|
56 |
+
# n_mfcc=13
|
57 |
+
# sr=22050
|
58 |
+
# n_fft=2048
|
59 |
+
# hop_length=512
|
60 |
+
# # Load audio file
|
61 |
+
# y, sr = librosa.load(file_path, sr=sr)
|
62 |
+
#
|
63 |
+
# # Compute MFCCs
|
64 |
+
# mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
|
65 |
+
# print(str(sr))
|
66 |
+
#
|
67 |
+
# # Pad or truncate MFCCs to ensure they have target_frames
|
68 |
+
# if mfccs.shape[1] < target_frames:
|
69 |
+
# pad_width = target_frames - mfccs.shape[1]
|
70 |
+
# mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
|
71 |
+
# else:
|
72 |
+
# mfccs = mfccs[:, :target_frames]
|
73 |
+
# print(mfccs)
|
flask/Image_generation_service.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from diffusers import StableDiffusionPipeline
|
2 |
+
import torch
|
3 |
+
|
4 |
+
class _Image_generation_service():
|
5 |
+
instance = None
|
6 |
+
def get_image(self,prompt):
|
7 |
+
pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
|
8 |
+
pipeline.load_lora_weights("../Weights/pytorch_lora_weights.safetensors", weight_name="pytorch_lora_weights.safetensors")
|
9 |
+
image = pipeline(prompt)
|
10 |
+
return image
|
11 |
+
|
12 |
+
def Image_generation_service():
|
13 |
+
if _Image_generation_service.instance == None:
|
14 |
+
_Image_generation_service.instance = _Image_generation_service()
|
15 |
+
return _Image_generation_service.instance
|
16 |
+
|
17 |
+
# if __name__ == "__main__":
|
18 |
+
# image_service = Image_generation_service()
|
19 |
+
# gen_image = image_service.get_image("A calm piece of music")
|
20 |
+
# print(gen_image)
|
flask/__pycache__/Beat_tracking_service.cpython-310.pyc
ADDED
Binary file (946 Bytes). View file
|
|
flask/__pycache__/Beat_tracking_service.cpython-311.pyc
ADDED
Binary file (1.36 kB). View file
|
|
flask/__pycache__/Emotion_spotting_service.cpython-310.pyc
ADDED
Binary file (1.88 kB). View file
|
|
flask/__pycache__/Emotion_spotting_service.cpython-311.pyc
ADDED
Binary file (3.5 kB). View file
|
|
flask/__pycache__/Genre_spotting_service.cpython-310.pyc
ADDED
Binary file (1.91 kB). View file
|
|
flask/__pycache__/Genre_spotting_service.cpython-311.pyc
ADDED
Binary file (2.61 kB). View file
|
|
flask/__pycache__/Image_generation_service.cpython-310.pyc
ADDED
Binary file (1.16 kB). View file
|
|
flask/__pycache__/Image_generation_service.cpython-311.pyc
ADDED
Binary file (1.77 kB). View file
|
|
flask/emotion_model.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f88238ef0a8671880b24d992e435aad8b169717f83231824229c0911253869fd
|
3 |
+
size 246095376
|
main.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from flask.Emotion_spotting_service import _Emotion_spotting_service
|
3 |
+
from flask.Genre_spotting_service import _Genre_spotting_service
|
4 |
+
from flask.Beat_tracking_service import _Beat_tracking_service
|
5 |
+
from diffusers import StableDiffusionPipeline
|
6 |
+
import torch
|
7 |
+
|
8 |
+
emo_list = []
|
9 |
+
gen_list = []
|
10 |
+
tempo_list = []
|
11 |
+
@st.cache_resource
|
12 |
+
def load_emo_model():
|
13 |
+
emo_service = _Emotion_spotting_service("flask/emotion_model.h5")
|
14 |
+
return emo_service
|
15 |
+
@st.cache_resource
|
16 |
+
def load_genre_model():
|
17 |
+
gen_service = _Genre_spotting_service("flask/Genre_classifier_model.h5")
|
18 |
+
return gen_service
|
19 |
+
|
20 |
+
@st.cache_resource
|
21 |
+
def load_beat_model():
|
22 |
+
beat_service = _Beat_tracking_service()
|
23 |
+
return beat_service
|
24 |
+
|
25 |
+
@st.cache_resource
|
26 |
+
def load_image_model():
|
27 |
+
pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5",torch_dtype=torch.float16).to("cuda")
|
28 |
+
pipeline.load_lora_weights("Weights/pytorch_lora_weights.safetensors", weight_name="pytorch_lora_weights.safetensors")
|
29 |
+
return pipeline
|
30 |
+
|
31 |
+
|
32 |
+
if 'emotion' not in st.session_state:
|
33 |
+
st.session_state.emotion = None
|
34 |
+
|
35 |
+
if 'genre' not in st.session_state:
|
36 |
+
st.session_state.genre = None
|
37 |
+
|
38 |
+
if 'beat' not in st.session_state:
|
39 |
+
st.session_state.beat = None
|
40 |
+
|
41 |
+
emotion_service = load_emo_model()
|
42 |
+
genre_service = load_genre_model()
|
43 |
+
beat_service = load_beat_model()
|
44 |
+
image_service = load_image_model()
|
45 |
+
|
46 |
+
st.title("Music2Image webpage")
|
47 |
+
user_input = st.file_uploader("Upload your wav/mp3 files here", type=["wav","mp3"],key = "file_uploader")
|
48 |
+
st.caption("Generate images from your audio file")
|
49 |
+
st.audio(user_input)
|
50 |
+
c1,c2,c3 = st.columns([1,1,1])
|
51 |
+
with c1:
|
52 |
+
if st.button("Generate emotion"):
|
53 |
+
emotion = emotion_service.predict(user_input)
|
54 |
+
st.session_state.emotion = emotion
|
55 |
+
st.text(st.session_state.emotion)
|
56 |
+
with c2:
|
57 |
+
if st.button("Generate genre"):
|
58 |
+
genre = genre_service.predict(user_input)
|
59 |
+
st.session_state.genre = genre
|
60 |
+
st.text(st.session_state.genre)
|
61 |
+
|
62 |
+
with c3:
|
63 |
+
if st.button("Generate beat"):
|
64 |
+
beat = beat_service.get_beat(user_input)
|
65 |
+
st.session_state.beat = beat
|
66 |
+
st.text(st.session_state.beat)
|
67 |
+
|
68 |
+
if st.session_state.emotion != None and st.session_state.genre != None and st.session_state.beat != None:
|
69 |
+
text_output = None
|
70 |
+
if st.button("Generate text description to be fed into stable diffusion"):
|
71 |
+
st.caption("Text description of your music file")
|
72 |
+
text_output = "This piece of music falls under the " + st.session_state.genre[0] + " genre. It is of tempo " + str(int(st.session_state.beat)) + " and evokes a sense of" + st.session_state.emotion + "."
|
73 |
+
st.text(text_output)
|
74 |
+
if text_output:
|
75 |
+
if st.button("Generate image from text description"):
|
76 |
+
image = image_service(text_output)
|
77 |
+
st.image(image)
|
requirements.txt
ADDED
Binary file (3.83 kB). View file
|
|