Monke64 commited on
Commit
1580527
·
1 Parent(s): 18b4029

First commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ flask/emotion_model.h5 filter=lfs diff=lfs merge=lfs -text
Weights/pytorch_lora_weights.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1b19610541f9a2c6f235a1bac2690d04b98535f9f9f7790e9ad4d0fe8ac89b0
3
+ size 3226184
flask/10.mp3 ADDED
Binary file (961 kB). View file
 
flask/Beat_tracking_service.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+
3
+ class _Beat_tracking_service():
4
+ instance = None
5
+ def __init__(self):
6
+ self.instance = 1
7
+
8
+ def get_beat(self,file_path):
9
+ y, sr = librosa.load(file_path)
10
+ beat = librosa.beat.beat_track(y=y, sr=sr)
11
+ return beat[0][0]
12
+
13
+ def Beat_tracking_service():
14
+ if _Beat_tracking_service.instance == None:
15
+ _Beat_tracking_service.instance = _Beat_tracking_service()
16
+ return _Beat_tracking_service.instance
17
+
18
+ # beat_tracking_service = Beat_tracking_service()
19
+ # predicted_beat = beat_tracking_service.get_beat("10.mp3")
20
+ # print(predicted_beat)
flask/Emotion_spotting_service.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow.keras as keras
2
+ import numpy as np
3
+ import librosa
4
+ import random
5
+ #import tensorflow_addons
6
+ SAMPLE_RATE = 22050
7
+
8
+ class _Emotion_spotting_service():
9
+ model = None
10
+ #instance = None
11
+ mapping = [' amazement', ' solemnity', ' tenderness',
12
+ ' nostalgia', ' calmness', ' power',
13
+ ' joyfulness', ' tension',' sadness']
14
+
15
+ def __init__(self,model_path):
16
+ self.model = keras.models.load_model(model_path)
17
+ def predict(self,file_path):
18
+ log_spectrogram = self.preprocess(file_path)
19
+ X = np.array(log_spectrogram).astype("float32")
20
+ X = np.expand_dims(X, axis=0)
21
+ # Do predictions
22
+ num_predictions = self.model.predict(X)
23
+ prediction = np.argmax(num_predictions)
24
+ predicted_keyword = self.mapping[prediction]
25
+ return predicted_keyword
26
+
27
+ # Split audio into 10 second excerpts
28
+ # Attain log spectrogram with following parameters
29
+ # sample rate = 22050, n_fft = 2048, hop_length = 512
30
+ # output, 1024*431
31
+ def preprocess(self,file_path):
32
+ signal, sr = librosa.load(file_path,sr=SAMPLE_RATE)
33
+ signal_normalized = librosa.util.normalize(signal)
34
+ len_to_check = 10 * 22050
35
+ # If audio is less than 10 seconds, we pad it with zeroes
36
+ # If audio is more than 10 seconds, we split into segments and randomly choose one
37
+ if len(signal_normalized) < len_to_check:
38
+ num_zeros = len_to_check - len(signal_normalized)
39
+ signal_normalized = signal_normalized + [0] * num_zeros
40
+ elif len(signal_normalized) > len_to_check:
41
+ num_segments = len(signal_normalized)//len_to_check
42
+ segments = []
43
+ for i in range(num_segments):
44
+ start = i * len_to_check
45
+ end = start + len_to_check
46
+ if len(signal[start:end]) != len_to_check:
47
+ continue
48
+ else:
49
+ segments.append(signal[start:end])
50
+ signal_normalized = random.choice(segments)
51
+ stft = librosa.stft(signal_normalized, n_fft=2048,hop_length=512)[:-1]
52
+ spectrogram = np.abs(stft)
53
+ log_spectrogram = librosa.amplitude_to_db(spectrogram)
54
+ return log_spectrogram
55
+
56
+ # def Emotion_spotting_service():
57
+ # if _Emotion_spotting_service.instance == None:
58
+ # _Emotion_spotting_service.instance = _Emotion_spotting_service()
59
+ # _Emotion_spotting_service.model = keras.models.load_model("ERM.h5")
60
+ # return _Emotion_spotting_service.instance
61
+
62
+ # if __name__ == "__main__":
63
+ # emotion_service = _Emotion_spotting_service("emotion_model.h5")
64
+ # predicted_word = emotion_service.predict("10.mp3")
65
+ # print(predicted_word)
flask/Genre_classifier_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6572829f9add78efffc0f17a8b0447ce50f84f751b942ba9c7dbf14703603f28
3
+ size 6874520
flask/Genre_spotting_service.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ import tensorflow as tf
4
+ #import argparse
5
+ import json
6
+ import tensorflow.keras as keras
7
+
8
+ # Input dimension for model = 259*13
9
+
10
+ mappings = ["blues","classical","country","disco","hip-hop","jazz","metal","pop","reggae","rock"]
11
+ model_path = "Genre_classifier_model.h5"
12
+ class _Genre_spotting_service():
13
+ instance = None
14
+ model = None
15
+ mappings = ["blues", "classical", "country", "disco", "hip-hop", "jazz", "metal", "pop", "reggae", "rock"]
16
+
17
+ def __init__(self,model_path):
18
+ self.model = tf.keras.models.load_model(model_path)
19
+ def predict(self,file_path):
20
+ input_data = self.preprocess_audio(file_path)
21
+ input_data = input_data[np.newaxis, ...]
22
+ predictions = self.model.predict(input_data)
23
+ text_predictions = [mappings[np.argmax(predictions, axis=1)[0]]]
24
+ return text_predictions
25
+
26
+ def preprocess_audio(self,file_path, target_frames=259, n_mfcc=13, sr=22050, n_fft=2048, hop_length=512):
27
+ # Load audio file
28
+ y, sr = librosa.load(file_path, sr=sr)
29
+
30
+ # Compute MFCCs
31
+ mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
32
+
33
+ # Pad or truncate MFCCs to ensure they have target_frames
34
+ if mfccs.shape[1] < target_frames:
35
+ pad_width = target_frames - mfccs.shape[1]
36
+ mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
37
+ else:
38
+ mfccs = mfccs[:, :target_frames]
39
+ return mfccs.T
40
+
41
+
42
+ def Genre_spotting_service():
43
+ if _Genre_spotting_service.instance == None:
44
+ _Genre_spotting_service.instance = _Genre_spotting_service()
45
+ _Genre_spotting_service.model = keras.models.load_model(model_path)
46
+ return _Genre_spotting_service.instance
47
+
48
+ # if __name__ == "__main__":
49
+ # genre_service = _Genre_spotting_service("Genre_classifier_model.h5")
50
+ # predicted_genre = genre_service.predict("10.mp3")
51
+ # print(predicted_genre)
52
+
53
+
54
+ # file_path = "10.mp3"
55
+ # target_frames=259
56
+ # n_mfcc=13
57
+ # sr=22050
58
+ # n_fft=2048
59
+ # hop_length=512
60
+ # # Load audio file
61
+ # y, sr = librosa.load(file_path, sr=sr)
62
+ #
63
+ # # Compute MFCCs
64
+ # mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
65
+ # print(str(sr))
66
+ #
67
+ # # Pad or truncate MFCCs to ensure they have target_frames
68
+ # if mfccs.shape[1] < target_frames:
69
+ # pad_width = target_frames - mfccs.shape[1]
70
+ # mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
71
+ # else:
72
+ # mfccs = mfccs[:, :target_frames]
73
+ # print(mfccs)
flask/Image_generation_service.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from diffusers import StableDiffusionPipeline
2
+ import torch
3
+
4
+ class _Image_generation_service():
5
+ instance = None
6
+ def get_image(self,prompt):
7
+ pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
8
+ pipeline.load_lora_weights("../Weights/pytorch_lora_weights.safetensors", weight_name="pytorch_lora_weights.safetensors")
9
+ image = pipeline(prompt)
10
+ return image
11
+
12
+ def Image_generation_service():
13
+ if _Image_generation_service.instance == None:
14
+ _Image_generation_service.instance = _Image_generation_service()
15
+ return _Image_generation_service.instance
16
+
17
+ # if __name__ == "__main__":
18
+ # image_service = Image_generation_service()
19
+ # gen_image = image_service.get_image("A calm piece of music")
20
+ # print(gen_image)
flask/__pycache__/Beat_tracking_service.cpython-310.pyc ADDED
Binary file (946 Bytes). View file
 
flask/__pycache__/Beat_tracking_service.cpython-311.pyc ADDED
Binary file (1.36 kB). View file
 
flask/__pycache__/Emotion_spotting_service.cpython-310.pyc ADDED
Binary file (1.88 kB). View file
 
flask/__pycache__/Emotion_spotting_service.cpython-311.pyc ADDED
Binary file (3.5 kB). View file
 
flask/__pycache__/Genre_spotting_service.cpython-310.pyc ADDED
Binary file (1.91 kB). View file
 
flask/__pycache__/Genre_spotting_service.cpython-311.pyc ADDED
Binary file (2.61 kB). View file
 
flask/__pycache__/Image_generation_service.cpython-310.pyc ADDED
Binary file (1.16 kB). View file
 
flask/__pycache__/Image_generation_service.cpython-311.pyc ADDED
Binary file (1.77 kB). View file
 
flask/emotion_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f88238ef0a8671880b24d992e435aad8b169717f83231824229c0911253869fd
3
+ size 246095376
main.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from flask.Emotion_spotting_service import _Emotion_spotting_service
3
+ from flask.Genre_spotting_service import _Genre_spotting_service
4
+ from flask.Beat_tracking_service import _Beat_tracking_service
5
+ from diffusers import StableDiffusionPipeline
6
+ import torch
7
+
8
+ emo_list = []
9
+ gen_list = []
10
+ tempo_list = []
11
+ @st.cache_resource
12
+ def load_emo_model():
13
+ emo_service = _Emotion_spotting_service("flask/emotion_model.h5")
14
+ return emo_service
15
+ @st.cache_resource
16
+ def load_genre_model():
17
+ gen_service = _Genre_spotting_service("flask/Genre_classifier_model.h5")
18
+ return gen_service
19
+
20
+ @st.cache_resource
21
+ def load_beat_model():
22
+ beat_service = _Beat_tracking_service()
23
+ return beat_service
24
+
25
+ @st.cache_resource
26
+ def load_image_model():
27
+ pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5",torch_dtype=torch.float16).to("cuda")
28
+ pipeline.load_lora_weights("Weights/pytorch_lora_weights.safetensors", weight_name="pytorch_lora_weights.safetensors")
29
+ return pipeline
30
+
31
+
32
+ if 'emotion' not in st.session_state:
33
+ st.session_state.emotion = None
34
+
35
+ if 'genre' not in st.session_state:
36
+ st.session_state.genre = None
37
+
38
+ if 'beat' not in st.session_state:
39
+ st.session_state.beat = None
40
+
41
+ emotion_service = load_emo_model()
42
+ genre_service = load_genre_model()
43
+ beat_service = load_beat_model()
44
+ image_service = load_image_model()
45
+
46
+ st.title("Music2Image webpage")
47
+ user_input = st.file_uploader("Upload your wav/mp3 files here", type=["wav","mp3"],key = "file_uploader")
48
+ st.caption("Generate images from your audio file")
49
+ st.audio(user_input)
50
+ c1,c2,c3 = st.columns([1,1,1])
51
+ with c1:
52
+ if st.button("Generate emotion"):
53
+ emotion = emotion_service.predict(user_input)
54
+ st.session_state.emotion = emotion
55
+ st.text(st.session_state.emotion)
56
+ with c2:
57
+ if st.button("Generate genre"):
58
+ genre = genre_service.predict(user_input)
59
+ st.session_state.genre = genre
60
+ st.text(st.session_state.genre)
61
+
62
+ with c3:
63
+ if st.button("Generate beat"):
64
+ beat = beat_service.get_beat(user_input)
65
+ st.session_state.beat = beat
66
+ st.text(st.session_state.beat)
67
+
68
+ if st.session_state.emotion != None and st.session_state.genre != None and st.session_state.beat != None:
69
+ text_output = None
70
+ if st.button("Generate text description to be fed into stable diffusion"):
71
+ st.caption("Text description of your music file")
72
+ text_output = "This piece of music falls under the " + st.session_state.genre[0] + " genre. It is of tempo " + str(int(st.session_state.beat)) + " and evokes a sense of" + st.session_state.emotion + "."
73
+ st.text(text_output)
74
+ if text_output:
75
+ if st.button("Generate image from text description"):
76
+ image = image_service(text_output)
77
+ st.image(image)
requirements.txt ADDED
Binary file (3.83 kB). View file