kkngan commited on
Commit
f597603
·
verified ·
1 Parent(s): 9854ac0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -81
app.py CHANGED
@@ -3,120 +3,103 @@ from transformers import WhisperForConditionalGeneration, WhisperProcessor
3
  from transformers import pipeline
4
  import librosa
5
  import torch
6
- from spleeter.separator import Separator
7
  from pydub import AudioSegment
8
  from IPython.display import Audio
9
  import os
10
  import accelerate
11
 
12
 
13
- # steamlit setup
14
- st.set_page_config(page_title="Sentiment Analysis on Your Cantonese Song",)
15
- st.header("Cantonese Song Sentiment Analyzer")
16
- input_file = st.file_uploader("upload a song in mp3 format", type="mp3") # upload song
17
- if input_file is not None:
18
- st.write("File uploaded successfully!")
19
- st.write(input_file)
20
- else:
21
- st.write("No file uploaded.")
22
- button_click = st.button("Run Analysis", type="primary")
23
-
24
-
25
- # load song
26
- #input_file = os.path.isfile("test1.mp3")
27
- output_file = os.path.isdir("")
28
-
29
 
30
  # preprocess and crop audio file
31
  def audio_preprocess(file_name = '/test1/vocals.wav'):
32
- # separate music and vocal
33
- separator = Separator('spleeter:2stems')
34
- separator.separate_to_file(input_file, output_file)
35
-
36
-
37
- # Crop the audio
38
- start_time = 60000 # e.g. 30 seconds, 30000
39
- end_time = 110000 # e.g. 40 seconds, 40000
40
-
41
 
 
 
 
42
 
43
 
44
- audio = AudioSegment.from_file(file_name)
45
- cropped_audio = audio[start_time:end_time]
46
- processed_audio = cropped_audio
47
- # .export('cropped_vocals.wav', format='wav') # save vocal audio file
48
- return processed_audio
49
-
50
 
51
  # ASR transcription
52
  def asr_model(processed_audio):
53
- # load audio file
54
- y, sr = librosa.load(processed_audio, sr=16000)
55
-
56
-
57
- # ASR model
58
- MODEL_NAME = "RexChan/ISOM5240-whisper-small-zhhk_1"
59
- processor = WhisperProcessor.from_pretrained(MODEL_NAME)
60
- model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, low_cpu_mem_usage=True)
61
-
62
-
63
- model.config.forced_decoder_ids = None
64
- model.config.suppress_tokens = []
65
- model.config.use_cache = False
66
 
 
 
 
 
67
 
68
- processed_in = processor(y, sampling_rate=sr, return_tensors="pt")
69
- gout = model.generate(
70
- input_features=processed_in.input_features,
71
- output_scores=True, return_dict_in_generate=True
72
- )
73
- transcription = processor.batch_decode(gout.sequences, skip_special_tokens=True)[0]
74
 
 
 
 
 
 
 
75
 
76
- # print result
77
- print(f"Song lyrics = {transcription}")
78
-
79
-
80
- return transcription
81
-
82
 
 
83
 
84
 
85
  # sentiment analysis
86
  def senti_model(transcription):
87
 
 
 
 
 
 
88
 
89
- pipe = pipeline("text-classification", model="lxyuan/distilbert-base-multilingual-cased-sentiments-student")
90
- final_result = pipe(transcription)
91
- display = f"Sentiment Analysis shows that this song is {final_result[0]['label']}. Confident level of this analysis is {final_result[0]['score']*100:.1f}%."
92
- print(display)
93
- return display
94
-
95
-
96
- # return final_result
97
-
98
-
99
 
100
 
101
  # main
102
  def main():
103
 
 
 
104
 
105
- processed_audio = audio_preprocess(input_file)
106
- transcription = asr_model(processed_audio)
107
- final_result = senti_model(transcription)
108
- st.write(final_result)
109
-
110
-
111
- if st.button("Play Audio"):
112
- st.audio(audio_data['audio'],
113
- format="audio/wav",
114
- start_time=0,
115
- sample_rate = audio_data['sampling_rate'])
116
-
117
 
 
 
 
 
 
118
 
119
 
120
  if __name__ == '__main__':
121
- if button_click:
122
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from transformers import pipeline
4
  import librosa
5
  import torch
6
+ # from spleeter.separator import Separator
7
  from pydub import AudioSegment
8
  from IPython.display import Audio
9
  import os
10
  import accelerate
11
 
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  # preprocess and crop audio file
15
  def audio_preprocess(file_name = '/test1/vocals.wav'):
16
+ # separate music and vocal
17
+ separator = Separator('spleeter:2stems')
18
+ separator.separate_to_file(input_file, output_file)
 
 
 
 
 
 
19
 
20
+ # Crop the audio
21
+ start_time = 60000 # e.g. 30 seconds, 30000
22
+ end_time = 110000 # e.g. 40 seconds, 40000
23
 
24
 
25
+ audio = AudioSegment.from_file(file_name)
26
+ cropped_audio = audio[start_time:end_time]
27
+ processed_audio = cropped_audio
28
+ # .export('cropped_vocals.wav', format='wav') # save vocal audio file
29
+ return processed_audio
 
30
 
31
  # ASR transcription
32
  def asr_model(processed_audio):
33
+ # load audio file
34
+ y, sr = librosa.load(processed_audio, sr=16000)
 
 
 
 
 
 
 
 
 
 
 
35
 
36
+ # ASR model
37
+ MODEL_NAME = "RexChan/ISOM5240-whisper-small-zhhk_1"
38
+ processor = WhisperProcessor.from_pretrained(MODEL_NAME)
39
+ model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, low_cpu_mem_usage=True)
40
 
41
+ model.config.forced_decoder_ids = None
42
+ model.config.suppress_tokens = []
43
+ model.config.use_cache = False
 
 
 
44
 
45
+ processed_in = processor(y, sampling_rate=sr, return_tensors="pt")
46
+ gout = model.generate(
47
+ input_features=processed_in.input_features,
48
+ output_scores=True, return_dict_in_generate=True
49
+ )
50
+ transcription = processor.batch_decode(gout.sequences, skip_special_tokens=True)[0]
51
 
52
+ # print result
53
+ print(f"Song lyrics = {transcription}")
 
 
 
 
54
 
55
+ return transcription
56
 
57
 
58
  # sentiment analysis
59
  def senti_model(transcription):
60
 
61
+ pipe = pipeline("text-classification", model="lxyuan/distilbert-base-multilingual-cased-sentiments-student")
62
+ final_result = pipe(transcription)
63
+ display = f"Sentiment Analysis shows that this song is {final_result[0]['label']}. Confident level of this analysis is {final_result[0]['score']*100:.1f}%."
64
+ print(display)
65
+ return display
66
 
67
+ # return final_result
 
 
 
 
 
 
 
 
 
68
 
69
 
70
  # main
71
  def main():
72
 
73
+ # processed_audio = audio_preprocess(input_file)
74
+ processed_audio = input_file
75
 
76
+ transcription = asr_model(processed_audio)
77
+ final_result = senti_model(transcription)
78
+ st.write(final_result)
 
 
 
 
 
 
 
 
 
79
 
80
+ if st.button("Play Audio"):
81
+ st.audio(audio_data['audio'],
82
+ format="audio/wav",
83
+ start_time=0,
84
+ sample_rate = audio_data['sampling_rate'])
85
 
86
 
87
  if __name__ == '__main__':
88
+
89
+ # steamlit setup
90
+ st.set_page_config(page_title="Sentiment Analysis on Your Cantonese Song",)
91
+ st.header("Cantonese Song Sentiment Analyzer")
92
+ input_file = st.file_uploader("upload a song in mp3 format", type="mp3") # upload song
93
+ if input_file is not None:
94
+ st.write("File uploaded successfully!")
95
+ st.write(input_file)
96
+ else:
97
+ st.write("No file uploaded.")
98
+ button_click = st.button("Run Analysis", type="primary")
99
+
100
+ # load song
101
+ #input_file = os.path.isfile("test1.mp3")
102
+ output_file = os.path.isdir("")
103
+
104
+ if button_click:
105
+ main()