File size: 3,235 Bytes
9854ac0
 
 
 
 
651716c
9854ac0
 
 
 
 
 
 
651716c
 
 
9854ac0
 
 
 
 
 
 
 
 
 
 
 
651716c
 
 
9854ac0
 
 
 
 
 
651716c
 
9854ac0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
651716c
9854ac0
 
28356eb
 
 
 
9854ac0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28356eb
 
 
 
 
651716c
28356eb
 
 
 
 
 
 
 
 
 
651716c
28356eb
 
9854ac0
651716c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import streamlit as st
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from transformers import pipeline
import librosa
import torch
# from spleeter.separator import Separator
from pydub import AudioSegment
from IPython.display import Audio
import os
import accelerate






# preprocess and crop audio file
def audio_preprocess(file_name = '/test1/vocals.wav'):
   # separate music and vocal
   separator = Separator('spleeter:2stems')
   separator.separate_to_file(input_file, output_file)


   # Crop the audio
   start_time = 60000  # e.g. 30 seconds, 30000
   end_time = 110000  # e.g. 40 seconds, 40000




   audio = AudioSegment.from_file(file_name)
   cropped_audio = audio[start_time:end_time]
   processed_audio = cropped_audio
   # .export('cropped_vocals.wav', format='wav') # save vocal audio file
   return processed_audio




# ASR transcription
def asr_model(processed_audio):
   # load audio file
   y, sr = librosa.load(processed_audio, sr=16000)


   # ASR model
   MODEL_NAME = "RexChan/ISOM5240-whisper-small-zhhk_1"
   processor = WhisperProcessor.from_pretrained(MODEL_NAME)
   model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, low_cpu_mem_usage=True)


   model.config.forced_decoder_ids = None
   model.config.suppress_tokens = []
   model.config.use_cache = False


   processed_in = processor(y, sampling_rate=sr, return_tensors="pt")
   gout = model.generate(
       input_features=processed_in.input_features,
       output_scores=True, return_dict_in_generate=True
   )
   transcription = processor.batch_decode(gout.sequences, skip_special_tokens=True)[0]


   # print result
   print(f"Song lyrics = {transcription}")


   return transcription




# sentiment analysis
def senti_model(transcription):


   pipe = pipeline("text-classification", model="lxyuan/distilbert-base-multilingual-cased-sentiments-student")
   final_result = pipe(transcription)
   display = f"Sentiment Analysis shows that this song is {final_result[0]['label']}. Confident level of this analysis is {final_result[0]['score']*100:.1f}%."
   print(display)
   return display


   # return final_result




# main
def main(input_file):


   # processed_audio = audio_preprocess(input_file)
   processed_audio = input_file


   transcription = asr_model(processed_audio)
   final_result = senti_model(transcription)
   st.write(final_result)


   if st.button("Play Audio"):
       st.audio(audio_data['audio'],
                   format="audio/wav",
                   start_time=0,
                   sample_rate = audio_data['sampling_rate'])




if __name__ == '__main__':


   # steamlit setup
   st.set_page_config(page_title="Sentiment Analysis on Your Cantonese Song",)
   st.header("Cantonese Song Sentiment Analyzer")
   input_file = st.file_uploader("upload a song in mp3 format", type="mp3") # upload song
   if input_file is not None:
       st.write("File uploaded successfully!")
       st.write(input_file)
   else:
       st.write("No file uploaded.")
   button_click = st.button("Run Analysis", type="primary")


   # load song
   #input_file = os.path.isfile("test1.mp3")
   # output_file = os.path.isdir("")


   if button_click:
       main(input_file=input_file)