File size: 3,341 Bytes
f4f38de
 
 
 
 
 
 
 
 
 
 
 
 
 
6d9b00c
 
 
 
 
fbd9ffb
f4f38de
 
 
1b2cdd4
3099d3f
 
f4f38de
 
8240e95
f4f38de
 
3099d3f
f4f38de
 
 
 
 
 
 
3099d3f
 
f4f38de
a473b31
f4f38de
 
 
 
 
a473b31
 
f4f38de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb6b424
bfe1f07
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import streamlit as st
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from transformers import pipeline
import librosa
import torch
from spleeter.separator import Separator
from pydub import AudioSegment
from IPython.display import Audio
import os
import accelerate

# steamlit setup
st.set_page_config(page_title="Sentiment Analysis on Your Cantonese Song",)
st.header("Cantonese Song Sentiment Analyzer")
#input_file = st.file_uploader("upload a song in mp3 format", type="mp3") # upload song
#if input_file is not None:
    #st.write("File uploaded successfully!")
#else:
    #st.write("No file uploaded.")
button_click = st.button("Run Analysis", type="primary")


# load song
input_file = os.path.join(os.path.dirname(__file__), 'test1.mp3')
#output_file = os.path.join(os.path.dirname(__file__))
output_file = None

# preprocess and crop audio file
                           
def audio_preprocess():
    # separate music and vocal
    global output_file, cropped_audio
    separator = Separator('spleeter:2stems')
    separator.separate_to_file(input_file, output_file)

    # Crop the audio
    start_time = 60000  # e.g. 30 seconds, 30000
    end_time = 110000  # e.g. 40 seconds, 40000

    #audio = AudioSegment.from_file(os.path.join(os.path.dirname(__file__), 'vocals.wav'))
    audio = AudioSegment.from_file(output_file[0])  # Use the global output_file variable for vocal part
    cropped_audio = audio[start_time:end_time]
    cropped_audio.export(os.path.join(os.path.dirname(__file__), 'cropped_vocals.wav'), format='wav') # save vocal audio file


# ASR transcription
def asr_model():
    # load audio file
    #y, sr = librosa.load('cropped_vocals.wav', sr=16000)
    y, sr = librosa.load(os.path.join(os.path.dirname(__file__), 'cropped_vocals.wav'), sr=16000)

    # ASR model
    MODEL_NAME = "RexChan/ISOM5240-whisper-small-zhhk_1"
    processor = WhisperProcessor.from_pretrained(MODEL_NAME)
    model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, low_cpu_mem_usage=True)

    model.config.forced_decoder_ids = None
    model.config.suppress_tokens = []
    model.config.use_cache = False

    processed_in = processor(y, sampling_rate=sr, return_tensors="pt")
    gout = model.generate(
        input_features=processed_in.input_features,
        output_scores=True, return_dict_in_generate=True
    )
    transcription = processor.batch_decode(gout.sequences, skip_special_tokens=True)[0]

    # print result
    print(f"Song lyrics = {transcription}")

    return transcription


# sentiment analysis
def senti_model(transcription):

    pipe = pipeline("text-classification", model="lxyuan/distilbert-base-multilingual-cased-sentiments-student")
    final_result = pipe(transcription)
    print(f"Sentiment Analysis shows that this song is {final_result[0]['label']}. Confident level of this analysis is {final_result[0]['score']*100:.1f}%.")

    return final_result


# main
def main():

    audio_preprocess()
    transcription = asr_model()
    final_result = senti_model(transcription)
    
    if st.button("Play Audio"):
        st.audio(audio_data['audio'],
                    format="audio/wav",
                    start_time=0,
                    sample_rate = audio_data['sampling_rate'])


if __name__ == '__main__':
    if button_click:
        main()