Spaces:
Sleeping
Sleeping
File size: 3,235 Bytes
9854ac0 651716c 9854ac0 651716c 9854ac0 651716c 9854ac0 651716c 9854ac0 651716c 9854ac0 28356eb 9854ac0 28356eb 651716c 28356eb 651716c 28356eb 9854ac0 651716c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import streamlit as st
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from transformers import pipeline
import librosa
import torch
# from spleeter.separator import Separator
from pydub import AudioSegment
from IPython.display import Audio
import os
import accelerate
# preprocess and crop audio file
def audio_preprocess(file_name = '/test1/vocals.wav'):
# separate music and vocal
separator = Separator('spleeter:2stems')
separator.separate_to_file(input_file, output_file)
# Crop the audio
start_time = 60000 # e.g. 30 seconds, 30000
end_time = 110000 # e.g. 40 seconds, 40000
audio = AudioSegment.from_file(file_name)
cropped_audio = audio[start_time:end_time]
processed_audio = cropped_audio
# .export('cropped_vocals.wav', format='wav') # save vocal audio file
return processed_audio
# ASR transcription
def asr_model(processed_audio):
# load audio file
y, sr = librosa.load(processed_audio, sr=16000)
# ASR model
MODEL_NAME = "RexChan/ISOM5240-whisper-small-zhhk_1"
processor = WhisperProcessor.from_pretrained(MODEL_NAME)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, low_cpu_mem_usage=True)
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.config.use_cache = False
processed_in = processor(y, sampling_rate=sr, return_tensors="pt")
gout = model.generate(
input_features=processed_in.input_features,
output_scores=True, return_dict_in_generate=True
)
transcription = processor.batch_decode(gout.sequences, skip_special_tokens=True)[0]
# print result
print(f"Song lyrics = {transcription}")
return transcription
# sentiment analysis
def senti_model(transcription):
pipe = pipeline("text-classification", model="lxyuan/distilbert-base-multilingual-cased-sentiments-student")
final_result = pipe(transcription)
display = f"Sentiment Analysis shows that this song is {final_result[0]['label']}. Confident level of this analysis is {final_result[0]['score']*100:.1f}%."
print(display)
return display
# return final_result
# main
def main(input_file):
# processed_audio = audio_preprocess(input_file)
processed_audio = input_file
transcription = asr_model(processed_audio)
final_result = senti_model(transcription)
st.write(final_result)
if st.button("Play Audio"):
st.audio(audio_data['audio'],
format="audio/wav",
start_time=0,
sample_rate = audio_data['sampling_rate'])
if __name__ == '__main__':
# steamlit setup
st.set_page_config(page_title="Sentiment Analysis on Your Cantonese Song",)
st.header("Cantonese Song Sentiment Analyzer")
input_file = st.file_uploader("upload a song in mp3 format", type="mp3") # upload song
if input_file is not None:
st.write("File uploaded successfully!")
st.write(input_file)
else:
st.write("No file uploaded.")
button_click = st.button("Run Analysis", type="primary")
# load song
#input_file = os.path.isfile("test1.mp3")
# output_file = os.path.isdir("")
if button_click:
main(input_file=input_file)
|