Spaces:

ojas121
/

speech_emotion_project

Running

App Files Files Community

speech_emotion_project / app.py

ojas121

Update app.py

9f79df9 verified 8 months ago

raw

history blame

6.74 kB

	import os
	import subprocess
	import streamlit as st
	import librosa
	import librosa.display
	import numpy as np
	import matplotlib.pyplot as plt
	import soundfile as sf
	import wave
	import json
	from vosk import Model, KaldiRecognizer
	from transformers import pipeline
	from huggingface_hub import snapshot_download
	from pydub import AudioSegment
	import noisereduce as nr
	import plotly.graph_objects as go
	import plotly.express as px

	# 🎨 Apply Custom Dark Mode CSS
	st.markdown(
	"""
	<style>
	.stApp {
	background-color: #121212;
	color: white;
	}
	.title {
	font-size: 32px;
	text-align: center;
	color: #4CAF50;
	font-weight: bold;
	}
	.subheader {
	font-size: 20px;
	font-weight: bold;
	color: #BB86FC;
	}
	.stButton>button {
	background-color: #BB86FC !important;
	color: black !important;
	font-size: 18px !important;
	padding: 10px 24px !important;
	border-radius: 10px !important;
	border: none !important;
	}
	.stAudio {
	width: 100% !important;
	}
	.stMarkdown {
	font-size: 16px;
	color: #E0E0E0;
	}
	.stTextInput>div>div>input {
	background-color: #1E1E1E !important;
	color: white !important;
	border-radius: 10px !important;
	}
	</style>
	""",
	unsafe_allow_html=True
	)

	# ✅ Auto-Download Vosk Model (Speech-to-Text)
	VOSK_MODEL = "vosk-model-small-en-us-0.15"
	if not os.path.exists(VOSK_MODEL):
	st.write("📥 Downloading Vosk Model...")
	subprocess.run(["wget", "-O", "vosk.zip", "https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip"])
	subprocess.run(["unzip", "vosk.zip"])
	subprocess.run(["rm", "vosk.zip"])

	# Load Vosk model
	model = Model(VOSK_MODEL)

	# ✅ Auto-Download Wav2Vec2 Model (Emotion Detection)
	WAV2VEC_MODEL = "facebook/wav2vec2-large-xlsr-53"
	if not os.path.exists(WAV2VEC_MODEL):
	st.write(f"📥 Downloading {WAV2VEC_MODEL}...")
	snapshot_download(repo_id=WAV2VEC_MODEL, local_dir=WAV2VEC_MODEL)

	# Load emotion detection model
	emotion_model = pipeline("audio-classification", model=WAV2VEC_MODEL)

	# ✅ Streamlit UI
	st.markdown("<div class='title'>🎙️ Speech Detection System</div>", unsafe_allow_html=True)
	st.markdown("<div class='subheader'>🔍 Upload an audio file for speech-to-text, noise filtering, and emotion analysis.</div>", unsafe_allow_html=True)

	uploaded_file = st.file_uploader("Upload an MP3/WAV file", type=["mp3", "wav"])

	if uploaded_file:
	# Convert MP3 to WAV if needed
	file_path = f"temp/{uploaded_file.name}"
	os.makedirs("temp", exist_ok=True)
	with open(file_path, "wb") as f:
	f.write(uploaded_file.getbuffer())

	if file_path.endswith(".mp3"):
	wav_path = file_path.replace(".mp3", ".wav")
	audio = AudioSegment.from_mp3(file_path)
	audio.export(wav_path, format="wav")
	file_path = wav_path

	# Load audio
	y, sr = librosa.load(file_path, sr=16000)

	# 🎵 Display waveform using Plotly
	st.markdown("<div class='subheader'>🎼 Interactive Audio Waveform:</div>", unsafe_allow_html=True)

	time_axis = np.linspace(0, len(y) / sr, num=len(y))

	fig_waveform = go.Figure()
	fig_waveform.add_trace(go.Scatter(
	x=time_axis,
	y=y,
	mode='lines',
	line=dict(color='cyan'),
	name="Waveform"
	))

	fig_waveform.update_layout(
	title="Audio Waveform",
	xaxis_title="Time (seconds)",
	yaxis_title="Amplitude",
	template="plotly_dark"
	)

	st.plotly_chart(fig_waveform)

	# ✅ Noise Reduction
	st.markdown("<div class='subheader'>🔇 Applying Noise Reduction...</div>", unsafe_allow_html=True)
	y_denoised = nr.reduce_noise(y=y, sr=sr)
	denoised_path = file_path.replace(".wav", "_denoised.wav")
	sf.write(denoised_path, y_denoised, sr)

	# ✅ Spectrogram using Plotly
	st.markdown("<div class='subheader'>🎤 Spectrogram (Frequency Analysis):</div>", unsafe_allow_html=True)

	S = librosa.stft(y)
	S_db = librosa.amplitude_to_db(np.abs(S), ref=np.max)

	fig_spectrogram = px.imshow(
	S_db,
	aspect='auto',
	origin='lower',
	labels={"x": "Time (frames)", "y": "Frequency (bins)", "color": "Intensity (dB)"},
	color_continuous_scale="plasma"
	)

	fig_spectrogram.update_layout(
	title="Spectrogram",
	template="plotly_dark"
	)

	st.plotly_chart(fig_spectrogram)

	# ✅ MFCC using Plotly
	st.markdown("<div class='subheader'>🎵 MFCC Feature Extraction:</div>", unsafe_allow_html=True)

	mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)

	fig_mfcc = px.imshow(
	mfccs,
	aspect='auto',
	origin='lower',
	labels={"x": "Time (frames)", "y": "MFCC Coefficients", "color": "Magnitude"},
	color_continuous_scale="viridis"
	)

	fig_mfcc.update_layout(
	title="Mel-Frequency Cepstral Coefficients (MFCC)",
	template="plotly_dark"
	)

	st.plotly_chart(fig_mfcc)

	# ✅ Speech-to-Text using Vosk
	def transcribe_audio(audio_path):
	wf = wave.open(audio_path, "rb")
	rec = KaldiRecognizer(model, wf.getframerate())

	while True:
	data = wf.readframes(4000)
	if len(data) == 0:
	break
	if rec.AcceptWaveform(data):
	result = json.loads(rec.Result())
	return result["text"]

	transcription = transcribe_audio(file_path)

	st.markdown("<div class='subheader'>📝 Transcribed Text:</div>", unsafe_allow_html=True)
	st.markdown(f"<div class='stMarkdown'>{transcription}</div>", unsafe_allow_html=True)

	# ✅ Emotion Detection
	st.markdown("<div class='subheader'>😊 Emotion Analysis:</div>", unsafe_allow_html=True)

	emotion_result = emotion_model(file_path)
	emotion_labels = {
	"LABEL_0": "Neutral",
	"LABEL_1": "Happy",
	"LABEL_2": "Sad",
	"LABEL_3": "Angry",
	"LABEL_4": "Surprised"
	}
	top_emotion = max(emotion_result, key=lambda x: x["score"])
	emotion_name = emotion_labels.get(top_emotion["label"], "Unknown")
	emotion_score = top_emotion["score"]

	st.markdown(
	f"""
	<div style="font-size:24px; color:#4CAF50; font-weight:bold;">
	{emotion_name} ({emotion_score:.2%} confidence)
	</div>
	""",
	unsafe_allow_html=True
	)

	# ✅ Play Original & Denoised Audio
	st.audio(file_path, format="audio/wav")
	st.audio(denoised_path, format="audio/wav")