Spaces:
Running
Running
| import os | |
| import subprocess | |
| import streamlit as st | |
| import librosa | |
| import librosa.display | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import soundfile as sf | |
| import wave | |
| import json | |
| from vosk import Model, KaldiRecognizer | |
| from transformers import pipeline | |
| from huggingface_hub import snapshot_download | |
| from pydub import AudioSegment | |
| import noisereduce as nr | |
| import plotly.graph_objects as go | |
| import plotly.express as px | |
| # π¨ Apply Custom Dark Mode CSS | |
| st.markdown( | |
| """ | |
| <style> | |
| .stApp { | |
| background-color: #121212; | |
| color: white; | |
| } | |
| .title { | |
| font-size: 32px; | |
| text-align: center; | |
| color: #4CAF50; | |
| font-weight: bold; | |
| } | |
| .subheader { | |
| font-size: 20px; | |
| font-weight: bold; | |
| color: #BB86FC; | |
| } | |
| .stButton>button { | |
| background-color: #BB86FC !important; | |
| color: black !important; | |
| font-size: 18px !important; | |
| padding: 10px 24px !important; | |
| border-radius: 10px !important; | |
| border: none !important; | |
| } | |
| .stAudio { | |
| width: 100% !important; | |
| } | |
| .stMarkdown { | |
| font-size: 16px; | |
| color: #E0E0E0; | |
| } | |
| .stTextInput>div>div>input { | |
| background-color: #1E1E1E !important; | |
| color: white !important; | |
| border-radius: 10px !important; | |
| } | |
| </style> | |
| """, | |
| unsafe_allow_html=True | |
| ) | |
| # β Auto-Download Vosk Model (Speech-to-Text) | |
| VOSK_MODEL = "vosk-model-small-en-us-0.15" | |
| if not os.path.exists(VOSK_MODEL): | |
| st.write("π₯ Downloading Vosk Model...") | |
| subprocess.run(["wget", "-O", "vosk.zip", "https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip"]) | |
| subprocess.run(["unzip", "vosk.zip"]) | |
| subprocess.run(["rm", "vosk.zip"]) | |
| # Load Vosk model | |
| model = Model(VOSK_MODEL) | |
| # β Auto-Download Wav2Vec2 Model (Emotion Detection) | |
| WAV2VEC_MODEL = "facebook/wav2vec2-large-xlsr-53" | |
| if not os.path.exists(WAV2VEC_MODEL): | |
| st.write(f"π₯ Downloading {WAV2VEC_MODEL}...") | |
| snapshot_download(repo_id=WAV2VEC_MODEL, local_dir=WAV2VEC_MODEL) | |
| # Load emotion detection model | |
| emotion_model = pipeline("audio-classification", model=WAV2VEC_MODEL) | |
| # β Streamlit UI | |
| st.markdown("<div class='title'>ποΈ Speech Detection System</div>", unsafe_allow_html=True) | |
| st.markdown("<div class='subheader'>π Upload an audio file for speech-to-text, noise filtering, and emotion analysis.</div>", unsafe_allow_html=True) | |
| uploaded_file = st.file_uploader("Upload an MP3/WAV file", type=["mp3", "wav"]) | |
| if uploaded_file: | |
| # Convert MP3 to WAV if needed | |
| file_path = f"temp/{uploaded_file.name}" | |
| os.makedirs("temp", exist_ok=True) | |
| with open(file_path, "wb") as f: | |
| f.write(uploaded_file.getbuffer()) | |
| if file_path.endswith(".mp3"): | |
| wav_path = file_path.replace(".mp3", ".wav") | |
| audio = AudioSegment.from_mp3(file_path) | |
| audio.export(wav_path, format="wav") | |
| file_path = wav_path | |
| # Load audio | |
| y, sr = librosa.load(file_path, sr=16000) | |
| # π΅ Display waveform using Plotly | |
| st.markdown("<div class='subheader'>πΌ Interactive Audio Waveform:</div>", unsafe_allow_html=True) | |
| time_axis = np.linspace(0, len(y) / sr, num=len(y)) | |
| fig_waveform = go.Figure() | |
| fig_waveform.add_trace(go.Scatter( | |
| x=time_axis, | |
| y=y, | |
| mode='lines', | |
| line=dict(color='cyan'), | |
| name="Waveform" | |
| )) | |
| fig_waveform.update_layout( | |
| title="Audio Waveform", | |
| xaxis_title="Time (seconds)", | |
| yaxis_title="Amplitude", | |
| template="plotly_dark" | |
| ) | |
| st.plotly_chart(fig_waveform) | |
| # β Noise Reduction | |
| st.markdown("<div class='subheader'>π Applying Noise Reduction...</div>", unsafe_allow_html=True) | |
| y_denoised = nr.reduce_noise(y=y, sr=sr) | |
| denoised_path = file_path.replace(".wav", "_denoised.wav") | |
| sf.write(denoised_path, y_denoised, sr) | |
| # β Spectrogram using Plotly | |
| st.markdown("<div class='subheader'>π€ Spectrogram (Frequency Analysis):</div>", unsafe_allow_html=True) | |
| S = librosa.stft(y) | |
| S_db = librosa.amplitude_to_db(np.abs(S), ref=np.max) | |
| fig_spectrogram = px.imshow( | |
| S_db, | |
| aspect='auto', | |
| origin='lower', | |
| labels={"x": "Time (frames)", "y": "Frequency (bins)", "color": "Intensity (dB)"}, | |
| color_continuous_scale="plasma" | |
| ) | |
| fig_spectrogram.update_layout( | |
| title="Spectrogram", | |
| template="plotly_dark" | |
| ) | |
| st.plotly_chart(fig_spectrogram) | |
| # β MFCC using Plotly | |
| st.markdown("<div class='subheader'>π΅ MFCC Feature Extraction:</div>", unsafe_allow_html=True) | |
| mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) | |
| fig_mfcc = px.imshow( | |
| mfccs, | |
| aspect='auto', | |
| origin='lower', | |
| labels={"x": "Time (frames)", "y": "MFCC Coefficients", "color": "Magnitude"}, | |
| color_continuous_scale="viridis" | |
| ) | |
| fig_mfcc.update_layout( | |
| title="Mel-Frequency Cepstral Coefficients (MFCC)", | |
| template="plotly_dark" | |
| ) | |
| st.plotly_chart(fig_mfcc) | |
| # β Speech-to-Text using Vosk | |
| def transcribe_audio(audio_path): | |
| wf = wave.open(audio_path, "rb") | |
| rec = KaldiRecognizer(model, wf.getframerate()) | |
| while True: | |
| data = wf.readframes(4000) | |
| if len(data) == 0: | |
| break | |
| if rec.AcceptWaveform(data): | |
| result = json.loads(rec.Result()) | |
| return result["text"] | |
| transcription = transcribe_audio(file_path) | |
| st.markdown("<div class='subheader'>π Transcribed Text:</div>", unsafe_allow_html=True) | |
| st.markdown(f"<div class='stMarkdown'>{transcription}</div>", unsafe_allow_html=True) | |
| # β Emotion Detection | |
| st.markdown("<div class='subheader'>π Emotion Analysis:</div>", unsafe_allow_html=True) | |
| emotion_result = emotion_model(file_path) | |
| emotion_labels = { | |
| "LABEL_0": "Neutral", | |
| "LABEL_1": "Happy", | |
| "LABEL_2": "Sad", | |
| "LABEL_3": "Angry", | |
| "LABEL_4": "Surprised" | |
| } | |
| top_emotion = max(emotion_result, key=lambda x: x["score"]) | |
| emotion_name = emotion_labels.get(top_emotion["label"], "Unknown") | |
| emotion_score = top_emotion["score"] | |
| st.markdown( | |
| f""" | |
| <div style="font-size:24px; color:#4CAF50; font-weight:bold;"> | |
| {emotion_name} ({emotion_score:.2%} confidence) | |
| </div> | |
| """, | |
| unsafe_allow_html=True | |
| ) | |
| # β Play Original & Denoised Audio | |
| st.audio(file_path, format="audio/wav") | |
| st.audio(denoised_path, format="audio/wav") |