import streamlit as st import librosa import librosa.display import numpy as np import matplotlib.pyplot as plt import soundfile as sf import wave import json from vosk import Model, KaldiRecognizer from transformers import pipeline import os from pydub import AudioSegment import noisereduce as nr import streamlit as st import subprocess try: import librosa st.write("✅ Librosa is installed successfully!") except ImportError: st.write("❌ Librosa is missing! Installing now...") subprocess.run(["pip", "install", "librosa"]) import librosa st.write("✅ Librosa installed successfully!") # Load Vosk model MODEL_PATH = "vosk-model-small-en-us-0.15" if not os.path.exists(MODEL_PATH): st.error("Vosk model not found! Please download and extract it.") st.stop() model = Model(MODEL_PATH) # Streamlit UI st.title("🎙️ Speech Detection System using Mozilla Common Voice") st.write("Upload an audio file and get real-time speech-to-text, noise filtering, and emotion analysis.") uploaded_file = st.file_uploader("Upload an MP3/WAV file", type=["mp3", "wav"]) if uploaded_file: # Convert MP3 to WAV if needed file_path = f"temp/{uploaded_file.name}" os.makedirs("temp", exist_ok=True) with open(file_path, "wb") as f: f.write(uploaded_file.getbuffer()) if file_path.endswith(".mp3"): wav_path = file_path.replace(".mp3", ".wav") audio = AudioSegment.from_mp3(file_path) audio.export(wav_path, format="wav") file_path = wav_path # Load audio y, sr = librosa.load(file_path, sr=16000) # Display waveform fig, ax = plt.subplots(figsize=(10, 4)) librosa.display.waveshow(y, sr=sr, ax=ax) st.pyplot(fig) # Noise Reduction y_denoised = nr.reduce_noise(y=y, sr=sr) denoised_path = file_path.replace(".wav", "_denoised.wav") sf.write(denoised_path, y_denoised, sr) # Speech-to-Text using Vosk def transcribe_audio(audio_path): wf = wave.open(audio_path, "rb") rec = KaldiRecognizer(model, wf.getframerate()) while True: data = wf.readframes(4000) if len(data) == 0: break if rec.AcceptWaveform(data): result = json.loads(rec.Result()) return result["text"] transcription = transcribe_audio(file_path) st.subheader("📝 Transcribed Text:") st.write(transcription) # Emotion Detection emotion_model = pipeline("audio-classification", model="superb/wav2vec2-large-xlsr-53") emotion_result = emotion_model(file_path) st.subheader("😊 Emotion Analysis:") st.write(emotion_result) # Play original and denoised audio st.audio(file_path, format="audio/wav", start_time=0) st.subheader("🔊 Denoised Audio:") st.audio(denoised_path, format="audio/wav", start_time=0)