# speech_model.py import whisper from transformers import pipeline import numpy as np import os from typing import Union, Tuple class SpeechEmotionAnalyzer: """ A class to transcribe audio and classify the emotion from the speech. """ def __init__(self, whisper_model="tiny", emotion_model="prithivMLmods/Speech-Emotion-Classification"): """ Initializes the SpeechEmotionAnalyzer. Args: whisper_model (str): The name of the Whisper model to use for transcription. emotion_model (str): The Hugging Face model to use for speech emotion classification. """ # Load the Whisper model for speech-to-text print("Loading Whisper model...") self.whisper_model = whisper.load_model(whisper_model) # Load the pipeline for audio classification print("Loading speech emotion classification model...") self.emotion_classifier = pipeline( "audio-classification", model=emotion_model ) print("SpeechEmotionAnalyzer initialized successfully.") def process_audio(self, audio_data: np.ndarray, sample_rate: int) -> Tuple[str, Union[str, None]]: """ Transcribes audio and classifies its emotion. Args: audio_data (np.ndarray): The raw audio data as a NumPy array. sample_rate (int): The sample rate of the audio data. Returns: A tuple containing: - The transcribed text. - The detected emotion label (e.g., 'SAD', 'HAPPY') or None if classification fails. """ # Ensure audio is in the correct format (float32) for Whisper if audio_data.dtype != np.float32: audio_data = audio_data.astype(np.float32) / 32767.0 # 1. Transcribe audio to text using Whisper print("Transcribing audio...") transcription_result = self.whisper_model.transcribe(audio_data) text = transcription_result.get("text", "").strip() # 2. Classify emotion from the audio print("Classifying speech emotion...") try: # The pipeline expects a dictionary with 'raw' audio data and 'sampling_rate' audio_input = {"raw": audio_data, "sampling_rate": sample_rate} emotion_results = self.emotion_classifier(audio_input, top_k=1) # The result is a list of lists, get the top result if emotion_results and emotion_results[0]: emotion = emotion_results[0][0]['label'] else: emotion = None except Exception as e: print(f"Could not classify speech emotion: {e}") emotion = None return text, emotion if __name__ == '__main__': # Example usage: This part is harder to test standalone without an audio file. # The main.py script will handle live microphone input. # You can uncomment and modify the following to test with a local audio file. # from scipy.io.wavfile import read # try: # analyzer = SpeechEmotionAnalyzer() # # Make sure you have a 'test_audio.wav' file in the same directory. # sample_rate, audio_data = read("test_audio.wav") # text, emotion = analyzer.process_audio(audio_data, sample_rate) # print("--- Analysis Result ---") # print(f"Transcription: {text}") # print(f"Vocal Emotion: {emotion}") # except FileNotFoundError: # print("Could not find 'test_audio.wav'. Skipping standalone test.") # except Exception as e: # print(f"An error occurred during standalone test: {e}") pass