# main.py import cv2 import sounddevice as sd import numpy as np from scipy.io.wavfile import write import threading import queue import time import os # Import the model classes from the other files from face_model import FacialEmotionDetector from speech_model import SpeechEmotionAnalyzer from text_model import TextSummarizer # --- Configuration --- SAMPLE_RATE = 16000 # Sample rate for audio recording (16k is standard for speech models) CHANNELS = 1 # Mono audio SILENCE_THRESHOLD = 0.01 # Amplitude threshold to detect silence SILENCE_DURATION = 1.5 # Seconds of silence to mark the end of a sentence AUDIO_CHUNK_SIZE = 1024 # Number of frames per buffer # --- Global Variables --- audio_queue = queue.Queue() is_recording = True def audio_callback(indata, frames, time, status): """This is called (from a separate thread) for each audio block.""" if status: print(status) audio_queue.put(indata.copy()) def record_audio_thread(): """ A thread that continuously records audio from the microphone and puts the data into a queue. """ with sd.InputStream(samplerate=SAMPLE_RATE, channels=CHANNELS, blocksize=AUDIO_CHUNK_SIZE, callback=audio_callback): print("Audio recording thread started. Recording...") while is_recording: time.sleep(0.1) print("Audio recording thread finished.") def main(): global is_recording # --- Model Initialization --- try: print("Initializing models...") # IMPORTANT: Make sure you have the 'yolov11n.pt' model file in this directory. face_detector = FacialEmotionDetector(model_path='yolov11n.pt') speech_analyzer = SpeechEmotionAnalyzer() text_summarizer = TextSummarizer() print("\nAll models initialized successfully!") except FileNotFoundError as e: print(f"ERROR: {e}") print("Please make sure the required model files are present.") return except Exception as e: print(f"An error occurred during model initialization: {e}") return # --- Start Audio Recording --- audio_thread = threading.Thread(target=record_audio_thread) audio_thread.daemon = True audio_thread.start() # --- Start Webcam --- cap = cv2.VideoCapture(0) if not cap.isOpened(): print("Error: Could not open webcam.") return # --- Main Application Loop --- print("\nApplication running. Speak into the microphone. Press 'q' to quit.") sentence_audio = [] silence_start_time = None last_facial_emotion = "neutral" try: while True: # 1. Process Video Frame ret, frame = cap.read() if not ret: break # Detect facial emotion annotated_frame, detected_emotion = face_detector.detect_emotion(frame) if detected_emotion: last_facial_emotion = detected_emotion cv2.imshow('Real-time Analysis', annotated_frame) # 2. Process Audio Stream audio_buffer = [] while not audio_queue.empty(): audio_buffer.append(audio_queue.get()) if audio_buffer: # Concatenate all audio chunks from the queue current_audio_chunk = np.concatenate(audio_buffer) sentence_audio.append(current_audio_chunk) # Check for silence if np.abs(current_audio_chunk).mean() < SILENCE_THRESHOLD: if silence_start_time is None: silence_start_time = time.time() elif time.time() - silence_start_time > SILENCE_DURATION: # Silence detected, process the sentence full_sentence_audio = np.concatenate(sentence_audio) print("\n" + "=" * 50) print("End of sentence detected. Processing...") # Process in a separate thread to avoid freezing the webcam feed processing_thread = threading.Thread( target=process_and_summarize, args=(full_sentence_audio, speech_analyzer, text_summarizer, last_facial_emotion) ) processing_thread.start() # Reset for the next sentence sentence_audio = [] silence_start_time = None else: # Sound detected, reset silence timer silence_start_time = None # Exit condition if cv2.waitKey(1) & 0xFF == ord('q'): break finally: # Cleanup print("Shutting down...") is_recording = False audio_thread.join(timeout=2) cap.release() cv2.destroyAllWindows() def process_and_summarize(audio_data, speech_analyzer, text_summarizer, facial_emotion): """ Function to run in a thread for processing audio and generating a summary. """ # 1. Get transcription and vocal emotion transcribed_text, vocal_emotion = speech_analyzer.process_audio(audio_data, SAMPLE_RATE) print(f"Transcription: '{transcribed_text}'") print(f"Vocal Emotion: {vocal_emotion} | Facial Emotion: {facial_emotion}") if not transcribed_text: print("Summary: Could not transcribe audio.") print("=" * 50 + "\n") return # 2. Get summary from Llama 3 summary = text_summarizer.summarize_with_context( transcribed_text, facial_emotion if facial_emotion else "unknown", vocal_emotion if vocal_emotion else "unknown" ) print(f"\nAI Summary: {summary}") print("=" * 50 + "\n") if __name__ == '__main__': main()