import gradio as gr import numpy as np import librosa import cv2 import ffmpeg import speech_recognition as sr from transformers import AutoModelForCausalLM, AutoTokenizer import tensorflow as tf from tensorflow.keras.preprocessing.text import tokenizer_from_json from tensorflow.keras.models import load_model from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.preprocessing.image import img_to_array from collections import Counter import os # Load necessary models and files text_model = load_model('model_for_text_emotion_updated(1).keras') # Load your text emotion model with open('tokenizer.json') as json_file: tokenizer = tokenizer_from_json(json.load(json_file)) # Tokenizer for text emotion audio_model = load_model('my_model.h5') # Load audio emotion model image_model = load_model('model_emotion.h5') # Load image emotion model # Load LLM model from Hugging Face llama_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") # Example: small OPT model llama_tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m") # Emotion mapping (from your model output) emotion_mapping = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"} # Preprocess text for emotion prediction def preprocess_text(text): tokens = [word for word in text.lower().split() if word.isalnum()] return ' '.join(tokens) # Predict emotion from text def predict_text_emotion(text): preprocessed_text = preprocess_text(text) seq = tokenizer.texts_to_sequences([preprocessed_text]) padded_seq = pad_sequences(seq, maxlen=35) prediction = text_model.predict(padded_seq) emotion_index = np.argmax(prediction) return emotion_mapping[emotion_index] # Extract audio features and predict emotion def extract_audio_features(audio_data, sample_rate): mfcc = np.mean(librosa.feature.mfcc(y=audio_data, sr=sample_rate).T, axis=0) return np.expand_dims(mfcc, axis=0) def predict_audio_emotion(audio_data, sample_rate): features = extract_audio_features(audio_data, sample_rate) prediction = audio_model.predict(features) emotion_index = np.argmax(prediction) return emotion_mapping[emotion_index] # Process video and predict emotions from frames def process_video(video_path): cap = cv2.VideoCapture(video_path) frame_rate = cap.get(cv2.CAP_PROP_FPS) predictions = [] while cap.isOpened(): ret, frame = cap.read() if not ret: break if int(cap.get(cv2.CAP_PROP_POS_FRAMES)) % int(frame_rate) == 0: frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) frame = cv2.resize(frame, (48, 48)) frame = img_to_array(frame) / 255.0 frame = np.expand_dims(frame, axis=0) prediction = image_model.predict(frame) predictions.append(np.argmax(prediction)) cap.release() most_common_emotion = Counter(predictions).most_common(1)[0][0] return emotion_mapping[most_common_emotion] # Extract audio from video using ffmpeg-python def extract_audio_from_video(video_path): audio_file = 'audio.wav' (ffmpeg .input(video_path) .output(audio_file, format='wav', acodec='pcm_s16le') .run(overwrite_output=True)) return audio_file def transcribe_audio(audio_file): recognizer = sr.Recognizer() with sr.AudioFile(audio_file) as source: audio_record = recognizer.record(source) return recognizer.recognize_google(audio_record) # Integrating with LLM to adjust responses based on detected emotion def interact_with_llm(emotion, user_input): prompt = f"The user is feeling {emotion}. Respond to their question in an empathetic and appropriate manner: {user_input}" inputs = llama_tokenizer(prompt, return_tensors="pt") outputs = llama_model.generate(**inputs, max_length=200) response = llama_tokenizer.decode(outputs[0], skip_special_tokens=True) return response # Main function to process video and predict emotions def transcribe_and_predict_video(video_path): # Extract audio from video and predict text-based emotion audio_file = extract_audio_from_video(video_path) text = transcribe_audio(audio_file) text_emotion = predict_text_emotion(text) # Predict emotion from video frames (image-based) image_emotion = process_video(video_path) # Predict emotion from audio (sound-based) sample_rate, audio_data = librosa.load(audio_file, sr=None) audio_emotion = predict_audio_emotion(audio_data, sample_rate) # Combine the detected emotions for final output (you could average them or choose the most common) final_emotion = image_emotion # Or decide based on some logic (e.g., majority vote) # Get response from LLM llm_response = interact_with_llm(final_emotion, text) return f"Emotion Detected: {final_emotion}\nLLM Response: {llm_response}" # Create Gradio interface iface = gr.Interface(fn=transcribe_and_predict_video, inputs=gr.Video(), outputs="text", title="Emotion-Responsive LLM for Video", description="Upload a video to get emotion predictions and LLM responses based on detected emotions.") iface.launch()