Spaces:

Garvitj
/

emotion-llm

Running

App Files Files Community

emotion-llm / app.py

Garvitj

Update app.py

b431c05 verified about 1 month ago

raw

history blame

5.31 kB

	import gradio as gr
	import numpy as np
	import librosa
	import cv2
	import json
	import ffmpeg
	import speech_recognition as sr
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import tensorflow as tf
	from tensorflow.keras.preprocessing.text import tokenizer_from_json
	from tensorflow.keras.models import load_model
	from tensorflow.keras.preprocessing.sequence import pad_sequences
	from tensorflow.keras.preprocessing.image import img_to_array
	from collections import Counter
	import os

	# Load necessary models and files
	text_model = load_model('model_for_text_emotion_updated(1).keras') # Load your text emotion model
	with open('tokenizer.json') as json_file:
	tokenizer = tokenizer_from_json(json.load(json_file)) # Tokenizer for text emotion
	audio_model = load_model('my_model.h5') # Load audio emotion model
	image_model = load_model('model_emotion.h5') # Load image emotion model

	# Load LLM model from Hugging Face
	llama_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") # Example: small OPT model
	llama_tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")

	# Emotion mapping (from your model output)
	emotion_mapping = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}

	# Preprocess text for emotion prediction
	def preprocess_text(text):
	tokens = [word for word in text.lower().split() if word.isalnum()]
	return ' '.join(tokens)

	# Predict emotion from text
	def predict_text_emotion(text):
	preprocessed_text = preprocess_text(text)
	seq = tokenizer.texts_to_sequences([preprocessed_text])
	padded_seq = pad_sequences(seq, maxlen=35)
	prediction = text_model.predict(padded_seq)
	emotion_index = np.argmax(prediction)
	return emotion_mapping[emotion_index]

	# Extract audio features and predict emotion
	def extract_audio_features(audio_data, sample_rate):
	mfcc = np.mean(librosa.feature.mfcc(y=audio_data, sr=sample_rate).T, axis=0)
	return np.expand_dims(mfcc, axis=0)

	def predict_audio_emotion(audio_data, sample_rate):
	features = extract_audio_features(audio_data, sample_rate)
	prediction = audio_model.predict(features)
	emotion_index = np.argmax(prediction)
	return emotion_mapping[emotion_index]

	# Process video and predict emotions from frames
	def process_video(video_path):
	cap = cv2.VideoCapture(video_path)
	frame_rate = cap.get(cv2.CAP_PROP_FPS)
	predictions = []

	while cap.isOpened():
	ret, frame = cap.read()
	if not ret:
	break
	if int(cap.get(cv2.CAP_PROP_POS_FRAMES)) % int(frame_rate) == 0:
	frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
	frame = cv2.resize(frame, (48, 48))
	frame = img_to_array(frame) / 255.0
	frame = np.expand_dims(frame, axis=0)
	prediction = image_model.predict(frame)
	predictions.append(np.argmax(prediction))

	cap.release()
	most_common_emotion = Counter(predictions).most_common(1)[0][0]
	return emotion_mapping[most_common_emotion]

	# Extract audio from video using ffmpeg-python
	def extract_audio_from_video(video_path):
	audio_file = 'audio.wav'
	(ffmpeg
	.input(video_path)
	.output(audio_file, format='wav', acodec='pcm_s16le')
	.run(overwrite_output=True))
	return audio_file

	def transcribe_audio(audio_file):
	recognizer = sr.Recognizer()
	with sr.AudioFile(audio_file) as source:
	audio_record = recognizer.record(source)
	return recognizer.recognize_google(audio_record)

	# Integrating with LLM to adjust responses based on detected emotion
	def interact_with_llm(emotion, user_input):
	prompt = f"The user is feeling {emotion}. Respond to their question in an empathetic and appropriate manner: {user_input}"

	inputs = llama_tokenizer(prompt, return_tensors="pt")
	outputs = llama_model.generate(**inputs, max_length=200)
	response = llama_tokenizer.decode(outputs[0], skip_special_tokens=True)

	return response

	# Main function to process video and predict emotions
	def transcribe_and_predict_video(video_path):
	# Extract audio from video and predict text-based emotion
	audio_file = extract_audio_from_video(video_path)
	text = transcribe_audio(audio_file)
	text_emotion = predict_text_emotion(text)

	# Predict emotion from video frames (image-based)
	image_emotion = process_video(video_path)

	# Predict emotion from audio (sound-based)
	sample_rate, audio_data = librosa.load(audio_file, sr=None)
	audio_emotion = predict_audio_emotion(audio_data, sample_rate)

	# Combine the detected emotions for final output (you could average them or choose the most common)
	final_emotion = image_emotion # Or decide based on some logic (e.g., majority vote)

	# Get response from LLM
	llm_response = interact_with_llm(final_emotion, text)

	return f"Emotion Detected: {final_emotion}\nLLM Response: {llm_response}"

	# Create Gradio interface
	iface = gr.Interface(fn=transcribe_and_predict_video,
	inputs=gr.Video(),
	outputs="text",
	title="Emotion-Responsive LLM for Video",
	description="Upload a video to get emotion predictions and LLM responses based on detected emotions.")

	iface.launch()