Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -187,17 +187,19 @@ def process_video(video_path):
|
|
187 |
|
188 |
|
189 |
|
190 |
-
# Process audio from video and predict emotions
|
191 |
def process_audio_from_video(video_path):
|
192 |
-
|
193 |
|
194 |
try:
|
195 |
-
#
|
196 |
-
ffmpeg
|
|
|
|
|
|
|
197 |
|
198 |
recognizer = sr.Recognizer()
|
199 |
|
200 |
-
with sr.AudioFile(
|
201 |
audio_record = recognizer.record(source)
|
202 |
text = recognizer.recognize_google(audio_record)
|
203 |
pre_text = preprocess_text(text)
|
@@ -206,24 +208,45 @@ def process_audio_from_video(video_path):
|
|
206 |
inp1 = np.array(padded_title_seq)
|
207 |
text_prediction = text_model.predict(inp1)
|
208 |
|
209 |
-
os.remove(
|
210 |
|
211 |
max_index = text_prediction.argmax()
|
212 |
text_emotion = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}[max_index]
|
213 |
|
214 |
-
|
215 |
-
|
216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
|
218 |
-
|
219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
|
221 |
except Exception as e:
|
222 |
-
print(f"Error processing audio: {e}")
|
223 |
audio_emotion = "Error in audio processing"
|
224 |
|
225 |
return text_emotion, audio_emotion
|
226 |
|
|
|
227 |
# Main function to handle video emotion recognition
|
228 |
def transcribe_and_predict_video(video):
|
229 |
image_emotion = process_video(video)
|
|
|
187 |
|
188 |
|
189 |
|
|
|
190 |
def process_audio_from_video(video_path):
|
191 |
+
text_emotion = "Error in text processing" # Initialize text_emotion
|
192 |
|
193 |
try:
|
194 |
+
# Load the video using an alternative library (e.g., ffmpeg or cv2)
|
195 |
+
import ffmpeg
|
196 |
+
|
197 |
+
audio_output = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
|
198 |
+
ffmpeg.input(video_path).output(audio_output, format="wav").run(quiet=True)
|
199 |
|
200 |
recognizer = sr.Recognizer()
|
201 |
|
202 |
+
with sr.AudioFile(audio_output) as source:
|
203 |
audio_record = recognizer.record(source)
|
204 |
text = recognizer.recognize_google(audio_record)
|
205 |
pre_text = preprocess_text(text)
|
|
|
208 |
inp1 = np.array(padded_title_seq)
|
209 |
text_prediction = text_model.predict(inp1)
|
210 |
|
211 |
+
os.remove(audio_output)
|
212 |
|
213 |
max_index = text_prediction.argmax()
|
214 |
text_emotion = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}[max_index]
|
215 |
|
216 |
+
except Exception as e:
|
217 |
+
print(f"Error processing text from audio: {e}")
|
218 |
+
text_emotion = "Error in text processing"
|
219 |
+
|
220 |
+
try:
|
221 |
+
# Extract audio features for emotion recognition
|
222 |
+
sample_rate, data = librosa.load(video_path, sr=None, mono=True)
|
223 |
+
data = data.flatten()
|
224 |
+
|
225 |
+
if data.dtype != np.float32:
|
226 |
+
data = data.astype(np.float32)
|
227 |
+
data = data / np.max(np.abs(data))
|
228 |
|
229 |
+
features = extract_features(data, sample_rate)
|
230 |
+
features = np.expand_dims(features, axis=0)
|
231 |
+
scaled_features = scaler.transform(features)
|
232 |
+
scaled_features = np.expand_dims(scaled_features, axis=2)
|
233 |
+
|
234 |
+
prediction = audio_model.predict(scaled_features)
|
235 |
+
emotion_index = np.argmax(prediction)
|
236 |
+
|
237 |
+
num_classes = len(encoder.categories_[0])
|
238 |
+
emotion_array = np.zeros((1, num_classes))
|
239 |
+
emotion_array[0, emotion_index] = 1
|
240 |
+
|
241 |
+
audio_emotion = encoder.inverse_transform(emotion_array)[0]
|
242 |
|
243 |
except Exception as e:
|
244 |
+
print(f"Error processing audio features: {e}")
|
245 |
audio_emotion = "Error in audio processing"
|
246 |
|
247 |
return text_emotion, audio_emotion
|
248 |
|
249 |
+
|
250 |
# Main function to handle video emotion recognition
|
251 |
def transcribe_and_predict_video(video):
|
252 |
image_emotion = process_video(video)
|