Kevin King commited on
Commit
2ae282b
·
1 Parent(s): 1e773b8

TEST: Deploy minimal app to isolate moviepy installation issue"

Browse files
requirements.txt CHANGED
@@ -1,27 +1,2 @@
1
- --extra-index-url https://download.pytorch.org/whl/cpu
2
-
3
- # Pin the main UI components to recent, stable versions
4
- streamlit==1.35.0
5
- # streamlit-camera removed as st.camera_input is native
6
- streamlit-autorefresh==1.0.1
7
-
8
- # Library for video/audio file handling
9
- moviepy
10
-
11
- # Pin ML/AI libraries to modern, known-good versions
12
- transformers==4.40.1
13
- deepface==0.0.94
14
- openai-whisper==20231117
15
-
16
- # Pin frameworks to ensure CPU versions and prevent build timeouts
17
- tensorflow-cpu==2.16.1
18
- tf-keras==2.16.0
19
- torch==2.7.0
20
- torchaudio==2.7.0
21
-
22
- # Pin data/audio libraries for stability
23
- pandas==2.2.2
24
- numpy==1.26.4
25
- soundfile==0.12.1
26
- librosa==0.10.1
27
- scipy==1.13.0
 
1
+ streamlit
2
+ moviepy
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements_full.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --extra-index-url https://download.pytorch.org/whl/cpu
2
+
3
+ # Pin the main UI components to recent, stable versions
4
+ streamlit==1.35.0
5
+ streamlit-autorefresh==1.0.1
6
+
7
+ # Library for video/audio file handling
8
+ moviepy
9
+
10
+ # Pin ML/AI libraries to modern, known-good versions
11
+ transformers==4.40.1
12
+ deepface==0.0.94
13
+ openai-whisper==20231117
14
+
15
+ # Pin frameworks to ensure CPU versions and prevent build timeouts
16
+ tensorflow-cpu==2.16.1
17
+ tf-keras==2.16.0
18
+ torch==2.7.0
19
+ torchaudio==2.7.0
20
+
21
+ # Pin data/audio libraries for stability
22
+ pandas==2.2.2
23
+ numpy==1.26.4
24
+ soundfile==0.12.1
25
+ librosa==0.10.1
26
+ scipy==1.13.0
src/streamlit_app.py CHANGED
@@ -1,178 +1,12 @@
1
- import os
2
  import streamlit as st
3
-
4
- # Set home directories for model caching to the writable /tmp folder
5
- os.environ['DEEPFACE_HOME'] = '/tmp/.deepface'
6
- os.environ['HF_HOME'] = '/tmp/huggingface'
7
-
8
- import numpy as np
9
- import torch
10
- import whisper
11
- from transformers import pipeline, AutoModelForAudioClassification, AutoFeatureExtractor
12
- from deepface import DeepFace
13
- import logging
14
- import soundfile as sf
15
- from scipy.io.wavfile import write as write_wav
16
- import tempfile
17
- from PIL import Image
18
- import cv2
19
  from moviepy.editor import VideoFileClip
20
 
21
- # Set home directories for model caching inside the app's writable directory
22
- os.environ['DEEPFACE_HOME'] = '/tmp/.deepface'
23
- os.environ['HF_HOME'] = '/tmp/huggingface'
24
-
25
- # --- Page Configuration ---
26
- st.set_page_config(
27
- page_title="AffectLink Batch Demo",
28
- page_icon="😊",
29
- layout="wide"
30
- )
31
-
32
- st.title("AffectLink: Post-Hoc Emotion Analysis")
33
- st.write("Upload a short video clip to analyze facial expressions, speech-to-text, and the emotional tone of the audio.")
34
-
35
- # --- Logger Configuration ---
36
- logging.basicConfig(level=logging.INFO)
37
- logging.getLogger('deepface').setLevel(logging.ERROR)
38
- logging.getLogger('huggingface_hub').setLevel(logging.WARNING)
39
- logging.getLogger('moviepy').setLevel(logging.ERROR)
40
-
41
-
42
- # --- Emotion Mappings ---
43
- UNIFIED_EMOTIONS = ['neutral', 'happy', 'sad', 'angry']
44
- TEXT_TO_UNIFIED = {
45
- 'neutral': 'neutral', 'joy': 'happy', 'sadness': 'sad', 'anger': 'angry',
46
- 'fear': None, 'surprise': None, 'disgust': None
47
- }
48
- SER_TO_UNIFIED = {
49
- 'neu': 'neutral', 'hap': 'happy', 'sad': 'sad', 'ang': 'angry'
50
- }
51
- AUDIO_SAMPLE_RATE = 16000
52
-
53
- # --- Model Loading ---
54
- @st.cache_resource
55
- def load_models():
56
- with st.spinner("Loading AI models, this may take a moment..."):
57
- whisper_model = whisper.load_model("base", download_root="/tmp/whisper_cache")
58
- text_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)
59
- ser_model_name = "superb/hubert-large-superb-er"
60
- ser_feature_extractor = AutoFeatureExtractor.from_pretrained(ser_model_name)
61
- ser_model = AutoModelForAudioClassification.from_pretrained(ser_model_name)
62
- return whisper_model, text_classifier, ser_model, ser_feature_extractor
63
-
64
- whisper_model, text_classifier, ser_model, ser_feature_extractor = load_models()
65
-
66
-
67
- # --- UI and Processing Logic ---
68
- uploaded_file = st.file_uploader("Choose a video file...", type=["mp4", "mov", "avi"])
69
-
70
- if uploaded_file is not None:
71
- # Save the uploaded file to a temporary location
72
- with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tfile:
73
- tfile.write(uploaded_file.read())
74
- temp_video_path = tfile.name
75
-
76
- st.video(temp_video_path)
77
-
78
- if st.button("Analyze Video"):
79
- facial_analysis_results = []
80
- audio_analysis_results = {}
81
-
82
- # --- Video Processing for Facial Emotion ---
83
- with st.spinner("Analyzing video for facial expressions..."):
84
- try:
85
- cap = cv2.VideoCapture(temp_video_path)
86
- fps = cap.get(cv2.CAP_PROP_FPS)
87
- frame_count = 0
88
- while cap.isOpened():
89
- ret, frame = cap.read()
90
- if not ret:
91
- break
92
-
93
- # Process one frame per second
94
- if frame_count % int(fps) == 0:
95
- timestamp = frame_count / fps
96
- analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False, silent=True)
97
- if isinstance(analysis, list) and len(analysis) > 0:
98
- dominant_emotion = analysis[0]['dominant_emotion']
99
- facial_analysis_results.append((timestamp, dominant_emotion.capitalize()))
100
-
101
- frame_count += 1
102
- cap.release()
103
- except Exception as e:
104
- st.error(f"An error occurred during facial analysis: {e}")
105
-
106
-
107
- # --- Audio Extraction and Processing ---
108
- with st.spinner("Extracting and analyzing audio..."):
109
- try:
110
- # Extract audio using moviepy
111
- video_clip = VideoFileClip(temp_video_path)
112
- with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as taudio:
113
- video_clip.audio.write_audiofile(taudio.name, fps=AUDIO_SAMPLE_RATE, logger=None)
114
- temp_audio_path = taudio.name
115
-
116
- # 1. Speech-to-Text (Whisper)
117
- result = whisper_model.transcribe(temp_audio_path, fp16=False)
118
- transcribed_text = result['text']
119
- audio_analysis_results['Transcription'] = transcribed_text
120
-
121
- # 2. Text-based Emotion
122
- if transcribed_text:
123
- text_emotions = text_classifier(transcribed_text)[0]
124
- unified_text_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
125
- for emo in text_emotions:
126
- unified_emo = TEXT_TO_UNIFIED.get(emo['label'])
127
- if unified_emo:
128
- unified_text_scores[unified_emo] += emo['score']
129
- dominant_text_emotion = max(unified_text_scores, key=unified_text_scores.get)
130
- audio_analysis_results['Text Emotion'] = dominant_text_emotion.capitalize()
131
-
132
- # 3. Speech Emotion Recognition (SER)
133
- audio_array, _ = sf.read(temp_audio_path)
134
- inputs = ser_feature_extractor(audio_array, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt", padding=True)
135
- with torch.no_grad():
136
- logits = ser_model(**inputs).logits
137
- scores = torch.nn.functional.softmax(logits, dim=1).squeeze()
138
- unified_ser_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
139
- for i, score in enumerate(scores):
140
- raw_emo = ser_model.config.id2label[i]
141
- unified_emo = SER_TO_UNIFIED.get(raw_emo)
142
- if unified_emo:
143
- unified_ser_scores[unified_emo] += score.item()
144
- dominant_ser_emotion = max(unified_ser_scores, key=unified_ser_scores.get)
145
- audio_analysis_results['Speech Emotion'] = dominant_ser_emotion.capitalize()
146
-
147
- # Clean up temp audio file
148
- os.unlink(temp_audio_path)
149
-
150
- except Exception as e:
151
- st.error(f"An error occurred during audio analysis: {e}")
152
- finally:
153
- video_clip.close()
154
-
155
-
156
- # --- Display Results ---
157
- st.header("Analysis Results")
158
- col1, col2 = st.columns(2)
159
-
160
- with col1:
161
- st.subheader("Audio Analysis")
162
- if audio_analysis_results:
163
- st.write(f"**Transcription:** \"{audio_analysis_results.get('Transcription', 'N/A')}\"")
164
- st.metric("Emotion from Text", audio_analysis_results.get('Text Emotion', 'N/A'))
165
- st.metric("Emotion from Speech", audio_analysis_results.get('Speech Emotion', 'N/A'))
166
- else:
167
- st.write("No audio results to display.")
168
-
169
- with col2:
170
- st.subheader("Facial Expression Timeline")
171
- if facial_analysis_results:
172
- for timestamp, emotion in facial_analysis_results:
173
- st.write(f"**Time {int(timestamp // 60):02d}:{int(timestamp % 60):02d}:** {emotion}")
174
- else:
175
- st.write("No faces detected or video processing failed.")
176
 
177
- # Clean up temp video file
178
- os.unlink(temp_video_path)
 
 
 
 
 
 
1
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from moviepy.editor import VideoFileClip
3
 
4
+ st.set_page_config(page_title="MoviePy Test")
5
+ st.title("Testing `moviepy` Installation")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ try:
8
+ # This line will only succeed if moviepy is installed correctly
9
+ st.success("Successfully imported `VideoFileClip` from `moviepy.editor`!")
10
+ st.write("This confirms that the `moviepy` library was installed correctly.")
11
+ except ImportError as e:
12
+ st.error(f"Failed to import `moviepy`. Error: {e}")
src/streamlit_app_full.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+
4
+ # Set home directories for model caching to the writable /tmp folder
5
+ os.environ['DEEPFACE_HOME'] = '/tmp/.deepface'
6
+ os.environ['HF_HOME'] = '/tmp/huggingface'
7
+
8
+ import numpy as np
9
+ import torch
10
+ import whisper
11
+ from transformers import pipeline, AutoModelForAudioClassification, AutoFeatureExtractor
12
+ from deepface import DeepFace
13
+ import logging
14
+ import soundfile as sf
15
+ from scipy.io.wavfile import write as write_wav
16
+ import tempfile
17
+ from PIL import Image
18
+ import cv2
19
+ from moviepy.editor import VideoFileClip
20
+
21
+ # Set home directories for model caching inside the app's writable directory
22
+ os.environ['DEEPFACE_HOME'] = '/tmp/.deepface'
23
+ os.environ['HF_HOME'] = '/tmp/huggingface'
24
+
25
+ # --- Page Configuration ---
26
+ st.set_page_config(
27
+ page_title="AffectLink Batch Demo",
28
+ page_icon="😊",
29
+ layout="wide"
30
+ )
31
+
32
+ st.title("AffectLink: Post-Hoc Emotion Analysis")
33
+ st.write("Upload a short video clip to analyze facial expressions, speech-to-text, and the emotional tone of the audio.")
34
+
35
+ # --- Logger Configuration ---
36
+ logging.basicConfig(level=logging.INFO)
37
+ logging.getLogger('deepface').setLevel(logging.ERROR)
38
+ logging.getLogger('huggingface_hub').setLevel(logging.WARNING)
39
+ logging.getLogger('moviepy').setLevel(logging.ERROR)
40
+
41
+
42
+ # --- Emotion Mappings ---
43
+ UNIFIED_EMOTIONS = ['neutral', 'happy', 'sad', 'angry']
44
+ TEXT_TO_UNIFIED = {
45
+ 'neutral': 'neutral', 'joy': 'happy', 'sadness': 'sad', 'anger': 'angry',
46
+ 'fear': None, 'surprise': None, 'disgust': None
47
+ }
48
+ SER_TO_UNIFIED = {
49
+ 'neu': 'neutral', 'hap': 'happy', 'sad': 'sad', 'ang': 'angry'
50
+ }
51
+ AUDIO_SAMPLE_RATE = 16000
52
+
53
+ # --- Model Loading ---
54
+ @st.cache_resource
55
+ def load_models():
56
+ with st.spinner("Loading AI models, this may take a moment..."):
57
+ whisper_model = whisper.load_model("base", download_root="/tmp/whisper_cache")
58
+ text_classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", top_k=None)
59
+ ser_model_name = "superb/hubert-large-superb-er"
60
+ ser_feature_extractor = AutoFeatureExtractor.from_pretrained(ser_model_name)
61
+ ser_model = AutoModelForAudioClassification.from_pretrained(ser_model_name)
62
+ return whisper_model, text_classifier, ser_model, ser_feature_extractor
63
+
64
+ whisper_model, text_classifier, ser_model, ser_feature_extractor = load_models()
65
+
66
+
67
+ # --- UI and Processing Logic ---
68
+ uploaded_file = st.file_uploader("Choose a video file...", type=["mp4", "mov", "avi"])
69
+
70
+ if uploaded_file is not None:
71
+ # Save the uploaded file to a temporary location
72
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as tfile:
73
+ tfile.write(uploaded_file.read())
74
+ temp_video_path = tfile.name
75
+
76
+ st.video(temp_video_path)
77
+
78
+ if st.button("Analyze Video"):
79
+ facial_analysis_results = []
80
+ audio_analysis_results = {}
81
+
82
+ # --- Video Processing for Facial Emotion ---
83
+ with st.spinner("Analyzing video for facial expressions..."):
84
+ try:
85
+ cap = cv2.VideoCapture(temp_video_path)
86
+ fps = cap.get(cv2.CAP_PROP_FPS)
87
+ frame_count = 0
88
+ while cap.isOpened():
89
+ ret, frame = cap.read()
90
+ if not ret:
91
+ break
92
+
93
+ # Process one frame per second
94
+ if frame_count % int(fps) == 0:
95
+ timestamp = frame_count / fps
96
+ analysis = DeepFace.analyze(frame, actions=['emotion'], enforce_detection=False, silent=True)
97
+ if isinstance(analysis, list) and len(analysis) > 0:
98
+ dominant_emotion = analysis[0]['dominant_emotion']
99
+ facial_analysis_results.append((timestamp, dominant_emotion.capitalize()))
100
+
101
+ frame_count += 1
102
+ cap.release()
103
+ except Exception as e:
104
+ st.error(f"An error occurred during facial analysis: {e}")
105
+
106
+
107
+ # --- Audio Extraction and Processing ---
108
+ with st.spinner("Extracting and analyzing audio..."):
109
+ try:
110
+ # Extract audio using moviepy
111
+ video_clip = VideoFileClip(temp_video_path)
112
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as taudio:
113
+ video_clip.audio.write_audiofile(taudio.name, fps=AUDIO_SAMPLE_RATE, logger=None)
114
+ temp_audio_path = taudio.name
115
+
116
+ # 1. Speech-to-Text (Whisper)
117
+ result = whisper_model.transcribe(temp_audio_path, fp16=False)
118
+ transcribed_text = result['text']
119
+ audio_analysis_results['Transcription'] = transcribed_text
120
+
121
+ # 2. Text-based Emotion
122
+ if transcribed_text:
123
+ text_emotions = text_classifier(transcribed_text)[0]
124
+ unified_text_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
125
+ for emo in text_emotions:
126
+ unified_emo = TEXT_TO_UNIFIED.get(emo['label'])
127
+ if unified_emo:
128
+ unified_text_scores[unified_emo] += emo['score']
129
+ dominant_text_emotion = max(unified_text_scores, key=unified_text_scores.get)
130
+ audio_analysis_results['Text Emotion'] = dominant_text_emotion.capitalize()
131
+
132
+ # 3. Speech Emotion Recognition (SER)
133
+ audio_array, _ = sf.read(temp_audio_path)
134
+ inputs = ser_feature_extractor(audio_array, sampling_rate=AUDIO_SAMPLE_RATE, return_tensors="pt", padding=True)
135
+ with torch.no_grad():
136
+ logits = ser_model(**inputs).logits
137
+ scores = torch.nn.functional.softmax(logits, dim=1).squeeze()
138
+ unified_ser_scores = {e: 0.0 for e in UNIFIED_EMOTIONS}
139
+ for i, score in enumerate(scores):
140
+ raw_emo = ser_model.config.id2label[i]
141
+ unified_emo = SER_TO_UNIFIED.get(raw_emo)
142
+ if unified_emo:
143
+ unified_ser_scores[unified_emo] += score.item()
144
+ dominant_ser_emotion = max(unified_ser_scores, key=unified_ser_scores.get)
145
+ audio_analysis_results['Speech Emotion'] = dominant_ser_emotion.capitalize()
146
+
147
+ # Clean up temp audio file
148
+ os.unlink(temp_audio_path)
149
+
150
+ except Exception as e:
151
+ st.error(f"An error occurred during audio analysis: {e}")
152
+ finally:
153
+ video_clip.close()
154
+
155
+
156
+ # --- Display Results ---
157
+ st.header("Analysis Results")
158
+ col1, col2 = st.columns(2)
159
+
160
+ with col1:
161
+ st.subheader("Audio Analysis")
162
+ if audio_analysis_results:
163
+ st.write(f"**Transcription:** \"{audio_analysis_results.get('Transcription', 'N/A')}\"")
164
+ st.metric("Emotion from Text", audio_analysis_results.get('Text Emotion', 'N/A'))
165
+ st.metric("Emotion from Speech", audio_analysis_results.get('Speech Emotion', 'N/A'))
166
+ else:
167
+ st.write("No audio results to display.")
168
+
169
+ with col2:
170
+ st.subheader("Facial Expression Timeline")
171
+ if facial_analysis_results:
172
+ for timestamp, emotion in facial_analysis_results:
173
+ st.write(f"**Time {int(timestamp // 60):02d}:{int(timestamp % 60):02d}:** {emotion}")
174
+ else:
175
+ st.write("No faces detected or video processing failed.")
176
+
177
+ # Clean up temp video file
178
+ os.unlink(temp_video_path)