Spaces:
Configuration error
Configuration error
Commit
·
360ddab
1
Parent(s):
6828b68
Update app
Browse files
app.py
CHANGED
@@ -35,7 +35,7 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
35 |
use_cuda = torch.cuda.is_available()
|
36 |
n_negative_samples = 100
|
37 |
|
38 |
-
def preprocess_video(path, result_folder, padding=20):
|
39 |
|
40 |
'''
|
41 |
This function preprocesses the input video to extract the audio and crop the frames using YOLO model
|
@@ -60,62 +60,10 @@ def preprocess_video(path, result_folder, padding=20):
|
|
60 |
msg = "Oops! Could not load the video. Please check the input video and try again."
|
61 |
return None, None, None, msg
|
62 |
|
63 |
-
all_frames = []
|
64 |
-
for k in range(len(vr)):
|
65 |
-
all_frames.append(vr[k].asnumpy())
|
66 |
-
all_frames = np.asarray(all_frames)
|
67 |
-
print("Extracted the frames for pre-processing")
|
68 |
-
|
69 |
-
# Load YOLOv9 model (pre-trained on COCO dataset)
|
70 |
-
yolo_model = YOLO("yolov9s.pt")
|
71 |
-
print("Loaded the YOLO model")
|
72 |
-
|
73 |
if frame_count < 25:
|
74 |
msg = "Not enough frames to process! Please give a longer video as input"
|
75 |
return None, None, None, msg
|
76 |
|
77 |
-
person_videos = {}
|
78 |
-
person_tracks = {}
|
79 |
-
|
80 |
-
print("Processing the frames...")
|
81 |
-
for frame_idx in tqdm(range(frame_count)):
|
82 |
-
|
83 |
-
frame = all_frames[frame_idx]
|
84 |
-
|
85 |
-
# Perform person detection
|
86 |
-
results = yolo_model(frame, verbose=False)
|
87 |
-
detections = results[0].boxes
|
88 |
-
|
89 |
-
for i, det in enumerate(detections):
|
90 |
-
x1, y1, x2, y2 = det.xyxy[0]
|
91 |
-
cls = det.cls[0]
|
92 |
-
if int(cls) == 0: # Class 0 is 'person' in COCO dataset
|
93 |
-
|
94 |
-
x1 = max(0, int(x1) - padding)
|
95 |
-
y1 = max(0, int(y1) - padding)
|
96 |
-
x2 = min(frame.shape[1], int(x2) + padding)
|
97 |
-
y2 = min(frame.shape[0], int(y2) + padding)
|
98 |
-
|
99 |
-
if i not in person_videos:
|
100 |
-
person_videos[i] = []
|
101 |
-
person_tracks[i] = []
|
102 |
-
|
103 |
-
person_videos[i].append(frame)
|
104 |
-
person_tracks[i].append([x1,y1,x2,y2])
|
105 |
-
|
106 |
-
|
107 |
-
num_persons = 0
|
108 |
-
for i in person_videos.keys():
|
109 |
-
if len(person_videos[i]) >= frame_count//2:
|
110 |
-
num_persons+=1
|
111 |
-
|
112 |
-
if num_persons==0:
|
113 |
-
msg = "No person detected in the video! Please give a video with one person as input"
|
114 |
-
return None, None, None, msg
|
115 |
-
if num_persons>1:
|
116 |
-
msg = "More than one person detected in the video! Please give a video with only one person as input"
|
117 |
-
return None, None, None, msg
|
118 |
-
|
119 |
# Extract the audio from the input video file using ffmpeg
|
120 |
wav_file = os.path.join(result_folder, "audio.wav")
|
121 |
|
@@ -125,50 +73,109 @@ def preprocess_video(path, result_folder, padding=20):
|
|
125 |
if status != 0:
|
126 |
msg = "Oops! Could not load the audio file. Please check the input video and try again."
|
127 |
return None, None, None, msg
|
128 |
-
|
129 |
print("Extracted the audio from the video")
|
130 |
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
no_sound_video = crop_filename.split('.')[0] + '_nosound.mp4'
|
153 |
-
status = subprocess.call('ffmpeg -hide_banner -loglevel panic -y -i %s -c copy -an -strict -2 %s' % (crop_filename, no_sound_video), shell=True)
|
154 |
-
if status != 0:
|
155 |
-
msg = "Oops! Could not preprocess the video. Please check the input video and try again."
|
156 |
-
return None, None, None, msg
|
157 |
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
return None, None, None, msg
|
|
|
|
|
|
|
|
|
164 |
|
165 |
-
os.remove(crop_filename)
|
166 |
-
os.remove(no_sound_video)
|
167 |
|
168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
else:
|
170 |
-
|
171 |
-
return None, None, None, msg
|
172 |
|
173 |
return wav_file, fps, video_output, "success"
|
174 |
|
@@ -649,7 +656,7 @@ class Logger:
|
|
649 |
return False
|
650 |
|
651 |
|
652 |
-
def process_video(video_path, num_avg_frames):
|
653 |
try:
|
654 |
# Extract the video filename
|
655 |
video_fname = os.path.basename(video_path.split(".")[0])
|
@@ -668,7 +675,8 @@ def process_video(video_path, num_avg_frames):
|
|
668 |
|
669 |
|
670 |
# Preprocess the video
|
671 |
-
|
|
|
672 |
if status != "success":
|
673 |
return status, None
|
674 |
print("Successfully preprocessed the video")
|
@@ -902,6 +910,7 @@ if __name__ == "__main__":
|
|
902 |
value=75,
|
903 |
label="Number of Average Frames",
|
904 |
)
|
|
|
905 |
video_input = gr.Video(label="Upload Video", height=400)
|
906 |
|
907 |
with gr.Column():
|
@@ -914,12 +923,12 @@ if __name__ == "__main__":
|
|
914 |
|
915 |
submit_button.click(
|
916 |
fn=process_video,
|
917 |
-
inputs=[video_input, num_avg_frames],
|
918 |
outputs=[result_text, output_video]
|
919 |
)
|
920 |
|
921 |
clear_button.click(
|
922 |
-
fn=lambda: (None, 75, "", None),
|
923 |
inputs=[],
|
924 |
outputs=[video_input, num_avg_frames, result_text, output_video]
|
925 |
)
|
|
|
35 |
use_cuda = torch.cuda.is_available()
|
36 |
n_negative_samples = 100
|
37 |
|
38 |
+
def preprocess_video(path, result_folder, apply_preprocess, padding=20):
|
39 |
|
40 |
'''
|
41 |
This function preprocesses the input video to extract the audio and crop the frames using YOLO model
|
|
|
60 |
msg = "Oops! Could not load the video. Please check the input video and try again."
|
61 |
return None, None, None, msg
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
if frame_count < 25:
|
64 |
msg = "Not enough frames to process! Please give a longer video as input"
|
65 |
return None, None, None, msg
|
66 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
# Extract the audio from the input video file using ffmpeg
|
68 |
wav_file = os.path.join(result_folder, "audio.wav")
|
69 |
|
|
|
73 |
if status != 0:
|
74 |
msg = "Oops! Could not load the audio file. Please check the input video and try again."
|
75 |
return None, None, None, msg
|
|
|
76 |
print("Extracted the audio from the video")
|
77 |
|
78 |
+
if apply_preprocess=="True":
|
79 |
+
all_frames = []
|
80 |
+
for k in range(len(vr)):
|
81 |
+
all_frames.append(vr[k].asnumpy())
|
82 |
+
all_frames = np.asarray(all_frames)
|
83 |
+
print("Extracted the frames for pre-processing")
|
84 |
+
|
85 |
+
# Load YOLOv9 model (pre-trained on COCO dataset)
|
86 |
+
yolo_model = YOLO("yolov9s.pt")
|
87 |
+
print("Loaded the YOLO model")
|
88 |
+
|
89 |
+
|
90 |
+
|
91 |
+
person_videos = {}
|
92 |
+
person_tracks = {}
|
93 |
+
|
94 |
+
print("Processing the frames...")
|
95 |
+
for frame_idx in tqdm(range(frame_count)):
|
96 |
+
|
97 |
+
frame = all_frames[frame_idx]
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
|
99 |
+
# Perform person detection
|
100 |
+
results = yolo_model(frame, verbose=False)
|
101 |
+
detections = results[0].boxes
|
102 |
+
|
103 |
+
for i, det in enumerate(detections):
|
104 |
+
x1, y1, x2, y2 = det.xyxy[0]
|
105 |
+
cls = det.cls[0]
|
106 |
+
if int(cls) == 0: # Class 0 is 'person' in COCO dataset
|
107 |
+
|
108 |
+
x1 = max(0, int(x1) - padding)
|
109 |
+
y1 = max(0, int(y1) - padding)
|
110 |
+
x2 = min(frame.shape[1], int(x2) + padding)
|
111 |
+
y2 = min(frame.shape[0], int(y2) + padding)
|
112 |
+
|
113 |
+
if i not in person_videos:
|
114 |
+
person_videos[i] = []
|
115 |
+
person_tracks[i] = []
|
116 |
+
|
117 |
+
person_videos[i].append(frame)
|
118 |
+
person_tracks[i].append([x1,y1,x2,y2])
|
119 |
+
|
120 |
+
|
121 |
+
num_persons = 0
|
122 |
+
for i in person_videos.keys():
|
123 |
+
if len(person_videos[i]) >= frame_count//2:
|
124 |
+
num_persons+=1
|
125 |
+
|
126 |
+
if num_persons==0:
|
127 |
+
msg = "No person detected in the video! Please give a video with one person as input"
|
128 |
return None, None, None, msg
|
129 |
+
if num_persons>1:
|
130 |
+
msg = "More than one person detected in the video! Please give a video with only one person as input"
|
131 |
+
return None, None, None, msg
|
132 |
+
|
133 |
|
|
|
|
|
134 |
|
135 |
+
# For the person detected, crop the frame based on the bounding box
|
136 |
+
if len(person_videos[0]) > frame_count-10:
|
137 |
+
crop_filename = os.path.join(result_folder, "preprocessed_video.avi")
|
138 |
+
fourcc = cv2.VideoWriter_fourcc(*'DIVX')
|
139 |
+
|
140 |
+
# Get bounding box coordinates based on person_tracks[i]
|
141 |
+
max_x1 = min([track[0] for track in person_tracks[0]])
|
142 |
+
max_y1 = min([track[1] for track in person_tracks[0]])
|
143 |
+
max_x2 = max([track[2] for track in person_tracks[0]])
|
144 |
+
max_y2 = max([track[3] for track in person_tracks[0]])
|
145 |
+
|
146 |
+
max_width = max_x2 - max_x1
|
147 |
+
max_height = max_y2 - max_y1
|
148 |
+
|
149 |
+
out = cv2.VideoWriter(crop_filename, fourcc, fps, (max_width, max_height))
|
150 |
+
for frame in person_videos[0]:
|
151 |
+
crop = frame[max_y1:max_y2, max_x1:max_x2]
|
152 |
+
crop = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
|
153 |
+
out.write(crop)
|
154 |
+
out.release()
|
155 |
+
|
156 |
+
no_sound_video = crop_filename.split('.')[0] + '_nosound.mp4'
|
157 |
+
status = subprocess.call('ffmpeg -hide_banner -loglevel panic -y -i %s -c copy -an -strict -2 %s' % (crop_filename, no_sound_video), shell=True)
|
158 |
+
if status != 0:
|
159 |
+
msg = "Oops! Could not preprocess the video. Please check the input video and try again."
|
160 |
+
return None, None, None, msg
|
161 |
+
|
162 |
+
video_output = crop_filename.split('.')[0] + '.mp4'
|
163 |
+
status = subprocess.call('ffmpeg -hide_banner -loglevel panic -y -i %s -i %s -strict -2 -q:v 1 %s' %
|
164 |
+
(wav_file , no_sound_video, video_output), shell=True)
|
165 |
+
if status != 0:
|
166 |
+
msg = "Oops! Could not preprocess the video. Please check the input video and try again."
|
167 |
+
return None, None, None, msg
|
168 |
+
|
169 |
+
os.remove(crop_filename)
|
170 |
+
os.remove(no_sound_video)
|
171 |
+
|
172 |
+
print("Successfully saved the pre-processed video: ", video_output)
|
173 |
+
else:
|
174 |
+
msg = "Could not track the person in the full video! Please give a single-speaker video as input"
|
175 |
+
return None, None, None, msg
|
176 |
+
|
177 |
else:
|
178 |
+
video_output = path
|
|
|
179 |
|
180 |
return wav_file, fps, video_output, "success"
|
181 |
|
|
|
656 |
return False
|
657 |
|
658 |
|
659 |
+
def process_video(video_path, num_avg_frames, apply_preprocess):
|
660 |
try:
|
661 |
# Extract the video filename
|
662 |
video_fname = os.path.basename(video_path.split(".")[0])
|
|
|
675 |
|
676 |
|
677 |
# Preprocess the video
|
678 |
+
print("Applying preprocessing: ", apply_preprocess)
|
679 |
+
wav_file, fps, vid_path_processed, status = preprocess_video(video_path, result_folder_input, apply_preprocess)
|
680 |
if status != "success":
|
681 |
return status, None
|
682 |
print("Successfully preprocessed the video")
|
|
|
910 |
value=75,
|
911 |
label="Number of Average Frames",
|
912 |
)
|
913 |
+
apply_preprocess = gr.Checkbox(label="Apply Preprocessing", value=False)
|
914 |
video_input = gr.Video(label="Upload Video", height=400)
|
915 |
|
916 |
with gr.Column():
|
|
|
923 |
|
924 |
submit_button.click(
|
925 |
fn=process_video,
|
926 |
+
inputs=[video_input, num_avg_frames, apply_preprocess],
|
927 |
outputs=[result_text, output_video]
|
928 |
)
|
929 |
|
930 |
clear_button.click(
|
931 |
+
fn=lambda: (None, 75, False, "", None),
|
932 |
inputs=[],
|
933 |
outputs=[video_input, num_avg_frames, result_text, output_video]
|
934 |
)
|