sindhuhegde commited on
Commit
360ddab
·
1 Parent(s): 6828b68

Update app

Browse files
Files changed (1) hide show
  1. app.py +103 -94
app.py CHANGED
@@ -35,7 +35,7 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
35
  use_cuda = torch.cuda.is_available()
36
  n_negative_samples = 100
37
 
38
- def preprocess_video(path, result_folder, padding=20):
39
 
40
  '''
41
  This function preprocesses the input video to extract the audio and crop the frames using YOLO model
@@ -60,62 +60,10 @@ def preprocess_video(path, result_folder, padding=20):
60
  msg = "Oops! Could not load the video. Please check the input video and try again."
61
  return None, None, None, msg
62
 
63
- all_frames = []
64
- for k in range(len(vr)):
65
- all_frames.append(vr[k].asnumpy())
66
- all_frames = np.asarray(all_frames)
67
- print("Extracted the frames for pre-processing")
68
-
69
- # Load YOLOv9 model (pre-trained on COCO dataset)
70
- yolo_model = YOLO("yolov9s.pt")
71
- print("Loaded the YOLO model")
72
-
73
  if frame_count < 25:
74
  msg = "Not enough frames to process! Please give a longer video as input"
75
  return None, None, None, msg
76
 
77
- person_videos = {}
78
- person_tracks = {}
79
-
80
- print("Processing the frames...")
81
- for frame_idx in tqdm(range(frame_count)):
82
-
83
- frame = all_frames[frame_idx]
84
-
85
- # Perform person detection
86
- results = yolo_model(frame, verbose=False)
87
- detections = results[0].boxes
88
-
89
- for i, det in enumerate(detections):
90
- x1, y1, x2, y2 = det.xyxy[0]
91
- cls = det.cls[0]
92
- if int(cls) == 0: # Class 0 is 'person' in COCO dataset
93
-
94
- x1 = max(0, int(x1) - padding)
95
- y1 = max(0, int(y1) - padding)
96
- x2 = min(frame.shape[1], int(x2) + padding)
97
- y2 = min(frame.shape[0], int(y2) + padding)
98
-
99
- if i not in person_videos:
100
- person_videos[i] = []
101
- person_tracks[i] = []
102
-
103
- person_videos[i].append(frame)
104
- person_tracks[i].append([x1,y1,x2,y2])
105
-
106
-
107
- num_persons = 0
108
- for i in person_videos.keys():
109
- if len(person_videos[i]) >= frame_count//2:
110
- num_persons+=1
111
-
112
- if num_persons==0:
113
- msg = "No person detected in the video! Please give a video with one person as input"
114
- return None, None, None, msg
115
- if num_persons>1:
116
- msg = "More than one person detected in the video! Please give a video with only one person as input"
117
- return None, None, None, msg
118
-
119
  # Extract the audio from the input video file using ffmpeg
120
  wav_file = os.path.join(result_folder, "audio.wav")
121
 
@@ -125,50 +73,109 @@ def preprocess_video(path, result_folder, padding=20):
125
  if status != 0:
126
  msg = "Oops! Could not load the audio file. Please check the input video and try again."
127
  return None, None, None, msg
128
-
129
  print("Extracted the audio from the video")
130
 
131
- # For the person detected, crop the frame based on the bounding box
132
- if len(person_videos[0]) > frame_count-10:
133
- crop_filename = os.path.join(result_folder, "preprocessed_video.avi")
134
- fourcc = cv2.VideoWriter_fourcc(*'DIVX')
135
-
136
- # Get bounding box coordinates based on person_tracks[i]
137
- max_x1 = min([track[0] for track in person_tracks[0]])
138
- max_y1 = min([track[1] for track in person_tracks[0]])
139
- max_x2 = max([track[2] for track in person_tracks[0]])
140
- max_y2 = max([track[3] for track in person_tracks[0]])
141
-
142
- max_width = max_x2 - max_x1
143
- max_height = max_y2 - max_y1
144
-
145
- out = cv2.VideoWriter(crop_filename, fourcc, fps, (max_width, max_height))
146
- for frame in person_videos[0]:
147
- crop = frame[max_y1:max_y2, max_x1:max_x2]
148
- crop = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
149
- out.write(crop)
150
- out.release()
151
-
152
- no_sound_video = crop_filename.split('.')[0] + '_nosound.mp4'
153
- status = subprocess.call('ffmpeg -hide_banner -loglevel panic -y -i %s -c copy -an -strict -2 %s' % (crop_filename, no_sound_video), shell=True)
154
- if status != 0:
155
- msg = "Oops! Could not preprocess the video. Please check the input video and try again."
156
- return None, None, None, msg
157
 
158
- video_output = crop_filename.split('.')[0] + '.mp4'
159
- status = subprocess.call('ffmpeg -hide_banner -loglevel panic -y -i %s -i %s -strict -2 -q:v 1 %s' %
160
- (wav_file , no_sound_video, video_output), shell=True)
161
- if status != 0:
162
- msg = "Oops! Could not preprocess the video. Please check the input video and try again."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  return None, None, None, msg
 
 
 
 
164
 
165
- os.remove(crop_filename)
166
- os.remove(no_sound_video)
167
 
168
- print("Successfully saved the pre-processed video: ", video_output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  else:
170
- msg = "Could not track the person in the full video! Please give a single-speaker video as input"
171
- return None, None, None, msg
172
 
173
  return wav_file, fps, video_output, "success"
174
 
@@ -649,7 +656,7 @@ class Logger:
649
  return False
650
 
651
 
652
- def process_video(video_path, num_avg_frames):
653
  try:
654
  # Extract the video filename
655
  video_fname = os.path.basename(video_path.split(".")[0])
@@ -668,7 +675,8 @@ def process_video(video_path, num_avg_frames):
668
 
669
 
670
  # Preprocess the video
671
- wav_file, fps, vid_path_processed, status = preprocess_video(video_path, result_folder_input)
 
672
  if status != "success":
673
  return status, None
674
  print("Successfully preprocessed the video")
@@ -902,6 +910,7 @@ if __name__ == "__main__":
902
  value=75,
903
  label="Number of Average Frames",
904
  )
 
905
  video_input = gr.Video(label="Upload Video", height=400)
906
 
907
  with gr.Column():
@@ -914,12 +923,12 @@ if __name__ == "__main__":
914
 
915
  submit_button.click(
916
  fn=process_video,
917
- inputs=[video_input, num_avg_frames],
918
  outputs=[result_text, output_video]
919
  )
920
 
921
  clear_button.click(
922
- fn=lambda: (None, 75, "", None),
923
  inputs=[],
924
  outputs=[video_input, num_avg_frames, result_text, output_video]
925
  )
 
35
  use_cuda = torch.cuda.is_available()
36
  n_negative_samples = 100
37
 
38
+ def preprocess_video(path, result_folder, apply_preprocess, padding=20):
39
 
40
  '''
41
  This function preprocesses the input video to extract the audio and crop the frames using YOLO model
 
60
  msg = "Oops! Could not load the video. Please check the input video and try again."
61
  return None, None, None, msg
62
 
 
 
 
 
 
 
 
 
 
 
63
  if frame_count < 25:
64
  msg = "Not enough frames to process! Please give a longer video as input"
65
  return None, None, None, msg
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  # Extract the audio from the input video file using ffmpeg
68
  wav_file = os.path.join(result_folder, "audio.wav")
69
 
 
73
  if status != 0:
74
  msg = "Oops! Could not load the audio file. Please check the input video and try again."
75
  return None, None, None, msg
 
76
  print("Extracted the audio from the video")
77
 
78
+ if apply_preprocess=="True":
79
+ all_frames = []
80
+ for k in range(len(vr)):
81
+ all_frames.append(vr[k].asnumpy())
82
+ all_frames = np.asarray(all_frames)
83
+ print("Extracted the frames for pre-processing")
84
+
85
+ # Load YOLOv9 model (pre-trained on COCO dataset)
86
+ yolo_model = YOLO("yolov9s.pt")
87
+ print("Loaded the YOLO model")
88
+
89
+
90
+
91
+ person_videos = {}
92
+ person_tracks = {}
93
+
94
+ print("Processing the frames...")
95
+ for frame_idx in tqdm(range(frame_count)):
96
+
97
+ frame = all_frames[frame_idx]
 
 
 
 
 
 
98
 
99
+ # Perform person detection
100
+ results = yolo_model(frame, verbose=False)
101
+ detections = results[0].boxes
102
+
103
+ for i, det in enumerate(detections):
104
+ x1, y1, x2, y2 = det.xyxy[0]
105
+ cls = det.cls[0]
106
+ if int(cls) == 0: # Class 0 is 'person' in COCO dataset
107
+
108
+ x1 = max(0, int(x1) - padding)
109
+ y1 = max(0, int(y1) - padding)
110
+ x2 = min(frame.shape[1], int(x2) + padding)
111
+ y2 = min(frame.shape[0], int(y2) + padding)
112
+
113
+ if i not in person_videos:
114
+ person_videos[i] = []
115
+ person_tracks[i] = []
116
+
117
+ person_videos[i].append(frame)
118
+ person_tracks[i].append([x1,y1,x2,y2])
119
+
120
+
121
+ num_persons = 0
122
+ for i in person_videos.keys():
123
+ if len(person_videos[i]) >= frame_count//2:
124
+ num_persons+=1
125
+
126
+ if num_persons==0:
127
+ msg = "No person detected in the video! Please give a video with one person as input"
128
  return None, None, None, msg
129
+ if num_persons>1:
130
+ msg = "More than one person detected in the video! Please give a video with only one person as input"
131
+ return None, None, None, msg
132
+
133
 
 
 
134
 
135
+ # For the person detected, crop the frame based on the bounding box
136
+ if len(person_videos[0]) > frame_count-10:
137
+ crop_filename = os.path.join(result_folder, "preprocessed_video.avi")
138
+ fourcc = cv2.VideoWriter_fourcc(*'DIVX')
139
+
140
+ # Get bounding box coordinates based on person_tracks[i]
141
+ max_x1 = min([track[0] for track in person_tracks[0]])
142
+ max_y1 = min([track[1] for track in person_tracks[0]])
143
+ max_x2 = max([track[2] for track in person_tracks[0]])
144
+ max_y2 = max([track[3] for track in person_tracks[0]])
145
+
146
+ max_width = max_x2 - max_x1
147
+ max_height = max_y2 - max_y1
148
+
149
+ out = cv2.VideoWriter(crop_filename, fourcc, fps, (max_width, max_height))
150
+ for frame in person_videos[0]:
151
+ crop = frame[max_y1:max_y2, max_x1:max_x2]
152
+ crop = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
153
+ out.write(crop)
154
+ out.release()
155
+
156
+ no_sound_video = crop_filename.split('.')[0] + '_nosound.mp4'
157
+ status = subprocess.call('ffmpeg -hide_banner -loglevel panic -y -i %s -c copy -an -strict -2 %s' % (crop_filename, no_sound_video), shell=True)
158
+ if status != 0:
159
+ msg = "Oops! Could not preprocess the video. Please check the input video and try again."
160
+ return None, None, None, msg
161
+
162
+ video_output = crop_filename.split('.')[0] + '.mp4'
163
+ status = subprocess.call('ffmpeg -hide_banner -loglevel panic -y -i %s -i %s -strict -2 -q:v 1 %s' %
164
+ (wav_file , no_sound_video, video_output), shell=True)
165
+ if status != 0:
166
+ msg = "Oops! Could not preprocess the video. Please check the input video and try again."
167
+ return None, None, None, msg
168
+
169
+ os.remove(crop_filename)
170
+ os.remove(no_sound_video)
171
+
172
+ print("Successfully saved the pre-processed video: ", video_output)
173
+ else:
174
+ msg = "Could not track the person in the full video! Please give a single-speaker video as input"
175
+ return None, None, None, msg
176
+
177
  else:
178
+ video_output = path
 
179
 
180
  return wav_file, fps, video_output, "success"
181
 
 
656
  return False
657
 
658
 
659
+ def process_video(video_path, num_avg_frames, apply_preprocess):
660
  try:
661
  # Extract the video filename
662
  video_fname = os.path.basename(video_path.split(".")[0])
 
675
 
676
 
677
  # Preprocess the video
678
+ print("Applying preprocessing: ", apply_preprocess)
679
+ wav_file, fps, vid_path_processed, status = preprocess_video(video_path, result_folder_input, apply_preprocess)
680
  if status != "success":
681
  return status, None
682
  print("Successfully preprocessed the video")
 
910
  value=75,
911
  label="Number of Average Frames",
912
  )
913
+ apply_preprocess = gr.Checkbox(label="Apply Preprocessing", value=False)
914
  video_input = gr.Video(label="Upload Video", height=400)
915
 
916
  with gr.Column():
 
923
 
924
  submit_button.click(
925
  fn=process_video,
926
+ inputs=[video_input, num_avg_frames, apply_preprocess],
927
  outputs=[result_text, output_video]
928
  )
929
 
930
  clear_button.click(
931
+ fn=lambda: (None, 75, False, "", None),
932
  inputs=[],
933
  outputs=[video_input, num_avg_frames, result_text, output_video]
934
  )