Spaces:
Configuration error
Configuration error
Commit
·
c6e1104
1
Parent(s):
8eab689
Update app
Browse files
app.py
CHANGED
|
@@ -647,6 +647,7 @@ def process_video(video_path, num_avg_frames):
|
|
| 647 |
wav_file, fps, vid_path_processed, status = preprocess_video(video_path, result_folder_input)
|
| 648 |
if status != "success":
|
| 649 |
return status, None
|
|
|
|
| 650 |
|
| 651 |
# Resample the video to 25 fps if it is not already 25 fps
|
| 652 |
print("FPS of video: ", fps)
|
|
@@ -666,7 +667,7 @@ def process_video(video_path, num_avg_frames):
|
|
| 666 |
frames, status = load_video_frames(vid_path)
|
| 667 |
if status != "success":
|
| 668 |
return status, None
|
| 669 |
-
|
| 670 |
|
| 671 |
if len(frames) < num_avg_frames:
|
| 672 |
return "Error: The input video is too short. Please use a longer input video.", None
|
|
@@ -675,6 +676,7 @@ def process_video(video_path, num_avg_frames):
|
|
| 675 |
kp_dict, status = get_keypoints(frames)
|
| 676 |
if status != "success":
|
| 677 |
return status, None
|
|
|
|
| 678 |
|
| 679 |
status = check_visible_gestures(kp_dict)
|
| 680 |
if status != "success":
|
|
@@ -689,12 +691,14 @@ def process_video(video_path, num_avg_frames):
|
|
| 689 |
rgb_frames = np.transpose(rgb_frames, (4, 0, 1, 2, 3))
|
| 690 |
rgb_frames = torch.FloatTensor(np.array(rgb_frames)).unsqueeze(0)
|
| 691 |
B = rgb_frames.size(0)
|
|
|
|
| 692 |
|
| 693 |
# Load spectrograms
|
| 694 |
spec, orig_spec, status = load_spectrograms(wav_file, num_frames, window_frames=25)
|
| 695 |
if status != "success":
|
| 696 |
return status, None
|
| 697 |
spec = torch.FloatTensor(spec).unsqueeze(0).unsqueeze(0).permute(0, 1, 2, 4, 3)
|
|
|
|
| 698 |
|
| 699 |
# Create input windows
|
| 700 |
video_sequences = torch.cat([rgb_frames[:, :, i] for i in range(rgb_frames.size(2))], dim=0)
|
|
@@ -703,6 +707,7 @@ def process_video(video_path, num_avg_frames):
|
|
| 703 |
# Load the trained model
|
| 704 |
model = Transformer_RGB()
|
| 705 |
model = load_checkpoint(CHECKPOINT_PATH, model)
|
|
|
|
| 706 |
|
| 707 |
# Process in batches
|
| 708 |
batch_size = 12
|
|
@@ -737,6 +742,7 @@ def process_video(video_path, num_avg_frames):
|
|
| 737 |
video_emb = torch.split(video_emb, B, dim=0)
|
| 738 |
video_emb = torch.stack(video_emb, dim=2)
|
| 739 |
video_emb = video_emb.squeeze(3)
|
|
|
|
| 740 |
|
| 741 |
# Calculate sync offset
|
| 742 |
pred_offset, status = calc_optimal_av_offset(video_emb, audio_emb, num_avg_frames, model)
|
|
|
|
| 647 |
wav_file, fps, vid_path_processed, status = preprocess_video(video_path, result_folder_input)
|
| 648 |
if status != "success":
|
| 649 |
return status, None
|
| 650 |
+
print("Successfully preprocessed the video")
|
| 651 |
|
| 652 |
# Resample the video to 25 fps if it is not already 25 fps
|
| 653 |
print("FPS of video: ", fps)
|
|
|
|
| 667 |
frames, status = load_video_frames(vid_path)
|
| 668 |
if status != "success":
|
| 669 |
return status, None
|
| 670 |
+
print("Successfully extracted the video frames")
|
| 671 |
|
| 672 |
if len(frames) < num_avg_frames:
|
| 673 |
return "Error: The input video is too short. Please use a longer input video.", None
|
|
|
|
| 676 |
kp_dict, status = get_keypoints(frames)
|
| 677 |
if status != "success":
|
| 678 |
return status, None
|
| 679 |
+
print("Successfully extracted the keypoints")
|
| 680 |
|
| 681 |
status = check_visible_gestures(kp_dict)
|
| 682 |
if status != "success":
|
|
|
|
| 691 |
rgb_frames = np.transpose(rgb_frames, (4, 0, 1, 2, 3))
|
| 692 |
rgb_frames = torch.FloatTensor(np.array(rgb_frames)).unsqueeze(0)
|
| 693 |
B = rgb_frames.size(0)
|
| 694 |
+
print("Successfully converted the frames to tensor")
|
| 695 |
|
| 696 |
# Load spectrograms
|
| 697 |
spec, orig_spec, status = load_spectrograms(wav_file, num_frames, window_frames=25)
|
| 698 |
if status != "success":
|
| 699 |
return status, None
|
| 700 |
spec = torch.FloatTensor(spec).unsqueeze(0).unsqueeze(0).permute(0, 1, 2, 4, 3)
|
| 701 |
+
print("Successfully loaded the spectrograms")
|
| 702 |
|
| 703 |
# Create input windows
|
| 704 |
video_sequences = torch.cat([rgb_frames[:, :, i] for i in range(rgb_frames.size(2))], dim=0)
|
|
|
|
| 707 |
# Load the trained model
|
| 708 |
model = Transformer_RGB()
|
| 709 |
model = load_checkpoint(CHECKPOINT_PATH, model)
|
| 710 |
+
print("Successfully loaded the model")
|
| 711 |
|
| 712 |
# Process in batches
|
| 713 |
batch_size = 12
|
|
|
|
| 742 |
video_emb = torch.split(video_emb, B, dim=0)
|
| 743 |
video_emb = torch.stack(video_emb, dim=2)
|
| 744 |
video_emb = video_emb.squeeze(3)
|
| 745 |
+
print("Successfully extracted GestSync embeddings")
|
| 746 |
|
| 747 |
# Calculate sync offset
|
| 748 |
pred_offset, status = calc_optimal_av_offset(video_emb, audio_emb, num_avg_frames, model)
|