Spaces:
Running
on
Zero
Running
on
Zero
| import os | |
| import torch | |
| import numpy as np | |
| import decord | |
| import torch.nn as nn | |
| import json | |
| import cv2 | |
| from kpe_mediapipe import video_holistic | |
| from crop_hands import HandExtractor | |
| from crop_face import FaceExtractor | |
| from dinov2_features import extract_embeddings_from_frames | |
| from body_features import process_pose_landmarks | |
| # from shubert import SignHubertModel, SignHubertConfig | |
| from inference import test | |
| import subprocess | |
| class SHuBERTProcessor: | |
| def __init__(self, config): | |
| self.config = config | |
| def process_video(self, video_path): | |
| # output_file = f"{output_path}/{os.path.basename(video_file)}" | |
| # # Target FPS is 12.5 | |
| # cmd = [ | |
| # 'ffmpeg', | |
| # '-i', video_path, | |
| # '-filter:v', 'fps=15', | |
| # '-c:v', 'libx264', | |
| # '-preset', 'medium', # Balance between speed and quality | |
| # '-crf', '23', # Quality level (lower is better) | |
| # '-y', # Overwrite output file if it exists | |
| # video_path | |
| # ] | |
| # try: | |
| # subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
| # print(f"Saved to {video_path} at 15 fps") | |
| # except subprocess.CalledProcessError as e: | |
| # print(f"Error reading video {video_path}: {e}") | |
| # Step 1: Change the fps to 15 | |
| signer_video = decord.VideoReader(video_path) | |
| signer_video_fps = signer_video.get_avg_fps() | |
| # target_fps = 12 | |
| # stride = max(1, int(round(signer_video_fps / target_fps))) | |
| stride = 1 | |
| index_list = list(range(0, len(signer_video), stride)) | |
| signer_video = signer_video.get_batch(index_list) | |
| signer_video = signer_video.asnumpy() | |
| # Step 2: Extract pose using kpe_mediapipe | |
| landmarks = video_holistic( | |
| video_input=signer_video, | |
| face_model_path=self.config['mediapipe_face_model_path'], | |
| hand_model_path=self.config['mediapipe_hands_model_path'], | |
| ) | |
| # Step 3: Extract stream features | |
| hand_extractor = HandExtractor() | |
| left_hand_frames, right_hand_frames = hand_extractor.extract_hand_frames(signer_video, landmarks) | |
| left_hand_embeddings = extract_embeddings_from_frames(left_hand_frames, self.config['dino_hands_model_path']) | |
| right_hand_embeddings = extract_embeddings_from_frames(right_hand_frames, self.config['dino_hands_model_path']) | |
| del left_hand_frames, right_hand_frames | |
| face_extractor = FaceExtractor() | |
| face_frames = face_extractor.extract_face_frames(signer_video, landmarks) | |
| face_embeddings = extract_embeddings_from_frames(face_frames, self.config['dino_face_model_path']) | |
| del face_frames, signer_video | |
| pose_embeddings = process_pose_landmarks(landmarks) | |
| del landmarks | |
| output_text = test(face_embeddings, | |
| left_hand_embeddings, | |
| right_hand_embeddings, | |
| pose_embeddings, | |
| self.config['slt_model_config'], | |
| self.config['slt_model_checkpoint'], | |
| self.config['slt_tokenizer_checkpoint'], | |
| self.config['temp_dir']) | |
| return output_text | |
| if __name__ == "__main__": | |
| config = { | |
| 'yolov8_model_path': '/share/data/pals/shester/inference/models/yolov8n.pt', | |
| 'dino_face_model_path': '/share/data/pals/shester/inference/models/dinov2face.pth', | |
| 'dino_hands_model_path': '/share/data/pals/shester/inference/models/dinov2hand.pth', | |
| 'mediapipe_face_model_path': '/share/data/pals/shester/inference/models/face_landmarker_v2_with_blendshapes.task', | |
| 'mediapipe_hands_model_path': '/share/data/pals/shester/inference/models/hand_landmarker.task', | |
| 'shubert_model_path': '/share/data/pals/shester/inference/models/checkpoint_836_400000.pt', | |
| 'temp_dir': '/share/data/pals/shester/inference', | |
| 'slt_model_config': '/share/data/pals/shester/inference/models/byt5_base/config.json', | |
| 'slt_model_checkpoint': '/share/data/pals/shester/inference/models/checkpoint-11625', | |
| 'slt_tokenizer_checkpoint': '/share/data/pals/shester/inference/models/byt5_base', | |
| } | |
| # input_clip = "/share/data/pals/shester/datasets/openasl/clips_bbox/J-0KHhPS_m4.029676-029733.mp4" | |
| # input_clip = "/share/data/pals/shester/inference/recordings/sabrin30fps.mp4" | |
| input_clip = "/share/data/pals/shester/inference/recordings/sample_sabrina.mp4" | |
| processor = SHuBERTProcessor(config) | |
| output_text = processor.process_video(input_clip) | |
| print(f"The English translation is: {output_text}") | |
| # /home-nfs/shesterg/.cache/torch/hub/facebookresearch_dinov2_main/dinov2/layers/attention.py | |
| # /home-nfs/shesterg/.cache/torch/hub/facebookresearch_dinov2_main/dinov2/layers/block.py |