Upload 4 files

Browse files

Files changed (4) hide show

config.json +2 -2
media.py +47 -6
media_encoder.py +2 -6
modeling_vila.py +31 -1

config.json CHANGED Viewed

@@ -6,7 +6,7 @@
   ],
   "chat_template": null,
   "drop_path_rate": 0.0,
-  "fps": 0.0,
   "hidden_size": 3584,
   "image_aspect_ratio": "resize",
   "image_encoder": {
@@ -177,7 +177,7 @@
   "model_name_or_path": "./LongVILA-R1-7B",
   "model_type": "vila",
   "num_time_tokens": 0,
-  "num_video_frames": 512,
   "resume_path": "./LongVILA-R1-7B",
   "s2": false,
   "s2_max_split_size": 336,

   ],
   "chat_template": null,
   "drop_path_rate": 0.0,
+  "fps": 2.0,
   "hidden_size": 3584,
   "image_aspect_ratio": "resize",
   "image_encoder": {
   "model_name_or_path": "./LongVILA-R1-7B",
   "model_type": "vila",
   "num_time_tokens": 0,
+  "num_video_frames": 2048,
   "resume_path": "./LongVILA-R1-7B",
   "s2": false,
   "s2_max_split_size": 336,

media.py CHANGED Viewed

@@ -10,11 +10,6 @@ import PIL.Image
 import requests
 from transformers import PretrainedConfig
-# from llava.constants import MEDIA_TOKENS
-# from llava.media import Image, Video
-# from llava.utils import make_list
-# from llava.utils.logging import logger
 MEDIA_TOKENS = {
     "image": "<image>",
     "video": "<vila/video>",
@@ -86,11 +81,57 @@ def _load_video(video_path: str, *, num_frames: int) -> List[PIL.Image.Image]:
         frames[index] = PIL.Image.fromarray(frame)
     return [frames[index] for index in indices if index in frames]
 def _extract_video(video, config: PretrainedConfig) -> List[PIL.Image.Image]:
     num_frames = config.num_video_frames
     video_path = video.path if isinstance(video, Video) else video["path"]
-    frames = _load_video(video_path, num_frames=num_frames)
     return frames

 import requests
 from transformers import PretrainedConfig
 MEDIA_TOKENS = {
     "image": "<image>",
     "video": "<vila/video>",
         frames[index] = PIL.Image.fromarray(frame)
     return [frames[index] for index in indices if index in frames]
+def _load_video_with_fps(video_path: str, *, num_frames: int, fps: float) -> List[PIL.Image.Image]:
+    # Load video frames from a directory
+    if os.path.isdir(video_path):
+        frame_paths = sorted(glob.glob(os.path.join(video_path, "*")))
+        indices = np.round(np.linspace(0, len(frame_paths) - 1, min(num_frames, len(frame_paths)))).astype(int)
+        return [PIL.Image.open(frame_paths[index]) for index in indices]
+    # Load video frames from a video file
+    vidcap = cv2.VideoCapture(video_path)
+    if not vidcap.isOpened():
+        raise ValueError(f"Cannot open video file: {video_path}")
+    orig_fps = vidcap.get(cv2.CAP_PROP_FPS)
+    frame_count = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    # Estimate video duration in seconds
+    duration_sec = frame_count / orig_fps if orig_fps > 0 else 0
+    if duration_sec == 0:
+        raise ValueError(f"Video '{video_path}' seems to be empty or corrupted.")
+    # Compute total frames to sample based on desired fps
+    sampled_frame_count = int(duration_sec * fps)
+    sampled_frame_count = ((sampled_frame_count + 127) // 128) * 128
+    sampled_frame_count = min(sampled_frame_count, num_frames)
+    # Compute which frame indices to sample
+    indices = np.linspace(0, frame_count - 1, sampled_frame_count).astype(int)
+    frames = {}
+    for index in indices:
+        if index in frames:
+            continue
+        vidcap.set(cv2.CAP_PROP_POS_FRAMES, index)
+        success, frame = vidcap.read()
+        if not success:
+            print(f"Failed to read frame {index} from video '{video_path}'. Skipped.")
+            continue
+        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        frames[index] = PIL.Image.fromarray(frame)
+    vidcap.release()
+    return [frames[index] for index in indices if index in frames]
 def _extract_video(video, config: PretrainedConfig) -> List[PIL.Image.Image]:
     num_frames = config.num_video_frames
     video_path = video.path if isinstance(video, Video) else video["path"]
+    if getattr(config, "fps") > 0:
+        frames = _load_video_with_fps(video_path, num_frames=num_frames, fps=config.fps)
+    else:
+        frames = _load_video(video_path, num_frames=num_frames)
     return frames

media_encoder.py CHANGED Viewed

@@ -101,22 +101,18 @@ class BasicVideoEncoder(BaseEncoder):
         return [process_features(f) for f in features]
 def pool(x: torch.Tensor, size: int, dim: int) -> torch.Tensor:
-    if x.shape[dim] % size == 0:
-        return x.view(x.shape[:dim] + (-1, size) + x.shape[dim + 1 :]).mean(dim + 1)
-    else:
-        return x.narrow(dim, start=0, length=1)
 class TSPVideoEncoder(BasicVideoEncoder):
     def __init__(
         self,
         parent: torch.nn.Module,
-        #pool_sizes: List[Tuple[int, int, int]],
         start_tokens: Optional[str] = None,
         end_tokens: Optional[str] = "\n",
         sep_tokens: Optional[str] = None,
     ) -> None:
         super().__init__(parent, start_tokens=start_tokens, end_tokens=end_tokens)
-        self.pool_sizes = [[8, 1, 1]] #pool_sizes
         self.sep_tokens = sep_tokens
     def _process_features(

         return [process_features(f) for f in features]
 def pool(x: torch.Tensor, size: int, dim: int) -> torch.Tensor:
+    return x.view(x.shape[:dim] + (-1, size) + x.shape[dim + 1 :]).mean(dim + 1)
 class TSPVideoEncoder(BasicVideoEncoder):
     def __init__(
         self,
         parent: torch.nn.Module,
         start_tokens: Optional[str] = None,
         end_tokens: Optional[str] = "\n",
         sep_tokens: Optional[str] = None,
     ) -> None:
         super().__init__(parent, start_tokens=start_tokens, end_tokens=end_tokens)
+        self.pool_sizes = [[8, 1, 1]]
         self.sep_tokens = sep_tokens
     def _process_features(

modeling_vila.py CHANGED Viewed

@@ -725,7 +725,37 @@ class VILAForCausalLM(VILAPretrainedModel):
                     dummy = torch.zeros(infos[0]["shape"], dtype=infos[0]["dtype"], device=self.device)
                     embeds["dummy"].extend(self.encoders[name]([dummy], media_config[name]))
                     continue
-            embeds[name] = deque(self.encoders[name](media[name], media_config[name]))
         return embeds
     def __truncate_sequence(

                     dummy = torch.zeros(infos[0]["shape"], dtype=infos[0]["dtype"], device=self.device)
                     embeds["dummy"].extend(self.encoders[name]([dummy], media_config[name]))
                     continue
+            def round_up_to_bucket(x: int) -> int:
+                bucket = 1
+                total = 8
+                while bucket < total:
+                    if x <= bucket:
+                        return bucket
+                    bucket *= 2
+                return total
+            if "video" in name:
+                num_video_frames = max([video.shape[0] for video in media[name]])
+                if isinstance(self.encoders[name], TSPVideoEncoder):
+                    self.encoders[name].pool_sizes[0][0] = 4 * round_up_to_bucket(num_video_frames / 256)
+                if num_video_frames > 512:
+                    media_split = []
+                    frames_split = 4
+                    for video in media[name]:
+                        media_split += video.tensor_split(frames_split, dim=0)
+                    embeds_split = []
+                    for video in media_split:
+                        embeds_split += self.encoders[name]([video], media_config[name])
+                    embeds_merged = [
+                        torch.cat(embeds_split[i * frames_split: (i + 1) * frames_split], dim=0)
+                        for i in range(len(media[name]))
+                    ]
+                    embeds[name] = deque(embeds_merged)
+                else:
+                    embeds[name] = deque(self.encoders[name](media[name], media_config[name]))
+            else:
+                embeds[name] = deque(self.encoders[name](media[name], media_config[name]))
         return embeds
     def __truncate_sequence(