Spaces:
Running
on
Zero
Running
on
Zero
| # Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright: | |
| # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright: | |
| # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright: | |
| # Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import math | |
| import copy | |
| import json | |
| import os | |
| import pathlib | |
| import random | |
| import re | |
| import sys | |
| import warnings | |
| import traceback | |
| from packaging import version | |
| from dataclasses import dataclass, field | |
| from typing import Dict, List, Optional, Sequence | |
| import numpy as np | |
| # torch-related packages | |
| # NOTE: torch must be imported before transformers. Otherwise, `Segmentation fault (core dumped)` will occur. | |
| import torch | |
| import transformers | |
| from packaging import version | |
| from datasets import load_dataset, concatenate_datasets | |
| from torch.utils.data import Dataset | |
| from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock | |
| sys.path.append('./') | |
| from videollama3.constants import (IGNORE_INDEX, MODAL_INDEX_MAP, | |
| NUM_FRAMES, DEFAULT_IMAGE_TOKEN, STREAM_MAX_FRAMES, | |
| STREAM_DOWNSAMPLING, STREAM_FPS, STREAM_IMAGE_SIZE, | |
| STREAM_START_TOKEN, STREAM_END_TOKEN, REGION_TOKEN) | |
| from videollama3.mm_utils import (load_images, load_video, | |
| tokenizer_multimodal_token, annToMask, resize_image_mask) | |
| from videollama3.model import * | |
| from videollama3.videollama3_trainer import ( | |
| VideoLLaMA3Trainer, find_all_linear_names, get_peft_state_maybe_zero_3, | |
| get_peft_state_non_lora_maybe_zero_3, safe_save_model_for_hf_trainer) | |
| from videollama3.model.processor import Videollama3Processor | |
| # NOTE: fast tokenizer warning issue: https://github.com/huggingface/transformers/issues/5486 | |
| os.environ["TOKENIZERS_PARALLELISM"] = "true" | |
| local_rank = None | |
| def rank0_print(*args): | |
| if local_rank == 0: | |
| print(*args) | |
| def set_seed(seed=42): | |
| """ | |
| Set the random seed for reproducible results. | |
| :param seed: An integer value to be used as the random seed. | |
| """ | |
| torch.manual_seed(seed) | |
| torch.cuda.manual_seed(seed) | |
| torch.cuda.manual_seed_all(seed) # for multi-GPU setups | |
| torch.backends.cudnn.deterministic = True | |
| torch.backends.cudnn.benchmark = False | |
| def int_with_none(value): | |
| if value == 'None': | |
| return None | |
| return int(value) | |
| class ModelArguments: | |
| # LLM Arguments | |
| model_type: Optional[str] = field(default="videollama3", metadata={"help": "Model type selected in the list: " + ", ".join(VLLMs.keys())}) | |
| model_path: Optional[str] = field(default="lmsys/vicuna-7b-v1.5") | |
| version: Optional[str] = field(default="v1", metadata={"help": "Version of the conversation template."}) | |
| freeze_backbone: bool = field(default=False, metadata={"help": "Whether to freeze the LLM backbone."}) | |
| # Connector Arguments | |
| mm_projector_type: Optional[str] = field(default='linear') | |
| pretrain_mm_projector: Optional[str] = field(default=None) | |
| # Vision tower Arguments | |
| vision_encoder: Optional[str] = field(default=None) | |
| mm_vision_select_layer: Optional[int] = field(default=-1) | |
| mm_vision_select_feature: Optional[str] = field(default="patch") | |
| mm_attn_implementation: Optional[str] = field(default="flash_attention_2") | |
| # Token downsampling Arguments | |
| spatial_merge_size: Optional[int] = field(default=1) | |
| mm_max_length: Optional[int] = field(default=9477) | |
| use_token_compression: Optional[bool] = field(default=False) | |
| class DataArguments: | |
| # Path Arguments | |
| data_path: List[str] = field(default=None, metadata={"help": "Path to the training data."}) | |
| # image_folder: Optional[str] = field(default=None) | |
| # video_folder: Optional[str] = field(default=None) | |
| data_folder: Optional[str] = field(default=None) | |
| # Loading Arguments | |
| is_multimodal: bool = False | |
| fps: Optional[int] = field(default=None) | |
| max_frames: Optional[int_with_none] = field(default=None) | |
| # Preprocess Arguments | |
| image_aspect_ratio: str = 'square' | |
| use_batch_flattening: bool = field(default=True, metadata={"help": "Whether to flatten the in-batch sequences of variable lengths."}) | |
| dataset_cache_dir: Optional[str] = field(default=None) | |
| class TrainingArguments(transformers.TrainingArguments): | |
| # shut auto processing (_remove_unused_columns) of transformers Trainer | |
| remove_unused_columns: bool = field(default=False) | |
| optim: str = field(default="adamw_torch") | |
| # Training learning rate Arguments | |
| vision_encoder_lr: Optional[float] = None | |
| mm_projector_lr: Optional[float] = None | |
| llm_lr: Optional[float] = None | |
| region_encoder_lr: Optional[float] = None | |
| # Training Data Arguments | |
| group_by_modality_length: bool = field(default=False) | |
| model_max_length: int = field( | |
| default=512, | |
| metadata={ | |
| "help": | |
| "Maximum sequence length. Sequences will be right padded (and possibly truncated)." | |
| }, | |
| ) | |
| # Lora or Quant Arguments | |
| double_quant: bool = field( | |
| default=True, | |
| metadata={"help": "Compress the quantization statistics through double quantization."} | |
| ) | |
| quant_type: str = field( | |
| default="nf4", | |
| metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."} | |
| ) | |
| bits: int = field( | |
| default=16, | |
| metadata={"help": "How many bits to use."} | |
| ) | |
| lora_enable: bool = False | |
| lora_r: int = 64 | |
| lora_alpha: int = 16 | |
| lora_dropout: float = 0.05 | |
| lora_weight_path: str = "" | |
| lora_bias: str = "none" | |
| class LazySupervisedDataset(Dataset): | |
| """Dataset for supervised fine-tuning.""" | |
| def __init__(self, data_path: str, vlprocessor, data_args: DataArguments): | |
| super(LazySupervisedDataset, self).__init__() | |
| data_objs = [] | |
| # try: | |
| # for data in data_path: | |
| # # NOTE: load_dataset can process both json or jsonl files | |
| # if data.endswith(".json") or data.endswith(".jsonl"): | |
| # data_objs.append(load_dataset("json", data_files=data, cache_dir=data_args.dataset_cache_dir)["train"]) | |
| # else: | |
| # raise Exception(f"Unsupported file format (<{data}>)!") | |
| # list_data_dict = concatenate_datasets(data_objs) | |
| # except: | |
| traceback.print_exc() | |
| # NOTE: compatible with the old version | |
| list_data_dict = [] | |
| for data in data_path: | |
| if data.endswith(".json"): | |
| data = json.load(open(data, "r")) | |
| for i in data: | |
| i['id'] = len(list_data_dict) | |
| list_data_dict.append(i) | |
| elif data.endswith(".jsonl"): | |
| with open(data, "r", encoding="utf-8") as fp: | |
| for line in fp: | |
| line = line.strip() | |
| obj = json.loads(line) | |
| obj["id"] = len(list_data_dict) | |
| list_data_dict.append(obj) | |
| else: | |
| raise Exception(f"Unsupported file format (<{data}>)!!!") | |
| rank0_print("Formatting inputs...Skip in lazy mode") | |
| self.vlprocessor = vlprocessor | |
| self.list_data_dict = list_data_dict | |
| self.data_args = data_args | |
| def __len__(self): | |
| return len(self.list_data_dict) | |
| def lengths(self): | |
| length_list = [] | |
| for sample in self.list_data_dict: | |
| img_tokens = 576 if 'image' in sample else 0 | |
| length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens) | |
| return length_list | |
| def modality_lengths(self): | |
| length_list = [] | |
| for sample in self.list_data_dict: | |
| cur_len = sum(len(conv['value'].split()) for conv in sample['conversations']) | |
| cur_len = cur_len if 'image' in sample else -cur_len | |
| length_list.append(cur_len) | |
| return length_list | |
| def _convert_normal(self, data_dict): | |
| data_folder = self.data_args.data_folder | |
| conversation = copy.deepcopy(data_dict["conversations"]) | |
| # data sanity check and repair | |
| start_idx = 0 | |
| for sentence in conversation: | |
| if sentence["from"] == "human" or sentence["from"] == "system": | |
| break | |
| start_idx += 1 | |
| if start_idx > 0: | |
| warnings.warn(f"Find {start_idx} non-user sentences at the beginning of the conversation, remove them automatically!") | |
| conversation = conversation[start_idx:] | |
| assert len(conversation) > 1, f"Invalid conversation" | |
| additional_frames = [] | |
| mask_ids = [] | |
| if 'image' in data_dict and data_dict['image'] is not None: | |
| modal = 'image' | |
| if all(not "<image>" in sentence["value"] for sentence in conversation): | |
| warnings.warn(f"Image tag not found in the conversation, add it automatically at the beginning!") | |
| conversation[0]["value"] = "<image>" + conversation[0]["value"] | |
| image_file = data_dict['image'] | |
| if isinstance(image_file, list): | |
| image_file = [os.path.join(data_folder, f) for f in image_file] | |
| else: | |
| image_file = os.path.join(data_folder, image_file) | |
| images = load_images(image_file) | |
| masks = [] | |
| if 'masks' in data_dict and data_dict['masks'] is not None and len(data_dict['masks'])>0: | |
| if 'height' in data_dict: | |
| h = data_dict['height'] | |
| w = data_dict['width'] | |
| else: | |
| h = None | |
| w = None | |
| for ann in data_dict['masks']: | |
| mask = annToMask(ann, h, w) | |
| masks.append(mask) | |
| mask_ids.append(0) | |
| masks = np.stack(masks, axis=0) | |
| masks = torch.from_numpy(masks) | |
| additional_frames = images.copy() | |
| else: | |
| masks = None | |
| elif 'video' in data_dict and data_dict['video'] is not None: | |
| modal = 'video' | |
| if all(not "<video>" in sentence["value"] for sentence in conversation): | |
| warnings.warn(f"Video tag not found in the conversation, add it automatically at the beginning!") | |
| conversation[0]["value"] = "<video>" + conversation[0]["value"] | |
| video_file = data_dict['video'] | |
| masks = [] | |
| frame_ids = [] | |
| if 'masks' in data_dict and data_dict['masks'] is not None: | |
| if 'height' in data_dict: | |
| h = data_dict['height'] | |
| w = data_dict['width'] | |
| else: | |
| h = None | |
| w = None | |
| for ann in data_dict['masks']: | |
| for k in ann.keys(): | |
| if int(k) not in frame_ids: | |
| frame_ids.append(int(k)) | |
| mask_ids.append(frame_ids.index(int(k))) | |
| mask = annToMask(ann[k], h, w) | |
| masks.append(mask) | |
| masks = np.stack(masks, axis=0) | |
| masks = torch.from_numpy(masks) | |
| else: | |
| masks = None | |
| if isinstance(video_file, list) and len(video_file) == 1: | |
| video_file = os.path.join(data_folder, video_file[0]) | |
| images, timestamps, additional_frames = load_video(video_file, fps=self.data_args.fps, max_frames=self.data_args.max_frames, frame_ids=frame_ids) | |
| elif isinstance(video_file, list) and len(video_file)>1: #images | |
| images = [] | |
| for vf in video_file: | |
| images+=load_images(os.path.join(data_folder, vf)) | |
| timestamps = data_dict['timestamps'] | |
| additional_frames = [] | |
| for mv in data_dict['masked_video']: | |
| additional_frames+=load_images(os.path.join(data_folder, mv)) | |
| else: | |
| raise ValueError(f"Unsupported video format: {video_file}") | |
| else: | |
| modal = 'text' | |
| images = [] | |
| masks = None | |
| if masks is not None and len(masks)>0: | |
| additional_frames, masks, mask_nums = resize_image_mask(additional_frames, masks, mask_ids) | |
| conv_i = 0 | |
| for idx in range(len(mask_nums)): | |
| while '<region>' not in conversation[conv_i]['value']: | |
| conv_i+=1 | |
| conversation[conv_i]['value'] = conversation[conv_i]['value'].replace('<region>', "["+REGION_TOKEN*mask_nums[idx]+"]", 1) | |
| messages = [] | |
| for conv in conversation: | |
| if conv["from"] == "human": | |
| # replace video tag to image tag for unified processing | |
| # conv["value"] = conv["value"].replace("<video>", "<image>" * len(images)) | |
| chunks = conv["value"].split("<image>" if modal == 'image' else "<video>") | |
| messages.append({ | |
| "role": "user", | |
| "content": [] | |
| }) | |
| for chunk_idx in range(1, 2 * len(chunks)): | |
| if chunk_idx % 2 == 1: | |
| chunk = chunks[chunk_idx // 2].strip() | |
| messages[-1]["content"].append({"type": "text", "text": chunk}) if chunk else None | |
| else: | |
| if modal == 'image': | |
| messages[-1]["content"].append({"type": "image"}) | |
| elif modal == 'video': | |
| messages[-1]["content"].append({"type": "video", "num_frames": len(images), "time": timestamps}) | |
| else: | |
| messages.append({ | |
| "role": "assistant", | |
| "content": conv['value'] | |
| }) | |
| # TODO: dynamic downsampling | |
| # image_downsampling = self.data_args.spatial_merge_size | |
| image_downsampling = self.data_args.spatial_merge_size if modal == "video" else 1 | |
| # if modal == 'video': | |
| # image_downsampling = 2 | |
| # else: | |
| # # image/text | |
| # image_downsampling = 1 | |
| return modal, images, messages, image_downsampling, masks, additional_frames | |
| def _convert_stream(self, data_dict): | |
| video_path = os.path.join(self.data_args.data_folder, data_dict['video'][0]) | |
| frames, timestamps = load_video( | |
| video_path=video_path, | |
| start_time=data_dict["start_time"], | |
| end_time=data_dict["end_time"], | |
| fps=self.data_args.fps, | |
| max_frames=self.data_args.max_frames, | |
| size=STREAM_IMAGE_SIZE, | |
| # size_divisible=14 * STREAM_DOWNSAMPLING, | |
| ) | |
| if len(frames) > STREAM_MAX_FRAMES: | |
| max_time = timestamps[STREAM_MAX_FRAMES] | |
| frames = frames[:STREAM_MAX_FRAMES] | |
| timestamps = timestamps[:STREAM_MAX_FRAMES] | |
| else: | |
| max_time = float("inf") | |
| messages = [] | |
| frame_idx = 0 | |
| conversation = copy.deepcopy(data_dict["conversation"]) | |
| for message in conversation: | |
| if message["time"] >= max_time: | |
| break | |
| while frame_idx < len(timestamps) and timestamps[frame_idx] <= message["time"]: | |
| messages.append({ | |
| "role": "stream", | |
| "content": [{"type": "image", "time": timestamps[frame_idx] - data_dict["start_time"]}], | |
| }) | |
| frame_idx += 1 | |
| messages.append(message) | |
| frames = frames[:frame_idx] | |
| # return "video", frames, messages, STREAM_DOWNSAMPLING | |
| return "video", frames, messages, self.data_args.spatial_merge_size | |
| def __getitem__(self, i) -> Dict[str, torch.Tensor]: | |
| data_dict = self.list_data_dict[i] | |
| try: | |
| if "stream" in data_dict and data_dict["stream"]: | |
| modal, images, messages, image_downsampling = self._convert_stream(data_dict) | |
| else: | |
| modal, images, messages, image_downsampling, masks, additional_frames = self._convert_normal(data_dict) | |
| data_dict = self.vlprocessor( | |
| images=images, | |
| text=messages, | |
| image_downsampling=image_downsampling, | |
| return_labels=True, | |
| return_tensors="pt", | |
| ) | |
| if len(additional_frames)>0: | |
| additional_images_dict = self.vlprocessor._process_image(additional_frames, num_images=1, image_downsampling=1) | |
| additional_images = additional_images_dict['images'] | |
| additional_images_thws = additional_images_dict['grid_thws'] | |
| else: | |
| additional_images = [] | |
| additional_images_thws = [] | |
| if modal == 'text': | |
| unit_size = self.vlprocessor.image_processor.patch_size**2 * 3 * self.vlprocessor.image_processor.temporal_patch_size | |
| data_dict['images'] = [torch.zeros(self.data_args.spatial_merge_size**2, unit_size)] | |
| data_dict['grid_thws'] = [torch.tensor([[1, self.data_args.spatial_merge_size, self.data_args.spatial_merge_size]])] | |
| elif modal == 'image' or modal == 'video': | |
| assert len(data_dict['images']) > 0 and len(data_dict['grid_thws']) > 0, f"Invalid image data: {data_dict['images']}, {data_dict['grid_thws']}" | |
| data_dict['modal'] = modal | |
| data_dict['masks'] = masks | |
| data_dict['additional_images'] = additional_images | |
| data_dict['additional_images_thws'] = additional_images_thws | |
| except Exception as e: | |
| traceback.print_exc() | |
| backup_idx = random.randint(0, len(self.list_data_dict) - 1) | |
| print(f"Encounted error when process {i}-th example: {data_dict}, use {backup_idx}-th example instead!!!") | |
| return self.__getitem__(backup_idx) | |
| return data_dict | |
| class DataCollatorForSupervisedDataset(object): | |
| """Collate examples for supervised fine-tuning.""" | |
| vlprocessor: transformers.ProcessorMixin | |
| def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: | |
| input_ids, labels = tuple([instance[key] for instance in instances] | |
| for key in ("input_ids", "labels")) | |
| input_ids = torch.nn.utils.rnn.pad_sequence( | |
| input_ids, | |
| batch_first=True, | |
| padding_value=self.vlprocessor.tokenizer.pad_token_id) | |
| labels = torch.nn.utils.rnn.pad_sequence(labels, | |
| batch_first=True, | |
| padding_value=IGNORE_INDEX) | |
| input_ids = input_ids[:, :self.vlprocessor.tokenizer.model_max_length] | |
| labels = labels[:, :self.vlprocessor.tokenizer.model_max_length] | |
| batch = dict( | |
| input_ids=input_ids, | |
| labels=labels, | |
| attention_mask=input_ids.ne(self.vlprocessor.tokenizer.pad_token_id), | |
| ) | |
| # work for 'images' argument in `prepare_inputs_labels_for_multimodal` | |
| batch['images'] = [] | |
| batch['additional_images'] = [] | |
| batch["masks"] = [] | |
| mask_idx_start = 0 | |
| for instance in instances: | |
| # for modal_token in MODAL_INDEX_MAP.keys(): | |
| # modal_token = modal_token.lower() | |
| # # MODAL_TOKEN shape like: <image>, <video>, ... | |
| # modal_name = re.findall(f'[<](.*)[>]', modal_token) | |
| # assert len(modal_name) == 1 | |
| # modal_name = modal_name[0] | |
| batch['images'].append((instance['modal'], instance['images'], instance['grid_thws'])) | |
| if len(instance['additional_images'])>0: | |
| batch['additional_images'].append((instance['additional_images'], instance['additional_images_thws'])) | |
| if instance["masks"] is not None: | |
| batch["masks"].append(instance["masks"]) | |
| mask_idx_start+=len(instance['additional_images']) | |
| return batch | |
| def make_supervised_data_module(vlprocessor, data_args) -> Dict: | |
| """Make dataset and collator for supervised fine-tuning.""" | |
| train_dataset = LazySupervisedDataset( | |
| vlprocessor=vlprocessor, | |
| data_path=data_args.data_path, | |
| data_args=data_args | |
| ) | |
| data_collator = DataCollatorForSupervisedDataset(vlprocessor=vlprocessor) | |
| return dict(train_dataset=train_dataset, | |
| eval_dataset=None, | |
| data_collator=data_collator) | |
| class DataCollatorWithFlatteningForSupervisedDataset(object): | |
| """Collate examples for batch flattened supervised fine-tuning.""" | |
| vlprocessor: transformers.ProcessorMixin | |
| def __call__(self, instances: Sequence[Dict], separator_id=-100) -> Dict[str, torch.Tensor]: | |
| input_ids, labels = tuple([instance[key] for instance in instances] | |
| for key in ("input_ids", "labels")) | |
| new_input_ids = [] | |
| new_labels = [] | |
| position_ids = [] | |
| for idx in range(0, len(input_ids)): | |
| new_input_ids.append(input_ids[idx][:self.vlprocessor.tokenizer.model_max_length]) | |
| temp_label = labels[idx][:self.vlprocessor.tokenizer.model_max_length] | |
| temp_label[0] = separator_id | |
| new_labels.append(temp_label) | |
| position_ids.append(torch.tensor(list(range(len(input_ids[idx][:self.vlprocessor.tokenizer.model_max_length]))))) | |
| new_input_ids = torch.cat(new_input_ids) | |
| new_labels = torch.cat(new_labels) | |
| position_ids = torch.cat(position_ids) | |
| batch = dict( | |
| input_ids=new_input_ids.unsqueeze(0), | |
| labels=new_labels.unsqueeze(0), | |
| position_ids=position_ids.unsqueeze(0), | |
| ) | |
| # work for 'images' argument in `prepare_inputs_labels_for_multimodal` | |
| batch['images'] = [] | |
| batch['additional_images'] = [] | |
| # mask_idx_start = 0 | |
| for instance in instances: | |
| batch['images'].append((instance['modal'], instance['images'], instance['grid_thws'])) | |
| if len(instance['additional_images'])>0: | |
| batch['additional_images'].append((instance['additional_images'], instance['additional_images_thws'])) | |
| # mask_idx_start+=len(instance['additional_images']) | |
| batch["masks"] = [x["masks"] for x in instances] | |
| return batch | |
| def make_flattening_supervised_data_module(vlprocessor: transformers.ProcessorMixin, data_args) -> Dict: | |
| """Make batch flattened dataset and collator for supervised fine-tuning.""" | |
| train_dataset = LazySupervisedDataset( | |
| vlprocessor=vlprocessor, | |
| data_path=data_args.data_path, | |
| data_args=data_args | |
| ) | |
| data_collator = DataCollatorWithFlatteningForSupervisedDataset(vlprocessor=vlprocessor) | |
| return dict(train_dataset=train_dataset, | |
| eval_dataset=None, | |
| data_collator=data_collator) | |
| def train(attn_implementation=None): | |
| global local_rank | |
| set_seed(42) | |
| parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments)) | |
| model_args, data_args, training_args = parser.parse_args_into_dataclasses() | |
| local_rank = training_args.local_rank | |
| if local_rank == 0: | |
| print('------model args------') | |
| print(model_args) | |
| print('------data args------') | |
| print(data_args) | |
| print('------training args------') | |
| print(training_args) | |
| compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32)) | |
| bnb_model_from_pretrained_args = {} | |
| if training_args.bits in [4, 8]: | |
| from transformers import BitsAndBytesConfig | |
| bnb_model_from_pretrained_args.update(dict( | |
| # device_map={"": training_args.device}, | |
| # BUG: High version transformers report error: | |
| # ValueError: You can't pass `load_in_4bit`or `load_in_8bit` as a kwarg when passing `quantization_config` argument at the same time | |
| # load_in_4bit=training_args.bits == 4, | |
| # load_in_8bit=training_args.bits == 8, | |
| quantization_config=BitsAndBytesConfig( | |
| load_in_4bit=training_args.bits == 4, | |
| load_in_8bit=training_args.bits == 8, | |
| llm_int8_skip_modules=["mm_projector"], | |
| llm_int8_threshold=6.0, | |
| llm_int8_has_fp16_weight=False, | |
| bnb_4bit_compute_dtype=compute_dtype, | |
| bnb_4bit_use_double_quant=training_args.double_quant, | |
| bnb_4bit_quant_type=training_args.quant_type, # {'fp4', 'nf4'} | |
| bnb_4bit_quant_storage=compute_dtype, | |
| ) | |
| )) | |
| config = VLLMConfigs[model_args.model_type].from_pretrained(model_args.model_path) | |
| config._attn_implementation = attn_implementation | |
| # NOTE: active spatial_merge_size arguments | |
| config.spatial_merge_size = model_args.spatial_merge_size | |
| config.mm_max_length = model_args.mm_max_length | |
| config.use_token_compression = model_args.use_token_compression | |
| if model_args.vision_encoder is not None: | |
| model = VLLMs[model_args.model_type].from_pretrained( | |
| model_args.model_path, | |
| config=config, | |
| torch_dtype=compute_dtype, | |
| do_sample=True, | |
| **bnb_model_from_pretrained_args | |
| ) | |
| if 'mixtral' in model_args.model_type: | |
| import deepspeed | |
| deepspeed.utils.set_z3_leaf_modules(model, [MixtralSparseMoeBlock]) | |
| else: | |
| model = transformers.LlamaForCausalLM.from_pretrained( | |
| model_args.model_path, | |
| config=config, | |
| torch_dtype=compute_dtype, | |
| do_sample=True, | |
| **bnb_model_from_pretrained_args | |
| ) | |
| model.config.use_cache = False | |
| if model_args.freeze_backbone: | |
| model.model.requires_grad_(False) | |
| if training_args.bits in [4, 8]: | |
| from peft import prepare_model_for_kbit_training | |
| model.config.torch_dtype=(torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32)) | |
| model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing) | |
| if training_args.gradient_checkpointing: | |
| if hasattr(model, "enable_input_require_grads"): | |
| model.enable_input_require_grads() | |
| else: | |
| def make_inputs_require_grad(module, input, output): | |
| output.requires_grad_(True) | |
| model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) | |
| if training_args.lora_enable: | |
| from peft import LoraConfig, get_peft_model | |
| lora_config = LoraConfig( | |
| r=training_args.lora_r, | |
| lora_alpha=training_args.lora_alpha, | |
| target_modules=find_all_linear_names(model), | |
| lora_dropout=training_args.lora_dropout, | |
| bias=training_args.lora_bias, | |
| task_type="CAUSAL_LM", | |
| ) | |
| if training_args.bits == 16: | |
| if training_args.bf16: | |
| model.to(torch.bfloat16) | |
| if training_args.fp16: | |
| model.to(torch.float16) | |
| rank0_print("Adding LoRA adapters...") | |
| model = get_peft_model(model, lora_config) | |
| tokenizer = transformers.AutoTokenizer.from_pretrained( | |
| model_args.model_path, | |
| model_max_length=training_args.model_max_length, | |
| padding_side="right", | |
| use_fast=True, | |
| ) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.unk_token | |
| if model_args.vision_encoder is not None: | |
| # initialize vision encoder + multi-modal projector | |
| model.get_model().initialize_vision_modules(model_args=model_args, fsdp=training_args.fsdp) | |
| vision_encoder = model.get_vision_encoder() | |
| vision_encoder.to(dtype=compute_dtype, device=training_args.device) | |
| mm_projector = model.get_mm_projector() | |
| mm_projector.to(dtype=compute_dtype if training_args.bf16 else torch.float16, device=training_args.device) | |
| data_args.is_multimodal = True | |
| model.config.tokenizer_padding_side = tokenizer.padding_side | |
| model.config.tokenizer_model_max_length = tokenizer.model_max_length | |
| if training_args.bits in [4, 8]: | |
| model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device) | |
| # decoupled learning rate | |
| model.config.llm_lr = training_args.llm_lr | |
| model.config.vision_encoder_lr = training_args.vision_encoder_lr | |
| model.config.mm_projector_lr = training_args.mm_projector_lr | |
| model.config.region_encoder_lr = training_args.region_encoder_lr | |
| if model.config.llm_lr is None: | |
| for p in model.get_model().parameters(): | |
| p.requires_grad = False | |
| for p in model.get_model().vision_encoder.parameters(): | |
| p.requires_grad = True | |
| for p in model.get_model().mm_projector.parameters(): | |
| p.requires_grad = True | |
| for p in model.get_model().region_encoder.parameters(): | |
| p.requires_grad = True | |
| if model.config.vision_encoder_lr is None: | |
| for p in model.get_model().vision_encoder.parameters(): | |
| p.requires_grad = False | |
| if model.config.mm_projector_lr is None: | |
| for p in model.get_model().mm_projector.parameters(): | |
| p.requires_grad = False | |
| if model.config.region_encoder_lr is None: | |
| for p in model.get_model().region_encoder.parameters(): | |
| p.requires_grad = False | |
| model.config.max_frames = getattr(data_args, 'max_frames', NUM_FRAMES) | |
| model.config.image_aspect_ratio = data_args.image_aspect_ratio if 'qwen2vl' not in model_args.vision_encoder else 'qwen2vl' | |
| # NOTE: complement data_args via model hyperparameters | |
| # 1. acquire image size | |
| model.config.image_size = data_args.image_size = vision_encoder.image_size | |
| # 2. calculate the number of tokens in the image | |
| model.config.image_token_length = data_args.image_token_length = mm_projector.cal_proj_size(vision_encoder.num_patches_per_side) | |
| # 3. check if alignment | |
| model.config.is_alignment = training_args.is_alignment = data_args.is_alignment = ( | |
| model.config.mm_projector_lr is not None and | |
| model.config.llm_lr is None and | |
| model.config.vision_encoder_lr is None | |
| ) | |
| # 4. set spatial merge size as default | |
| model.config.spatial_merge_size = data_args.spatial_merge_size = model_args.spatial_merge_size | |
| tokenizer.add_tokens([DEFAULT_IMAGE_TOKEN, STREAM_START_TOKEN, STREAM_END_TOKEN], special_tokens=True) | |
| tokenizer.add_tokens([REGION_TOKEN], special_tokens=True) | |
| model.resize_token_embeddings(len(tokenizer)) | |
| model.config.image_token_index = tokenizer.convert_tokens_to_ids(DEFAULT_IMAGE_TOKEN) | |
| model.config.region_token_index = tokenizer.convert_tokens_to_ids(REGION_TOKEN) | |
| vlprocessor = Videollama3Processor(vision_encoder.image_processor, tokenizer) | |
| if training_args.bits in [4, 8]: | |
| from peft.tuners.lora import LoraLayer | |
| for name, module in model.named_modules(): | |
| if isinstance(module, LoraLayer): | |
| if training_args.bf16: | |
| module = module.to(torch.bfloat16) | |
| if 'norm' in name: | |
| module = module.to(torch.float32) | |
| if 'lm_head' in name or 'embed_tokens' in name: | |
| if hasattr(module, 'weight'): | |
| if training_args.bf16 and module.weight.dtype == torch.float32: | |
| module = module.to(torch.bfloat16) | |
| if local_rank == 0: | |
| print("Current model:", model) | |
| print("Model config:", model.config) | |
| if data_args.use_batch_flattening: | |
| rank0_print('You are using flattening operation to flatten the entire mini batch into a single sequence') | |
| assert model.config._attn_implementation == 'flash_attention_2' | |
| assert version.parse(transformers.__version__) >= version.parse("4.44.0") | |
| data_module = make_flattening_supervised_data_module(vlprocessor=vlprocessor, data_args=data_args) | |
| else: | |
| data_module = make_supervised_data_module(vlprocessor=vlprocessor, data_args=data_args) | |
| # select a Trainer | |
| trainer = VideoLLaMA3Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module) | |
| if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")): | |
| trainer.train(resume_from_checkpoint=True) | |
| else: | |
| trainer.train() | |
| trainer.save_state() | |
| model.config.use_cache = True | |
| if training_args.lora_enable: | |
| state_dict = get_peft_state_maybe_zero_3(model.named_parameters(), training_args.lora_bias) | |
| non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(model.named_parameters()) | |
| if training_args.local_rank == 0 or training_args.local_rank == -1: | |
| model.config.save_pretrained(training_args.output_dir) | |
| model.save_pretrained(training_args.output_dir, state_dict=state_dict) | |
| torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin')) | |
| else: | |
| safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir) | |
| if __name__ == "__main__": | |
| train(attn_implementation="flash_attention_2") | |