Upload 4 files

Browse files

Files changed (4) hide show

scripts/original_stella_jasper_training_codes/run_train_align_image_text_stage4.py +466 -0
scripts/original_stella_jasper_training_codes/run_train_distill_stage1.py +405 -0
scripts/original_stella_jasper_training_codes/run_train_distill_stage2.py +390 -0
scripts/original_stella_jasper_training_codes/run_train_mrl_stage3.py +415 -0

scripts/original_stella_jasper_training_codes/run_train_align_image_text_stage4.py ADDED Viewed

	@@ -0,0 +1,466 @@

+# coding=utf8
+from PIL import ImageFile
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+import copy
+import json
+import os
+import sys
+import yaml
+import torch
+import shutil
+import math
+import random
+import tarfile
+import io
+import accelerate
+from loguru import logger
+from torch.utils.data import DataLoader, Dataset
+from transformers import get_scheduler, SiglipImageProcessor
+from accelerate import Accelerator
+from accelerate.utils import set_seed, ProjectConfiguration
+from tqdm import tqdm
+from typing import List, Union
+from os.path import join
+import torch.nn.functional as F
+from jasper_model.modeling_jasper_vl import JasperVL
+from jasper_model.tokenization_qwen import Qwen2TokenizerFast
+from jasper_model.configuration_jasper_vl import JasperVLConfig
+from PIL import Image
+class JasperVLDataset_TAR(Dataset):
+    def __init__(self, file_path_list: Union[List[str], str], tar_names_list: List[str]):
+        self.file_path_list = file_path_list
+        self.tar_fr_list = [tarfile.open(file_path) for file_path in file_path_list]
+        self.tar_names_list = tar_names_list
+        self.num_data_of_tar = [len(i) for i in self.tar_names_list]
+        self.num_all_data = sum(self.num_data_of_tar)
+        self.all_ids = []
+        ids_list = [list(range(i)) for i in self.num_data_of_tar]
+        for start in range(0, len(ids_list), 11):
+            end = start + 11
+            if end > len(ids_list):
+                end = len(ids_list)
+            while True:
+                pre_num = len(self.all_ids)
+                for file_idx in range(start, end):
+                    if ids_list[file_idx]:
+                        self.all_ids.append((file_idx, ids_list[file_idx].pop()))
+                if len(self.all_ids) == pre_num:
+                    break
+        assert len(self.all_ids) == self.num_all_data
+        self.accumulation_numbers = [sum(self.num_data_of_tar[:idx + 1]) for idx in range(len(self.num_data_of_tar))]
+        if accelerator.is_main_process:
+            logger.info(f"file_path_list:{file_path_list}")
+            logger.info(f"num_data_of_tar:{self.num_data_of_tar}")
+            logger.info(f"number of data:{self.num_all_data}")
+    def __len__(self):
+        return self.num_all_data
+    def __getitem__(self, item):
+        file_idx, item = self.all_ids[item]
+        file_path = self.file_path_list[file_idx]
+        tar_fr = self.tar_fr_list[file_idx]
+        text_item = json.loads(
+            tar_fr.extractfile(self.tar_names_list[file_idx][item]["text_name"]).read()
+        )
+        img_bytes = tar_fr.extractfile(self.tar_names_list[file_idx][item]["img_name"]).read()
+        # 根据file path 获取要处理的数据类型
+        if "DocStruct4M_struct_aware_parse" in file_path:
+            user_text = text_item["conversations"][0]["value"]
+            assistant_text = text_item["conversations"][1]["value"]
+            idx = user_text.find("<doc>")
+            if idx == -1:
+                user_text = ""
+            else:
+                user_text = user_text[idx + 5:]
+                # -6的原因是有</doc>
+            return {"text": user_text + assistant_text[:-6], "img_bytes": img_bytes}
+        else:
+            return {"text": text_item["conversations"][1]["value"], "img_bytes": img_bytes}
+# modelscope download --dataset 'BAAI/Infinity-MM' --include 'stage2/DocStruct4M/DocStruct4M_struct_aware_parse*' --local_dir infinity_mm
+# modelscope download --dataset 'BAAI/Infinity-MM' --include 'stage2/llava-onevision-mid-stage/synthdog_en_100k--synthdog_en_processed_new/*.tar' --local_dir infinity_mm
+# modelscope download --dataset 'BAAI/Infinity-MM' --include 'stage2/MMC-Alignment/MMC-Alignment-mmc_chart_text_alignment_arxiv_text/*.tar' --local_dir infinity_mm
+def collate_fn(batch):
+    """
+    :param batch:List[data_set[i]]
+    :return:
+    """
+    texts = [item["text"] for item in batch]
+    images = [Image.open(io.BytesIO(item["img_bytes"])).convert("RGB") for item in batch]
+    try:
+        pixel_values = processor(
+            images=images,
+            return_tensors="pt"
+        )["pixel_values"].bfloat16()
+    except Exception as e:
+        logger.error(f"转换成pixel_values失败：{e}, 会选取一些重复的数据进行替代")
+        # 先获取正常的数据id
+        normal_ids = []
+        for idx, img in enumerate(images):
+            try:
+                _ = processor(images=[img], return_tensors="pt")
+                normal_ids.append(idx)
+            except:
+                continue
+        if not normal_ids:
+            # 彻底没救了，gg
+            raise
+        # 然后组成一个无错误的texts和images
+        normal_texts, norm_images = [], []
+        while True:
+            for idx in normal_ids:
+                normal_texts.append(copy.deepcopy(texts[idx]))
+                norm_images.append(copy.deepcopy(images[idx]))
+                if len(normal_texts) == len(batch):
+                    break
+            if len(normal_texts) == len(batch):
+                break
+        # 重新赋值并生成pixel values
+        texts, images = normal_texts, norm_images
+        pixel_values = processor(
+            images=images,
+            return_tensors="pt"
+        )["pixel_values"].bfloat16()
+    teacher_ipt = tokenizer(texts, padding=padding, truncation=True, max_length=max_length, return_tensors="pt")
+    student_ipt = tokenizer(
+        # +2是因为要考虑start token和end token
+        ["<|jasper_img_token|>" * (model_conf.num_img_tokens + 2)] * len(batch),
+        padding="longest", return_tensors="pt"
+    )
+    student_ipt["pixel_values"] = pixel_values
+    return {"teacher_ipt": teacher_ipt, "student_ipt": student_ipt}
+def save_model():
+    checkpoint_dir = join(output_dir, f"step_{completed_steps}")
+    # accelerator.save_state(checkpoint_dir, safe_serialization=True)
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        logger.info(f"保存模型{checkpoint_dir}")
+        accelerator.unwrap_model(model).save_pretrained(checkpoint_dir, max_shard_size="32GB", safe_serialization=True)
+        # torch.save(accelerator.unwrap_model(optimizer).state_dict(), join(checkpoint_dir, "optimizer.bin"))
+        processor.save_pretrained(checkpoint_dir)
+        # tokenizer.save_pretrained(checkpoint_dir)
+        # cppy file
+        shutil.copy("./jasper_model/configuration_jasper_vl.py", join(checkpoint_dir, "configuration_jasper_vl.py"))
+        shutil.copy("./jasper_model/modeling_jasper_vl.py", join(checkpoint_dir, "modeling_jasper_vl.py"))
+        shutil.copy("./jasper_model/tokenization_qwen.py", join(checkpoint_dir, "tokenization_qwen.py"))
+        # change config json
+        with open(join(checkpoint_dir, "config.json"), "r", encoding="utf8") as fr:
+            config = json.load(fr)
+        if "_name_or_path" in config:
+            config.pop("_name_or_path")
+        config["auto_map"] = {
+            "AutoModel": "modeling_jasper_vl.JasperVL",
+            "AutoConfig": "configuration_jasper_vl.JasperVLConfig",
+        }
+        with open(join(checkpoint_dir, "config.json"), "w", encoding="utf8") as fw:
+            json.dump(config, fw, ensure_ascii=False, indent=1)
+        ## modules.json
+        with open(os.path.join(checkpoint_dir, "modules.json"), "w", encoding="utf8") as fw:
+            json.dump(
+                [
+                    {
+                        "idx": 0,
+                        "name": "0",
+                        "path": "",
+                        "type": "sentence_transformers.models.Transformer"
+                    }
+                ],
+                fw,
+                ensure_ascii=False,
+                indent=1
+            )
+        ## sentence_bert_config.json
+        shutil.copy(join(model_dir, "added_tokens.json"), join(checkpoint_dir, "added_tokens.json"))
+        shutil.copy(join(model_dir, "config_sentence_transformers.json"),
+                    join(checkpoint_dir, "config_sentence_transformers.json"))
+        shutil.copy(join(model_dir, "merges.txt"), join(checkpoint_dir, "merges.txt"))
+        shutil.copy(join(model_dir, "sentence_bert_config.json"), join(checkpoint_dir, "sentence_bert_config.json"))
+        shutil.copy(join(model_dir, "special_tokens_map.json"), join(checkpoint_dir, "special_tokens_map.json"))
+        shutil.copy(join(model_dir, "tokenizer_config.json"), join(checkpoint_dir, "tokenizer_config.json"))
+        shutil.copy(join(model_dir, "tokenizer.json"), join(checkpoint_dir, "tokenizer.json"))
+        shutil.copy(join(model_dir, "vocab.json"), join(checkpoint_dir, "vocab.json"))
+def get_score_diff(vectors):
+    scores = torch.matmul(vectors, vectors.T)
+    scores = scores[torch.triu(torch.ones_like(scores), diagonal=1).bool()]
+    score_diff = scores.reshape((1, -1)) - scores.reshape((-1, 1))
+    score_diff = score_diff[torch.triu(torch.ones_like(score_diff), diagonal=1).bool()]
+    return score_diff
+if __name__ == "__main__":
+    # read the configration
+    with open(sys.argv[1].strip(), "r", encoding="utf8") as fr:
+        conf = yaml.safe_load(fr)
+    model_dir = conf["model_path_or_name"]
+    max_length = conf["max_length"]
+    resume_model_dir = conf["resume_model_dir"]
+    output_dir = conf["output_dir"]
+    save_steps = conf["save_steps"]
+    batch_size = conf["batch_size"]
+    project_name = conf["project_name"]
+    log_with = conf["log_with"]
+    log_init_kwargs = conf["log_init_kwargs"]
+    file_path_list_path = conf["file_path_list_path"]
+    print_debug_info_prob = conf["print_debug_info_prob"]
+    gradient_accumulation_steps = conf["gradient_accumulation_steps"]
+    continue_train = conf["continue_train"]
+    num_train_epochs = conf["num_train_epochs"]
+    lr_scheduler_type = conf["lr_scheduler_type"]
+    mse_loss_scale = conf["mse_loss_scale"]
+    cosine_loss_scale = conf["cosine_loss_scale"]
+    padding = conf["padding"]
+    rank_margin = conf["rank_margin"]
+    rank_loss_scale = conf["rank_loss_scale"]
+    start_index, end_index = conf["start_index"], conf["end_index"]
+    scheduler_kwargs = conf.get("scheduler_kwargs", {})
+    seed = conf["seed"]
+    # initialize accelerator
+    accelerator = Accelerator(
+        project_config=ProjectConfiguration(
+            project_dir=output_dir,
+            logging_dir=join(output_dir, "logs"),
+        ),
+        gradient_accumulation_steps=gradient_accumulation_steps,
+        log_with=log_with,
+        kwargs_handlers=[
+            accelerate.DistributedDataParallelKwargs(find_unused_parameters=not conf["gradient_checkpointing"])]
+    )
+    # output_dir and sth
+    with accelerator.main_process_first():
+        if accelerator.is_main_process:
+            os.makedirs(output_dir, exist_ok=True)
+            os.makedirs(join(output_dir, "logs/wandb_logs"), exist_ok=True)
+            logger.add(
+                join(output_dir, "train_logs.txt"),
+                level="DEBUG",
+                compression="zip",
+                rotation="500 MB",
+                # format="{message}"
+            )
+            shutil.copy(sys.argv[1].strip(), join(output_dir, "train_config.yml"))
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        logger.info(f"accelerator.state:{accelerator.state}")
+    # seed
+    set_seed(seed=seed)
+    # 加载模型、tokenizer
+    processor = SiglipImageProcessor.from_pretrained(model_dir)
+    model_conf = JasperVLConfig.from_pretrained(model_dir)
+    model = JasperVL.from_pretrained(model_dir, is_text_encoder=False)
+    tokenizer = Qwen2TokenizerFast.from_pretrained(model_dir, padding_side="right")
+    for k, v in model.named_parameters():
+        if k.startswith("model.") or k.startswith("vector_linear_"):
+            v.requires_grad = False
+    # 训练 最后三个特殊token
+    ## 加了好像没有用，就不加了
+    # model.get_input_embeddings().weight.data[-1].requires_grad = True
+    # model.get_input_embeddings().weight.data[-3].requires_grad = True
+    if accelerator.is_main_process:
+        logger.debug("参数冻结情况")
+        for k, v in model.named_parameters():
+            logger.debug(f"{k}:{v.shape, v.requires_grad}")
+    if conf["gradient_checkpointing"]:
+        model.gradient_checkpointing_enable()
+    # 加载数据和teacher vector
+    with open(file_path_list_path, "r", encoding="utf8") as fr:
+        file_path_list = json.load(fr)[start_index:end_index]
+    with open(conf["tar_names_path"], "r", encoding="utf8") as fr:
+        tar_names_list = json.load(fr)[start_index:end_index]
+    train_dataset = JasperVLDataset_TAR(file_path_list=file_path_list, tar_names_list=tar_names_list)
+    train_dataloader = DataLoader(
+        dataset=train_dataset,
+        shuffle=False,
+        collate_fn=collate_fn,
+        batch_size=batch_size,
+        num_workers=1,  # 大于1，会报错，懒得调试了
+        drop_last=True,
+        # pin_memory=True,
+        # pin_memory_device="cuda",
+        # prefetch_factor=4,
+    )
+    # 加载上次的训练状态
+    accelerator.wait_for_everyone()
+    # init log
+    if "wandb" in log_init_kwargs:
+        log_init_kwargs["wandb"]["dir"] = join(output_dir, "logs/wandb_logs")
+        log_init_kwargs["wandb"]["config"] = {k: json.dumps(v, ensure_ascii=False) for k, v in conf.items()}
+    accelerator.init_trackers(
+        project_name=project_name,
+        init_kwargs=log_init_kwargs
+    )
+    # Optimizer
+    optimizer = torch.optim.AdamW(model.parameters(), lr=conf["learning_rate"])
+    # if os.path.exists(join(model_path_or_name, "optimizer.bin")):
+    #     logger.info("加载优化器权重")
+    #     optimizer.load_state_dict(torch.load(join(model_path_or_name, "optimizer.bin"), weights_only=False, map_location="cpu"))
+    # scheduler
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
+    max_train_steps = num_update_steps_per_epoch * num_train_epochs
+    logger.info(f"max_train_steps:{max_train_steps}")
+    if isinstance(conf["num_warmup_steps"], float):
+        num_warmup_steps = int(max_train_steps * conf["num_warmup_steps"])
+    else:
+        num_warmup_steps = conf["num_warmup_steps"]
+    lr_scheduler = get_scheduler(
+        name=lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=conf.get("max_train_steps") if conf.get("max_train_steps", -1) > 0 else max_train_steps,
+        scheduler_specific_kwargs=scheduler_kwargs,
+    )
+    logger.debug(f"before prepare, len(train_dataloader): {len(train_dataloader)}")
+    # prepare everything
+    model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, lr_scheduler
+    )
+    logger.debug(f"after prepare, len(train_dataloader): {len(train_dataloader)}")
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    ## PS: 多机多卡的问题，之前的计算没有考虑num_process,多机读卡下len(train_dataloader)会变小， 接下来的相当于是每张卡的数量
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / accelerator.gradient_accumulation_steps)
+    max_train_steps = num_train_epochs * num_update_steps_per_epoch
+    logger.debug(f"max_train_steps for each card:{max_train_steps}")
+    starting_epoch, completed_steps = 0, 0
+    progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process)
+    if continue_train:
+        logger.info(f"Continue train from {model_dir}")
+        accelerator.load_state(resume_model_dir)
+        resume_step = int(os.path.basename(resume_model_dir).replace("step_", ""))
+        completed_steps = resume_step
+        starting_epoch = resume_step // num_update_steps_per_epoch
+        resume_step -= starting_epoch * num_update_steps_per_epoch
+    progress_bar.update(completed_steps)
+    # 开始训练
+    for epoch in range(starting_epoch, num_train_epochs):
+        model.train()
+        # skip new `skip_first_batches` to skip the batches when resuming from ckpt
+        if continue_train and epoch == starting_epoch:
+            # We need to skip steps until we reach the resumed step
+            active_dataloader = accelerator.skip_first_batches(
+                train_dataloader,
+                resume_step * gradient_accumulation_steps
+            )
+        else:
+            # After the first iteration though, we need to go back to the original dataloader
+            active_dataloader = train_dataloader
+        logger.debug(f"len(active_dataloader): {len(active_dataloader)}")
+        for batch in active_dataloader:
+            # get teacher vectors
+            with torch.no_grad():
+                model.eval()
+                all_teacher_vectors = model(**batch["teacher_ipt"])["all_vectors"]
+            all_teacher_vectors = [F.normalize(vector.float(), p=2, dim=-1) for vector in all_teacher_vectors]
+            # 维度最长的向量作为label
+            target_sim_values = torch.matmul(all_teacher_vectors[0],
+                                             all_teacher_vectors[0].T)
+            rank_label = torch.where(get_score_diff(all_teacher_vectors[0]) < 0, 1, -1)
+            model.train()
+            with accelerator.accumulate(model):
+                # get student vectors
+                all_student_vectors = model(**batch["student_ipt"])["all_vectors"]
+                all_student_vectors = [F.normalize(vector.float(), p=2, dim=-1) for vector in all_student_vectors]
+                cosine_loss_list, sim_value_loss_list, rank_loss_list = [], [], []
+                for teacher_vectors, student_vectors in zip(all_teacher_vectors, all_student_vectors):
+                    # cosine loss
+                    cosine_loss_list.append(
+                        (1 - (student_vectors * teacher_vectors).sum(axis=1).mean()) * cosine_loss_scale
+                    )
+                    # 计算老师和学生的相似度值损失
+                    sim_value_loss_list.append(
+                        F.mse_loss(
+                            input=torch.matmul(student_vectors, student_vectors.T),
+                            target=target_sim_values,
+                        ) * mse_loss_scale
+                    )
+                    # print(sim_value_loss_list)
+                    # 计算 排序损失函数
+                    rank_loss_list.append(
+                        F.relu(get_score_diff(student_vectors) * rank_label + rank_margin).mean() * rank_loss_scale
+                    )
+                cosine_loss = sum(cosine_loss_list) / len(cosine_loss_list)
+                sim_value_loss = sum(sim_value_loss_list) / len(sim_value_loss_list)
+                rank_loss = sum(rank_loss_list) / len(rank_loss_list)
+                loss = cosine_loss + sim_value_loss + rank_loss
+                ##########################  debug 信息  #######################################################
+                if accelerator.is_main_process and (completed_steps == 10 or random.random() < print_debug_info_prob):
+                    debug_index = random.randint(0, batch_size - 1)
+                    teacher_input_ids = batch["teacher_ipt"]["input_ids"].cpu().numpy()
+                    teacher_attention_mask = batch["teacher_ipt"]["attention_mask"].cpu().numpy()
+                    for debug_k, debug_v in batch["teacher_ipt"].items():
+                        logger.debug(f"teacher_ipt_{debug_k}.shape: {debug_v.shape}")
+                    logger.debug(f"teacher input_ids: {teacher_input_ids[debug_index].tolist()}")
+                    logger.debug(f"teacher input_tokens: {tokenizer.decode(teacher_input_ids[debug_index])}")
+                    logger.debug(f"teacher attention_mask: {teacher_attention_mask[debug_index].tolist()}")
+                    student_input_ids = batch["student_ipt"]["input_ids"].cpu().numpy()
+                    student_attention_mask = batch["student_ipt"]["attention_mask"].cpu().numpy()
+                    for debug_k, debug_v in batch["student_ipt"].items():
+                        logger.debug(f"student_ipt_{debug_k}.shape: {debug_v.shape}")
+                    logger.debug(f"student input_ids: {student_input_ids[debug_index].tolist()}")
+                    logger.debug(f"student input_tokens: {tokenizer.decode(student_input_ids[debug_index])}")
+                    logger.debug(f"student attention_mask: {student_attention_mask[debug_index].tolist()}")
+                    logger.debug(f"teacher_vectors.shape: {teacher_vectors.shape}")
+                    logger.debug(f"student_vectors.shape: {student_vectors.shape}")
+                ###############################################################################################
+                accelerator.backward(loss)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+                if accelerator.sync_gradients:
+                    progress_bar.update(1)
+                    completed_steps += 1
+                    # if completed_steps == 15:
+                    #     save_model()
+                    if completed_steps % save_steps == 0 and completed_steps > 0:
+                        save_model()
+                    # log
+                    if accelerator.is_main_process:
+                        curr_lr = float(lr_scheduler.get_last_lr()[-1])
+                        logger.info(
+                            f"epoch-{epoch},completed_steps-{completed_steps},lr:{curr_lr},cosine_loss:{cosine_loss.item()},sim_value_loss:{sim_value_loss.item()},rank_loss:{rank_loss.item()}"
+                        )
+                        accelerator.log(
+                            {
+                                "cosine_loss": cosine_loss.item(),
+                                "sim_value_loss": sim_value_loss.item(),
+                                "rank_loss": rank_loss.item(),
+                                "lr": curr_lr
+                            },
+                            step=completed_steps
+                        )
+                # if accelerator.is_main_process:
+                #     print("model.vs_token_emb[:,:,:4]", model.vs_token_emb[:, :, :4])
+                #     print("model.ve_token_emb[:,:,:4]", model.ve_token_emb[:, :, :4])
+    # 训练结束后保存一次模型
+    save_model()
+    accelerator.end_training()

scripts/original_stella_jasper_training_codes/run_train_distill_stage1.py ADDED Viewed

	@@ -0,0 +1,405 @@

+# coding=utf8
+import json
+import os
+import sys
+import yaml
+import torch
+import shutil
+import math
+import random
+import lmdb
+import pickle
+import accelerate
+from loguru import logger
+from torch.utils.data import DataLoader, Dataset
+from transformers import get_scheduler
+from accelerate import Accelerator
+from accelerate.utils import set_seed, ProjectConfiguration
+from tqdm import tqdm
+from typing import List, Union
+from os.path import join
+import torch.nn.functional as F
+from safetensors.torch import load_file, save_file
+class JasperDataset_LMDB_RANDOM_ACCESS(Dataset):
+    def __init__(self, file_path_list_or_dir: Union[List[str], str]):
+        if isinstance(file_path_list_or_dir, str):
+            file_path_list = []
+            for name in os.listdir(file_path_list_or_dir):
+                if not name.endswith('-lock'):
+                    continue
+                file_path_list.append(join(file_path_list_or_dir, name[:-5]))
+        else:
+            file_path_list = file_path_list_or_dir
+        file_path_list.sort()
+        random.seed(42)
+        random.shuffle(file_path_list)
+        # file_path_list = file_path_list[:20]
+        self.lmdb_env_list = [
+            lmdb.open(file_path, readonly=True, readahead=False, subdir=False, lock=False)
+            for file_path in file_path_list
+        ]
+        self.lmdb_txn_list = [lmdb_env.begin(write=False, buffers=True) for lmdb_env in self.lmdb_env_list]
+        self.num_data_of_env = [lmdb_env.stat()["entries"] for lmdb_env in self.lmdb_env_list]
+        self.num_all_data = sum(self.num_data_of_env)
+        self.accumulation_numbers = [sum(self.num_data_of_env[:idx + 1]) for idx in range(len(self.num_data_of_env))]
+        if accelerator.is_main_process:
+            logger.info(f"file_path_list:{file_path_list}")
+            logger.info(f"number of data:{self.num_all_data}")
+    def __len__(self):
+        return self.num_all_data
+    def __getitem__(self, item):
+        # print("accelerator.local_process_index,item", accelerator.local_process_index, item)
+        # rank_env and item in this db
+        for env_idx, accum_num in enumerate(self.accumulation_numbers):
+            if item < accum_num:
+                break
+        txn = self.lmdb_txn_list[env_idx]
+        item -= self.accumulation_numbers[env_idx - 1] if env_idx > 0 else 0
+        data_item = pickle.loads(bytes(txn.get(f"{item}".encode())))
+        text, extra = data_item["text"], json.loads(data_item["extra"])
+        data_item["text"] = extra["prompt_student"] + data_item["text"]
+        return data_item
+def collate_fn_jasper_text(batch):
+    """
+    :param batch:List[data_set[i]]
+    :return:
+    """
+    all_texts = [item["text"] for item in batch]
+    teacher_vectors = torch.tensor(
+        [
+            [value for col in teacher_vector_cols for value in item[col]]
+            for item in batch
+        ]
+    )
+    if len(teacher_vector_cols) > 1:
+        teacher_vectors = F.normalize(teacher_vector_cols, p=2, dim=-1)
+    ipt = tokenizer(all_texts, padding=padding, truncation=True, max_length=max_length, return_tensors="pt")
+    ipt["teacher_vectors"] = teacher_vectors
+    return ipt
+def save_model():
+    checkpoint_dir = join(output_dir, f"step_{completed_steps}")
+    # accelerator.save_state(checkpoint_dir, safe_serialization=True)
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        logger.info(f"保存模型{checkpoint_dir}")
+        accelerator.unwrap_model(model).save_pretrained(checkpoint_dir, max_shard_size="32GB", safe_serialization=True)
+        shutil.copy("./jasper_model/modeling_qwen.py", join(checkpoint_dir, "modeling_qwen.py"))
+        shutil.copy("./jasper_model/tokenization_qwen.py", join(checkpoint_dir, "tokenization_qwen.py"))
+        # change config json
+        with open(join(checkpoint_dir, "config.json"), "r", encoding="utf8") as fr:
+            config = json.load(fr)
+        config.pop("_name_or_path")
+        config["auto_map"] = {"AutoModel": "modeling_qwen.JasperTextStella_1_5"}
+        with open(join(checkpoint_dir, "config.json"), "w", encoding="utf8") as fw:
+            json.dump(config, fw, ensure_ascii=False, indent=1)
+        os.makedirs(join(checkpoint_dir, "1_Pooling"), exist_ok=True)
+        config = {
+            "word_embedding_dimension": 4096,
+            "pooling_mode_cls_token": True,
+            "pooling_mode_mean_tokens": False,
+            "pooling_mode_max_tokens": False,
+            "pooling_mode_mean_sqrt_len_tokens": False,
+            "pooling_mode_weightedmean_tokens": False,
+            "pooling_mode_lasttoken": False,
+            "include_prompt": False
+        }
+        with open(join(checkpoint_dir, "1_Pooling/config.json"), "w", encoding="utf8") as fw:
+            json.dump(config, fw, ensure_ascii=False, indent=1)
+        ## modules.json
+        with open(os.path.join(checkpoint_dir, "modules.json"), "w", encoding="utf8") as fw:
+            json.dump(
+                [
+                    {
+                        "idx": 0,
+                        "name": "0",
+                        "path": "",
+                        "type": "sentence_transformers.models.Transformer"
+                    },
+                    {
+                        "idx": 1,
+                        "name": "1",
+                        "path": "1_Pooling",
+                        "type": "sentence_transformers.models.Pooling"
+                    }
+                ],
+                fw,
+                ensure_ascii=False,
+                indent=1
+            )
+        ## sentence_bert_config.json
+        shutil.copy(join(model_dir, "added_tokens.json"), join(checkpoint_dir, "added_tokens.json"))
+        shutil.copy(join(model_dir, "config_sentence_transformers.json"),
+                    join(checkpoint_dir, "config_sentence_transformers.json"))
+        shutil.copy(join(model_dir, "merges.txt"), join(checkpoint_dir, "merges.txt"))
+        shutil.copy(join(model_dir, "sentence_bert_config.json"), join(checkpoint_dir, "sentence_bert_config.json"))
+        shutil.copy(join(model_dir, "special_tokens_map.json"), join(checkpoint_dir, "special_tokens_map.json"))
+        shutil.copy(join(model_dir, "tokenizer_config.json"), join(checkpoint_dir, "tokenizer_config.json"))
+        shutil.copy(join(model_dir, "tokenizer.json"), join(checkpoint_dir, "tokenizer.json"))
+        shutil.copy(join(model_dir, "vocab.json"), join(checkpoint_dir, "vocab.json"))
+        # 把stella 的 vector weight放进去
+        ori_di = load_file(join(checkpoint_dir, "model.safetensors"))
+        stella_dense_di = load_file(
+            "/home/wcm/jasper/public_models/stella_en_1_5B_v5/2_Dense_8192/model.safetensors"
+        )
+        # vec
+        ori_di["stella_dense.weight"] = stella_dense_di["linear.weight"].clone().detach().bfloat16()
+        ori_di["stella_dense.bias"] = stella_dense_di["linear.bias"].clone().detach().bfloat16()
+        save_file(ori_di, join(checkpoint_dir, "model.safetensors"), metadata={"format": "pt"})
+def get_score_diff(vectors):
+    scores = torch.matmul(vectors, vectors.T)
+    scores = scores[torch.triu(torch.ones_like(scores), diagonal=1).bool()]
+    score_diff = scores.reshape((1, -1)) - scores.reshape((-1, 1))
+    score_diff = score_diff[torch.triu(torch.ones_like(score_diff), diagonal=1).bool()]
+    return score_diff
+if __name__ == "__main__":
+    # read the configration
+    with open(sys.argv[1].strip(), "r", encoding="utf8") as fr:
+        conf = yaml.safe_load(fr)
+    model_name = conf["model_name"]
+    model_dir = conf["model_path_or_name"]
+    max_length = conf["max_length"]
+    resume_model_dir = conf["resume_model_dir"]
+    output_dir = conf["output_dir"]
+    save_steps = conf["save_steps"]
+    batch_size = conf["batch_size"]
+    project_name = conf["project_name"]
+    log_with = conf["log_with"]
+    log_init_kwargs = conf["log_init_kwargs"]
+    file_path_list_or_dir = conf["file_path_list"]
+    print_debug_info_prob = conf["print_debug_info_prob"]
+    gradient_accumulation_steps = conf["gradient_accumulation_steps"]
+    continue_train = conf["continue_train"]
+    num_train_epochs = conf["num_train_epochs"]
+    lr_scheduler_type = conf["lr_scheduler_type"]
+    mse_loss_scale = conf["mse_loss_scale"]
+    cosine_loss_scale = conf["cosine_loss_scale"]
+    padding = conf["padding"]
+    teacher_vector_cols = conf["teacher_vector_cols"]
+    rank_margin = conf["rank_margin"]
+    rank_loss_scale = conf["rank_loss_scale"]
+    used_loss = set(conf["used_loss"].split(";"))
+    scheduler_kwargs = conf.get("scheduler_kwargs", {})
+    os.environ["ADAPTER_TYPE"] = conf["adapter_type"]
+    os.environ["MERGE_VECS"] = "0"
+    seed = conf["seed"]
+    CL_LABELS = torch.LongTensor(range(max_length))
+    # initialize accelerator
+    accelerator = Accelerator(
+        project_config=ProjectConfiguration(
+            project_dir=output_dir,
+            logging_dir=join(output_dir, "logs"),
+        ),
+        gradient_accumulation_steps=gradient_accumulation_steps,
+        log_with=log_with,
+        kwargs_handlers=[
+            accelerate.DistributedDataParallelKwargs(find_unused_parameters=not conf["gradient_checkpointing"])]
+    )
+    # output_dir and sth
+    with accelerator.main_process_first():
+        if accelerator.is_main_process:
+            os.makedirs(output_dir, exist_ok=True)
+            os.makedirs(join(output_dir, "logs/wandb_logs"), exist_ok=True)
+            logger.add(
+                join(output_dir, "train_logs.txt"),
+                level="DEBUG",
+                compression="zip",
+                rotation="500 MB",
+                # format="{message}"
+            )
+            shutil.copy(sys.argv[1].strip(), join(output_dir, "train_config.yml"))
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        logger.info(f"accelerator.state:{accelerator.state}")
+    # seed
+    set_seed(seed=seed)
+    # 加载模型、tokenizer
+    model = MODEL_NAME_INFO[model_name][0].from_pretrained(
+        model_dir,
+        use_cache=False,
+    )
+    tokenizer = MODEL_NAME_INFO[model_name][1].from_pretrained(model_dir, padding_side="right")
+    model_conf = model.config
+    model.padding_side = "right"
+    for k, v in model.named_parameters():
+        if k.startswith("model."):
+            v.requires_grad = False
+    if accelerator.is_main_process:
+        logger.debug("参数冻结情况")
+        for k, v in model.named_parameters():
+            logger.debug(f"{k}:{v.shape, v.requires_grad}")
+    if conf["gradient_checkpointing"]:
+        model.gradient_checkpointing_enable()
+    # 加载数据和teacher vector
+    train_dataset = JasperDataset_LMDB_RANDOM_ACCESS(file_path_list_or_dir=file_path_list_or_dir)
+    train_dataloader = DataLoader(
+        dataset=train_dataset,
+        shuffle=False,
+        collate_fn=collate_fn_jasper_text,
+        batch_size=batch_size,
+        num_workers=6,
+        drop_last=True,
+        # pin_memory=True,
+        # pin_memory_device="cuda",
+        prefetch_factor=4,
+    )
+    # 加载上次的训练状态
+    accelerator.wait_for_everyone()
+    # init log
+    if "wandb" in log_init_kwargs:
+        log_init_kwargs["wandb"]["dir"] = join(output_dir, "logs/wandb_logs")
+        log_init_kwargs["wandb"]["config"] = {k: json.dumps(v, ensure_ascii=False) for k, v in conf.items()}
+    accelerator.init_trackers(
+        project_name=project_name,
+        init_kwargs=log_init_kwargs
+    )
+    # Optimizer
+    optimizer = torch.optim.AdamW(model.parameters(), lr=conf["learning_rate"])
+    # if os.path.exists(join(model_path_or_name, "optimizer.bin")):
+    #     optimizer.load_state_dict(torch.load(join(model_path_or_name, "optimizer.bin"), weights_only=False, map_location="cpu"))
+    # scheduler
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
+    max_train_steps = num_update_steps_per_epoch * num_train_epochs
+    if isinstance(conf["num_warmup_steps"], float):
+        num_warmup_steps = int(max_train_steps * conf["num_warmup_steps"])
+    else:
+        num_warmup_steps = conf["num_warmup_steps"]
+    lr_scheduler = get_scheduler(
+        name=lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=max_train_steps,
+        scheduler_specific_kwargs=scheduler_kwargs,
+    )
+    logger.debug(f"before prepare, len(train_dataloader): {len(train_dataloader)}")
+    # prepare everything
+    model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, lr_scheduler
+    )
+    logger.debug(f"after prepare, len(train_dataloader): {len(train_dataloader)}")
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    ## PS: 多机多卡的问题，之前的计算没有考虑num_process,多机读卡下len(train_dataloader)会变小， 接下来的相当于是每张卡的数量
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / accelerator.gradient_accumulation_steps)
+    max_train_steps = num_train_epochs * num_update_steps_per_epoch
+    logger.debug(f"max_train_steps for each card:{max_train_steps}")
+    starting_epoch, completed_steps = 0, 0
+    progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process)
+    if continue_train:
+        logger.info(f"Continue train from {model_dir}")
+        accelerator.load_state(resume_model_dir)
+        resume_step = int(os.path.basename(resume_model_dir).replace("step_", ""))
+        completed_steps = resume_step
+        starting_epoch = resume_step // num_update_steps_per_epoch
+        resume_step -= starting_epoch * num_update_steps_per_epoch
+    progress_bar.update(completed_steps)
+    # 开始训练
+    CL_LABELS = CL_LABELS.to(accelerator.device)
+    for epoch in range(starting_epoch, num_train_epochs):
+        model.train()
+        # skip new `skip_first_batches` to skip the batches when resuming from ckpt
+        if continue_train and epoch == starting_epoch:
+            # We need to skip steps until we reach the resumed step
+            active_dataloader = accelerator.skip_first_batches(
+                train_dataloader,
+                resume_step * gradient_accumulation_steps
+            )
+        else:
+            # After the first iteration though, we need to go back to the original dataloader
+            active_dataloader = train_dataloader
+        logger.debug(f"len(active_dataloader): {len(active_dataloader)}")
+        for batch in active_dataloader:
+            teacher_vectors = batch.pop("teacher_vectors")
+            with accelerator.accumulate(model):
+                attention_mask = batch["attention_mask"]
+                model_output = model(**batch)
+                student_vectors = model_output["token_embeddings"].float()[:, 0]
+                student_vectors = F.normalize(student_vectors, p=2, dim=-1)
+                # 计算cosine loss
+                cosine_loss = (1 - (student_vectors * teacher_vectors).sum(axis=1).mean()) * cosine_loss_scale
+                # 计算老师和学生的相似度值损失
+                sim_value_loss = F.mse_loss(
+                    input=torch.matmul(student_vectors, student_vectors.T),
+                    target=torch.matmul(teacher_vectors, teacher_vectors.T),
+                ) * mse_loss_scale
+                # 计算 排序损失函数
+                ## 首先获取 rank_labellabel
+                rank_label = torch.where(get_score_diff(teacher_vectors) < 0, 1, -1)
+                rank_loss = F.relu(get_score_diff(student_vectors) * rank_label + rank_margin).mean() * rank_loss_scale
+                loss = cosine_loss
+                if "sim_value_loss" in used_loss:
+                    loss = loss + sim_value_loss
+                if "rank_loss" in used_loss:
+                    loss = loss + rank_loss
+                ##########################  debug 信息  #######################################################
+                if accelerator.is_main_process and (completed_steps == 10 or random.random() < print_debug_info_prob):
+                    input_ids = batch["input_ids"].cpu().numpy()
+                    attention_mask = batch["attention_mask"].cpu().numpy()
+                    debug_index = random.randint(0, len(input_ids) - 1)
+                    for debug_k, debug_v in batch.items():
+                        logger.debug(f"{debug_k}.shape: {debug_v.shape}")
+                    logger.debug(f"debug_index: {debug_index}")
+                    logger.debug(f"input_ids: {input_ids[debug_index].tolist()}")
+                    logger.debug(f"input_tokens: {tokenizer.decode(input_ids[debug_index])}")
+                    logger.debug(f"attention_mask: {attention_mask[debug_index].tolist()}")
+                    logger.debug(f"teacher_vectors.shape: {teacher_vectors.shape}")
+                    logger.debug(f"student_vectors.shape: {student_vectors.shape}")
+                ###############################################################################################
+                accelerator.backward(loss)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+                if accelerator.sync_gradients:
+                    progress_bar.update(1)
+                    completed_steps += 1
+                    if completed_steps == 15:
+                        save_model()
+                    if completed_steps % save_steps == 0 and completed_steps > 0:
+                        save_model()
+                    # log
+                    if accelerator.is_main_process:
+                        curr_lr = float(lr_scheduler.get_last_lr()[-1])
+                        logger.info(
+                            f"epoch-{epoch},completed_steps-{completed_steps},lr:{curr_lr},cosine_loss:{cosine_loss.item()},sim_value_loss:{sim_value_loss.item()},rank_loss:{rank_loss.item()}"
+                        )
+                        accelerator.log(
+                            {
+                                "cosine_loss": cosine_loss.item(),
+                                "sim_value_loss": sim_value_loss.item(),
+                                "rank_loss": rank_loss.item(),
+                                "lr": curr_lr
+                            },
+                            step=completed_steps
+                        )
+    save_model()
+    accelerator.end_training()

scripts/original_stella_jasper_training_codes/run_train_distill_stage2.py ADDED Viewed

	@@ -0,0 +1,390 @@

+# coding=utf8
+import json
+import os
+import sys
+import yaml
+import torch
+import shutil
+import math
+import random
+import lmdb
+import pickle
+import accelerate
+from loguru import logger
+from torch.utils.data import DataLoader, Dataset
+from transformers import get_scheduler
+from accelerate import Accelerator
+from accelerate.utils import set_seed, ProjectConfiguration
+from tqdm import tqdm
+from typing import List, Union
+from os.path import join
+import torch.nn.functional as F
+from jasper_model.modeling_jasper_vl import JasperVL
+from jasper_model.tokenization_qwen import Qwen2TokenizerFast
+class JasperDataset_LMDB_RANDOM_ACCESS(Dataset):
+    def __init__(self, file_path_list_or_dir: Union[List[str], str]):
+        if isinstance(file_path_list_or_dir, str):
+            file_path_list = []
+            for name in os.listdir(file_path_list_or_dir):
+                if not name.endswith('-lock'):
+                    continue
+                file_path_list.append(join(file_path_list_or_dir, name[:-5]))
+        else:
+            file_path_list = file_path_list_or_dir
+        file_path_list.sort()
+        random.seed(seed)
+        random.shuffle(file_path_list)
+        # file_path_list = file_path_list[:20]
+        self.lmdb_env_list = [
+            lmdb.open(file_path, readonly=True, readahead=False, subdir=False, lock=False)
+            for file_path in file_path_list
+        ]
+        self.lmdb_txn_list = [lmdb_env.begin(write=False, buffers=True) for lmdb_env in self.lmdb_env_list]
+        self.num_data_of_env = [lmdb_env.stat()["entries"] for lmdb_env in self.lmdb_env_list]
+        self.num_all_data = sum(self.num_data_of_env)
+        self.accumulation_numbers = [sum(self.num_data_of_env[:idx + 1]) for idx in range(len(self.num_data_of_env))]
+        if accelerator.is_main_process:
+            logger.info(f"file_path_list:{file_path_list}")
+            logger.info(f"number of data:{self.num_all_data}")
+    def __len__(self):
+        return self.num_all_data
+    def __getitem__(self, item):
+        # print("accelerator.local_process_index,item", accelerator.local_process_index, item)
+        # rank_env and item in this db
+        for env_idx, accum_num in enumerate(self.accumulation_numbers):
+            if item < accum_num:
+                break
+        txn = self.lmdb_txn_list[env_idx]
+        item -= self.accumulation_numbers[env_idx - 1] if env_idx > 0 else 0
+        data_item = pickle.loads(bytes(txn.get(f"{item}".encode())))
+        text, extra = data_item["text"], json.loads(data_item["extra"])
+        data_item["text"] = extra["prompt_student"] + data_item["text"]
+        return data_item
+def collate_fn_jasper_text(batch):
+    """
+    :param batch:List[data_set[i]]
+    :return:
+    """
+    all_texts = [item["text"] for item in batch]
+    teacher_vectors = torch.tensor(
+        [
+            [value for col in teacher_vector_cols for value in item[col]]
+            for item in batch
+        ]
+    )
+    if len(teacher_vector_cols) > 1:
+        teacher_vectors = F.normalize(teacher_vectors, p=2, dim=-1)
+    ipt = tokenizer(all_texts, padding=padding, truncation=True, max_length=max_length, return_tensors="pt")
+    ipt["teacher_vectors"] = teacher_vectors
+    return ipt
+def save_model():
+    checkpoint_dir = join(output_dir, f"step_{completed_steps}")
+    # accelerator.save_state(checkpoint_dir, safe_serialization=True)
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        logger.info(f"保存模型{checkpoint_dir}")
+        # 再存储一次方便直接加载
+        accelerator.unwrap_model(model).save_pretrained(checkpoint_dir, max_shard_size="32GB", safe_serialization=True)
+        # cppy file
+        shutil.copy("./jasper_model/configuration_jasper_vl.py", join(checkpoint_dir, "configuration_jasper_vl.py"))
+        shutil.copy("./jasper_model/modeling_jasper_vl.py", join(checkpoint_dir, "modeling_jasper_vl.py"))
+        shutil.copy("./jasper_model/tokenization_qwen.py", join(checkpoint_dir, "tokenization_qwen.py"))
+        # change config json
+        with open(join(checkpoint_dir, "config.json"), "r", encoding="utf8") as fr:
+            config = json.load(fr)
+        if "_name_or_path" in config:
+            config.pop("_name_or_path")
+        config["auto_map"] = {
+            "AutoModel": "modeling_jasper_vl.JasperVL",
+            "AutoConfig": "configuration_jasper_vl.JasperVLConfig",
+        }
+        with open(join(checkpoint_dir, "config.json"), "w", encoding="utf8") as fw:
+            json.dump(config, fw, ensure_ascii=False, indent=1)
+        os.makedirs(join(checkpoint_dir, "1_Pooling"), exist_ok=True)
+        config = {
+            "word_embedding_dimension": 12288,
+            "pooling_mode_cls_token": True,
+            "pooling_mode_mean_tokens": False,
+            "pooling_mode_max_tokens": False,
+            "pooling_mode_mean_sqrt_len_tokens": False,
+            "pooling_mode_weightedmean_tokens": False,
+            "pooling_mode_lasttoken": False,
+            "include_prompt": False
+        }
+        with open(join(checkpoint_dir, "1_Pooling/config.json"), "w", encoding="utf8") as fw:
+            json.dump(config, fw, ensure_ascii=False, indent=1)
+        ## modules.json
+        with open(os.path.join(checkpoint_dir, "modules.json"), "w", encoding="utf8") as fw:
+            json.dump(
+                [
+                    {
+                        "idx": 0,
+                        "name": "0",
+                        "path": "",
+                        "type": "sentence_transformers.models.Transformer"
+                    },
+                    {
+                        "idx": 1,
+                        "name": "1",
+                        "path": "1_Pooling",
+                        "type": "sentence_transformers.models.Pooling"
+                    }
+                ],
+                fw,
+                ensure_ascii=False,
+                indent=1
+            )
+        ## sentence_bert_config.json
+        shutil.copy(join(model_dir, "added_tokens.json"), join(checkpoint_dir, "added_tokens.json"))
+        shutil.copy(join(model_dir, "config_sentence_transformers.json"),
+                    join(checkpoint_dir, "config_sentence_transformers.json"))
+        shutil.copy(join(model_dir, "merges.txt"), join(checkpoint_dir, "merges.txt"))
+        shutil.copy(join(model_dir, "sentence_bert_config.json"), join(checkpoint_dir, "sentence_bert_config.json"))
+        shutil.copy(join(model_dir, "special_tokens_map.json"), join(checkpoint_dir, "special_tokens_map.json"))
+        shutil.copy(join(model_dir, "tokenizer_config.json"), join(checkpoint_dir, "tokenizer_config.json"))
+        shutil.copy(join(model_dir, "tokenizer.json"), join(checkpoint_dir, "tokenizer.json"))
+        shutil.copy(join(model_dir, "vocab.json"), join(checkpoint_dir, "vocab.json"))
+def get_score_diff(vectors):
+    scores = torch.matmul(vectors, vectors.T)
+    scores = scores[torch.triu(torch.ones_like(scores), diagonal=1).bool()]
+    score_diff = scores.reshape((1, -1)) - scores.reshape((-1, 1))
+    score_diff = score_diff[torch.triu(torch.ones_like(score_diff), diagonal=1).bool()]
+    return score_diff
+if __name__ == "__main__":
+    # read the configration
+    with open(sys.argv[1].strip(), "r", encoding="utf8") as fr:
+        conf = yaml.safe_load(fr)
+    model_dir = conf["model_path_or_name"]
+    max_length = conf["max_length"]
+    resume_model_dir = conf["resume_model_dir"]
+    output_dir = conf["output_dir"]
+    save_steps = conf["save_steps"]
+    batch_size = conf["batch_size"]
+    project_name = conf["project_name"]
+    log_with = conf["log_with"]
+    log_init_kwargs = conf["log_init_kwargs"]
+    file_path_list_or_dir = conf["file_path_list"]
+    print_debug_info_prob = conf["print_debug_info_prob"]
+    gradient_accumulation_steps = conf["gradient_accumulation_steps"]
+    continue_train = conf["continue_train"]
+    num_train_epochs = conf["num_train_epochs"]
+    lr_scheduler_type = conf["lr_scheduler_type"]
+    mse_loss_scale = conf["mse_loss_scale"]
+    cosine_loss_scale = conf["cosine_loss_scale"]
+    padding = conf["padding"]
+    teacher_vector_cols = conf["teacher_vector_cols"]
+    rank_margin = conf["rank_margin"]
+    rank_loss_scale = conf["rank_loss_scale"]
+    scheduler_kwargs = conf.get("scheduler_kwargs", {})
+    seed = conf["seed"]
+    # initialize accelerator
+    accelerator = Accelerator(
+        project_config=ProjectConfiguration(
+            project_dir=output_dir,
+            logging_dir=join(output_dir, "logs"),
+        ),
+        gradient_accumulation_steps=gradient_accumulation_steps,
+        log_with=log_with,
+        kwargs_handlers=[
+            accelerate.DistributedDataParallelKwargs(find_unused_parameters=not conf["gradient_checkpointing"])]
+    )
+    # output_dir and sth
+    with accelerator.main_process_first():
+        if accelerator.is_main_process:
+            os.makedirs(output_dir, exist_ok=True)
+            os.makedirs(join(output_dir, "logs/wandb_logs"), exist_ok=True)
+            logger.add(
+                join(output_dir, "train_logs.txt"),
+                level="DEBUG",
+                compression="zip",
+                rotation="500 MB",
+                # format="{message}"
+            )
+            shutil.copy(sys.argv[1].strip(), join(output_dir, "train_config.yml"))
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        logger.info(f"accelerator.state:{accelerator.state}")
+    # seed
+    set_seed(seed=seed)
+    # 加载模型、tokenizer
+    model = JasperVL.from_pretrained(model_dir)
+    tokenizer = Qwen2TokenizerFast.from_pretrained(model_dir, padding_side="right")
+    for k, v in model.named_parameters():
+        if k.startswith("model."):
+            v.requires_grad = False
+        if "model.norm.weight" in k or "layers.27" in k:
+            v.requires_grad = True
+    if accelerator.is_main_process:
+        logger.debug("被训练的参数如下：")
+        for k, v in model.named_parameters():
+            if v.requires_grad:
+                logger.debug(f"{k}:{v.shape, v.requires_grad}")
+    if conf["gradient_checkpointing"]:
+        model.gradient_checkpointing_enable()
+    # 加载数据和teacher vector
+    train_dataset = JasperDataset_LMDB_RANDOM_ACCESS(file_path_list_or_dir=file_path_list_or_dir)
+    train_dataloader = DataLoader(
+        dataset=train_dataset,
+        shuffle=False,
+        collate_fn=collate_fn_jasper_text,
+        batch_size=batch_size,
+        num_workers=6,
+        drop_last=True,
+        # pin_memory=True,
+        # pin_memory_device="cuda",
+        prefetch_factor=4,
+    )
+    # 加载上次的训练状态
+    accelerator.wait_for_everyone()
+    # init log
+    if "wandb" in log_init_kwargs:
+        log_init_kwargs["wandb"]["dir"] = join(output_dir, "logs/wandb_logs")
+        log_init_kwargs["wandb"]["config"] = {k: json.dumps(v, ensure_ascii=False) for k, v in conf.items()}
+    accelerator.init_trackers(
+        project_name=project_name,
+        init_kwargs=log_init_kwargs
+    )
+    # Optimizer
+    optimizer = torch.optim.AdamW(model.parameters(), lr=conf["learning_rate"])
+    # if os.path.exists(join(model_path_or_name, "optimizer.bin")):
+    #     optimizer.load_state_dict(torch.load(join(model_path_or_name, "optimizer.bin"), weights_only=False, map_location="cpu"))
+    # scheduler
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
+    max_train_steps = num_update_steps_per_epoch * num_train_epochs
+    if isinstance(conf["num_warmup_steps"], float):
+        num_warmup_steps = int(max_train_steps * conf["num_warmup_steps"])
+    else:
+        num_warmup_steps = conf["num_warmup_steps"]
+    lr_scheduler = get_scheduler(
+        name=lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=max_train_steps,
+        scheduler_specific_kwargs=scheduler_kwargs,
+    )
+    logger.debug(f"before prepare, len(train_dataloader): {len(train_dataloader)}")
+    # prepare everything
+    model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, lr_scheduler
+    )
+    logger.debug(f"after prepare, len(train_dataloader): {len(train_dataloader)}")
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    ## PS: 多机多卡的问题，之前的计算没有考虑num_process,多机读卡下len(train_dataloader)会变小， 接下来的相当于是每张卡的数量
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / accelerator.gradient_accumulation_steps)
+    max_train_steps = num_train_epochs * num_update_steps_per_epoch
+    logger.debug(f"max_train_steps for each card:{max_train_steps}")
+    starting_epoch, completed_steps = 0, 0
+    progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process)
+    if continue_train:
+        logger.info(f"Continue train from {model_dir}")
+        accelerator.load_state(resume_model_dir)
+        resume_step = int(os.path.basename(resume_model_dir).replace("step_", ""))
+        completed_steps = resume_step
+        starting_epoch = resume_step // num_update_steps_per_epoch
+        resume_step -= starting_epoch * num_update_steps_per_epoch
+    progress_bar.update(completed_steps)
+    # 开始训练
+    for epoch in range(starting_epoch, num_train_epochs):
+        model.train()
+        # skip new `skip_first_batches` to skip the batches when resuming from ckpt
+        if continue_train and epoch == starting_epoch:
+            # We need to skip steps until we reach the resumed step
+            active_dataloader = accelerator.skip_first_batches(
+                train_dataloader,
+                resume_step * gradient_accumulation_steps
+            )
+        else:
+            # After the first iteration though, we need to go back to the original dataloader
+            active_dataloader = train_dataloader
+        logger.debug(f"len(active_dataloader): {len(active_dataloader)}")
+        for batch in active_dataloader:
+            teacher_vectors = batch.pop("teacher_vectors")
+            with accelerator.accumulate(model):
+                attention_mask = batch["attention_mask"]
+                model_output = model(**batch)
+                student_vectors = model_output["token_embeddings"].float()[:, 0]
+                student_vectors = F.normalize(student_vectors, p=2, dim=-1)
+                # 计算cosine loss
+                cosine_loss = (1 - (student_vectors * teacher_vectors).sum(axis=1).mean()) * cosine_loss_scale
+                # 计算老师和学生的相似度值损失
+                sim_value_loss = F.mse_loss(
+                    input=torch.matmul(student_vectors, student_vectors.T),
+                    target=torch.matmul(teacher_vectors, teacher_vectors.T),
+                ) * mse_loss_scale
+                # 计算 排序损失函数
+                ## 首先获取 rank_labellabel
+                rank_label = torch.where(get_score_diff(teacher_vectors) < 0, 1, -1)
+                rank_loss = F.relu(get_score_diff(student_vectors) * rank_label + rank_margin).mean() * rank_loss_scale
+                loss = cosine_loss + sim_value_loss + rank_loss
+                ##########################  debug 信息  #######################################################
+                if accelerator.is_main_process and (completed_steps == 10 or random.random() < print_debug_info_prob):
+                    input_ids = batch["input_ids"].cpu().numpy()
+                    attention_mask = batch["attention_mask"].cpu().numpy()
+                    debug_index = random.randint(0, len(input_ids) - 1)
+                    for debug_k, debug_v in batch.items():
+                        logger.debug(f"{debug_k}.shape: {debug_v.shape}")
+                    logger.debug(f"debug_index: {debug_index}")
+                    logger.debug(f"input_ids: {input_ids[debug_index].tolist()}")
+                    logger.debug(f"input_tokens: {tokenizer.decode(input_ids[debug_index])}")
+                    logger.debug(f"attention_mask: {attention_mask[debug_index].tolist()}")
+                    logger.debug(f"teacher_vectors.shape: {teacher_vectors.shape}")
+                    logger.debug(f"student_vectors.shape: {student_vectors.shape}")
+                ###############################################################################################
+                accelerator.backward(loss)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+                if accelerator.sync_gradients:
+                    progress_bar.update(1)
+                    completed_steps += 1
+                    if completed_steps == 15:
+                        save_model()
+                    if completed_steps % save_steps == 0 and completed_steps > 0:
+                        save_model()
+                    # log
+                    if accelerator.is_main_process:
+                        curr_lr = float(lr_scheduler.get_last_lr()[-1])
+                        logger.info(
+                            f"epoch-{epoch},completed_steps-{completed_steps},lr:{curr_lr},cosine_loss:{cosine_loss.item()},sim_value_loss:{sim_value_loss.item()},rank_loss:{rank_loss.item()}"
+                        )
+                        accelerator.log(
+                            {
+                                "cosine_loss": cosine_loss.item(),
+                                "sim_value_loss": sim_value_loss.item(),
+                                "rank_loss": rank_loss.item(),
+                                "lr": curr_lr
+                            },
+                            step=completed_steps
+                        )
+    # 训练结束后保存一次模型
+    save_model()
+    accelerator.end_training()

scripts/original_stella_jasper_training_codes/run_train_mrl_stage3.py ADDED Viewed

	@@ -0,0 +1,415 @@

+# coding=utf8
+import json
+import os
+import sys
+import yaml
+import torch
+import shutil
+import math
+import random
+import lmdb
+import pickle
+import accelerate
+from loguru import logger
+from torch.utils.data import DataLoader, Dataset
+from transformers import get_scheduler
+from accelerate import Accelerator
+from accelerate.utils import set_seed, ProjectConfiguration
+from tqdm import tqdm
+from typing import List, Union
+from os.path import join
+import torch.nn.functional as F
+from jasper_model.modeling_jasper_vl import JasperVL
+from jasper_model.tokenization_qwen import Qwen2TokenizerFast
+from jasper_model.configuration_jasper_vl import JasperVLConfig
+from safetensors.torch import load_file
+class JasperDataset_LMDB_RANDOM_ACCESS(Dataset):
+    def __init__(self, file_path_list_or_dir: Union[List[str], str]):
+        if isinstance(file_path_list_or_dir, str):
+            file_path_list = []
+            for name in os.listdir(file_path_list_or_dir):
+                if not name.endswith('-lock'):
+                    continue
+                file_path_list.append(join(file_path_list_or_dir, name[:-5]))
+        else:
+            file_path_list = file_path_list_or_dir
+        file_path_list.sort()
+        random.seed(seed)
+        random.shuffle(file_path_list)
+        # TODO 之前为了加速我们是顺序训练的，只训练了一部分，现在我们要训练mrl，可以倒过来读取，用新的数据训练
+        file_path_list = file_path_list[::-1]
+        self.lmdb_env_list = [
+            lmdb.open(file_path, readonly=True, readahead=False, subdir=False, lock=False)
+            for file_path in file_path_list
+        ]
+        self.lmdb_txn_list = [lmdb_env.begin(write=False, buffers=True) for lmdb_env in self.lmdb_env_list]
+        self.num_data_of_env = [lmdb_env.stat()["entries"] for lmdb_env in self.lmdb_env_list]
+        self.num_all_data = sum(self.num_data_of_env)
+        self.accumulation_numbers = [sum(self.num_data_of_env[:idx + 1]) for idx in range(len(self.num_data_of_env))]
+        if accelerator.is_main_process:
+            logger.info(f"file_path_list:{file_path_list}")
+            logger.info(f"number of data:{self.num_all_data}")
+    def __len__(self):
+        return self.num_all_data
+    def __getitem__(self, item):
+        # print("accelerator.local_process_index,item", accelerator.local_process_index, item)
+        # rank_env and item in this db
+        for env_idx, accum_num in enumerate(self.accumulation_numbers):
+            if item < accum_num:
+                break
+        txn = self.lmdb_txn_list[env_idx]
+        item -= self.accumulation_numbers[env_idx - 1] if env_idx > 0 else 0
+        data_item = pickle.loads(bytes(txn.get(f"{item}".encode())))
+        text, extra = data_item["text"], json.loads(data_item["extra"])
+        data_item["text"] = extra["prompt_student"] + data_item["text"]
+        return data_item
+def collate_fn_jasper_text(batch):
+    """
+    :param batch:List[data_set[i]]
+    :return:
+    """
+    all_texts = [item["text"] for item in batch]
+    teacher_vectors = torch.tensor(
+        [
+            [value for col in teacher_vector_cols for value in item[col]]
+            for item in batch
+        ]
+    )
+    if len(teacher_vector_cols) > 1:
+        teacher_vectors = F.normalize(teacher_vectors, p=2, dim=-1)
+    ipt = tokenizer(all_texts, padding=padding, truncation=True, max_length=max_length, return_tensors="pt")
+    ipt["teacher_vectors"] = teacher_vectors
+    return ipt
+def save_model():
+    checkpoint_dir = join(output_dir, f"step_{completed_steps}")
+    # accelerator.save_state(checkpoint_dir, safe_serialization=True)
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        logger.info(f"保存模型{checkpoint_dir}")
+        accelerator.unwrap_model(model).save_pretrained(checkpoint_dir, max_shard_size="32GB", safe_serialization=True)
+        # cppy file
+        shutil.copy("./jasper_model/configuration_jasper_vl.py", join(checkpoint_dir, "configuration_jasper_vl.py"))
+        shutil.copy("./jasper_model/modeling_jasper_vl.py", join(checkpoint_dir, "modeling_jasper_vl.py"))
+        shutil.copy("./jasper_model/tokenization_qwen.py", join(checkpoint_dir, "tokenization_qwen.py"))
+        # change config json
+        with open(join(checkpoint_dir, "config.json"), "r", encoding="utf8") as fr:
+            config = json.load(fr)
+        if "_name_or_path" in config:
+            config.pop("_name_or_path")
+        config["auto_map"] = {
+            "AutoModel": "modeling_jasper_vl.JasperVL",
+            "AutoConfig": "configuration_jasper_vl.JasperVLConfig",
+        }
+        with open(join(checkpoint_dir, "config.json"), "w", encoding="utf8") as fw:
+            json.dump(config, fw, ensure_ascii=False, indent=1)
+        os.makedirs(join(checkpoint_dir, "1_Pooling"), exist_ok=True)
+        config = {
+            "word_embedding_dimension": 4096,
+            "pooling_mode_cls_token": True,
+            "pooling_mode_mean_tokens": False,
+            "pooling_mode_max_tokens": False,
+            "pooling_mode_mean_sqrt_len_tokens": False,
+            "pooling_mode_weightedmean_tokens": False,
+            "pooling_mode_lasttoken": False,
+            "include_prompt": False
+        }
+        with open(join(checkpoint_dir, "1_Pooling/config.json"), "w", encoding="utf8") as fw:
+            json.dump(config, fw, ensure_ascii=False, indent=1)
+        ## modules.json
+        with open(os.path.join(checkpoint_dir, "modules.json"), "w", encoding="utf8") as fw:
+            json.dump(
+                [
+                    {
+                        "idx": 0,
+                        "name": "0",
+                        "path": "",
+                        "type": "sentence_transformers.models.Transformer"
+                    },
+                    {
+                        "idx": 1,
+                        "name": "1",
+                        "path": "1_Pooling",
+                        "type": "sentence_transformers.models.Pooling"
+                    }
+                ],
+                fw,
+                ensure_ascii=False,
+                indent=1
+            )
+        ## sentence_bert_config.json
+        shutil.copy(join(model_dir, "added_tokens.json"), join(checkpoint_dir, "added_tokens.json"))
+        shutil.copy(join(model_dir, "config_sentence_transformers.json"),
+                    join(checkpoint_dir, "config_sentence_transformers.json"))
+        shutil.copy(join(model_dir, "merges.txt"), join(checkpoint_dir, "merges.txt"))
+        shutil.copy(join(model_dir, "sentence_bert_config.json"), join(checkpoint_dir, "sentence_bert_config.json"))
+        shutil.copy(join(model_dir, "special_tokens_map.json"), join(checkpoint_dir, "special_tokens_map.json"))
+        shutil.copy(join(model_dir, "tokenizer_config.json"), join(checkpoint_dir, "tokenizer_config.json"))
+        shutil.copy(join(model_dir, "tokenizer.json"), join(checkpoint_dir, "tokenizer.json"))
+        shutil.copy(join(model_dir, "vocab.json"), join(checkpoint_dir, "vocab.json"))
+def get_score_diff(vectors):
+    scores = torch.matmul(vectors, vectors.T)
+    scores = scores[torch.triu(torch.ones_like(scores), diagonal=1).bool()]
+    score_diff = scores.reshape((1, -1)) - scores.reshape((-1, 1))
+    score_diff = score_diff[torch.triu(torch.ones_like(score_diff), diagonal=1).bool()]
+    return score_diff
+if __name__ == "__main__":
+    # read the configration
+    with open(sys.argv[1].strip(), "r", encoding="utf8") as fr:
+        conf = yaml.safe_load(fr)
+    model_dir = conf["model_path_or_name"]
+    max_length = conf["max_length"]
+    resume_model_dir = conf["resume_model_dir"]
+    output_dir = conf["output_dir"]
+    save_steps = conf["save_steps"]
+    batch_size = conf["batch_size"]
+    project_name = conf["project_name"]
+    log_with = conf["log_with"]
+    log_init_kwargs = conf["log_init_kwargs"]
+    file_path_list_or_dir = conf["file_path_list"]
+    print_debug_info_prob = conf["print_debug_info_prob"]
+    gradient_accumulation_steps = conf["gradient_accumulation_steps"]
+    continue_train = conf["continue_train"]
+    num_train_epochs = conf["num_train_epochs"]
+    lr_scheduler_type = conf["lr_scheduler_type"]
+    mse_loss_scale = conf["mse_loss_scale"]
+    cosine_loss_scale = conf["cosine_loss_scale"]
+    padding = conf["padding"]
+    teacher_vector_cols = conf["teacher_vector_cols"]
+    rank_margin = conf["rank_margin"]
+    rank_loss_scale = conf["rank_loss_scale"]
+    scheduler_kwargs = conf.get("scheduler_kwargs", {})
+    seed = conf["seed"]
+    # initialize accelerator
+    accelerator = Accelerator(
+        project_config=ProjectConfiguration(
+            project_dir=output_dir,
+            logging_dir=join(output_dir, "logs"),
+        ),
+        gradient_accumulation_steps=gradient_accumulation_steps,
+        log_with=log_with,
+        kwargs_handlers=[
+            accelerate.DistributedDataParallelKwargs(find_unused_parameters=not conf["gradient_checkpointing"])]
+    )
+    # output_dir and sth
+    with accelerator.main_process_first():
+        if accelerator.is_main_process:
+            os.makedirs(output_dir, exist_ok=True)
+            os.makedirs(join(output_dir, "logs/wandb_logs"), exist_ok=True)
+            logger.add(
+                join(output_dir, "train_logs.txt"),
+                level="DEBUG",
+                compression="zip",
+                rotation="500 MB",
+                # format="{message}"
+            )
+            shutil.copy(sys.argv[1].strip(), join(output_dir, "train_config.yml"))
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        logger.info(f"accelerator.state:{accelerator.state}")
+    # seed
+    set_seed(seed=seed)
+    # 加载模型、tokenizer
+    model_conf = JasperVLConfig.from_pretrained(model_dir)
+    model = JasperVL(model_conf)
+    w_di = load_file(filename=join(model_dir, "model.safetensors"), device="cpu")
+    w, b = w_di["vector_linear_12288.weight"].detach(), w_di["vector_linear_12288.bias"].detach()
+    w_di["vector_linear_1024.weight"] = w.reshape(1024, -1, 1536).mean(dim=1, keepdim=False)
+    w_di["vector_linear_1024.bias"] = b.reshape(1024, -1).mean(dim=1, keepdim=False)
+    w_di["vector_linear_512.weight"] = w.reshape(512, -1, 1536).mean(dim=1, keepdim=False)
+    w_di["vector_linear_512.bias"] = b.reshape(512, -1).mean(dim=1, keepdim=False)
+    w_di["vector_linear_256.weight"] = w.reshape(256, -1, 1536).mean(dim=1, keepdim=False)
+    w_di["vector_linear_256.bias"] = b.reshape(256, -1).mean(dim=1, keepdim=False)
+    model.load_state_dict(state_dict=w_di, strict=True)
+    tokenizer = Qwen2TokenizerFast.from_pretrained(model_dir, padding_side="right")
+    for k, v in model.named_parameters():
+        if k.startswith("model."):
+            v.requires_grad = False
+        if "model.norm.weight" in k or "layers.27" in k or "layers.26" in k or "layers.25" in k:
+            v.requires_grad = True
+    if accelerator.is_main_process:
+        logger.debug("参数冻结情况")
+        for k, v in model.named_parameters():
+            logger.debug(f"{k}:{v.shape, v.requires_grad}")
+    if conf["gradient_checkpointing"]:
+        model.gradient_checkpointing_enable()
+    # 加载数据和teacher vector
+    train_dataset = JasperDataset_LMDB_RANDOM_ACCESS(file_path_list_or_dir=file_path_list_or_dir)
+    train_dataloader = DataLoader(
+        dataset=train_dataset,
+        shuffle=False,
+        collate_fn=collate_fn_jasper_text,
+        batch_size=batch_size,
+        num_workers=6,
+        drop_last=True,
+        # pin_memory=True,
+        # pin_memory_device="cuda",
+        prefetch_factor=4,
+    )
+    # 加载上次的训练状态
+    accelerator.wait_for_everyone()
+    # init log
+    if "wandb" in log_init_kwargs:
+        log_init_kwargs["wandb"]["dir"] = join(output_dir, "logs/wandb_logs")
+        log_init_kwargs["wandb"]["config"] = {k: json.dumps(v, ensure_ascii=False) for k, v in conf.items()}
+    accelerator.init_trackers(
+        project_name=project_name,
+        init_kwargs=log_init_kwargs
+    )
+    # Optimizer
+    optimizer = torch.optim.AdamW(model.parameters(), lr=conf["learning_rate"])
+    # if os.path.exists(join(model_path_or_name, "optimizer.bin")):
+    #     optimizer.load_state_dict(torch.load(join(model_path_or_name, "optimizer.bin"), weights_only=False, map_location="cpu"))
+    # scheduler
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
+    max_train_steps = num_update_steps_per_epoch * num_train_epochs
+    if isinstance(conf["num_warmup_steps"], float):
+        num_warmup_steps = int(max_train_steps * conf["num_warmup_steps"])
+    else:
+        num_warmup_steps = conf["num_warmup_steps"]
+    lr_scheduler = get_scheduler(
+        name=lr_scheduler_type,
+        optimizer=optimizer,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=max_train_steps,
+        scheduler_specific_kwargs=scheduler_kwargs,
+    )
+    logger.debug(f"before prepare, len(train_dataloader): {len(train_dataloader)}")
+    # prepare everything
+    model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_dataloader, lr_scheduler
+    )
+    logger.debug(f"after prepare, len(train_dataloader): {len(train_dataloader)}")
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    ## PS: 多机多卡的问题，之前的计算没有考虑num_process,多机读卡下len(train_dataloader)会变小， 接下来的相当于是每张卡的数量
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / accelerator.gradient_accumulation_steps)
+    max_train_steps = num_train_epochs * num_update_steps_per_epoch
+    logger.debug(f"max_train_steps for each card:{max_train_steps}")
+    starting_epoch, completed_steps = 0, 0
+    progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process)
+    if continue_train:
+        logger.info(f"Continue train from {model_dir}")
+        accelerator.load_state(resume_model_dir)
+        resume_step = int(os.path.basename(resume_model_dir).replace("step_", ""))
+        completed_steps = resume_step
+        starting_epoch = resume_step // num_update_steps_per_epoch
+        resume_step -= starting_epoch * num_update_steps_per_epoch
+    progress_bar.update(completed_steps)
+    # 开始训练
+    for epoch in range(starting_epoch, num_train_epochs):
+        model.train()
+        # skip new `skip_first_batches` to skip the batches when resuming from ckpt
+        if continue_train and epoch == starting_epoch:
+            # We need to skip steps until we reach the resumed step
+            active_dataloader = accelerator.skip_first_batches(
+                train_dataloader,
+                resume_step * gradient_accumulation_steps
+            )
+        else:
+            # After the first iteration though, we need to go back to the original dataloader
+            active_dataloader = train_dataloader
+        logger.debug(f"len(active_dataloader): {len(active_dataloader)}")
+        for batch in active_dataloader:
+            teacher_vectors = batch.pop("teacher_vectors")
+            with accelerator.accumulate(model):
+                attention_mask = batch["attention_mask"]
+                model_output = model(**batch)
+                target_sim_values = torch.matmul(teacher_vectors, teacher_vectors.T)
+                rank_label = torch.where(get_score_diff(teacher_vectors) < 0, 1, -1)
+                sim_value_loss_list, rank_loss_list = [], []
+                all_vectors = [v for k, v in model_output.items() if k.startswith("student_vectors_")]
+                for student_vectors in all_vectors:
+                    student_vectors = student_vectors.float()[:, 0]
+                    student_vectors = F.normalize(student_vectors, p=2, dim=-1)
+                    if student_vectors.shape[-1] == 12288:
+                        # 计算cosine loss
+                        cosine_loss = (1 - (student_vectors * teacher_vectors).sum(axis=1).mean()) * cosine_loss_scale
+                    # 计算老师和学生的相似度值损失
+                    sim_value_loss_list.append(
+                        F.mse_loss(
+                            input=torch.matmul(student_vectors, student_vectors.T),
+                            target=target_sim_values,
+                        ) * mse_loss_scale
+                    )
+                    # print(sim_value_loss_list)
+                    # 计算 排序损失函数
+                    rank_loss_list.append(
+                        F.relu(get_score_diff(student_vectors) * rank_label + rank_margin).mean() * rank_loss_scale
+                    )
+                sim_value_loss = sum(sim_value_loss_list) / len(sim_value_loss_list)
+                rank_loss = sum(rank_loss_list) / len(rank_loss_list)
+                loss = cosine_loss + sim_value_loss + rank_loss
+                ##########################  debug 信息  #######################################################
+                if accelerator.is_main_process and (completed_steps == 10 or random.random() < print_debug_info_prob):
+                    input_ids = batch["input_ids"].cpu().numpy()
+                    attention_mask = batch["attention_mask"].cpu().numpy()
+                    debug_index = random.randint(0, len(input_ids) - 1)
+                    for debug_k, debug_v in batch.items():
+                        logger.debug(f"{debug_k}.shape: {debug_v.shape}")
+                    logger.debug(f"debug_index: {debug_index}")
+                    logger.debug(f"input_ids: {input_ids[debug_index].tolist()}")
+                    logger.debug(f"input_tokens: {tokenizer.decode(input_ids[debug_index])}")
+                    logger.debug(f"attention_mask: {attention_mask[debug_index].tolist()}")
+                    logger.debug(f"teacher_vectors.shape: {teacher_vectors.shape}")
+                    logger.debug(f"student_vectors.shape: {student_vectors.shape}")
+                ###############################################################################################
+                accelerator.backward(loss)
+                optimizer.step()
+                lr_scheduler.step()
+                optimizer.zero_grad()
+                if accelerator.sync_gradients:
+                    progress_bar.update(1)
+                    completed_steps += 1
+                    if completed_steps == 15:
+                        save_model()
+                    if completed_steps % save_steps == 0 and completed_steps > 0:
+                        save_model()
+                    # log
+                    if accelerator.is_main_process:
+                        curr_lr = float(lr_scheduler.get_last_lr()[-1])
+                        logger.info(
+                            f"epoch-{epoch},completed_steps-{completed_steps},lr:{curr_lr},cosine_loss:{cosine_loss.item()},sim_value_loss:{sim_value_loss.item()},rank_loss:{rank_loss.item()}"
+                        )
+                        accelerator.log(
+                            {
+                                "cosine_loss": cosine_loss.item(),
+                                "sim_value_loss": sim_value_loss.item(),
+                                "rank_loss": rank_loss.item(),
+                                "lr": curr_lr
+                            },
+                            step=completed_steps
+                        )
+    # 训练结束后保存一次模型
+    save_model()
+    accelerator.end_training()