Upload 15 files

Browse files

Files changed (15) hide show

README.md +71 -0
adapter_config.json +28 -0
adapter_model.safetensors +3 -0
all_results.json +12 -0
eval_results.json +7 -0
special_tokens_map.json +18 -0
tokenization_chatglm.py +328 -0
tokenizer.model +3 -0
tokenizer_config.json +65 -0
train_results.json +8 -0
trainer_log.jsonl +81 -0
trainer_state.json +590 -0
training_args.bin +3 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,71 @@

+---
+license: other
+library_name: peft
+tags:
+- llama-factory
+- lora
+- generated_from_trainer
+base_model: THUDM/chatglm3-6b
+model-index:
+- name: LangGPT
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# LangGPT
+This model is a fine-tuned version of [/datas/huangyijie/my_model/chatglm3-6b](https://huggingface.co//datas/huangyijie/my_model/chatglm3-6b) on the LangGPT dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.8991
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 5e-05
+- train_batch_size: 12
+- eval_batch_size: 4
+- seed: 42
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 96
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 20
+- num_epochs: 9.0
+- mixed_precision_training: Native AMP
+### Training results
+| Training Loss | Epoch | Step | Validation Loss |
+|:-------------:|:-----:|:----:|:---------------:|
+| 1.0558        | 1.25  | 100  | 1.0500          |
+| 0.9566        | 2.5   | 200  | 0.9630          |
+| 0.9082        | 3.75  | 300  | 0.9288          |
+| 0.8992        | 5.0   | 400  | 0.9108          |
+| 0.8874        | 6.25  | 500  | 0.9028          |
+| 0.8835        | 7.5   | 600  | 0.8997          |
+| 0.8912        | 8.75  | 700  | 0.8991          |
+### Framework versions
+- PEFT 0.10.0
+- Transformers 4.40.1
+- Pytorch 2.2.0+cu121
+- Datasets 2.16.1
+- Tokenizers 0.19.1

adapter_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "THUDM/chatglm3-6b",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "query_key_value"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:404f0a92a53d60d0eb9cfabc139cc99740bb4ff315929bf64fcccd2fead20436
+size 7807744

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 9.0,
+    "eval_loss": 0.8991448283195496,
+    "eval_runtime": 353.2745,
+    "eval_samples_per_second": 2.417,
+    "eval_steps_per_second": 0.606,
+    "total_flos": 2.5580424283828716e+18,
+    "train_loss": 0.9693152533637153,
+    "train_runtime": 60193.9387,
+    "train_samples_per_second": 1.148,
+    "train_steps_per_second": 0.012
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 9.0,
+    "eval_loss": 0.8991448283195496,
+    "eval_runtime": 353.2745,
+    "eval_samples_per_second": 2.417,
+    "eval_steps_per_second": 0.606
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|observation|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ]
+}

tokenization_chatglm.py ADDED Viewed

	@@ -0,0 +1,328 @@

+import json
+import os
+import re
+from typing import List, Optional, Union, Dict
+from sentencepiece import SentencePieceProcessor
+from transformers import PreTrainedTokenizer
+from transformers.utils import logging, PaddingStrategy
+from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
+logger = logging.get_logger(__name__)
+class SPTokenizer:
+    def __init__(self, model_path: str):
+        # reload tokenizer
+        assert os.path.isfile(model_path), model_path
+        self.sp_model = SentencePieceProcessor(model_file=model_path)
+        # BOS / EOS token IDs
+        self.n_words: int = self.sp_model.vocab_size()
+        self.bos_id: int = self.sp_model.bos_id()
+        self.eos_id: int = self.sp_model.eos_id()
+        self.pad_id: int = self.sp_model.unk_id()
+        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
+        role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"]
+        special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens
+        self.special_tokens = {}
+        self.index_special_tokens = {}
+        for token in special_tokens:
+            self.special_tokens[token] = self.n_words
+            self.index_special_tokens[self.n_words] = token
+            self.n_words += 1
+        self.role_special_token_expression = "|".join([re.escape(token) for token in special_tokens]) # for apply_chat_template
+    def tokenize(self, s: str, encode_special_tokens=False):
+        if encode_special_tokens:
+            last_index = 0
+            t = []
+            for match in re.finditer(self.role_special_token_expression, s):
+                if last_index < match.start():
+                    t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()]))
+                t.append(s[match.start():match.end()])
+                last_index = match.end()
+            if last_index < len(s):
+                t.extend(self.sp_model.EncodeAsPieces(s[last_index:]))
+            return t
+        else:
+            return self.sp_model.EncodeAsPieces(s)
+    def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
+        assert type(s) is str
+        t = self.sp_model.encode(s)
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+        return t
+    def decode(self, t: List[int]) -> str:
+        text, buffer = "", []
+        for token in t:
+            if token in self.index_special_tokens:
+                if buffer:
+                    text += self.sp_model.decode(buffer)
+                    buffer = []
+                text += self.index_special_tokens[token]
+            else:
+                buffer.append(token)
+        if buffer:
+            text += self.sp_model.decode(buffer)
+        return text
+    def decode_tokens(self, tokens: List[str]) -> str:
+        text = self.sp_model.DecodePieces(tokens)
+        return text
+    def convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        return self.sp_model.PieceToId(token)
+    def convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.index_special_tokens:
+            return self.index_special_tokens[index]
+        if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0 or index > self.sp_model.vocab_size():
+            return ""
+        return self.sp_model.IdToPiece(index)
+class ChatGLMTokenizer(PreTrainedTokenizer):
+    vocab_files_names = {"vocab_file": "tokenizer.model"}
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+    def __init__(
+        self,
+        vocab_file,
+        padding_side="left",
+        clean_up_tokenization_spaces=False,
+        encode_special_tokens=False,
+        **kwargs
+    ):
+        self.name = "GLMTokenizer"
+        self.vocab_file = vocab_file
+        self.tokenizer = SPTokenizer(vocab_file)
+        self.special_tokens = {
+            "<bos>": self.tokenizer.bos_id,
+            "<eos>": self.tokenizer.eos_id,
+            "<unk>": self.tokenizer.pad_id,
+            "<pad>": self.tokenizer.pad_id
+        }
+        self.encode_special_tokens = encode_special_tokens
+        super().__init__(
+            padding_side=padding_side,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs
+        )
+    def get_command(self, token):
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}"
+        return self.tokenizer.special_tokens[token]
+    @property
+    def unk_token(self) -> str:
+        return self.tokenizer.sp_model.IdToPiece(self.get_command("<unk>"))
+    @property
+    def pad_token(self) -> str:
+        return self.tokenizer.sp_model.IdToPiece(self.get_command("<pad>"))
+    @property
+    def eos_token(self) -> str:
+        return self.tokenizer.sp_model.IdToPiece(self.get_command("<eos>"))
+    @property
+    def unk_token_id(self) -> int:
+        return self.get_command("<unk>")
+    @property
+    def pad_token_id(self) -> int:
+        return self.get_command("<pad>")
+    @property
+    def eos_token_id(self):
+        return self.get_command("<eos>")
+    @unk_token.setter
+    def unk_token(self, value):
+        logger.warning("Setting unk_token is not supported, use the default one.")
+    @pad_token.setter
+    def pad_token(self, value):
+        logger.warning("Setting pad_token is not supported, use the default one.")
+    @eos_token.setter
+    def eos_token(self, value):
+        logger.warning("Setting eos_token is not supported, use the default one.")
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_words
+    def get_vocab(self):
+        """ Returns vocab as a dict """
+        vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def _tokenize(self, text, **kwargs):
+        return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens)
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.tokenizer.convert_token_to_id(token)
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.tokenizer.convert_id_to_token(index)
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return self.tokenizer.decode_tokens(tokens)
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+            filename_prefix (`str`, *optional*):
+                An optional prefix to add to the named of the saved files.
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, self.vocab_files_names["vocab_file"]
+            )
+        else:
+            vocab_file = save_directory
+        with open(self.vocab_file, 'rb') as fin:
+            proto_str = fin.read()
+        with open(vocab_file, "wb") as writer:
+            writer.write(proto_str)
+        return (vocab_file,)
+    def get_prefix_tokens(self):
+        prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")]
+        return prefix_tokens
+    def build_single_message(self, role, metadata, message):
+        assert role in ["system", "user", "assistant", "observation"], role
+        role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n")
+        message_tokens = self.tokenizer.encode(message)
+        tokens = role_tokens + message_tokens
+        return tokens
+    def build_chat_input(self, query, history=None, role="user"):
+        if history is None:
+            history = []
+        input_ids = []
+        for item in history:
+            content = item["content"]
+            if item["role"] == "system" and "tools" in item:
+                content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False)
+            input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content))
+        input_ids.extend(self.build_single_message(role, "", query))
+        input_ids.extend([self.get_command("<|assistant|>")])
+        return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True)
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        prefix_tokens = self.get_prefix_tokens()
+        token_ids_0 = prefix_tokens + token_ids_0
+        if token_ids_1 is not None:
+            token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("<eos>")]
+        return token_ids_0
+    def _pad(
+        self,
+        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        assert self.padding_side == "left"
+        required_input = encoded_inputs[self.model_input_names[0]]
+        seq_length = len(required_input)
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+        # Initialize attention mask if not present.
+        if "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * seq_length
+        if "position_ids" not in encoded_inputs:
+            encoded_inputs["position_ids"] = list(range(seq_length))
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
+            if "position_ids" in encoded_inputs:
+                encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
+            encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+        return encoded_inputs

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7dc4c393423b76e4373e5157ddc34803a0189ba96b21ddbb40269d31468a6f2
+size 1018370

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,65 @@

+{
+  "added_tokens_decoder": {
+    "64790": {
+      "content": "[gMASK]",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "64792": {
+      "content": "sop",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "64795": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "64796": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "64797": {
+      "content": "<|observation|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|user|>",
+    "<|observation|>"
+  ],
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_chatglm.ChatGLMTokenizer",
+      null
+    ]
+  },
+  "chat_template": "{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|>\n {{ message['content'] }}{% else %}<|{{ message['role'] }}|>\n {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": false,
+  "eos_token": "</s>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<unk>",
+  "padding_side": "right",
+  "remove_space": false,
+  "split_special_tokens": false,
+  "tokenizer_class": "ChatGLMTokenizer",
+  "unk_token": "<unk>"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 9.0,
+    "total_flos": 2.5580424283828716e+18,
+    "train_loss": 0.9693152533637153,
+    "train_runtime": 60193.9387,
+    "train_samples_per_second": 1.148,
+    "train_steps_per_second": 0.012
+}

trainer_log.jsonl ADDED Viewed

	@@ -0,0 +1,81 @@

+{"current_steps": 10, "total_steps": 720, "loss": 1.6827, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.5e-05, "epoch": 0.125, "percentage": 1.39, "elapsed_time": "0:13:23", "remaining_time": "15:50:19"}
+{"current_steps": 20, "total_steps": 720, "loss": 1.6309, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 5e-05, "epoch": 0.25, "percentage": 2.78, "elapsed_time": "0:26:45", "remaining_time": "15:36:27"}
+{"current_steps": 30, "total_steps": 720, "loss": 1.5415, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.997482666353287e-05, "epoch": 0.375, "percentage": 4.17, "elapsed_time": "0:40:07", "remaining_time": "15:22:55"}
+{"current_steps": 40, "total_steps": 720, "loss": 1.393, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.989935734988098e-05, "epoch": 0.5, "percentage": 5.56, "elapsed_time": "0:53:29", "remaining_time": "15:09:26"}
+{"current_steps": 50, "total_steps": 720, "loss": 1.2563, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.977374404419837e-05, "epoch": 0.625, "percentage": 6.94, "elapsed_time": "1:06:51", "remaining_time": "14:55:59"}
+{"current_steps": 60, "total_steps": 720, "loss": 1.1963, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.959823971496574e-05, "epoch": 0.75, "percentage": 8.33, "elapsed_time": "1:20:14", "remaining_time": "14:42:34"}
+{"current_steps": 70, "total_steps": 720, "loss": 1.1385, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.937319780454559e-05, "epoch": 0.875, "percentage": 9.72, "elapsed_time": "1:33:36", "remaining_time": "14:29:08"}
+{"current_steps": 80, "total_steps": 720, "loss": 1.1085, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.909907151739633e-05, "epoch": 1.0, "percentage": 11.11, "elapsed_time": "1:46:56", "remaining_time": "14:15:30"}
+{"current_steps": 90, "total_steps": 720, "loss": 1.1025, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.877641290737884e-05, "epoch": 1.125, "percentage": 12.5, "elapsed_time": "2:00:18", "remaining_time": "14:02:06"}
+{"current_steps": 100, "total_steps": 720, "loss": 1.0558, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.8405871765993433e-05, "epoch": 1.25, "percentage": 13.89, "elapsed_time": "2:13:40", "remaining_time": "13:48:45"}
+{"current_steps": 100, "total_steps": 720, "loss": null, "eval_loss": 1.0500283241271973, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 1.25, "percentage": 13.89, "elapsed_time": "2:13:40", "remaining_time": "13:48:45"}
+{"current_steps": 110, "total_steps": 720, "loss": 1.0258, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.7988194313786275e-05, "epoch": 1.375, "percentage": 15.28, "elapsed_time": "2:32:55", "remaining_time": "14:07:59"}
+{"current_steps": 120, "total_steps": 720, "loss": 1.0261, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.752422169756048e-05, "epoch": 1.5, "percentage": 16.67, "elapsed_time": "2:46:16", "remaining_time": "13:51:24"}
+{"current_steps": 130, "total_steps": 720, "loss": 0.9923, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.701488829641845e-05, "epoch": 1.625, "percentage": 18.06, "elapsed_time": "2:59:38", "remaining_time": "13:35:18"}
+{"current_steps": 140, "total_steps": 720, "loss": 0.9835, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.6461219840046654e-05, "epoch": 1.75, "percentage": 19.44, "elapsed_time": "3:13:00", "remaining_time": "13:19:35"}
+{"current_steps": 150, "total_steps": 720, "loss": 1.0039, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.586433134303257e-05, "epoch": 1.875, "percentage": 20.83, "elapsed_time": "3:26:22", "remaining_time": "13:04:12"}
+{"current_steps": 160, "total_steps": 720, "loss": 0.9947, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.522542485937369e-05, "epoch": 2.0, "percentage": 22.22, "elapsed_time": "3:39:42", "remaining_time": "12:48:58"}
+{"current_steps": 170, "total_steps": 720, "loss": 0.9821, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.454578706170075e-05, "epoch": 2.125, "percentage": 23.61, "elapsed_time": "3:53:04", "remaining_time": "12:34:03"}
+{"current_steps": 180, "total_steps": 720, "loss": 0.9535, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.382678665009028e-05, "epoch": 2.25, "percentage": 25.0, "elapsed_time": "4:06:26", "remaining_time": "12:19:18"}
+{"current_steps": 190, "total_steps": 720, "loss": 0.9514, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.306987159568479e-05, "epoch": 2.375, "percentage": 26.39, "elapsed_time": "4:19:47", "remaining_time": "12:04:42"}
+{"current_steps": 200, "total_steps": 720, "loss": 0.9566, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.227656622467162e-05, "epoch": 2.5, "percentage": 27.78, "elapsed_time": "4:33:09", "remaining_time": "11:50:13"}
+{"current_steps": 200, "total_steps": 720, "loss": null, "eval_loss": 0.9630343914031982, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 2.5, "percentage": 27.78, "elapsed_time": "4:33:09", "remaining_time": "11:50:13"}
+{"current_steps": 210, "total_steps": 720, "loss": 0.9655, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.144846814849282e-05, "epoch": 2.625, "percentage": 29.17, "elapsed_time": "4:52:25", "remaining_time": "11:50:09"}
+{"current_steps": 220, "total_steps": 720, "loss": 0.9537, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.058724504646834e-05, "epoch": 2.75, "percentage": 30.56, "elapsed_time": "5:05:46", "remaining_time": "11:34:56"}
+{"current_steps": 230, "total_steps": 720, "loss": 0.951, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.969463130731183e-05, "epoch": 2.875, "percentage": 31.94, "elapsed_time": "5:19:08", "remaining_time": "11:19:54"}
+{"current_steps": 240, "total_steps": 720, "loss": 0.938, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.8772424536302564e-05, "epoch": 3.0, "percentage": 33.33, "elapsed_time": "5:32:28", "remaining_time": "11:04:57"}
+{"current_steps": 250, "total_steps": 720, "loss": 0.955, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.782248193514766e-05, "epoch": 3.125, "percentage": 34.72, "elapsed_time": "5:45:50", "remaining_time": "10:50:10"}
+{"current_steps": 260, "total_steps": 720, "loss": 0.9319, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.6846716561824965e-05, "epoch": 3.25, "percentage": 36.11, "elapsed_time": "5:59:11", "remaining_time": "10:35:30"}
+{"current_steps": 270, "total_steps": 720, "loss": 0.9385, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.5847093477938956e-05, "epoch": 3.375, "percentage": 37.5, "elapsed_time": "6:12:33", "remaining_time": "10:20:55"}
+{"current_steps": 280, "total_steps": 720, "loss": 0.911, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.4825625791348096e-05, "epoch": 3.5, "percentage": 38.89, "elapsed_time": "6:25:54", "remaining_time": "10:06:26"}
+{"current_steps": 290, "total_steps": 720, "loss": 0.9366, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.378437060203357e-05, "epoch": 3.625, "percentage": 40.28, "elapsed_time": "6:39:17", "remaining_time": "9:52:02"}
+{"current_steps": 300, "total_steps": 720, "loss": 0.9082, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.272542485937369e-05, "epoch": 3.75, "percentage": 41.67, "elapsed_time": "6:52:38", "remaining_time": "9:37:42"}
+{"current_steps": 300, "total_steps": 720, "loss": null, "eval_loss": 0.928753137588501, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 3.75, "percentage": 41.67, "elapsed_time": "6:52:38", "remaining_time": "9:37:42"}
+{"current_steps": 310, "total_steps": 720, "loss": 0.9158, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.165092113916688e-05, "epoch": 3.875, "percentage": 43.06, "elapsed_time": "7:11:54", "remaining_time": "9:31:13"}
+{"current_steps": 320, "total_steps": 720, "loss": 0.9027, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.056302334890786e-05, "epoch": 4.0, "percentage": 44.44, "elapsed_time": "7:25:14", "remaining_time": "9:16:32"}
+{"current_steps": 330, "total_steps": 720, "loss": 0.9336, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.9463922369965917e-05, "epoch": 4.125, "percentage": 45.83, "elapsed_time": "7:38:36", "remaining_time": "9:01:59"}
+{"current_steps": 340, "total_steps": 720, "loss": 0.9161, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.8355831645441388e-05, "epoch": 4.25, "percentage": 47.22, "elapsed_time": "7:51:58", "remaining_time": "8:47:30"}
+{"current_steps": 350, "total_steps": 720, "loss": 0.8966, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.724098272258584e-05, "epoch": 4.375, "percentage": 48.61, "elapsed_time": "8:05:20", "remaining_time": "8:33:04"}
+{"current_steps": 360, "total_steps": 720, "loss": 0.8954, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.6121620758762877e-05, "epoch": 4.5, "percentage": 50.0, "elapsed_time": "8:18:42", "remaining_time": "8:18:42"}
+{"current_steps": 370, "total_steps": 720, "loss": 0.8815, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.5e-05, "epoch": 4.625, "percentage": 51.39, "elapsed_time": "8:32:04", "remaining_time": "8:04:24"}
+{"current_steps": 380, "total_steps": 720, "loss": 0.89, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.3878379241237136e-05, "epoch": 4.75, "percentage": 52.78, "elapsed_time": "8:45:26", "remaining_time": "7:50:08"}
+{"current_steps": 390, "total_steps": 720, "loss": 0.9196, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.2759017277414166e-05, "epoch": 4.875, "percentage": 54.17, "elapsed_time": "8:58:48", "remaining_time": "7:35:55"}
+{"current_steps": 400, "total_steps": 720, "loss": 0.8992, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.164416835455862e-05, "epoch": 5.0, "percentage": 55.56, "elapsed_time": "9:12:08", "remaining_time": "7:21:43"}
+{"current_steps": 400, "total_steps": 720, "loss": null, "eval_loss": 0.9107962846755981, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 5.0, "percentage": 55.56, "elapsed_time": "9:12:08", "remaining_time": "7:21:43"}
+{"current_steps": 410, "total_steps": 720, "loss": 0.888, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.0536077630034086e-05, "epoch": 5.125, "percentage": 56.94, "elapsed_time": "9:31:24", "remaining_time": "7:12:02"}
+{"current_steps": 420, "total_steps": 720, "loss": 0.8901, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.9436976651092144e-05, "epoch": 5.25, "percentage": 58.33, "elapsed_time": "9:44:46", "remaining_time": "6:57:41"}
+{"current_steps": 430, "total_steps": 720, "loss": 0.9147, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.8349078860833123e-05, "epoch": 5.375, "percentage": 59.72, "elapsed_time": "9:58:08", "remaining_time": "6:43:23"}
+{"current_steps": 440, "total_steps": 720, "loss": 0.8925, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.7274575140626318e-05, "epoch": 5.5, "percentage": 61.11, "elapsed_time": "10:11:29", "remaining_time": "6:29:08"}
+{"current_steps": 450, "total_steps": 720, "loss": 0.9012, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.621562939796643e-05, "epoch": 5.625, "percentage": 62.5, "elapsed_time": "10:24:51", "remaining_time": "6:14:55"}
+{"current_steps": 460, "total_steps": 720, "loss": 0.8808, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.5174374208651912e-05, "epoch": 5.75, "percentage": 63.89, "elapsed_time": "10:38:13", "remaining_time": "6:00:44"}
+{"current_steps": 470, "total_steps": 720, "loss": 0.8816, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.4152906522061048e-05, "epoch": 5.875, "percentage": 65.28, "elapsed_time": "10:51:35", "remaining_time": "5:46:35"}
+{"current_steps": 480, "total_steps": 720, "loss": 0.8941, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.3153283438175034e-05, "epoch": 6.0, "percentage": 66.67, "elapsed_time": "11:04:55", "remaining_time": "5:32:27"}
+{"current_steps": 490, "total_steps": 720, "loss": 0.9048, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.217751806485235e-05, "epoch": 6.125, "percentage": 68.06, "elapsed_time": "11:18:17", "remaining_time": "5:18:22"}
+{"current_steps": 500, "total_steps": 720, "loss": 0.8874, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.122757546369744e-05, "epoch": 6.25, "percentage": 69.44, "elapsed_time": "11:31:39", "remaining_time": "5:04:19"}
+{"current_steps": 500, "total_steps": 720, "loss": null, "eval_loss": 0.9028034806251526, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 6.25, "percentage": 69.44, "elapsed_time": "11:31:39", "remaining_time": "5:04:19"}
+{"current_steps": 510, "total_steps": 720, "loss": 0.8738, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.0305368692688174e-05, "epoch": 6.375, "percentage": 70.83, "elapsed_time": "11:50:54", "remaining_time": "4:52:43"}
+{"current_steps": 520, "total_steps": 720, "loss": 0.8951, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 9.412754953531663e-06, "epoch": 6.5, "percentage": 72.22, "elapsed_time": "12:04:16", "remaining_time": "4:38:33"}
+{"current_steps": 530, "total_steps": 720, "loss": 0.8914, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 8.551531851507186e-06, "epoch": 6.625, "percentage": 73.61, "elapsed_time": "12:17:37", "remaining_time": "4:24:26"}
+{"current_steps": 540, "total_steps": 720, "loss": 0.8818, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 7.723433775328384e-06, "epoch": 6.75, "percentage": 75.0, "elapsed_time": "12:30:59", "remaining_time": "4:10:19"}
+{"current_steps": 550, "total_steps": 720, "loss": 0.8794, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 6.930128404315214e-06, "epoch": 6.875, "percentage": 76.39, "elapsed_time": "12:44:21", "remaining_time": "3:56:15"}
+{"current_steps": 560, "total_steps": 720, "loss": 0.8814, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 6.173213349909729e-06, "epoch": 7.0, "percentage": 77.78, "elapsed_time": "12:57:41", "remaining_time": "3:42:11"}
+{"current_steps": 570, "total_steps": 720, "loss": 0.8909, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 5.454212938299255e-06, "epoch": 7.125, "percentage": 79.17, "elapsed_time": "13:11:03", "remaining_time": "3:28:10"}
+{"current_steps": 580, "total_steps": 720, "loss": 0.8737, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.7745751406263165e-06, "epoch": 7.25, "percentage": 80.56, "elapsed_time": "13:24:25", "remaining_time": "3:14:10"}
+{"current_steps": 590, "total_steps": 720, "loss": 0.8937, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.135668656967434e-06, "epoch": 7.375, "percentage": 81.94, "elapsed_time": "13:37:47", "remaining_time": "3:00:11"}
+{"current_steps": 600, "total_steps": 720, "loss": 0.8835, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 3.5387801599533475e-06, "epoch": 7.5, "percentage": 83.33, "elapsed_time": "13:51:09", "remaining_time": "2:46:13"}
+{"current_steps": 600, "total_steps": 720, "loss": null, "eval_loss": 0.899681031703949, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 7.5, "percentage": 83.33, "elapsed_time": "13:51:09", "remaining_time": "2:46:13"}
+{"current_steps": 610, "total_steps": 720, "loss": 0.8841, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.98511170358155e-06, "epoch": 7.625, "percentage": 84.72, "elapsed_time": "14:10:24", "remaining_time": "2:33:21"}
+{"current_steps": 620, "total_steps": 720, "loss": 0.8979, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.475778302439524e-06, "epoch": 7.75, "percentage": 86.11, "elapsed_time": "14:23:46", "remaining_time": "2:19:19"}
+{"current_steps": 630, "total_steps": 720, "loss": 0.8696, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.0118056862137357e-06, "epoch": 7.875, "percentage": 87.5, "elapsed_time": "14:37:08", "remaining_time": "2:05:18"}
+{"current_steps": 640, "total_steps": 720, "loss": 0.8782, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.59412823400657e-06, "epoch": 8.0, "percentage": 88.89, "elapsed_time": "14:50:28", "remaining_time": "1:51:18"}
+{"current_steps": 650, "total_steps": 720, "loss": 0.8873, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.2235870926211619e-06, "epoch": 8.125, "percentage": 90.28, "elapsed_time": "15:03:50", "remaining_time": "1:37:20"}
+{"current_steps": 660, "total_steps": 720, "loss": 0.8737, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 9.009284826036691e-07, "epoch": 8.25, "percentage": 91.67, "elapsed_time": "15:17:11", "remaining_time": "1:23:22"}
+{"current_steps": 670, "total_steps": 720, "loss": 0.8883, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 6.268021954544096e-07, "epoch": 8.375, "percentage": 93.06, "elapsed_time": "15:30:33", "remaining_time": "1:09:26"}
+{"current_steps": 680, "total_steps": 720, "loss": 0.87, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 4.0176028503425835e-07, "epoch": 8.5, "percentage": 94.44, "elapsed_time": "15:43:54", "remaining_time": "0:55:31"}
+{"current_steps": 690, "total_steps": 720, "loss": 0.8746, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.262559558016325e-07, "epoch": 8.625, "percentage": 95.83, "elapsed_time": "15:57:16", "remaining_time": "0:41:37"}
+{"current_steps": 700, "total_steps": 720, "loss": 0.8912, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 1.006426501190233e-07, "epoch": 8.75, "percentage": 97.22, "elapsed_time": "16:10:38", "remaining_time": "0:27:43"}
+{"current_steps": 700, "total_steps": 720, "loss": null, "eval_loss": 0.8991448283195496, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 8.75, "percentage": 97.22, "elapsed_time": "16:10:38", "remaining_time": "0:27:43"}
+{"current_steps": 710, "total_steps": 720, "loss": 0.9007, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 2.5173336467135267e-08, "epoch": 8.875, "percentage": 98.61, "elapsed_time": "16:29:53", "remaining_time": "0:13:56"}
+{"current_steps": 720, "total_steps": 720, "loss": 0.8796, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": 0.0, "epoch": 9.0, "percentage": 100.0, "elapsed_time": "16:43:13", "remaining_time": "0:00:00"}
+{"current_steps": 720, "total_steps": 720, "loss": null, "eval_loss": null, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 9.0, "percentage": 100.0, "elapsed_time": "16:43:13", "remaining_time": "0:00:00"}
+{"current_steps": 214, "total_steps": 214, "loss": null, "eval_loss": 0.8991448283195496, "predict_loss": null, "reward": null, "accuracy": null, "learning_rate": null, "epoch": 9.0, "percentage": 100.0, "elapsed_time": "16:49:07", "remaining_time": "0:00:00"}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,590 @@

+{
+  "best_metric": 0.8991448283195496,
+  "best_model_checkpoint": "../../output/chatglm3-6b/LangGPT/checkpoint-700",
+  "epoch": 9.0,
+  "eval_steps": 100,
+  "global_step": 720,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.125,
+      "grad_norm": 0.40453147888183594,
+      "learning_rate": 2.5e-05,
+      "loss": 1.6827,
+      "step": 10
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5551838874816895,
+      "learning_rate": 5e-05,
+      "loss": 1.6309,
+      "step": 20
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 0.7859359383583069,
+      "learning_rate": 4.997482666353287e-05,
+      "loss": 1.5415,
+      "step": 30
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.597720742225647,
+      "learning_rate": 4.989935734988098e-05,
+      "loss": 1.393,
+      "step": 40
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 0.4020984172821045,
+      "learning_rate": 4.977374404419837e-05,
+      "loss": 1.2563,
+      "step": 50
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.35916563868522644,
+      "learning_rate": 4.959823971496574e-05,
+      "loss": 1.1963,
+      "step": 60
+    },
+    {
+      "epoch": 0.875,
+      "grad_norm": 0.3013848066329956,
+      "learning_rate": 4.937319780454559e-05,
+      "loss": 1.1385,
+      "step": 70
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.23849129676818848,
+      "learning_rate": 4.909907151739633e-05,
+      "loss": 1.1085,
+      "step": 80
+    },
+    {
+      "epoch": 1.125,
+      "grad_norm": 0.22885890305042267,
+      "learning_rate": 4.877641290737884e-05,
+      "loss": 1.1025,
+      "step": 90
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 0.20683708786964417,
+      "learning_rate": 4.8405871765993433e-05,
+      "loss": 1.0558,
+      "step": 100
+    },
+    {
+      "epoch": 1.25,
+      "eval_loss": 1.0500283241271973,
+      "eval_runtime": 353.0769,
+      "eval_samples_per_second": 2.419,
+      "eval_steps_per_second": 0.606,
+      "step": 100
+    },
+    {
+      "epoch": 1.375,
+      "grad_norm": 0.20663395524024963,
+      "learning_rate": 4.7988194313786275e-05,
+      "loss": 1.0258,
+      "step": 110
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.18335361778736115,
+      "learning_rate": 4.752422169756048e-05,
+      "loss": 1.0261,
+      "step": 120
+    },
+    {
+      "epoch": 1.625,
+      "grad_norm": 0.18184833228588104,
+      "learning_rate": 4.701488829641845e-05,
+      "loss": 0.9923,
+      "step": 130
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 0.19089923799037933,
+      "learning_rate": 4.6461219840046654e-05,
+      "loss": 0.9835,
+      "step": 140
+    },
+    {
+      "epoch": 1.875,
+      "grad_norm": 0.17791251838207245,
+      "learning_rate": 4.586433134303257e-05,
+      "loss": 1.0039,
+      "step": 150
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.18376672267913818,
+      "learning_rate": 4.522542485937369e-05,
+      "loss": 0.9947,
+      "step": 160
+    },
+    {
+      "epoch": 2.125,
+      "grad_norm": 0.20052292943000793,
+      "learning_rate": 4.454578706170075e-05,
+      "loss": 0.9821,
+      "step": 170
+    },
+    {
+      "epoch": 2.25,
+      "grad_norm": 0.19209513068199158,
+      "learning_rate": 4.382678665009028e-05,
+      "loss": 0.9535,
+      "step": 180
+    },
+    {
+      "epoch": 2.375,
+      "grad_norm": 0.19733993709087372,
+      "learning_rate": 4.306987159568479e-05,
+      "loss": 0.9514,
+      "step": 190
+    },
+    {
+      "epoch": 2.5,
+      "grad_norm": 0.18989509344100952,
+      "learning_rate": 4.227656622467162e-05,
+      "loss": 0.9566,
+      "step": 200
+    },
+    {
+      "epoch": 2.5,
+      "eval_loss": 0.9630343914031982,
+      "eval_runtime": 353.288,
+      "eval_samples_per_second": 2.417,
+      "eval_steps_per_second": 0.606,
+      "step": 200
+    },
+    {
+      "epoch": 2.625,
+      "grad_norm": 0.19188831746578217,
+      "learning_rate": 4.144846814849282e-05,
+      "loss": 0.9655,
+      "step": 210
+    },
+    {
+      "epoch": 2.75,
+      "grad_norm": 0.2034657597541809,
+      "learning_rate": 4.058724504646834e-05,
+      "loss": 0.9537,
+      "step": 220
+    },
+    {
+      "epoch": 2.875,
+      "grad_norm": 0.20900140702724457,
+      "learning_rate": 3.969463130731183e-05,
+      "loss": 0.951,
+      "step": 230
+    },
+    {
+      "epoch": 3.0,
+      "grad_norm": 0.231728196144104,
+      "learning_rate": 3.8772424536302564e-05,
+      "loss": 0.938,
+      "step": 240
+    },
+    {
+      "epoch": 3.125,
+      "grad_norm": 0.21837086975574493,
+      "learning_rate": 3.782248193514766e-05,
+      "loss": 0.955,
+      "step": 250
+    },
+    {
+      "epoch": 3.25,
+      "grad_norm": 0.2057914286851883,
+      "learning_rate": 3.6846716561824965e-05,
+      "loss": 0.9319,
+      "step": 260
+    },
+    {
+      "epoch": 3.375,
+      "grad_norm": 0.22230790555477142,
+      "learning_rate": 3.5847093477938956e-05,
+      "loss": 0.9385,
+      "step": 270
+    },
+    {
+      "epoch": 3.5,
+      "grad_norm": 0.24387766420841217,
+      "learning_rate": 3.4825625791348096e-05,
+      "loss": 0.911,
+      "step": 280
+    },
+    {
+      "epoch": 3.625,
+      "grad_norm": 0.2634485065937042,
+      "learning_rate": 3.378437060203357e-05,
+      "loss": 0.9366,
+      "step": 290
+    },
+    {
+      "epoch": 3.75,
+      "grad_norm": 0.22965680062770844,
+      "learning_rate": 3.272542485937369e-05,
+      "loss": 0.9082,
+      "step": 300
+    },
+    {
+      "epoch": 3.75,
+      "eval_loss": 0.928753137588501,
+      "eval_runtime": 353.3512,
+      "eval_samples_per_second": 2.417,
+      "eval_steps_per_second": 0.606,
+      "step": 300
+    },
+    {
+      "epoch": 3.875,
+      "grad_norm": 0.21778391301631927,
+      "learning_rate": 3.165092113916688e-05,
+      "loss": 0.9158,
+      "step": 310
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 0.24541890621185303,
+      "learning_rate": 3.056302334890786e-05,
+      "loss": 0.9027,
+      "step": 320
+    },
+    {
+      "epoch": 4.125,
+      "grad_norm": 0.25015348196029663,
+      "learning_rate": 2.9463922369965917e-05,
+      "loss": 0.9336,
+      "step": 330
+    },
+    {
+      "epoch": 4.25,
+      "grad_norm": 0.22015893459320068,
+      "learning_rate": 2.8355831645441388e-05,
+      "loss": 0.9161,
+      "step": 340
+    },
+    {
+      "epoch": 4.375,
+      "grad_norm": 0.2516670823097229,
+      "learning_rate": 2.724098272258584e-05,
+      "loss": 0.8966,
+      "step": 350
+    },
+    {
+      "epoch": 4.5,
+      "grad_norm": 0.2541712820529938,
+      "learning_rate": 2.6121620758762877e-05,
+      "loss": 0.8954,
+      "step": 360
+    },
+    {
+      "epoch": 4.625,
+      "grad_norm": 0.25608915090560913,
+      "learning_rate": 2.5e-05,
+      "loss": 0.8815,
+      "step": 370
+    },
+    {
+      "epoch": 4.75,
+      "grad_norm": 0.24169643223285675,
+      "learning_rate": 2.3878379241237136e-05,
+      "loss": 0.89,
+      "step": 380
+    },
+    {
+      "epoch": 4.875,
+      "grad_norm": 0.2623349130153656,
+      "learning_rate": 2.2759017277414166e-05,
+      "loss": 0.9196,
+      "step": 390
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 0.29517388343811035,
+      "learning_rate": 2.164416835455862e-05,
+      "loss": 0.8992,
+      "step": 400
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 0.9107962846755981,
+      "eval_runtime": 353.3297,
+      "eval_samples_per_second": 2.417,
+      "eval_steps_per_second": 0.606,
+      "step": 400
+    },
+    {
+      "epoch": 5.125,
+      "grad_norm": 0.2589443027973175,
+      "learning_rate": 2.0536077630034086e-05,
+      "loss": 0.888,
+      "step": 410
+    },
+    {
+      "epoch": 5.25,
+      "grad_norm": 0.24191297590732574,
+      "learning_rate": 1.9436976651092144e-05,
+      "loss": 0.8901,
+      "step": 420
+    },
+    {
+      "epoch": 5.375,
+      "grad_norm": 0.27726104855537415,
+      "learning_rate": 1.8349078860833123e-05,
+      "loss": 0.9147,
+      "step": 430
+    },
+    {
+      "epoch": 5.5,
+      "grad_norm": 0.23908096551895142,
+      "learning_rate": 1.7274575140626318e-05,
+      "loss": 0.8925,
+      "step": 440
+    },
+    {
+      "epoch": 5.625,
+      "grad_norm": 0.30176234245300293,
+      "learning_rate": 1.621562939796643e-05,
+      "loss": 0.9012,
+      "step": 450
+    },
+    {
+      "epoch": 5.75,
+      "grad_norm": 0.23645330965518951,
+      "learning_rate": 1.5174374208651912e-05,
+      "loss": 0.8808,
+      "step": 460
+    },
+    {
+      "epoch": 5.875,
+      "grad_norm": 0.2720588147640228,
+      "learning_rate": 1.4152906522061048e-05,
+      "loss": 0.8816,
+      "step": 470
+    },
+    {
+      "epoch": 6.0,
+      "grad_norm": 0.2631034553050995,
+      "learning_rate": 1.3153283438175034e-05,
+      "loss": 0.8941,
+      "step": 480
+    },
+    {
+      "epoch": 6.125,
+      "grad_norm": 0.2486189901828766,
+      "learning_rate": 1.217751806485235e-05,
+      "loss": 0.9048,
+      "step": 490
+    },
+    {
+      "epoch": 6.25,
+      "grad_norm": 0.2926970422267914,
+      "learning_rate": 1.122757546369744e-05,
+      "loss": 0.8874,
+      "step": 500
+    },
+    {
+      "epoch": 6.25,
+      "eval_loss": 0.9028034806251526,
+      "eval_runtime": 353.2345,
+      "eval_samples_per_second": 2.418,
+      "eval_steps_per_second": 0.606,
+      "step": 500
+    },
+    {
+      "epoch": 6.375,
+      "grad_norm": 0.25221139192581177,
+      "learning_rate": 1.0305368692688174e-05,
+      "loss": 0.8738,
+      "step": 510
+    },
+    {
+      "epoch": 6.5,
+      "grad_norm": 0.2523793578147888,
+      "learning_rate": 9.412754953531663e-06,
+      "loss": 0.8951,
+      "step": 520
+    },
+    {
+      "epoch": 6.625,
+      "grad_norm": 0.2493809163570404,
+      "learning_rate": 8.551531851507186e-06,
+      "loss": 0.8914,
+      "step": 530
+    },
+    {
+      "epoch": 6.75,
+      "grad_norm": 0.2688143253326416,
+      "learning_rate": 7.723433775328384e-06,
+      "loss": 0.8818,
+      "step": 540
+    },
+    {
+      "epoch": 6.875,
+      "grad_norm": 0.2695543169975281,
+      "learning_rate": 6.930128404315214e-06,
+      "loss": 0.8794,
+      "step": 550
+    },
+    {
+      "epoch": 7.0,
+      "grad_norm": 0.27596864104270935,
+      "learning_rate": 6.173213349909729e-06,
+      "loss": 0.8814,
+      "step": 560
+    },
+    {
+      "epoch": 7.125,
+      "grad_norm": 0.27881208062171936,
+      "learning_rate": 5.454212938299255e-06,
+      "loss": 0.8909,
+      "step": 570
+    },
+    {
+      "epoch": 7.25,
+      "grad_norm": 0.2895490825176239,
+      "learning_rate": 4.7745751406263165e-06,
+      "loss": 0.8737,
+      "step": 580
+    },
+    {
+      "epoch": 7.375,
+      "grad_norm": 0.25476014614105225,
+      "learning_rate": 4.135668656967434e-06,
+      "loss": 0.8937,
+      "step": 590
+    },
+    {
+      "epoch": 7.5,
+      "grad_norm": 0.2785739600658417,
+      "learning_rate": 3.5387801599533475e-06,
+      "loss": 0.8835,
+      "step": 600
+    },
+    {
+      "epoch": 7.5,
+      "eval_loss": 0.899681031703949,
+      "eval_runtime": 353.2175,
+      "eval_samples_per_second": 2.418,
+      "eval_steps_per_second": 0.606,
+      "step": 600
+    },
+    {
+      "epoch": 7.625,
+      "grad_norm": 0.27525651454925537,
+      "learning_rate": 2.98511170358155e-06,
+      "loss": 0.8841,
+      "step": 610
+    },
+    {
+      "epoch": 7.75,
+      "grad_norm": 0.25052082538604736,
+      "learning_rate": 2.475778302439524e-06,
+      "loss": 0.8979,
+      "step": 620
+    },
+    {
+      "epoch": 7.875,
+      "grad_norm": 0.2501230537891388,
+      "learning_rate": 2.0118056862137357e-06,
+      "loss": 0.8696,
+      "step": 630
+    },
+    {
+      "epoch": 8.0,
+      "grad_norm": 0.2521611452102661,
+      "learning_rate": 1.59412823400657e-06,
+      "loss": 0.8782,
+      "step": 640
+    },
+    {
+      "epoch": 8.125,
+      "grad_norm": 0.249056875705719,
+      "learning_rate": 1.2235870926211619e-06,
+      "loss": 0.8873,
+      "step": 650
+    },
+    {
+      "epoch": 8.25,
+      "grad_norm": 0.27458131313323975,
+      "learning_rate": 9.009284826036691e-07,
+      "loss": 0.8737,
+      "step": 660
+    },
+    {
+      "epoch": 8.375,
+      "grad_norm": 0.24417945742607117,
+      "learning_rate": 6.268021954544096e-07,
+      "loss": 0.8883,
+      "step": 670
+    },
+    {
+      "epoch": 8.5,
+      "grad_norm": 0.25331562757492065,
+      "learning_rate": 4.0176028503425835e-07,
+      "loss": 0.87,
+      "step": 680
+    },
+    {
+      "epoch": 8.625,
+      "grad_norm": 0.25556355714797974,
+      "learning_rate": 2.262559558016325e-07,
+      "loss": 0.8746,
+      "step": 690
+    },
+    {
+      "epoch": 8.75,
+      "grad_norm": 0.27511876821517944,
+      "learning_rate": 1.006426501190233e-07,
+      "loss": 0.8912,
+      "step": 700
+    },
+    {
+      "epoch": 8.75,
+      "eval_loss": 0.8991448283195496,
+      "eval_runtime": 353.267,
+      "eval_samples_per_second": 2.417,
+      "eval_steps_per_second": 0.606,
+      "step": 700
+    },
+    {
+      "epoch": 8.875,
+      "grad_norm": 0.2639774680137634,
+      "learning_rate": 2.5173336467135267e-08,
+      "loss": 0.9007,
+      "step": 710
+    },
+    {
+      "epoch": 9.0,
+      "grad_norm": 0.27557557821273804,
+      "learning_rate": 0.0,
+      "loss": 0.8796,
+      "step": 720
+    },
+    {
+      "epoch": 9.0,
+      "step": 720,
+      "total_flos": 2.5580424283828716e+18,
+      "train_loss": 0.9693152533637153,
+      "train_runtime": 60193.9387,
+      "train_samples_per_second": 1.148,
+      "train_steps_per_second": 0.012
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 720,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 9,
+  "save_steps": 100,
+  "total_flos": 2.5580424283828716e+18,
+  "train_batch_size": 12,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5badf6ea1b208056e83e64d2e96f6325b3d65de256294dc09063561959f7369
+size 5176

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed