Upload 16 files

Browse files

Files changed (16) hide show

README.md +202 -0
adapter_config.json +32 -0
adapter_model.safetensors +3 -0
added_tokens.json +4 -0
byteorder +1 -0
data.pkl +3 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scaler.pt +3 -0
scheduler.pt +3 -0
special_tokens_map.json +27 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +64 -0
trainer_state.json +664 -0
version +1 -0

README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: openchat/openchat-3.5-0106
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.14.0

adapter_config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "openchat/openchat-3.5-0106",
+  "bias": "none",
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_bias": false,
+  "lora_dropout": 0.1,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ed608d82ebeea6f69a121908462bb103c744faf8c82781a2634ee63a67e4b4f6
+size 13648432

added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "<|end_of_turn|>": 32000,
+  "<|pad_0|>": 32001
+}

byteorder ADDED Viewed

	@@ -0,0 +1 @@


1	+ little

data.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b1af4bd1ba258cfd54c6e2ff4560b913ee6294a7624c8eef6296d331ca9d10f1
+size 4781

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e11fd45c5b46188d01abb9be2bee8d28d57fc54a33f32382c08323e33140caa
+size 27370618

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f7dd1e9ba8560a772eb89a3646a6bdc96fa7e1c0dfb62ab9bf07a2514155eb9
+size 14244

scaler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:baba31a5e5063037a5c811de9cb04bc62c6c5f0f5fe6720b7d681afe6500d4c1
+size 988

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4155e7e2b742523e09122dece3598631178cb3544d7363337cd06e707eee56e9
+size 1064

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "additional_special_tokens": [
+    "<|end_of_turn|>",
+    "<|pad_0|>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|end_of_turn|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,64 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": null,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32000": {
+      "content": "<|end_of_turn|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32001": {
+      "content": "<|pad_0|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|end_of_turn|>",
+    "<|pad_0|>"
+  ],
+  "bos_token": "<s>",
+  "chat_template": "{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + message['role'].title() + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|end_of_turn|>",
+  "extra_special_tokens": {},
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": true
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,664 @@

+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9826285313212845,
+  "eval_steps": 500,
+  "global_step": 1400,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.01403755044744692,
+      "grad_norm": 2.182603597640991,
+      "learning_rate": 4.936666666666667e-05,
+      "loss": 1.0775,
+      "mean_token_accuracy": 0.7353726878762246,
+      "num_tokens": 21541.0,
+      "step": 20
+    },
+    {
+      "epoch": 0.02807510089489384,
+      "grad_norm": 1.6199620962142944,
+      "learning_rate": 4.87e-05,
+      "loss": 0.831,
+      "mean_token_accuracy": 0.7886073857545852,
+      "num_tokens": 41741.0,
+      "step": 40
+    },
+    {
+      "epoch": 0.04211265134234076,
+      "grad_norm": 1.5445266962051392,
+      "learning_rate": 4.803333333333333e-05,
+      "loss": 0.8208,
+      "mean_token_accuracy": 0.7912300959229469,
+      "num_tokens": 64250.0,
+      "step": 60
+    },
+    {
+      "epoch": 0.05615020178978768,
+      "grad_norm": 3.0431602001190186,
+      "learning_rate": 4.736666666666667e-05,
+      "loss": 0.7495,
+      "mean_token_accuracy": 0.8050705902278423,
+      "num_tokens": 82180.0,
+      "step": 80
+    },
+    {
+      "epoch": 0.0701877522372346,
+      "grad_norm": 2.468738555908203,
+      "learning_rate": 4.6700000000000003e-05,
+      "loss": 0.6958,
+      "mean_token_accuracy": 0.8200091950595378,
+      "num_tokens": 103406.0,
+      "step": 100
+    },
+    {
+      "epoch": 0.08422530268468152,
+      "grad_norm": 2.01033878326416,
+      "learning_rate": 4.603333333333333e-05,
+      "loss": 0.6687,
+      "mean_token_accuracy": 0.8226511150598526,
+      "num_tokens": 130084.0,
+      "step": 120
+    },
+    {
+      "epoch": 0.09826285313212844,
+      "grad_norm": 3.93017578125,
+      "learning_rate": 4.536666666666667e-05,
+      "loss": 0.6991,
+      "mean_token_accuracy": 0.8165564998984337,
+      "num_tokens": 151551.0,
+      "step": 140
+    },
+    {
+      "epoch": 0.11230040357957537,
+      "grad_norm": 2.8924667835235596,
+      "learning_rate": 4.47e-05,
+      "loss": 0.7028,
+      "mean_token_accuracy": 0.8169861853122711,
+      "num_tokens": 178720.0,
+      "step": 160
+    },
+    {
+      "epoch": 0.12633795402702228,
+      "grad_norm": 2.084078311920166,
+      "learning_rate": 4.403333333333334e-05,
+      "loss": 0.7081,
+      "mean_token_accuracy": 0.8121843561530113,
+      "num_tokens": 203486.0,
+      "step": 180
+    },
+    {
+      "epoch": 0.1403755044744692,
+      "grad_norm": 3.8189709186553955,
+      "learning_rate": 4.3366666666666666e-05,
+      "loss": 0.7169,
+      "mean_token_accuracy": 0.8171404838562012,
+      "num_tokens": 225494.0,
+      "step": 200
+    },
+    {
+      "epoch": 0.15441305492191612,
+      "grad_norm": 1.5748624801635742,
+      "learning_rate": 4.27e-05,
+      "loss": 0.726,
+      "mean_token_accuracy": 0.8170952200889587,
+      "num_tokens": 252779.0,
+      "step": 220
+    },
+    {
+      "epoch": 0.16845060536936304,
+      "grad_norm": 3.446660041809082,
+      "learning_rate": 4.2033333333333336e-05,
+      "loss": 0.7028,
+      "mean_token_accuracy": 0.8219958022236824,
+      "num_tokens": 277891.0,
+      "step": 240
+    },
+    {
+      "epoch": 0.18248815581680997,
+      "grad_norm": 3.063415288925171,
+      "learning_rate": 4.136666666666667e-05,
+      "loss": 0.6552,
+      "mean_token_accuracy": 0.825613521784544,
+      "num_tokens": 300954.0,
+      "step": 260
+    },
+    {
+      "epoch": 0.1965257062642569,
+      "grad_norm": 2.5160129070281982,
+      "learning_rate": 4.07e-05,
+      "loss": 0.6667,
+      "mean_token_accuracy": 0.8290905028581619,
+      "num_tokens": 323928.0,
+      "step": 280
+    },
+    {
+      "epoch": 0.2105632567117038,
+      "grad_norm": 2.0370359420776367,
+      "learning_rate": 4.0033333333333335e-05,
+      "loss": 0.6895,
+      "mean_token_accuracy": 0.8264866299927235,
+      "num_tokens": 348589.0,
+      "step": 300
+    },
+    {
+      "epoch": 0.22460080715915073,
+      "grad_norm": 2.5861940383911133,
+      "learning_rate": 3.936666666666667e-05,
+      "loss": 0.6844,
+      "mean_token_accuracy": 0.8244311735033989,
+      "num_tokens": 373224.0,
+      "step": 320
+    },
+    {
+      "epoch": 0.23863835760659766,
+      "grad_norm": 3.3938305377960205,
+      "learning_rate": 3.8700000000000006e-05,
+      "loss": 0.6405,
+      "mean_token_accuracy": 0.8296338513493537,
+      "num_tokens": 396631.0,
+      "step": 340
+    },
+    {
+      "epoch": 0.25267590805404455,
+      "grad_norm": 5.519856929779053,
+      "learning_rate": 3.803333333333334e-05,
+      "loss": 0.6919,
+      "mean_token_accuracy": 0.8259296268224716,
+      "num_tokens": 421242.0,
+      "step": 360
+    },
+    {
+      "epoch": 0.2667134585014915,
+      "grad_norm": 4.32072114944458,
+      "learning_rate": 3.736666666666667e-05,
+      "loss": 0.5901,
+      "mean_token_accuracy": 0.8394061036407947,
+      "num_tokens": 447204.0,
+      "step": 380
+    },
+    {
+      "epoch": 0.2807510089489384,
+      "grad_norm": 3.1764180660247803,
+      "learning_rate": 3.6700000000000004e-05,
+      "loss": 0.7203,
+      "mean_token_accuracy": 0.8243181221187115,
+      "num_tokens": 472483.0,
+      "step": 400
+    },
+    {
+      "epoch": 0.2947885593963853,
+      "grad_norm": 2.287874937057495,
+      "learning_rate": 3.603333333333333e-05,
+      "loss": 0.6231,
+      "mean_token_accuracy": 0.839271092414856,
+      "num_tokens": 499268.0,
+      "step": 420
+    },
+    {
+      "epoch": 0.30882610984383224,
+      "grad_norm": 2.6682229042053223,
+      "learning_rate": 3.536666666666667e-05,
+      "loss": 0.6893,
+      "mean_token_accuracy": 0.8259402737021446,
+      "num_tokens": 526355.0,
+      "step": 440
+    },
+    {
+      "epoch": 0.32286366029127916,
+      "grad_norm": 2.3246352672576904,
+      "learning_rate": 3.4699999999999996e-05,
+      "loss": 0.6985,
+      "mean_token_accuracy": 0.8276082828640938,
+      "num_tokens": 554181.0,
+      "step": 460
+    },
+    {
+      "epoch": 0.3369012107387261,
+      "grad_norm": 3.031585931777954,
+      "learning_rate": 3.403333333333333e-05,
+      "loss": 0.6293,
+      "mean_token_accuracy": 0.8332549884915352,
+      "num_tokens": 576342.0,
+      "step": 480
+    },
+    {
+      "epoch": 0.350938761186173,
+      "grad_norm": 3.3900413513183594,
+      "learning_rate": 3.336666666666667e-05,
+      "loss": 0.6686,
+      "mean_token_accuracy": 0.8274131864309311,
+      "num_tokens": 602872.0,
+      "step": 500
+    },
+    {
+      "epoch": 0.36497631163361993,
+      "grad_norm": 3.3233566284179688,
+      "learning_rate": 3.27e-05,
+      "loss": 0.6676,
+      "mean_token_accuracy": 0.8297357447445393,
+      "num_tokens": 630050.0,
+      "step": 520
+    },
+    {
+      "epoch": 0.37901386208106685,
+      "grad_norm": 2.4295690059661865,
+      "learning_rate": 3.203333333333334e-05,
+      "loss": 0.5948,
+      "mean_token_accuracy": 0.8400285199284554,
+      "num_tokens": 654849.0,
+      "step": 540
+    },
+    {
+      "epoch": 0.3930514125285138,
+      "grad_norm": 1.6221336126327515,
+      "learning_rate": 3.1366666666666666e-05,
+      "loss": 0.642,
+      "mean_token_accuracy": 0.8356543615460396,
+      "num_tokens": 685449.0,
+      "step": 560
+    },
+    {
+      "epoch": 0.4070889629759607,
+      "grad_norm": 3.886709213256836,
+      "learning_rate": 3.07e-05,
+      "loss": 0.6546,
+      "mean_token_accuracy": 0.831772755086422,
+      "num_tokens": 710500.0,
+      "step": 580
+    },
+    {
+      "epoch": 0.4211265134234076,
+      "grad_norm": 4.270613670349121,
+      "learning_rate": 3.0033333333333336e-05,
+      "loss": 0.6791,
+      "mean_token_accuracy": 0.8233974911272526,
+      "num_tokens": 736595.0,
+      "step": 600
+    },
+    {
+      "epoch": 0.43516406387085454,
+      "grad_norm": 3.993839740753174,
+      "learning_rate": 2.936666666666667e-05,
+      "loss": 0.6385,
+      "mean_token_accuracy": 0.841860581934452,
+      "num_tokens": 758072.0,
+      "step": 620
+    },
+    {
+      "epoch": 0.44920161431830147,
+      "grad_norm": 2.41766357421875,
+      "learning_rate": 2.87e-05,
+      "loss": 0.6533,
+      "mean_token_accuracy": 0.8278725482523441,
+      "num_tokens": 779437.0,
+      "step": 640
+    },
+    {
+      "epoch": 0.4632391647657484,
+      "grad_norm": 2.7951812744140625,
+      "learning_rate": 2.8033333333333335e-05,
+      "loss": 0.6393,
+      "mean_token_accuracy": 0.8370219074189663,
+      "num_tokens": 804462.0,
+      "step": 660
+    },
+    {
+      "epoch": 0.4772767152131953,
+      "grad_norm": 3.4887661933898926,
+      "learning_rate": 2.7366666666666667e-05,
+      "loss": 0.667,
+      "mean_token_accuracy": 0.8294253669679165,
+      "num_tokens": 823375.0,
+      "step": 680
+    },
+    {
+      "epoch": 0.49131426566064224,
+      "grad_norm": 3.319476366043091,
+      "learning_rate": 2.6700000000000002e-05,
+      "loss": 0.6249,
+      "mean_token_accuracy": 0.8350663833320141,
+      "num_tokens": 845739.0,
+      "step": 700
+    },
+    {
+      "epoch": 0.5053518161080891,
+      "grad_norm": 6.183107376098633,
+      "learning_rate": 2.6033333333333337e-05,
+      "loss": 0.6231,
+      "mean_token_accuracy": 0.8355060666799545,
+      "num_tokens": 868279.0,
+      "step": 720
+    },
+    {
+      "epoch": 0.519389366555536,
+      "grad_norm": 4.573716640472412,
+      "learning_rate": 2.5366666666666665e-05,
+      "loss": 0.597,
+      "mean_token_accuracy": 0.840047723799944,
+      "num_tokens": 891162.0,
+      "step": 740
+    },
+    {
+      "epoch": 0.533426917002983,
+      "grad_norm": 2.598928451538086,
+      "learning_rate": 2.47e-05,
+      "loss": 0.6272,
+      "mean_token_accuracy": 0.8387389734387398,
+      "num_tokens": 918926.0,
+      "step": 760
+    },
+    {
+      "epoch": 0.5474644674504299,
+      "grad_norm": 3.8199005126953125,
+      "learning_rate": 2.4033333333333336e-05,
+      "loss": 0.6036,
+      "mean_token_accuracy": 0.8386851519346237,
+      "num_tokens": 940324.0,
+      "step": 780
+    },
+    {
+      "epoch": 0.5615020178978768,
+      "grad_norm": 1.9673742055892944,
+      "learning_rate": 2.3366666666666668e-05,
+      "loss": 0.6067,
+      "mean_token_accuracy": 0.8391704387962818,
+      "num_tokens": 968666.0,
+      "step": 800
+    },
+    {
+      "epoch": 0.5755395683453237,
+      "grad_norm": 2.7032785415649414,
+      "learning_rate": 2.2700000000000003e-05,
+      "loss": 0.5798,
+      "mean_token_accuracy": 0.8483829110860824,
+      "num_tokens": 994073.0,
+      "step": 820
+    },
+    {
+      "epoch": 0.5895771187927706,
+      "grad_norm": 4.700735092163086,
+      "learning_rate": 2.2033333333333335e-05,
+      "loss": 0.6122,
+      "mean_token_accuracy": 0.8438993617892265,
+      "num_tokens": 1018185.0,
+      "step": 840
+    },
+    {
+      "epoch": 0.6036146692402176,
+      "grad_norm": 4.28444242477417,
+      "learning_rate": 2.1366666666666667e-05,
+      "loss": 0.6391,
+      "mean_token_accuracy": 0.8350753806531429,
+      "num_tokens": 1040600.0,
+      "step": 860
+    },
+    {
+      "epoch": 0.6176522196876645,
+      "grad_norm": 4.160486221313477,
+      "learning_rate": 2.07e-05,
+      "loss": 0.598,
+      "mean_token_accuracy": 0.8390520095825196,
+      "num_tokens": 1064127.0,
+      "step": 880
+    },
+    {
+      "epoch": 0.6316897701351114,
+      "grad_norm": 2.5011801719665527,
+      "learning_rate": 2.0033333333333334e-05,
+      "loss": 0.659,
+      "mean_token_accuracy": 0.8372919000685215,
+      "num_tokens": 1090322.0,
+      "step": 900
+    },
+    {
+      "epoch": 0.6457273205825583,
+      "grad_norm": 3.6319570541381836,
+      "learning_rate": 1.9366666666666665e-05,
+      "loss": 0.6205,
+      "mean_token_accuracy": 0.8420100875198842,
+      "num_tokens": 1113579.0,
+      "step": 920
+    },
+    {
+      "epoch": 0.6597648710300053,
+      "grad_norm": 3.8766114711761475,
+      "learning_rate": 1.87e-05,
+      "loss": 0.6125,
+      "mean_token_accuracy": 0.8427253067493439,
+      "num_tokens": 1132044.0,
+      "step": 940
+    },
+    {
+      "epoch": 0.6738024214774522,
+      "grad_norm": 3.2334253787994385,
+      "learning_rate": 1.8033333333333336e-05,
+      "loss": 0.6321,
+      "mean_token_accuracy": 0.8387841299176216,
+      "num_tokens": 1153969.0,
+      "step": 960
+    },
+    {
+      "epoch": 0.6878399719248991,
+      "grad_norm": 3.2974843978881836,
+      "learning_rate": 1.7366666666666668e-05,
+      "loss": 0.611,
+      "mean_token_accuracy": 0.8420861139893532,
+      "num_tokens": 1174179.0,
+      "step": 980
+    },
+    {
+      "epoch": 0.701877522372346,
+      "grad_norm": 2.2505669593811035,
+      "learning_rate": 1.6700000000000003e-05,
+      "loss": 0.6781,
+      "mean_token_accuracy": 0.8258809894323349,
+      "num_tokens": 1194991.0,
+      "step": 1000
+    },
+    {
+      "epoch": 0.7159150728197929,
+      "grad_norm": 2.249610662460327,
+      "learning_rate": 1.6033333333333335e-05,
+      "loss": 0.5948,
+      "mean_token_accuracy": 0.8416233405470848,
+      "num_tokens": 1219197.0,
+      "step": 1020
+    },
+    {
+      "epoch": 0.7299526232672399,
+      "grad_norm": 2.6272480487823486,
+      "learning_rate": 1.536666666666667e-05,
+      "loss": 0.6496,
+      "mean_token_accuracy": 0.8354081869125366,
+      "num_tokens": 1242371.0,
+      "step": 1040
+    },
+    {
+      "epoch": 0.7439901737146868,
+      "grad_norm": 2.5832204818725586,
+      "learning_rate": 1.47e-05,
+      "loss": 0.6281,
+      "mean_token_accuracy": 0.8409083731472492,
+      "num_tokens": 1264243.0,
+      "step": 1060
+    },
+    {
+      "epoch": 0.7580277241621337,
+      "grad_norm": 2.5525519847869873,
+      "learning_rate": 1.4033333333333335e-05,
+      "loss": 0.5765,
+      "mean_token_accuracy": 0.8413878485560418,
+      "num_tokens": 1289190.0,
+      "step": 1080
+    },
+    {
+      "epoch": 0.7720652746095806,
+      "grad_norm": 2.7448742389678955,
+      "learning_rate": 1.3366666666666667e-05,
+      "loss": 0.5842,
+      "mean_token_accuracy": 0.8508293248713017,
+      "num_tokens": 1312081.0,
+      "step": 1100
+    },
+    {
+      "epoch": 0.7861028250570276,
+      "grad_norm": 3.093715190887451,
+      "learning_rate": 1.27e-05,
+      "loss": 0.619,
+      "mean_token_accuracy": 0.8391894645988941,
+      "num_tokens": 1336299.0,
+      "step": 1120
+    },
+    {
+      "epoch": 0.8001403755044745,
+      "grad_norm": 3.5069360733032227,
+      "learning_rate": 1.2033333333333334e-05,
+      "loss": 0.662,
+      "mean_token_accuracy": 0.8379314132034779,
+      "num_tokens": 1356089.0,
+      "step": 1140
+    },
+    {
+      "epoch": 0.8141779259519214,
+      "grad_norm": 3.438401222229004,
+      "learning_rate": 1.1366666666666667e-05,
+      "loss": 0.6105,
+      "mean_token_accuracy": 0.8439491540193558,
+      "num_tokens": 1378743.0,
+      "step": 1160
+    },
+    {
+      "epoch": 0.8282154763993683,
+      "grad_norm": 3.4282867908477783,
+      "learning_rate": 1.0700000000000001e-05,
+      "loss": 0.5596,
+      "mean_token_accuracy": 0.8491624429821968,
+      "num_tokens": 1403734.0,
+      "step": 1180
+    },
+    {
+      "epoch": 0.8422530268468152,
+      "grad_norm": 2.1889116764068604,
+      "learning_rate": 1.0033333333333333e-05,
+      "loss": 0.5936,
+      "mean_token_accuracy": 0.8432723931968212,
+      "num_tokens": 1425665.0,
+      "step": 1200
+    },
+    {
+      "epoch": 0.8562905772942622,
+      "grad_norm": 4.925279140472412,
+      "learning_rate": 9.366666666666666e-06,
+      "loss": 0.5934,
+      "mean_token_accuracy": 0.8465121522545814,
+      "num_tokens": 1448683.0,
+      "step": 1220
+    },
+    {
+      "epoch": 0.8703281277417091,
+      "grad_norm": 2.0163872241973877,
+      "learning_rate": 8.7e-06,
+      "loss": 0.6448,
+      "mean_token_accuracy": 0.8441565148532391,
+      "num_tokens": 1472899.0,
+      "step": 1240
+    },
+    {
+      "epoch": 0.884365678189156,
+      "grad_norm": 2.7730228900909424,
+      "learning_rate": 8.033333333333335e-06,
+      "loss": 0.5927,
+      "mean_token_accuracy": 0.8459630504250526,
+      "num_tokens": 1495100.0,
+      "step": 1260
+    },
+    {
+      "epoch": 0.8984032286366029,
+      "grad_norm": 4.159714698791504,
+      "learning_rate": 7.3666666666666676e-06,
+      "loss": 0.5662,
+      "mean_token_accuracy": 0.8462217047810554,
+      "num_tokens": 1520586.0,
+      "step": 1280
+    },
+    {
+      "epoch": 0.9124407790840499,
+      "grad_norm": 3.4387991428375244,
+      "learning_rate": 6.700000000000001e-06,
+      "loss": 0.5935,
+      "mean_token_accuracy": 0.8469379253685474,
+      "num_tokens": 1543417.0,
+      "step": 1300
+    },
+    {
+      "epoch": 0.9264783295314968,
+      "grad_norm": 3.399153470993042,
+      "learning_rate": 6.033333333333334e-06,
+      "loss": 0.6468,
+      "mean_token_accuracy": 0.8329462945461273,
+      "num_tokens": 1566253.0,
+      "step": 1320
+    },
+    {
+      "epoch": 0.9405158799789437,
+      "grad_norm": 5.575164318084717,
+      "learning_rate": 5.366666666666667e-06,
+      "loss": 0.6146,
+      "mean_token_accuracy": 0.8445694409310818,
+      "num_tokens": 1587912.0,
+      "step": 1340
+    },
+    {
+      "epoch": 0.9545534304263906,
+      "grad_norm": 4.406199932098389,
+      "learning_rate": 4.7e-06,
+      "loss": 0.6114,
+      "mean_token_accuracy": 0.8456776596605777,
+      "num_tokens": 1613084.0,
+      "step": 1360
+    },
+    {
+      "epoch": 0.9685909808738375,
+      "grad_norm": 2.3013195991516113,
+      "learning_rate": 4.033333333333333e-06,
+      "loss": 0.6377,
+      "mean_token_accuracy": 0.8406077317893506,
+      "num_tokens": 1636349.0,
+      "step": 1380
+    },
+    {
+      "epoch": 0.9826285313212845,
+      "grad_norm": 2.496525526046753,
+      "learning_rate": 3.3666666666666665e-06,
+      "loss": 0.6049,
+      "mean_token_accuracy": 0.8412599414587021,
+      "num_tokens": 1662890.0,
+      "step": 1400
+    }
+  ],
+  "logging_steps": 20,
+  "max_steps": 1500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.09795566342144e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3