Upload folder using huggingface_hub

Browse files

Files changed (15) hide show

checkpoint-224/README.md +202 -0
checkpoint-224/adapter_config.json +34 -0
checkpoint-224/adapter_model.safetensors +3 -0
checkpoint-224/optimizer.pt +3 -0
checkpoint-224/rng_state_0.pth +3 -0
checkpoint-224/rng_state_1.pth +3 -0
checkpoint-224/rng_state_2.pth +3 -0
checkpoint-224/rng_state_3.pth +3 -0
checkpoint-224/scheduler.pt +3 -0
checkpoint-224/special_tokens_map.json +30 -0
checkpoint-224/tokenizer.json +0 -0
checkpoint-224/tokenizer.model +3 -0
checkpoint-224/tokenizer_config.json +43 -0
checkpoint-224/trainer_state.json +1601 -0
checkpoint-224/training_args.bin +3 -0

checkpoint-224/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: unsloth/mistral-7b-v0.2
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.11.1

checkpoint-224/adapter_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "unsloth/mistral-7b-v0.2",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "up_proj",
+    "down_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-224/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb8d29b68e78e426826dd4ecef6895ee47bff01b6e36d115212b15cbe08d247f
+size 335604696

checkpoint-224/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:515c391f358192f8b61e9341f86176473fb62c2b37f5e64b8c66a84b3de84df0
+size 168624724

checkpoint-224/rng_state_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b7ac0a1bbe24162373c8e919937a41818acd96309f08c6a7138dd00b8d025dfe
+size 15024

checkpoint-224/rng_state_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8724f1ecd38ddc82ef460fd84a40074046be858aedd2f4c747dbae337858945
+size 15024

checkpoint-224/rng_state_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4568e6f11b460085133819c02b1d2571723cd419eb172d9ef1030ead7c00108a
+size 15024

checkpoint-224/rng_state_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bfa7486fb008c8d681e68011699e9ab716dbaa9394c7ef7c51f58882cbcefa83
+size 15024

checkpoint-224/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:697c01ed8395edf0f21a9beb5e0e6c7d80aa5b425903961f9aff4ba1cb0a88fb
+size 1064

checkpoint-224/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

checkpoint-224/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoint-224/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
+size 493443

checkpoint-224/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<unk>",
+  "padding_side": "left",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

checkpoint-224/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1601 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.971111111111111,
+  "eval_steps": 500,
+  "global_step": 224,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.008888888888888889,
+      "grad_norm": 2.119051456451416,
+      "learning_rate": 4e-05,
+      "loss": 1.8256,
+      "step": 1
+    },
+    {
+      "epoch": 0.017777777777777778,
+      "grad_norm": 2.163247585296631,
+      "learning_rate": 8e-05,
+      "loss": 1.8713,
+      "step": 2
+    },
+    {
+      "epoch": 0.02666666666666667,
+      "grad_norm": 2.0141727924346924,
+      "learning_rate": 0.00012,
+      "loss": 1.7768,
+      "step": 3
+    },
+    {
+      "epoch": 0.035555555555555556,
+      "grad_norm": 1.873190999031067,
+      "learning_rate": 0.00016,
+      "loss": 1.4671,
+      "step": 4
+    },
+    {
+      "epoch": 0.044444444444444446,
+      "grad_norm": 1.9785406589508057,
+      "learning_rate": 0.0002,
+      "loss": 1.0704,
+      "step": 5
+    },
+    {
+      "epoch": 0.05333333333333334,
+      "grad_norm": 1.1257535219192505,
+      "learning_rate": 0.0001999897109785537,
+      "loss": 0.6978,
+      "step": 6
+    },
+    {
+      "epoch": 0.06222222222222222,
+      "grad_norm": 0.9121673107147217,
+      "learning_rate": 0.00019995884603149402,
+      "loss": 0.5012,
+      "step": 7
+    },
+    {
+      "epoch": 0.07111111111111111,
+      "grad_norm": 0.7675965428352356,
+      "learning_rate": 0.00019990741151022301,
+      "loss": 0.4309,
+      "step": 8
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.33606216311454773,
+      "learning_rate": 0.0001998354179989585,
+      "loss": 0.3882,
+      "step": 9
+    },
+    {
+      "epoch": 0.08888888888888889,
+      "grad_norm": 0.603860080242157,
+      "learning_rate": 0.00019974288031255618,
+      "loss": 0.3879,
+      "step": 10
+    },
+    {
+      "epoch": 0.09777777777777778,
+      "grad_norm": 0.37005615234375,
+      "learning_rate": 0.00019962981749346078,
+      "loss": 0.3739,
+      "step": 11
+    },
+    {
+      "epoch": 0.10666666666666667,
+      "grad_norm": 0.28186964988708496,
+      "learning_rate": 0.00019949625280778777,
+      "loss": 0.3521,
+      "step": 12
+    },
+    {
+      "epoch": 0.11555555555555555,
+      "grad_norm": 0.308602511882782,
+      "learning_rate": 0.0001993422137405354,
+      "loss": 0.3517,
+      "step": 13
+    },
+    {
+      "epoch": 0.12444444444444444,
+      "grad_norm": 0.2449037730693817,
+      "learning_rate": 0.000199167731989929,
+      "loss": 0.3249,
+      "step": 14
+    },
+    {
+      "epoch": 0.13333333333333333,
+      "grad_norm": 0.20047444105148315,
+      "learning_rate": 0.0001989728434608981,
+      "loss": 0.3428,
+      "step": 15
+    },
+    {
+      "epoch": 0.14222222222222222,
+      "grad_norm": 0.2844916880130768,
+      "learning_rate": 0.0001987575882576878,
+      "loss": 0.3054,
+      "step": 16
+    },
+    {
+      "epoch": 0.1511111111111111,
+      "grad_norm": 0.2698603868484497,
+      "learning_rate": 0.00019852201067560606,
+      "loss": 0.3236,
+      "step": 17
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.1720426231622696,
+      "learning_rate": 0.00019826615919190887,
+      "loss": 0.3217,
+      "step": 18
+    },
+    {
+      "epoch": 0.1688888888888889,
+      "grad_norm": 0.13907530903816223,
+      "learning_rate": 0.0001979900864558242,
+      "loss": 0.295,
+      "step": 19
+    },
+    {
+      "epoch": 0.17777777777777778,
+      "grad_norm": 0.19792726635932922,
+      "learning_rate": 0.0001976938492777182,
+      "loss": 0.297,
+      "step": 20
+    },
+    {
+      "epoch": 0.18666666666666668,
+      "grad_norm": 0.21798674762248993,
+      "learning_rate": 0.00019737750861740431,
+      "loss": 0.3055,
+      "step": 21
+    },
+    {
+      "epoch": 0.19555555555555557,
+      "grad_norm": 0.1863073855638504,
+      "learning_rate": 0.0001970411295715994,
+      "loss": 0.3077,
+      "step": 22
+    },
+    {
+      "epoch": 0.20444444444444446,
+      "grad_norm": 0.11970901489257812,
+      "learning_rate": 0.00019668478136052774,
+      "loss": 0.2825,
+      "step": 23
+    },
+    {
+      "epoch": 0.21333333333333335,
+      "grad_norm": 0.2247242033481598,
+      "learning_rate": 0.00019630853731367713,
+      "loss": 0.2988,
+      "step": 24
+    },
+    {
+      "epoch": 0.2222222222222222,
+      "grad_norm": 0.21440450847148895,
+      "learning_rate": 0.0001959124748547088,
+      "loss": 0.3017,
+      "step": 25
+    },
+    {
+      "epoch": 0.2311111111111111,
+      "grad_norm": 0.14320749044418335,
+      "learning_rate": 0.00019549667548552556,
+      "loss": 0.2991,
+      "step": 26
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.19487203657627106,
+      "learning_rate": 0.00019506122476949981,
+      "loss": 0.282,
+      "step": 27
+    },
+    {
+      "epoch": 0.24888888888888888,
+      "grad_norm": 0.19697605073451996,
+      "learning_rate": 0.00019460621231386676,
+      "loss": 0.285,
+      "step": 28
+    },
+    {
+      "epoch": 0.2577777777777778,
+      "grad_norm": 0.12722846865653992,
+      "learning_rate": 0.00019413173175128473,
+      "loss": 0.2943,
+      "step": 29
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.22678963840007782,
+      "learning_rate": 0.0001936378807205673,
+      "loss": 0.2803,
+      "step": 30
+    },
+    {
+      "epoch": 0.27555555555555555,
+      "grad_norm": 0.1479775756597519,
+      "learning_rate": 0.0001931247608465915,
+      "loss": 0.2757,
+      "step": 31
+    },
+    {
+      "epoch": 0.28444444444444444,
+      "grad_norm": 0.18370389938354492,
+      "learning_rate": 0.000192592477719385,
+      "loss": 0.2706,
+      "step": 32
+    },
+    {
+      "epoch": 0.29333333333333333,
+      "grad_norm": 0.19348439574241638,
+      "learning_rate": 0.00019204114087239806,
+      "loss": 0.2676,
+      "step": 33
+    },
+    {
+      "epoch": 0.3022222222222222,
+      "grad_norm": 0.1605139970779419,
+      "learning_rate": 0.0001914708637599636,
+      "loss": 0.2775,
+      "step": 34
+    },
+    {
+      "epoch": 0.3111111111111111,
+      "grad_norm": 0.20057319104671478,
+      "learning_rate": 0.0001908817637339503,
+      "loss": 0.2617,
+      "step": 35
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.1943630576133728,
+      "learning_rate": 0.0001902739620196143,
+      "loss": 0.255,
+      "step": 36
+    },
+    {
+      "epoch": 0.3288888888888889,
+      "grad_norm": 0.15888817608356476,
+      "learning_rate": 0.000189647583690653,
+      "loss": 0.2597,
+      "step": 37
+    },
+    {
+      "epoch": 0.3377777777777778,
+      "grad_norm": 0.24778060615062714,
+      "learning_rate": 0.00018900275764346768,
+      "loss": 0.2503,
+      "step": 38
+    },
+    {
+      "epoch": 0.3466666666666667,
+      "grad_norm": 0.164043590426445,
+      "learning_rate": 0.00018833961657063885,
+      "loss": 0.2568,
+      "step": 39
+    },
+    {
+      "epoch": 0.35555555555555557,
+      "grad_norm": 0.12275097519159317,
+      "learning_rate": 0.00018765829693362095,
+      "loss": 0.2391,
+      "step": 40
+    },
+    {
+      "epoch": 0.36444444444444446,
+      "grad_norm": 0.14079827070236206,
+      "learning_rate": 0.0001869589389346611,
+      "loss": 0.2591,
+      "step": 41
+    },
+    {
+      "epoch": 0.37333333333333335,
+      "grad_norm": 0.13059890270233154,
+      "learning_rate": 0.00018624168648794832,
+      "loss": 0.2595,
+      "step": 42
+    },
+    {
+      "epoch": 0.38222222222222224,
+      "grad_norm": 0.1584450751543045,
+      "learning_rate": 0.00018550668718999872,
+      "loss": 0.2547,
+      "step": 43
+    },
+    {
+      "epoch": 0.39111111111111113,
+      "grad_norm": 0.14390461146831512,
+      "learning_rate": 0.00018475409228928312,
+      "loss": 0.2609,
+      "step": 44
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.14565418660640717,
+      "learning_rate": 0.000183984056655103,
+      "loss": 0.2457,
+      "step": 45
+    },
+    {
+      "epoch": 0.4088888888888889,
+      "grad_norm": 0.23821446299552917,
+      "learning_rate": 0.0001831967387457214,
+      "loss": 0.2502,
+      "step": 46
+    },
+    {
+      "epoch": 0.4177777777777778,
+      "grad_norm": 0.12499424815177917,
+      "learning_rate": 0.00018239230057575542,
+      "loss": 0.2504,
+      "step": 47
+    },
+    {
+      "epoch": 0.4266666666666667,
+      "grad_norm": 0.2016025185585022,
+      "learning_rate": 0.00018157090768283678,
+      "loss": 0.2508,
+      "step": 48
+    },
+    {
+      "epoch": 0.43555555555555553,
+      "grad_norm": 0.1100303903222084,
+      "learning_rate": 0.00018073272909354727,
+      "loss": 0.2357,
+      "step": 49
+    },
+    {
+      "epoch": 0.4444444444444444,
+      "grad_norm": 0.24978284537792206,
+      "learning_rate": 0.00017987793728863651,
+      "loss": 0.2657,
+      "step": 50
+    },
+    {
+      "epoch": 0.4533333333333333,
+      "grad_norm": 0.14468884468078613,
+      "learning_rate": 0.00017900670816752874,
+      "loss": 0.2465,
+      "step": 51
+    },
+    {
+      "epoch": 0.4622222222222222,
+      "grad_norm": 0.21499471366405487,
+      "learning_rate": 0.0001781192210121262,
+      "loss": 0.2515,
+      "step": 52
+    },
+    {
+      "epoch": 0.4711111111111111,
+      "grad_norm": 0.11484992504119873,
+      "learning_rate": 0.00017721565844991643,
+      "loss": 0.2552,
+      "step": 53
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.23135177791118622,
+      "learning_rate": 0.00017629620641639103,
+      "loss": 0.244,
+      "step": 54
+    },
+    {
+      "epoch": 0.4888888888888889,
+      "grad_norm": 0.11799556761980057,
+      "learning_rate": 0.0001753610541167838,
+      "loss": 0.2371,
+      "step": 55
+    },
+    {
+      "epoch": 0.49777777777777776,
+      "grad_norm": 0.25612759590148926,
+      "learning_rate": 0.00017441039398713608,
+      "loss": 0.2533,
+      "step": 56
+    },
+    {
+      "epoch": 0.5066666666666667,
+      "grad_norm": 0.11615631729364395,
+      "learning_rate": 0.00017344442165469714,
+      "loss": 0.2633,
+      "step": 57
+    },
+    {
+      "epoch": 0.5155555555555555,
+      "grad_norm": 0.20532435178756714,
+      "learning_rate": 0.00017246333589766787,
+      "loss": 0.2531,
+      "step": 58
+    },
+    {
+      "epoch": 0.5244444444444445,
+      "grad_norm": 0.13294675946235657,
+      "learning_rate": 0.00017146733860429612,
+      "loss": 0.2491,
+      "step": 59
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.14878574013710022,
+      "learning_rate": 0.00017045663473133215,
+      "loss": 0.253,
+      "step": 60
+    },
+    {
+      "epoch": 0.5422222222222223,
+      "grad_norm": 0.19352824985980988,
+      "learning_rate": 0.00016943143226185253,
+      "loss": 0.2422,
+      "step": 61
+    },
+    {
+      "epoch": 0.5511111111111111,
+      "grad_norm": 0.12467966228723526,
+      "learning_rate": 0.00016839194216246108,
+      "loss": 0.2452,
+      "step": 62
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.18069876730442047,
+      "learning_rate": 0.00016733837833987633,
+      "loss": 0.2454,
+      "step": 63
+    },
+    {
+      "epoch": 0.5688888888888889,
+      "grad_norm": 0.11620178818702698,
+      "learning_rate": 0.00016627095759691362,
+      "loss": 0.2446,
+      "step": 64
+    },
+    {
+      "epoch": 0.5777777777777777,
+      "grad_norm": 0.22151941061019897,
+      "learning_rate": 0.00016518989958787126,
+      "loss": 0.2529,
+      "step": 65
+    },
+    {
+      "epoch": 0.5866666666666667,
+      "grad_norm": 0.12719431519508362,
+      "learning_rate": 0.00016409542677333006,
+      "loss": 0.2375,
+      "step": 66
+    },
+    {
+      "epoch": 0.5955555555555555,
+      "grad_norm": 0.1565994918346405,
+      "learning_rate": 0.00016298776437437523,
+      "loss": 0.2491,
+      "step": 67
+    },
+    {
+      "epoch": 0.6044444444444445,
+      "grad_norm": 0.1190439760684967,
+      "learning_rate": 0.00016186714032625035,
+      "loss": 0.2441,
+      "step": 68
+    },
+    {
+      "epoch": 0.6133333333333333,
+      "grad_norm": 0.12428659945726395,
+      "learning_rate": 0.0001607337852314527,
+      "loss": 0.2537,
+      "step": 69
+    },
+    {
+      "epoch": 0.6222222222222222,
+      "grad_norm": 0.1403275430202484,
+      "learning_rate": 0.0001595879323122798,
+      "loss": 0.2347,
+      "step": 70
+    },
+    {
+      "epoch": 0.6311111111111111,
+      "grad_norm": 0.1558828353881836,
+      "learning_rate": 0.00015842981736283686,
+      "loss": 0.2393,
+      "step": 71
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.10860105603933334,
+      "learning_rate": 0.0001572596787005149,
+      "loss": 0.2364,
+      "step": 72
+    },
+    {
+      "epoch": 0.6488888888888888,
+      "grad_norm": 0.1323016732931137,
+      "learning_rate": 0.00015607775711694977,
+      "loss": 0.2655,
+      "step": 73
+    },
+    {
+      "epoch": 0.6577777777777778,
+      "grad_norm": 0.12287864089012146,
+      "learning_rate": 0.00015488429582847192,
+      "loss": 0.2354,
+      "step": 74
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.10385692864656448,
+      "learning_rate": 0.0001536795404260572,
+      "loss": 0.2219,
+      "step": 75
+    },
+    {
+      "epoch": 0.6755555555555556,
+      "grad_norm": 0.1461033821105957,
+      "learning_rate": 0.00015246373882478898,
+      "loss": 0.2422,
+      "step": 76
+    },
+    {
+      "epoch": 0.6844444444444444,
+      "grad_norm": 0.12456011027097702,
+      "learning_rate": 0.0001512371412128424,
+      "loss": 0.2489,
+      "step": 77
+    },
+    {
+      "epoch": 0.6933333333333334,
+      "grad_norm": 0.13823498785495758,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.2413,
+      "step": 78
+    },
+    {
+      "epoch": 0.7022222222222222,
+      "grad_norm": 0.2109290212392807,
+      "learning_rate": 0.00014875256976571135,
+      "loss": 0.2292,
+      "step": 79
+    },
+    {
+      "epoch": 0.7111111111111111,
+      "grad_norm": 0.15889447927474976,
+      "learning_rate": 0.00014749510720670506,
+      "loss": 0.2402,
+      "step": 80
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.14581087231636047,
+      "learning_rate": 0.00014622787108416584,
+      "loss": 0.2375,
+      "step": 81
+    },
+    {
+      "epoch": 0.7288888888888889,
+      "grad_norm": 0.12129058688879013,
+      "learning_rate": 0.00014495112217048658,
+      "loss": 0.2431,
+      "step": 82
+    },
+    {
+      "epoch": 0.7377777777777778,
+      "grad_norm": 0.17288292944431305,
+      "learning_rate": 0.0001436651231956064,
+      "loss": 0.2316,
+      "step": 83
+    },
+    {
+      "epoch": 0.7466666666666667,
+      "grad_norm": 0.18346691131591797,
+      "learning_rate": 0.0001423701387929459,
+      "loss": 0.2284,
+      "step": 84
+    },
+    {
+      "epoch": 0.7555555555555555,
+      "grad_norm": 0.14579100906848907,
+      "learning_rate": 0.0001410664354449509,
+      "loss": 0.2328,
+      "step": 85
+    },
+    {
+      "epoch": 0.7644444444444445,
+      "grad_norm": 0.16875790059566498,
+      "learning_rate": 0.0001397542814282556,
+      "loss": 0.2305,
+      "step": 86
+    },
+    {
+      "epoch": 0.7733333333333333,
+      "grad_norm": 0.13410557806491852,
+      "learning_rate": 0.00013843394675847634,
+      "loss": 0.2224,
+      "step": 87
+    },
+    {
+      "epoch": 0.7822222222222223,
+      "grad_norm": 0.17506512999534607,
+      "learning_rate": 0.00013710570313464778,
+      "loss": 0.2511,
+      "step": 88
+    },
+    {
+      "epoch": 0.7911111111111111,
+      "grad_norm": 0.15146586298942566,
+      "learning_rate": 0.0001357698238833126,
+      "loss": 0.2243,
+      "step": 89
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.1319180130958557,
+      "learning_rate": 0.00013442658390227602,
+      "loss": 0.2292,
+      "step": 90
+    },
+    {
+      "epoch": 0.8088888888888889,
+      "grad_norm": 0.11848906427621841,
+      "learning_rate": 0.00013307625960403763,
+      "loss": 0.2268,
+      "step": 91
+    },
+    {
+      "epoch": 0.8177777777777778,
+      "grad_norm": 0.11634498089551926,
+      "learning_rate": 0.00013171912885891063,
+      "loss": 0.234,
+      "step": 92
+    },
+    {
+      "epoch": 0.8266666666666667,
+      "grad_norm": 0.14567352831363678,
+      "learning_rate": 0.00013035547093784186,
+      "loss": 0.2452,
+      "step": 93
+    },
+    {
+      "epoch": 0.8355555555555556,
+      "grad_norm": 0.18894989788532257,
+      "learning_rate": 0.00012898556645494325,
+      "loss": 0.2362,
+      "step": 94
+    },
+    {
+      "epoch": 0.8444444444444444,
+      "grad_norm": 0.14165499806404114,
+      "learning_rate": 0.00012760969730974694,
+      "loss": 0.2249,
+      "step": 95
+    },
+    {
+      "epoch": 0.8533333333333334,
+      "grad_norm": 0.14605680108070374,
+      "learning_rate": 0.00012622814662919561,
+      "loss": 0.2394,
+      "step": 96
+    },
+    {
+      "epoch": 0.8622222222222222,
+      "grad_norm": 0.11261335760354996,
+      "learning_rate": 0.00012484119870938103,
+      "loss": 0.232,
+      "step": 97
+    },
+    {
+      "epoch": 0.8711111111111111,
+      "grad_norm": 0.1433236300945282,
+      "learning_rate": 0.00012344913895704097,
+      "loss": 0.2283,
+      "step": 98
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.12494556605815887,
+      "learning_rate": 0.00012205225383082843,
+      "loss": 0.2116,
+      "step": 99
+    },
+    {
+      "epoch": 0.8888888888888888,
+      "grad_norm": 0.1096857562661171,
+      "learning_rate": 0.00012065083078236374,
+      "loss": 0.2395,
+      "step": 100
+    },
+    {
+      "epoch": 0.8977777777777778,
+      "grad_norm": 0.1713569015264511,
+      "learning_rate": 0.000119245158197083,
+      "loss": 0.2432,
+      "step": 101
+    },
+    {
+      "epoch": 0.9066666666666666,
+      "grad_norm": 0.13948315382003784,
+      "learning_rate": 0.00011783552533489372,
+      "loss": 0.2409,
+      "step": 102
+    },
+    {
+      "epoch": 0.9155555555555556,
+      "grad_norm": 0.14001551270484924,
+      "learning_rate": 0.00011642222227065089,
+      "loss": 0.243,
+      "step": 103
+    },
+    {
+      "epoch": 0.9244444444444444,
+      "grad_norm": 0.16406795382499695,
+      "learning_rate": 0.00011500553983446527,
+      "loss": 0.2173,
+      "step": 104
+    },
+    {
+      "epoch": 0.9333333333333333,
+      "grad_norm": 0.12843827903270721,
+      "learning_rate": 0.0001135857695518563,
+      "loss": 0.2392,
+      "step": 105
+    },
+    {
+      "epoch": 0.9422222222222222,
+      "grad_norm": 0.1474241018295288,
+      "learning_rate": 0.00011216320358376157,
+      "loss": 0.2295,
+      "step": 106
+    },
+    {
+      "epoch": 0.9511111111111111,
+      "grad_norm": 0.1349494457244873,
+      "learning_rate": 0.00011073813466641632,
+      "loss": 0.2372,
+      "step": 107
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.12883637845516205,
+      "learning_rate": 0.00010931085605111354,
+      "loss": 0.2065,
+      "step": 108
+    },
+    {
+      "epoch": 0.9688888888888889,
+      "grad_norm": 0.21473082900047302,
+      "learning_rate": 0.00010788166144385888,
+      "loss": 0.2419,
+      "step": 109
+    },
+    {
+      "epoch": 0.9777777777777777,
+      "grad_norm": 0.10445938259363174,
+      "learning_rate": 0.00010645084494493165,
+      "loss": 0.2217,
+      "step": 110
+    },
+    {
+      "epoch": 0.9866666666666667,
+      "grad_norm": 0.19851183891296387,
+      "learning_rate": 0.00010501870098836473,
+      "loss": 0.2276,
+      "step": 111
+    },
+    {
+      "epoch": 0.9955555555555555,
+      "grad_norm": 0.1827424168586731,
+      "learning_rate": 0.00010358552428135575,
+      "loss": 0.237,
+      "step": 112
+    },
+    {
+      "epoch": 1.0044444444444445,
+      "grad_norm": 0.11062519997358322,
+      "learning_rate": 0.00010215160974362223,
+      "loss": 0.2199,
+      "step": 113
+    },
+    {
+      "epoch": 1.0133333333333334,
+      "grad_norm": 0.18650983273983002,
+      "learning_rate": 0.00010071725244671282,
+      "loss": 0.2351,
+      "step": 114
+    },
+    {
+      "epoch": 1.0022222222222221,
+      "grad_norm": 0.158047154545784,
+      "learning_rate": 9.928274755328723e-05,
+      "loss": 0.2257,
+      "step": 115
+    },
+    {
+      "epoch": 1.011111111111111,
+      "grad_norm": 0.11520474404096603,
+      "learning_rate": 9.784839025637778e-05,
+      "loss": 0.2087,
+      "step": 116
+    },
+    {
+      "epoch": 1.02,
+      "grad_norm": 0.1513085812330246,
+      "learning_rate": 9.641447571864429e-05,
+      "loss": 0.2207,
+      "step": 117
+    },
+    {
+      "epoch": 1.028888888888889,
+      "grad_norm": 0.17206072807312012,
+      "learning_rate": 9.49812990116353e-05,
+      "loss": 0.1939,
+      "step": 118
+    },
+    {
+      "epoch": 1.0377777777777777,
+      "grad_norm": 0.10064394772052765,
+      "learning_rate": 9.354915505506839e-05,
+      "loss": 0.1994,
+      "step": 119
+    },
+    {
+      "epoch": 1.0466666666666666,
+      "grad_norm": 0.206328347325325,
+      "learning_rate": 9.211833855614114e-05,
+      "loss": 0.1926,
+      "step": 120
+    },
+    {
+      "epoch": 1.0555555555555556,
+      "grad_norm": 0.16261310875415802,
+      "learning_rate": 9.06891439488865e-05,
+      "loss": 0.2189,
+      "step": 121
+    },
+    {
+      "epoch": 1.0644444444444445,
+      "grad_norm": 0.10285034775733948,
+      "learning_rate": 8.92618653335837e-05,
+      "loss": 0.2028,
+      "step": 122
+    },
+    {
+      "epoch": 1.0733333333333333,
+      "grad_norm": 0.15619589388370514,
+      "learning_rate": 8.783679641623845e-05,
+      "loss": 0.2189,
+      "step": 123
+    },
+    {
+      "epoch": 1.0822222222222222,
+      "grad_norm": 0.17892366647720337,
+      "learning_rate": 8.641423044814374e-05,
+      "loss": 0.2135,
+      "step": 124
+    },
+    {
+      "epoch": 1.0911111111111111,
+      "grad_norm": 0.11443614959716797,
+      "learning_rate": 8.499446016553474e-05,
+      "loss": 0.1986,
+      "step": 125
+    },
+    {
+      "epoch": 1.1,
+      "grad_norm": 0.12756159901618958,
+      "learning_rate": 8.357777772934913e-05,
+      "loss": 0.1972,
+      "step": 126
+    },
+    {
+      "epoch": 1.1088888888888888,
+      "grad_norm": 0.16858510673046112,
+      "learning_rate": 8.216447466510631e-05,
+      "loss": 0.2143,
+      "step": 127
+    },
+    {
+      "epoch": 1.1177777777777778,
+      "grad_norm": 0.1412675678730011,
+      "learning_rate": 8.075484180291701e-05,
+      "loss": 0.1957,
+      "step": 128
+    },
+    {
+      "epoch": 1.1266666666666667,
+      "grad_norm": 0.12204045802354813,
+      "learning_rate": 7.934916921763628e-05,
+      "loss": 0.2244,
+      "step": 129
+    },
+    {
+      "epoch": 1.1355555555555557,
+      "grad_norm": 0.14355255663394928,
+      "learning_rate": 7.79477461691716e-05,
+      "loss": 0.2058,
+      "step": 130
+    },
+    {
+      "epoch": 1.1444444444444444,
+      "grad_norm": 0.16251309216022491,
+      "learning_rate": 7.655086104295904e-05,
+      "loss": 0.2131,
+      "step": 131
+    },
+    {
+      "epoch": 1.1533333333333333,
+      "grad_norm": 0.1018742248415947,
+      "learning_rate": 7.5158801290619e-05,
+      "loss": 0.195,
+      "step": 132
+    },
+    {
+      "epoch": 1.1622222222222223,
+      "grad_norm": 0.1632150560617447,
+      "learning_rate": 7.377185337080442e-05,
+      "loss": 0.2027,
+      "step": 133
+    },
+    {
+      "epoch": 1.1711111111111112,
+      "grad_norm": 0.14629168808460236,
+      "learning_rate": 7.239030269025311e-05,
+      "loss": 0.2068,
+      "step": 134
+    },
+    {
+      "epoch": 1.18,
+      "grad_norm": 0.10014279186725616,
+      "learning_rate": 7.101443354505678e-05,
+      "loss": 0.1948,
+      "step": 135
+    },
+    {
+      "epoch": 1.1888888888888889,
+      "grad_norm": 0.12599867582321167,
+      "learning_rate": 6.964452906215815e-05,
+      "loss": 0.2444,
+      "step": 136
+    },
+    {
+      "epoch": 1.1977777777777778,
+      "grad_norm": 0.16535311937332153,
+      "learning_rate": 6.82808711410894e-05,
+      "loss": 0.2269,
+      "step": 137
+    },
+    {
+      "epoch": 1.2066666666666666,
+      "grad_norm": 0.11025357991456985,
+      "learning_rate": 6.69237403959624e-05,
+      "loss": 0.2161,
+      "step": 138
+    },
+    {
+      "epoch": 1.2155555555555555,
+      "grad_norm": 0.11338219046592712,
+      "learning_rate": 6.5573416097724e-05,
+      "loss": 0.2201,
+      "step": 139
+    },
+    {
+      "epoch": 1.2244444444444444,
+      "grad_norm": 0.11197050660848618,
+      "learning_rate": 6.423017611668745e-05,
+      "loss": 0.2021,
+      "step": 140
+    },
+    {
+      "epoch": 1.2333333333333334,
+      "grad_norm": 0.12104946374893188,
+      "learning_rate": 6.289429686535226e-05,
+      "loss": 0.2015,
+      "step": 141
+    },
+    {
+      "epoch": 1.2422222222222223,
+      "grad_norm": 0.11741239577531815,
+      "learning_rate": 6.15660532415237e-05,
+      "loss": 0.204,
+      "step": 142
+    },
+    {
+      "epoch": 1.251111111111111,
+      "grad_norm": 0.10673556476831436,
+      "learning_rate": 6.024571857174443e-05,
+      "loss": 0.1953,
+      "step": 143
+    },
+    {
+      "epoch": 1.26,
+      "grad_norm": 0.1320074498653412,
+      "learning_rate": 5.8933564555049105e-05,
+      "loss": 0.212,
+      "step": 144
+    },
+    {
+      "epoch": 1.268888888888889,
+      "grad_norm": 0.10862302780151367,
+      "learning_rate": 5.7629861207054136e-05,
+      "loss": 0.2004,
+      "step": 145
+    },
+    {
+      "epoch": 1.2777777777777777,
+      "grad_norm": 0.12227830290794373,
+      "learning_rate": 5.633487680439361e-05,
+      "loss": 0.2049,
+      "step": 146
+    },
+    {
+      "epoch": 1.2866666666666666,
+      "grad_norm": 0.12364481389522552,
+      "learning_rate": 5.5048877829513424e-05,
+      "loss": 0.2164,
+      "step": 147
+    },
+    {
+      "epoch": 1.2955555555555556,
+      "grad_norm": 0.11840461939573288,
+      "learning_rate": 5.3772128915834184e-05,
+      "loss": 0.1963,
+      "step": 148
+    },
+    {
+      "epoch": 1.3044444444444445,
+      "grad_norm": 0.12821979820728302,
+      "learning_rate": 5.2504892793295e-05,
+      "loss": 0.212,
+      "step": 149
+    },
+    {
+      "epoch": 1.3133333333333335,
+      "grad_norm": 0.1159982681274414,
+      "learning_rate": 5.124743023428867e-05,
+      "loss": 0.196,
+      "step": 150
+    },
+    {
+      "epoch": 1.3222222222222222,
+      "grad_norm": 0.11656828969717026,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.2115,
+      "step": 151
+    },
+    {
+      "epoch": 1.3311111111111111,
+      "grad_norm": 0.11024783551692963,
+      "learning_rate": 4.876285878715764e-05,
+      "loss": 0.1926,
+      "step": 152
+    },
+    {
+      "epoch": 1.34,
+      "grad_norm": 0.12047923356294632,
+      "learning_rate": 4.753626117521103e-05,
+      "loss": 0.2023,
+      "step": 153
+    },
+    {
+      "epoch": 1.3488888888888888,
+      "grad_norm": 0.12601517140865326,
+      "learning_rate": 4.6320459573942856e-05,
+      "loss": 0.1968,
+      "step": 154
+    },
+    {
+      "epoch": 1.3577777777777778,
+      "grad_norm": 0.11270546168088913,
+      "learning_rate": 4.5115704171528105e-05,
+      "loss": 0.2133,
+      "step": 155
+    },
+    {
+      "epoch": 1.3666666666666667,
+      "grad_norm": 0.12202366441488266,
+      "learning_rate": 4.3922242883050224e-05,
+      "loss": 0.2299,
+      "step": 156
+    },
+    {
+      "epoch": 1.3755555555555556,
+      "grad_norm": 0.1306028515100479,
+      "learning_rate": 4.274032129948512e-05,
+      "loss": 0.205,
+      "step": 157
+    },
+    {
+      "epoch": 1.3844444444444444,
+      "grad_norm": 0.1278497874736786,
+      "learning_rate": 4.1570182637163155e-05,
+      "loss": 0.2095,
+      "step": 158
+    },
+    {
+      "epoch": 1.3933333333333333,
+      "grad_norm": 0.10907793790102005,
+      "learning_rate": 4.041206768772022e-05,
+      "loss": 0.2096,
+      "step": 159
+    },
+    {
+      "epoch": 1.4022222222222223,
+      "grad_norm": 0.11490105092525482,
+      "learning_rate": 3.926621476854734e-05,
+      "loss": 0.2082,
+      "step": 160
+    },
+    {
+      "epoch": 1.411111111111111,
+      "grad_norm": 0.13591797649860382,
+      "learning_rate": 3.813285967374969e-05,
+      "loss": 0.1986,
+      "step": 161
+    },
+    {
+      "epoch": 1.42,
+      "grad_norm": 0.1358928382396698,
+      "learning_rate": 3.701223562562478e-05,
+      "loss": 0.2269,
+      "step": 162
+    },
+    {
+      "epoch": 1.4288888888888889,
+      "grad_norm": 0.12895609438419342,
+      "learning_rate": 3.590457322666997e-05,
+      "loss": 0.209,
+      "step": 163
+    },
+    {
+      "epoch": 1.4377777777777778,
+      "grad_norm": 0.11933830380439758,
+      "learning_rate": 3.4810100412128747e-05,
+      "loss": 0.2098,
+      "step": 164
+    },
+    {
+      "epoch": 1.4466666666666668,
+      "grad_norm": 0.11205963045358658,
+      "learning_rate": 3.3729042403086395e-05,
+      "loss": 0.1958,
+      "step": 165
+    },
+    {
+      "epoch": 1.4555555555555555,
+      "grad_norm": 0.1345898061990738,
+      "learning_rate": 3.2661621660123665e-05,
+      "loss": 0.215,
+      "step": 166
+    },
+    {
+      "epoch": 1.4644444444444444,
+      "grad_norm": 0.11800192296504974,
+      "learning_rate": 3.160805783753897e-05,
+      "loss": 0.2057,
+      "step": 167
+    },
+    {
+      "epoch": 1.4733333333333334,
+      "grad_norm": 0.11855156719684601,
+      "learning_rate": 3.05685677381475e-05,
+      "loss": 0.2083,
+      "step": 168
+    },
+    {
+      "epoch": 1.482222222222222,
+      "grad_norm": 0.1298232227563858,
+      "learning_rate": 2.9543365268667867e-05,
+      "loss": 0.2155,
+      "step": 169
+    },
+    {
+      "epoch": 1.491111111111111,
+      "grad_norm": 0.12133259326219559,
+      "learning_rate": 2.853266139570391e-05,
+      "loss": 0.2007,
+      "step": 170
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 0.12358111888170242,
+      "learning_rate": 2.7536664102332176e-05,
+      "loss": 0.2073,
+      "step": 171
+    },
+    {
+      "epoch": 1.508888888888889,
+      "grad_norm": 0.11087555438280106,
+      "learning_rate": 2.6555578345302878e-05,
+      "loss": 0.2048,
+      "step": 172
+    },
+    {
+      "epoch": 1.517777777777778,
+      "grad_norm": 0.11245834082365036,
+      "learning_rate": 2.5589606012863963e-05,
+      "loss": 0.1937,
+      "step": 173
+    },
+    {
+      "epoch": 1.5266666666666666,
+      "grad_norm": 0.11430654674768448,
+      "learning_rate": 2.4638945883216235e-05,
+      "loss": 0.203,
+      "step": 174
+    },
+    {
+      "epoch": 1.5355555555555556,
+      "grad_norm": 0.11313813179731369,
+      "learning_rate": 2.3703793583609013e-05,
+      "loss": 0.206,
+      "step": 175
+    },
+    {
+      "epoch": 1.5444444444444443,
+      "grad_norm": 0.1203705370426178,
+      "learning_rate": 2.2784341550083576e-05,
+      "loss": 0.1997,
+      "step": 176
+    },
+    {
+      "epoch": 1.5533333333333332,
+      "grad_norm": 0.11311240494251251,
+      "learning_rate": 2.1880778987873807e-05,
+      "loss": 0.2036,
+      "step": 177
+    },
+    {
+      "epoch": 1.5622222222222222,
+      "grad_norm": 0.11635250598192215,
+      "learning_rate": 2.099329183247126e-05,
+      "loss": 0.1948,
+      "step": 178
+    },
+    {
+      "epoch": 1.5711111111111111,
+      "grad_norm": 0.11245696246623993,
+      "learning_rate": 2.0122062711363532e-05,
+      "loss": 0.1944,
+      "step": 179
+    },
+    {
+      "epoch": 1.58,
+      "grad_norm": 0.11117862164974213,
+      "learning_rate": 1.926727090645275e-05,
+      "loss": 0.1988,
+      "step": 180
+    },
+    {
+      "epoch": 1.588888888888889,
+      "grad_norm": 0.12594608962535858,
+      "learning_rate": 1.8429092317163245e-05,
+      "loss": 0.2225,
+      "step": 181
+    },
+    {
+      "epoch": 1.5977777777777777,
+      "grad_norm": 0.12260469049215317,
+      "learning_rate": 1.7607699424244585e-05,
+      "loss": 0.1929,
+      "step": 182
+    },
+    {
+      "epoch": 1.6066666666666667,
+      "grad_norm": 0.1126975491642952,
+      "learning_rate": 1.6803261254278636e-05,
+      "loss": 0.2059,
+      "step": 183
+    },
+    {
+      "epoch": 1.6155555555555554,
+      "grad_norm": 0.14772062003612518,
+      "learning_rate": 1.601594334489702e-05,
+      "loss": 0.21,
+      "step": 184
+    },
+    {
+      "epoch": 1.6244444444444444,
+      "grad_norm": 0.11341425776481628,
+      "learning_rate": 1.5245907710716911e-05,
+      "loss": 0.1996,
+      "step": 185
+    },
+    {
+      "epoch": 1.6333333333333333,
+      "grad_norm": 0.13061125576496124,
+      "learning_rate": 1.4493312810001292e-05,
+      "loss": 0.1974,
+      "step": 186
+    },
+    {
+      "epoch": 1.6422222222222222,
+      "grad_norm": 0.11971107870340347,
+      "learning_rate": 1.3758313512051702e-05,
+      "loss": 0.2143,
+      "step": 187
+    },
+    {
+      "epoch": 1.6511111111111112,
+      "grad_norm": 0.1235496923327446,
+      "learning_rate": 1.30410610653389e-05,
+      "loss": 0.2075,
+      "step": 188
+    },
+    {
+      "epoch": 1.6600000000000001,
+      "grad_norm": 0.11503645032644272,
+      "learning_rate": 1.2341703066379074e-05,
+      "loss": 0.203,
+      "step": 189
+    },
+    {
+      "epoch": 1.6688888888888889,
+      "grad_norm": 0.122038334608078,
+      "learning_rate": 1.1660383429361155e-05,
+      "loss": 0.2178,
+      "step": 190
+    },
+    {
+      "epoch": 1.6777777777777778,
+      "grad_norm": 0.11667265743017197,
+      "learning_rate": 1.0997242356532334e-05,
+      "loss": 0.1912,
+      "step": 191
+    },
+    {
+      "epoch": 1.6866666666666665,
+      "grad_norm": 0.11608020961284637,
+      "learning_rate": 1.0352416309347001e-05,
+      "loss": 0.2031,
+      "step": 192
+    },
+    {
+      "epoch": 1.6955555555555555,
+      "grad_norm": 0.1358698457479477,
+      "learning_rate": 9.726037980385738e-06,
+      "loss": 0.214,
+      "step": 193
+    },
+    {
+      "epoch": 1.7044444444444444,
+      "grad_norm": 0.12363021075725555,
+      "learning_rate": 9.118236266049707e-06,
+      "loss": 0.2006,
+      "step": 194
+    },
+    {
+      "epoch": 1.7133333333333334,
+      "grad_norm": 0.12124864757061005,
+      "learning_rate": 8.52913624003644e-06,
+      "loss": 0.2018,
+      "step": 195
+    },
+    {
+      "epoch": 1.7222222222222223,
+      "grad_norm": 0.12149032205343246,
+      "learning_rate": 7.958859127601936e-06,
+      "loss": 0.1931,
+      "step": 196
+    },
+    {
+      "epoch": 1.7311111111111113,
+      "grad_norm": 0.11497901380062103,
+      "learning_rate": 7.40752228061502e-06,
+      "loss": 0.2104,
+      "step": 197
+    },
+    {
+      "epoch": 1.74,
+      "grad_norm": 0.12098975479602814,
+      "learning_rate": 6.875239153408542e-06,
+      "loss": 0.1959,
+      "step": 198
+    },
+    {
+      "epoch": 1.748888888888889,
+      "grad_norm": 0.13070844113826752,
+      "learning_rate": 6.36211927943271e-06,
+      "loss": 0.2199,
+      "step": 199
+    },
+    {
+      "epoch": 1.7577777777777777,
+      "grad_norm": 0.11926998198032379,
+      "learning_rate": 5.868268248715292e-06,
+      "loss": 0.1991,
+      "step": 200
+    },
+    {
+      "epoch": 1.7666666666666666,
+      "grad_norm": 0.11677994579076767,
+      "learning_rate": 5.3937876861332336e-06,
+      "loss": 0.1999,
+      "step": 201
+    },
+    {
+      "epoch": 1.7755555555555556,
+      "grad_norm": 0.11903119087219238,
+      "learning_rate": 4.938775230500192e-06,
+      "loss": 0.2033,
+      "step": 202
+    },
+    {
+      "epoch": 1.7844444444444445,
+      "grad_norm": 0.12705688178539276,
+      "learning_rate": 4.503324514474483e-06,
+      "loss": 0.1893,
+      "step": 203
+    },
+    {
+      "epoch": 1.7933333333333334,
+      "grad_norm": 0.12431243807077408,
+      "learning_rate": 4.087525145291204e-06,
+      "loss": 0.1944,
+      "step": 204
+    },
+    {
+      "epoch": 1.8022222222222222,
+      "grad_norm": 0.11600019037723541,
+      "learning_rate": 3.6914626863228997e-06,
+      "loss": 0.2043,
+      "step": 205
+    },
+    {
+      "epoch": 1.8111111111111111,
+      "grad_norm": 0.11950667202472687,
+      "learning_rate": 3.3152186394722505e-06,
+      "loss": 0.2026,
+      "step": 206
+    },
+    {
+      "epoch": 1.8199999999999998,
+      "grad_norm": 0.11922794580459595,
+      "learning_rate": 2.9588704284006174e-06,
+      "loss": 0.1948,
+      "step": 207
+    },
+    {
+      "epoch": 1.8288888888888888,
+      "grad_norm": 0.10943859070539474,
+      "learning_rate": 2.622491382595693e-06,
+      "loss": 0.2043,
+      "step": 208
+    },
+    {
+      "epoch": 1.8377777777777777,
+      "grad_norm": 0.12232745438814163,
+      "learning_rate": 2.30615072228183e-06,
+      "loss": 0.2175,
+      "step": 209
+    },
+    {
+      "epoch": 1.8466666666666667,
+      "grad_norm": 0.12107669562101364,
+      "learning_rate": 2.00991354417579e-06,
+      "loss": 0.2159,
+      "step": 210
+    },
+    {
+      "epoch": 1.8555555555555556,
+      "grad_norm": 0.11113448441028595,
+      "learning_rate": 1.7338408080911471e-06,
+      "loss": 0.1897,
+      "step": 211
+    },
+    {
+      "epoch": 1.8644444444444446,
+      "grad_norm": 0.12423297762870789,
+      "learning_rate": 1.4779893243939359e-06,
+      "loss": 0.2229,
+      "step": 212
+    },
+    {
+      "epoch": 1.8733333333333333,
+      "grad_norm": 0.11375775188207626,
+      "learning_rate": 1.2424117423122328e-06,
+      "loss": 0.2134,
+      "step": 213
+    },
+    {
+      "epoch": 1.8822222222222222,
+      "grad_norm": 0.12908221781253815,
+      "learning_rate": 1.0271565391018922e-06,
+      "loss": 0.2089,
+      "step": 214
+    },
+    {
+      "epoch": 1.891111111111111,
+      "grad_norm": 0.11395999044179916,
+      "learning_rate": 8.322680100710023e-07,
+      "loss": 0.1978,
+      "step": 215
+    },
+    {
+      "epoch": 1.9,
+      "grad_norm": 0.12362853437662125,
+      "learning_rate": 6.577862594646322e-07,
+      "loss": 0.2052,
+      "step": 216
+    },
+    {
+      "epoch": 1.9088888888888889,
+      "grad_norm": 0.1114320233464241,
+      "learning_rate": 5.037471922122561e-07,
+      "loss": 0.1939,
+      "step": 217
+    },
+    {
+      "epoch": 1.9177777777777778,
+      "grad_norm": 0.11809442937374115,
+      "learning_rate": 3.701825065392184e-07,
+      "loss": 0.2037,
+      "step": 218
+    },
+    {
+      "epoch": 1.9266666666666667,
+      "grad_norm": 0.12118525058031082,
+      "learning_rate": 2.5711968744382974e-07,
+      "loss": 0.2032,
+      "step": 219
+    },
+    {
+      "epoch": 1.9355555555555557,
+      "grad_norm": 0.11821627616882324,
+      "learning_rate": 1.6458200104149113e-07,
+      "loss": 0.1872,
+      "step": 220
+    },
+    {
+      "epoch": 1.9444444444444444,
+      "grad_norm": 0.11238008737564087,
+      "learning_rate": 9.258848977700129e-08,
+      "loss": 0.1988,
+      "step": 221
+    },
+    {
+      "epoch": 1.9533333333333334,
+      "grad_norm": 0.1149674504995346,
+      "learning_rate": 4.1153968505991406e-08,
+      "loss": 0.1886,
+      "step": 222
+    },
+    {
+      "epoch": 1.962222222222222,
+      "grad_norm": 0.11997341364622116,
+      "learning_rate": 1.0289021446308056e-08,
+      "loss": 0.1981,
+      "step": 223
+    },
+    {
+      "epoch": 1.971111111111111,
+      "grad_norm": 0.11600257456302643,
+      "learning_rate": 0.0,
+      "loss": 0.2017,
+      "step": 224
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 224,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.5347935595512463e+18,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-224/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9071cbc82dde54983a3561a6ccf22d115ba0a4ffef5538257f9749930ee8e1d4
+size 6200