Upload 16 files

Browse files

Files changed (16) hide show

README.md +144 -3
adapter_config.json +33 -0
adapter_model.bin +3 -0
checkpoint-527/README.md +202 -0
checkpoint-527/adapter_config.json +33 -0
checkpoint-527/adapter_model.safetensors +3 -0
checkpoint-527/optimizer.pt +3 -0
checkpoint-527/rng_state.pth +3 -0
checkpoint-527/scheduler.pt +3 -0
checkpoint-527/trainer_state.json +3742 -0
checkpoint-527/training_args.bin +3 -0
config.json +43 -0
runs/Mar09_05-23-07_9b5078085e9b/events.out.tfevents.1709961787.9b5078085e9b.1472.0 +3 -0
special_tokens_map.json +24 -0
tokenizer.model +3 -0
tokenizer_config.json +43 -0

README.md CHANGED Viewed

@@ -1,3 +1,144 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+library_name: peft
+tags:
+- generated_from_trainer
+base_model: openlm-research/open_llama_3b_v2
+model-index:
+- name: lora-out
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+[<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
+<details><summary>See axolotl config</summary>
+axolotl version: `0.4.0`
+```yaml
+base_model: openlm-research/open_llama_3b_v2
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+push_dataset_to_hub:
+datasets:
+  - path: AabirDey/job-queries-and-customer-service
+    type: alpaca
+dataset_prepared_path:
+val_set_size: 0.02
+adapter: lora
+lora_model_dir:
+sequence_len: 1024
+sample_packing: true
+lora_r: 8
+lora_alpha: 16
+lora_dropout: 0.0
+lora_target_modules:
+  - gate_proj
+  - down_proj
+  - up_proj
+  - q_proj
+  - v_proj
+  - k_proj
+  - o_proj
+lora_fan_in_fan_out:
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+output_dir: ./lora-out
+gradient_accumulation_steps: 8
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+torchdistx_path:
+lr_scheduler: cosine
+learning_rate: 0.0002
+train_on_inputs: false
+group_by_length: false
+bf16: false
+fp16: true
+tf32: false
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+gptq_groupsize:
+s2_attention:
+gptq_model_v1:
+warmup_steps: 20
+evals_per_epoch: 4
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.1
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
+```
+</details><br>
+# lora-out
+This model is a fine-tuned version of [openlm-research/open_llama_3b_v2](https://huggingface.co/openlm-research/open_llama_3b_v2) on the None dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.6555
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 1
+- eval_batch_size: 1
+- seed: 42
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 8
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 20
+- num_epochs: 1
+- mixed_precision_training: Native AMP
+### Training results
+| Training Loss | Epoch | Step | Validation Loss |
+|:-------------:|:-----:|:----:|:---------------:|
+| 2.0783        | 0.0   | 1    | 2.0402          |
+| 0.8144        | 0.25  | 132  | 0.8365          |
+| 0.7159        | 0.5   | 264  | 0.7109          |
+| 0.6664        | 0.75  | 396  | 0.6555          |
+### Framework versions
+- PEFT 0.9.0
+- Transformers 4.38.2
+- Pytorch 2.2.0+cu118
+- Datasets 2.18.0
+- Tokenizers 0.15.0

adapter_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "openlm-research/open_llama_3b_v2",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "k_proj",
+    "o_proj",
+    "gate_proj",
+    "q_proj",
+    "down_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70c52922453b15fec25d5828641908edf859b7a77b86b4a20a0a77c2be5a7f59
+size 50982842

checkpoint-527/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+library_name: peft
+base_model: openlm-research/open_llama_3b_v2
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.9.0

checkpoint-527/adapter_config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "openlm-research/open_llama_3b_v2",
+  "bias": "none",
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 16,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 8,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "k_proj",
+    "o_proj",
+    "gate_proj",
+    "q_proj",
+    "down_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

checkpoint-527/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8295bcd57cd2e395157ae145d8ed4e37b19438151a56f46ab611c050802dacb7
+size 50899792

checkpoint-527/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6bbf9f0256d3259a97ee0a6a827ac3ad2f9d5eda4141c61b208d5a73e75882d4
+size 25871876

checkpoint-527/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b3ee827a7a00012c0a116546df467feee35e70376d81a7a85b1a70eb90414d3
+size 14244

checkpoint-527/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c657142c52d00cceef4d96f4405b39cd8ef2c1725c2a0d85b5a5d6841cabbb43
+size 1064

checkpoint-527/trainer_state.json ADDED Viewed

	@@ -0,0 +1,3742 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9997628645956841,
+  "eval_steps": 132,
+  "global_step": 527,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.5150582194328308,
+      "learning_rate": 1e-05,
+      "loss": 2.0783,
+      "step": 1
+    },
+    {
+      "epoch": 0.0,
+      "eval_loss": 2.0402109622955322,
+      "eval_runtime": 14.6647,
+      "eval_samples_per_second": 32.186,
+      "eval_steps_per_second": 32.186,
+      "step": 1
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.3772180378437042,
+      "learning_rate": 2e-05,
+      "loss": 1.9188,
+      "step": 2
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.43772926926612854,
+      "learning_rate": 3e-05,
+      "loss": 2.015,
+      "step": 3
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.40814539790153503,
+      "learning_rate": 4e-05,
+      "loss": 1.971,
+      "step": 4
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.3195774555206299,
+      "learning_rate": 5e-05,
+      "loss": 1.8184,
+      "step": 5
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.4951046109199524,
+      "learning_rate": 6e-05,
+      "loss": 2.0699,
+      "step": 6
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.5869188904762268,
+      "learning_rate": 7e-05,
+      "loss": 2.1714,
+      "step": 7
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.3898499608039856,
+      "learning_rate": 8e-05,
+      "loss": 1.8471,
+      "step": 8
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.4388677179813385,
+      "learning_rate": 9e-05,
+      "loss": 1.7862,
+      "step": 9
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.6679179072380066,
+      "learning_rate": 0.0001,
+      "loss": 1.664,
+      "step": 10
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.5442260503768921,
+      "learning_rate": 0.00011000000000000002,
+      "loss": 1.8824,
+      "step": 11
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.7661851644515991,
+      "learning_rate": 0.00012,
+      "loss": 2.0104,
+      "step": 12
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.47241243720054626,
+      "learning_rate": 0.00013000000000000002,
+      "loss": 1.5412,
+      "step": 13
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.852836012840271,
+      "learning_rate": 0.00014,
+      "loss": 1.6664,
+      "step": 14
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.833349883556366,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 1.5698,
+      "step": 15
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.7689324021339417,
+      "learning_rate": 0.00016,
+      "loss": 1.6071,
+      "step": 16
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.8255270719528198,
+      "learning_rate": 0.00017,
+      "loss": 1.5949,
+      "step": 17
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.6706920266151428,
+      "learning_rate": 0.00018,
+      "loss": 1.477,
+      "step": 18
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.6380877494812012,
+      "learning_rate": 0.00019,
+      "loss": 1.3816,
+      "step": 19
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 1.2489427328109741,
+      "learning_rate": 0.0002,
+      "loss": 1.3171,
+      "step": 20
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.7026221752166748,
+      "learning_rate": 0.0001999980802156745,
+      "loss": 1.5759,
+      "step": 21
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.5911536812782288,
+      "learning_rate": 0.00019999232093640933,
+      "loss": 1.4688,
+      "step": 22
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.576220691204071,
+      "learning_rate": 0.00019998272238333606,
+      "loss": 1.2945,
+      "step": 23
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.5969688892364502,
+      "learning_rate": 0.0001999692849249977,
+      "loss": 1.3766,
+      "step": 24
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.5913142561912537,
+      "learning_rate": 0.00019995200907733468,
+      "loss": 1.3836,
+      "step": 25
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.7268506288528442,
+      "learning_rate": 0.000199930895503665,
+      "loss": 1.3697,
+      "step": 26
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.5529304146766663,
+      "learning_rate": 0.00019990594501465884,
+      "loss": 1.221,
+      "step": 27
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.5636203289031982,
+      "learning_rate": 0.0001998771585683074,
+      "loss": 1.341,
+      "step": 28
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.5537460446357727,
+      "learning_rate": 0.000199844537269886,
+      "loss": 1.212,
+      "step": 29
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.5263828039169312,
+      "learning_rate": 0.00019980808237191178,
+      "loss": 1.1306,
+      "step": 30
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.5496954917907715,
+      "learning_rate": 0.0001997677952740956,
+      "loss": 1.3379,
+      "step": 31
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.6726649403572083,
+      "learning_rate": 0.00019972367752328824,
+      "loss": 1.2297,
+      "step": 32
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.5256686806678772,
+      "learning_rate": 0.00019967573081342103,
+      "loss": 1.1823,
+      "step": 33
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.5452485084533691,
+      "learning_rate": 0.00019962395698544077,
+      "loss": 1.1396,
+      "step": 34
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.5839616060256958,
+      "learning_rate": 0.00019956835802723916,
+      "loss": 1.1203,
+      "step": 35
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.4983978867530823,
+      "learning_rate": 0.00019950893607357636,
+      "loss": 1.1478,
+      "step": 36
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.6289787292480469,
+      "learning_rate": 0.00019944569340599912,
+      "loss": 1.1202,
+      "step": 37
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.7068483233451843,
+      "learning_rate": 0.00019937863245275304,
+      "loss": 1.2123,
+      "step": 38
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.655548632144928,
+      "learning_rate": 0.00019930775578868947,
+      "loss": 1.1072,
+      "step": 39
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.6124732494354248,
+      "learning_rate": 0.0001992330661351665,
+      "loss": 1.2471,
+      "step": 40
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.6181141138076782,
+      "learning_rate": 0.0001991545663599448,
+      "loss": 1.0698,
+      "step": 41
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5633079409599304,
+      "learning_rate": 0.00019907225947707704,
+      "loss": 1.0341,
+      "step": 42
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5386618375778198,
+      "learning_rate": 0.0001989861486467925,
+      "loss": 1.0025,
+      "step": 43
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.7523472309112549,
+      "learning_rate": 0.00019889623717537564,
+      "loss": 1.1307,
+      "step": 44
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.6467829942703247,
+      "learning_rate": 0.00019880252851503915,
+      "loss": 0.9503,
+      "step": 45
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.6346300840377808,
+      "learning_rate": 0.00019870502626379127,
+      "loss": 1.1203,
+      "step": 46
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.724151611328125,
+      "learning_rate": 0.00019860373416529802,
+      "loss": 1.1548,
+      "step": 47
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.5642977356910706,
+      "learning_rate": 0.00019849865610873898,
+      "loss": 1.0182,
+      "step": 48
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.6579070091247559,
+      "learning_rate": 0.0001983897961286583,
+      "loss": 1.0998,
+      "step": 49
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.6359585523605347,
+      "learning_rate": 0.0001982771584048096,
+      "loss": 1.0401,
+      "step": 50
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.6426641941070557,
+      "learning_rate": 0.00019816074726199565,
+      "loss": 1.0209,
+      "step": 51
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.7192591428756714,
+      "learning_rate": 0.0001980405671699022,
+      "loss": 1.061,
+      "step": 52
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.6428646445274353,
+      "learning_rate": 0.00019791662274292637,
+      "loss": 0.9462,
+      "step": 53
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.7455520033836365,
+      "learning_rate": 0.00019778891873999954,
+      "loss": 1.0747,
+      "step": 54
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.8657999038696289,
+      "learning_rate": 0.00019765746006440455,
+      "loss": 1.38,
+      "step": 55
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.8126336336135864,
+      "learning_rate": 0.00019752225176358757,
+      "loss": 1.0305,
+      "step": 56
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.5417783260345459,
+      "learning_rate": 0.00019738329902896403,
+      "loss": 1.0167,
+      "step": 57
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.8542747497558594,
+      "learning_rate": 0.00019724060719571962,
+      "loss": 1.2036,
+      "step": 58
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.7181089520454407,
+      "learning_rate": 0.0001970941817426052,
+      "loss": 1.06,
+      "step": 59
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.6730545163154602,
+      "learning_rate": 0.00019694402829172663,
+      "loss": 0.9376,
+      "step": 60
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.7448925971984863,
+      "learning_rate": 0.00019679015260832872,
+      "loss": 1.016,
+      "step": 61
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.8230142593383789,
+      "learning_rate": 0.00019663256060057393,
+      "loss": 1.0177,
+      "step": 62
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.6832039952278137,
+      "learning_rate": 0.0001964712583193156,
+      "loss": 0.9556,
+      "step": 63
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.6915066242218018,
+      "learning_rate": 0.00019630625195786558,
+      "loss": 0.9383,
+      "step": 64
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.9440682530403137,
+      "learning_rate": 0.0001961375478517564,
+      "loss": 1.1566,
+      "step": 65
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.695676863193512,
+      "learning_rate": 0.000195965152478498,
+      "loss": 0.983,
+      "step": 66
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.6894365549087524,
+      "learning_rate": 0.0001957890724573291,
+      "loss": 0.9134,
+      "step": 67
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.6659775376319885,
+      "learning_rate": 0.00019560931454896298,
+      "loss": 0.8653,
+      "step": 68
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.618377685546875,
+      "learning_rate": 0.00019542588565532799,
+      "loss": 0.9735,
+      "step": 69
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.6974190473556519,
+      "learning_rate": 0.00019523879281930235,
+      "loss": 1.0011,
+      "step": 70
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.6658157110214233,
+      "learning_rate": 0.000195048043224444,
+      "loss": 0.974,
+      "step": 71
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.6543557643890381,
+      "learning_rate": 0.00019485364419471454,
+      "loss": 0.9153,
+      "step": 72
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.8609347939491272,
+      "learning_rate": 0.00019465560319419824,
+      "loss": 0.9097,
+      "step": 73
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.5851992964744568,
+      "learning_rate": 0.00019445392782681522,
+      "loss": 0.8875,
+      "step": 74
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.6669757962226868,
+      "learning_rate": 0.00019424862583602965,
+      "loss": 0.9028,
+      "step": 75
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.9252307415008545,
+      "learning_rate": 0.00019403970510455248,
+      "loss": 0.8723,
+      "step": 76
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.6840870380401611,
+      "learning_rate": 0.00019382717365403854,
+      "loss": 0.9596,
+      "step": 77
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.8419337272644043,
+      "learning_rate": 0.00019361103964477883,
+      "loss": 0.9653,
+      "step": 78
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.7405250072479248,
+      "learning_rate": 0.00019339131137538696,
+      "loss": 0.7997,
+      "step": 79
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.759087085723877,
+      "learning_rate": 0.00019316799728248075,
+      "loss": 1.0548,
+      "step": 80
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.6998041868209839,
+      "learning_rate": 0.00019294110594035804,
+      "loss": 0.8764,
+      "step": 81
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.7717428803443909,
+      "learning_rate": 0.0001927106460606677,
+      "loss": 0.9785,
+      "step": 82
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 1.446915626525879,
+      "learning_rate": 0.0001924766264920751,
+      "loss": 0.8841,
+      "step": 83
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.8436324596405029,
+      "learning_rate": 0.00019223905621992206,
+      "loss": 1.0682,
+      "step": 84
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.767516553401947,
+      "learning_rate": 0.00019199794436588243,
+      "loss": 0.9833,
+      "step": 85
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.8140639066696167,
+      "learning_rate": 0.0001917533001876113,
+      "loss": 0.9773,
+      "step": 86
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.6604956984519958,
+      "learning_rate": 0.00019150513307838988,
+      "loss": 1.0004,
+      "step": 87
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.9023972153663635,
+      "learning_rate": 0.00019125345256676467,
+      "loss": 0.9804,
+      "step": 88
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.6216576099395752,
+      "learning_rate": 0.0001909982683161817,
+      "loss": 0.8837,
+      "step": 89
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.7384454607963562,
+      "learning_rate": 0.00019073959012461545,
+      "loss": 0.8443,
+      "step": 90
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.8240922093391418,
+      "learning_rate": 0.00019047742792419268,
+      "loss": 0.9069,
+      "step": 91
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.629971981048584,
+      "learning_rate": 0.00019021179178081105,
+      "loss": 0.9381,
+      "step": 92
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 1.0065981149673462,
+      "learning_rate": 0.00018994269189375268,
+      "loss": 0.989,
+      "step": 93
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.7697680592536926,
+      "learning_rate": 0.00018967013859529246,
+      "loss": 0.9805,
+      "step": 94
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.6588563919067383,
+      "learning_rate": 0.00018939414235030134,
+      "loss": 0.9206,
+      "step": 95
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.7448359727859497,
+      "learning_rate": 0.00018911471375584468,
+      "loss": 0.9082,
+      "step": 96
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.787029504776001,
+      "learning_rate": 0.0001888318635407752,
+      "loss": 0.9051,
+      "step": 97
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.6427372694015503,
+      "learning_rate": 0.000188545602565321,
+      "loss": 0.8041,
+      "step": 98
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.633730411529541,
+      "learning_rate": 0.00018825594182066886,
+      "loss": 0.8297,
+      "step": 99
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 1.0125346183776855,
+      "learning_rate": 0.0001879628924285419,
+      "loss": 0.9292,
+      "step": 100
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.851612389087677,
+      "learning_rate": 0.00018766646564077265,
+      "loss": 0.8787,
+      "step": 101
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.9300311803817749,
+      "learning_rate": 0.00018736667283887116,
+      "loss": 0.933,
+      "step": 102
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.9016942977905273,
+      "learning_rate": 0.00018706352553358793,
+      "loss": 0.9506,
+      "step": 103
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.8613467216491699,
+      "learning_rate": 0.00018675703536447178,
+      "loss": 0.8779,
+      "step": 104
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.7087985277175903,
+      "learning_rate": 0.00018644721409942323,
+      "loss": 0.9122,
+      "step": 105
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.0127161741256714,
+      "learning_rate": 0.00018613407363424238,
+      "loss": 0.9366,
+      "step": 106
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.786028265953064,
+      "learning_rate": 0.00018581762599217242,
+      "loss": 0.8335,
+      "step": 107
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 1.2019354104995728,
+      "learning_rate": 0.00018549788332343777,
+      "loss": 0.9577,
+      "step": 108
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.8300893902778625,
+      "learning_rate": 0.0001851748579047777,
+      "loss": 0.9094,
+      "step": 109
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.7978622913360596,
+      "learning_rate": 0.00018484856213897498,
+      "loss": 0.8753,
+      "step": 110
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.77290278673172,
+      "learning_rate": 0.0001845190085543795,
+      "loss": 0.8746,
+      "step": 111
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.6986845135688782,
+      "learning_rate": 0.00018418620980442736,
+      "loss": 0.8153,
+      "step": 112
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.8883097767829895,
+      "learning_rate": 0.00018385017866715507,
+      "loss": 0.9012,
+      "step": 113
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 1.0097709894180298,
+      "learning_rate": 0.00018351092804470885,
+      "loss": 0.9251,
+      "step": 114
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.7271776795387268,
+      "learning_rate": 0.00018316847096284917,
+      "loss": 0.8207,
+      "step": 115
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.6595386862754822,
+      "learning_rate": 0.00018282282057045088,
+      "loss": 0.8365,
+      "step": 116
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.9244414567947388,
+      "learning_rate": 0.00018247399013899805,
+      "loss": 0.901,
+      "step": 117
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.8177185654640198,
+      "learning_rate": 0.00018212199306207456,
+      "loss": 0.9121,
+      "step": 118
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.6657998561859131,
+      "learning_rate": 0.00018176684285484983,
+      "loss": 0.8596,
+      "step": 119
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.7159414291381836,
+      "learning_rate": 0.0001814085531535599,
+      "loss": 0.7837,
+      "step": 120
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.7221715450286865,
+      "learning_rate": 0.00018104713771498383,
+      "loss": 0.8141,
+      "step": 121
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.7190296053886414,
+      "learning_rate": 0.00018068261041591548,
+      "loss": 0.7683,
+      "step": 122
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.9818452596664429,
+      "learning_rate": 0.00018031498525263072,
+      "loss": 1.0075,
+      "step": 123
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.7468921542167664,
+      "learning_rate": 0.00017994427634035015,
+      "loss": 0.8308,
+      "step": 124
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.8131096959114075,
+      "learning_rate": 0.00017957049791269685,
+      "loss": 0.9214,
+      "step": 125
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.7757013440132141,
+      "learning_rate": 0.00017919366432115024,
+      "loss": 0.8861,
+      "step": 126
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.6586887240409851,
+      "learning_rate": 0.00017881379003449472,
+      "loss": 0.8654,
+      "step": 127
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.6687777042388916,
+      "learning_rate": 0.00017843088963826435,
+      "loss": 0.7934,
+      "step": 128
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.7160292267799377,
+      "learning_rate": 0.00017804497783418266,
+      "loss": 0.8842,
+      "step": 129
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.7248404622077942,
+      "learning_rate": 0.00017765606943959833,
+      "loss": 0.8668,
+      "step": 130
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.953946053981781,
+      "learning_rate": 0.00017726417938691619,
+      "loss": 0.8918,
+      "step": 131
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.8473754525184631,
+      "learning_rate": 0.0001768693227230238,
+      "loss": 0.8144,
+      "step": 132
+    },
+    {
+      "epoch": 0.25,
+      "eval_loss": 0.8364565372467041,
+      "eval_runtime": 14.7047,
+      "eval_samples_per_second": 32.099,
+      "eval_steps_per_second": 32.099,
+      "step": 132
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.7996974587440491,
+      "learning_rate": 0.00017647151460871386,
+      "loss": 0.9233,
+      "step": 133
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.728558361530304,
+      "learning_rate": 0.00017607077031810202,
+      "loss": 0.766,
+      "step": 134
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.8249170184135437,
+      "learning_rate": 0.00017566710523804043,
+      "loss": 0.8565,
+      "step": 135
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.6535347104072571,
+      "learning_rate": 0.00017526053486752695,
+      "loss": 0.7229,
+      "step": 136
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.7984380722045898,
+      "learning_rate": 0.00017485107481711012,
+      "loss": 0.7271,
+      "step": 137
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.7088860869407654,
+      "learning_rate": 0.00017443874080828962,
+      "loss": 0.7622,
+      "step": 138
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.8559077978134155,
+      "learning_rate": 0.0001740235486729128,
+      "loss": 0.9194,
+      "step": 139
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.9430374503135681,
+      "learning_rate": 0.00017360551435256674,
+      "loss": 0.8032,
+      "step": 140
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.8738976716995239,
+      "learning_rate": 0.00017318465389796613,
+      "loss": 0.8433,
+      "step": 141
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 1.5377166271209717,
+      "learning_rate": 0.00017276098346833712,
+      "loss": 0.7793,
+      "step": 142
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.8574996590614319,
+      "learning_rate": 0.00017233451933079664,
+      "loss": 0.8422,
+      "step": 143
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.7897395491600037,
+      "learning_rate": 0.00017190527785972807,
+      "loss": 0.9424,
+      "step": 144
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.8188284039497375,
+      "learning_rate": 0.0001714732755361523,
+      "loss": 0.6568,
+      "step": 145
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.6334033012390137,
+      "learning_rate": 0.00017103852894709517,
+      "loss": 0.7111,
+      "step": 146
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.9169749021530151,
+      "learning_rate": 0.00017060105478495044,
+      "loss": 0.8087,
+      "step": 147
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.8317080140113831,
+      "learning_rate": 0.00017016086984683888,
+      "loss": 0.7404,
+      "step": 148
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.6163865327835083,
+      "learning_rate": 0.00016971799103396334,
+      "loss": 0.6276,
+      "step": 149
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.8176054358482361,
+      "learning_rate": 0.00016927243535095997,
+      "loss": 0.9162,
+      "step": 150
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.6680843830108643,
+      "learning_rate": 0.00016882421990524498,
+      "loss": 0.7782,
+      "step": 151
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.8415772914886475,
+      "learning_rate": 0.00016837336190635824,
+      "loss": 0.759,
+      "step": 152
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.8282654285430908,
+      "learning_rate": 0.00016791987866530202,
+      "loss": 0.7986,
+      "step": 153
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.7016776204109192,
+      "learning_rate": 0.00016746378759387675,
+      "loss": 0.6493,
+      "step": 154
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.5957922339439392,
+      "learning_rate": 0.00016700510620401224,
+      "loss": 0.7463,
+      "step": 155
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.7017227411270142,
+      "learning_rate": 0.00016654385210709531,
+      "loss": 0.7806,
+      "step": 156
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.77492356300354,
+      "learning_rate": 0.00016608004301329365,
+      "loss": 0.7959,
+      "step": 157
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.6599026918411255,
+      "learning_rate": 0.00016561369673087588,
+      "loss": 0.764,
+      "step": 158
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.7865548133850098,
+      "learning_rate": 0.00016514483116552762,
+      "loss": 0.7801,
+      "step": 159
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.6737894415855408,
+      "learning_rate": 0.00016467346431966413,
+      "loss": 0.7365,
+      "step": 160
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.7904422879219055,
+      "learning_rate": 0.0001641996142917391,
+      "loss": 0.8118,
+      "step": 161
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.760714590549469,
+      "learning_rate": 0.0001637232992755496,
+      "loss": 0.6932,
+      "step": 162
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.6482570767402649,
+      "learning_rate": 0.00016324453755953773,
+      "loss": 0.5669,
+      "step": 163
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.8352275490760803,
+      "learning_rate": 0.00016276334752608822,
+      "loss": 0.7958,
+      "step": 164
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.6860224604606628,
+      "learning_rate": 0.00016227974765082274,
+      "loss": 0.8383,
+      "step": 165
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.8616229891777039,
+      "learning_rate": 0.00016179375650189048,
+      "loss": 0.7639,
+      "step": 166
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.6623161435127258,
+      "learning_rate": 0.0001613053927392553,
+      "loss": 0.661,
+      "step": 167
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.8096372485160828,
+      "learning_rate": 0.0001608146751139791,
+      "loss": 0.7507,
+      "step": 168
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.7616962790489197,
+      "learning_rate": 0.00016032162246750197,
+      "loss": 0.7798,
+      "step": 169
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.7718486189842224,
+      "learning_rate": 0.00015982625373091875,
+      "loss": 0.8187,
+      "step": 170
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.6922149658203125,
+      "learning_rate": 0.0001593285879242522,
+      "loss": 0.7562,
+      "step": 171
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.7328513860702515,
+      "learning_rate": 0.0001588286441557226,
+      "loss": 0.7315,
+      "step": 172
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.6606435775756836,
+      "learning_rate": 0.00015832644162101417,
+      "loss": 0.5997,
+      "step": 173
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 1.2101995944976807,
+      "learning_rate": 0.000157821999602538,
+      "loss": 0.8754,
+      "step": 174
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.744756817817688,
+      "learning_rate": 0.00015731533746869163,
+      "loss": 0.8369,
+      "step": 175
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.6933184266090393,
+      "learning_rate": 0.00015680647467311557,
+      "loss": 0.7291,
+      "step": 176
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.8668520450592041,
+      "learning_rate": 0.00015629543075394625,
+      "loss": 0.6565,
+      "step": 177
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 1.06217360496521,
+      "learning_rate": 0.0001557822253330657,
+      "loss": 0.7949,
+      "step": 178
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.7296217679977417,
+      "learning_rate": 0.00015526687811534838,
+      "loss": 0.7218,
+      "step": 179
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.679236114025116,
+      "learning_rate": 0.00015474940888790455,
+      "loss": 0.7323,
+      "step": 180
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.7786648273468018,
+      "learning_rate": 0.0001542298375193204,
+      "loss": 0.7032,
+      "step": 181
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.0958847999572754,
+      "learning_rate": 0.00015370818395889536,
+      "loss": 0.856,
+      "step": 182
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.6277094483375549,
+      "learning_rate": 0.00015318446823587596,
+      "loss": 0.7215,
+      "step": 183
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.7913120985031128,
+      "learning_rate": 0.000152658710458687,
+      "loss": 0.7673,
+      "step": 184
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.0577659606933594,
+      "learning_rate": 0.0001521309308141592,
+      "loss": 0.7168,
+      "step": 185
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.7179178595542908,
+      "learning_rate": 0.00015160114956675434,
+      "loss": 0.7872,
+      "step": 186
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 1.1850095987319946,
+      "learning_rate": 0.00015106938705778712,
+      "loss": 0.7967,
+      "step": 187
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.7346929311752319,
+      "learning_rate": 0.00015053566370464415,
+      "loss": 0.7788,
+      "step": 188
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.8880440592765808,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.7466,
+      "step": 189
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.6921265125274658,
+      "learning_rate": 0.00014946241651103034,
+      "loss": 0.8424,
+      "step": 190
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.8322516679763794,
+      "learning_rate": 0.00014892293387862223,
+      "loss": 0.7648,
+      "step": 191
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.8475639820098877,
+      "learning_rate": 0.00014838157281658177,
+      "loss": 0.674,
+      "step": 192
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.8423508405685425,
+      "learning_rate": 0.00014783835411083854,
+      "loss": 0.8193,
+      "step": 193
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.9078856110572815,
+      "learning_rate": 0.0001472932986186477,
+      "loss": 0.8027,
+      "step": 194
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.6911562085151672,
+      "learning_rate": 0.00014674642726778906,
+      "loss": 0.7821,
+      "step": 195
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.7079100608825684,
+      "learning_rate": 0.0001461977610557635,
+      "loss": 0.6792,
+      "step": 196
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.7966701984405518,
+      "learning_rate": 0.000145647321048987,
+      "loss": 0.8002,
+      "step": 197
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.9254695177078247,
+      "learning_rate": 0.00014509512838198148,
+      "loss": 0.8354,
+      "step": 198
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.6664696335792542,
+      "learning_rate": 0.00014454120425656342,
+      "loss": 0.7546,
+      "step": 199
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 1.0904691219329834,
+      "learning_rate": 0.00014398556994102996,
+      "loss": 0.7356,
+      "step": 200
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.8222538232803345,
+      "learning_rate": 0.00014342824676934202,
+      "loss": 0.6181,
+      "step": 201
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.848257839679718,
+      "learning_rate": 0.00014286925614030542,
+      "loss": 0.8353,
+      "step": 202
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.6590856313705444,
+      "learning_rate": 0.00014230861951674913,
+      "loss": 0.6996,
+      "step": 203
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.1949280500411987,
+      "learning_rate": 0.00014174635842470118,
+      "loss": 0.7901,
+      "step": 204
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.7490679621696472,
+      "learning_rate": 0.00014118249445256223,
+      "loss": 0.6008,
+      "step": 205
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 1.1541824340820312,
+      "learning_rate": 0.00014061704925027652,
+      "loss": 0.6705,
+      "step": 206
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.9292888045310974,
+      "learning_rate": 0.00014005004452850083,
+      "loss": 0.8084,
+      "step": 207
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.9962819218635559,
+      "learning_rate": 0.0001394815020577707,
+      "loss": 0.7381,
+      "step": 208
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.6327970027923584,
+      "learning_rate": 0.00013891144366766456,
+      "loss": 0.7034,
+      "step": 209
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 1.0281709432601929,
+      "learning_rate": 0.00013833989124596572,
+      "loss": 0.759,
+      "step": 210
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.6905353665351868,
+      "learning_rate": 0.00013776686673782175,
+      "loss": 0.7074,
+      "step": 211
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.7964909672737122,
+      "learning_rate": 0.00013719239214490204,
+      "loss": 0.8094,
+      "step": 212
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.8168321251869202,
+      "learning_rate": 0.00013661648952455291,
+      "loss": 0.7632,
+      "step": 213
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.6741600036621094,
+      "learning_rate": 0.00013603918098895092,
+      "loss": 0.7473,
+      "step": 214
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.1006839275360107,
+      "learning_rate": 0.00013546048870425356,
+      "loss": 0.6841,
+      "step": 215
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 1.0017565488815308,
+      "learning_rate": 0.00013488043488974844,
+      "loss": 0.8087,
+      "step": 216
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.7672898769378662,
+      "learning_rate": 0.0001342990418169999,
+      "loss": 0.787,
+      "step": 217
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.7265097498893738,
+      "learning_rate": 0.00013371633180899416,
+      "loss": 0.7716,
+      "step": 218
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.8181336522102356,
+      "learning_rate": 0.000133132327239282,
+      "loss": 0.7527,
+      "step": 219
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.7809070944786072,
+      "learning_rate": 0.0001325470505311198,
+      "loss": 0.6298,
+      "step": 220
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.8382383584976196,
+      "learning_rate": 0.00013196052415660856,
+      "loss": 0.723,
+      "step": 221
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.6496840119361877,
+      "learning_rate": 0.0001313727706358311,
+      "loss": 0.7228,
+      "step": 222
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.657471776008606,
+      "learning_rate": 0.00013078381253598732,
+      "loss": 0.6453,
+      "step": 223
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.8327584862709045,
+      "learning_rate": 0.0001301936724705278,
+      "loss": 0.7631,
+      "step": 224
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.7766168713569641,
+      "learning_rate": 0.0001296023730982855,
+      "loss": 0.705,
+      "step": 225
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.8591150045394897,
+      "learning_rate": 0.0001290099371226058,
+      "loss": 0.6703,
+      "step": 226
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.7799991369247437,
+      "learning_rate": 0.00012841638729047463,
+      "loss": 0.7068,
+      "step": 227
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.7640935182571411,
+      "learning_rate": 0.0001278217463916453,
+      "loss": 0.7237,
+      "step": 228
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.6747919321060181,
+      "learning_rate": 0.00012722603725776329,
+      "loss": 0.6774,
+      "step": 229
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.9026936292648315,
+      "learning_rate": 0.00012662928276148985,
+      "loss": 0.7422,
+      "step": 230
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.816842794418335,
+      "learning_rate": 0.0001260315058156235,
+      "loss": 0.7338,
+      "step": 231
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 1.1454386711120605,
+      "learning_rate": 0.00012543272937222044,
+      "loss": 0.8056,
+      "step": 232
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.7503964900970459,
+      "learning_rate": 0.00012483297642171333,
+      "loss": 0.6559,
+      "step": 233
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.6204442381858826,
+      "learning_rate": 0.00012423226999202838,
+      "loss": 0.5569,
+      "step": 234
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.7909918427467346,
+      "learning_rate": 0.00012363063314770135,
+      "loss": 0.7276,
+      "step": 235
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.5152371525764465,
+      "learning_rate": 0.00012302808898899197,
+      "loss": 0.499,
+      "step": 236
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.9286866188049316,
+      "learning_rate": 0.00012242466065099685,
+      "loss": 0.7141,
+      "step": 237
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.66729736328125,
+      "learning_rate": 0.00012182037130276125,
+      "loss": 0.6765,
+      "step": 238
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.6844175457954407,
+      "learning_rate": 0.00012121524414638959,
+      "loss": 0.7087,
+      "step": 239
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.7366169095039368,
+      "learning_rate": 0.0001206093024161544,
+      "loss": 0.6645,
+      "step": 240
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.691806435585022,
+      "learning_rate": 0.00012000256937760445,
+      "loss": 0.6785,
+      "step": 241
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.6430802941322327,
+      "learning_rate": 0.00011939506832667128,
+      "loss": 0.6785,
+      "step": 242
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.7784428000450134,
+      "learning_rate": 0.00011878682258877478,
+      "loss": 0.7017,
+      "step": 243
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.6317816972732544,
+      "learning_rate": 0.00011817785551792766,
+      "loss": 0.6515,
+      "step": 244
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.8226771950721741,
+      "learning_rate": 0.00011756819049583861,
+      "loss": 0.6753,
+      "step": 245
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.7601309418678284,
+      "learning_rate": 0.00011695785093101475,
+      "loss": 0.6568,
+      "step": 246
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.6727234721183777,
+      "learning_rate": 0.00011634686025786264,
+      "loss": 0.5909,
+      "step": 247
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.7579910755157471,
+      "learning_rate": 0.00011573524193578863,
+      "loss": 0.7882,
+      "step": 248
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.680234432220459,
+      "learning_rate": 0.00011512301944829809,
+      "loss": 0.6865,
+      "step": 249
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.8539860248565674,
+      "learning_rate": 0.00011451021630209371,
+      "loss": 0.7303,
+      "step": 250
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.7847228646278381,
+      "learning_rate": 0.00011389685602617301,
+      "loss": 0.5941,
+      "step": 251
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.7232929468154907,
+      "learning_rate": 0.00011328296217092485,
+      "loss": 0.7054,
+      "step": 252
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.8790189623832703,
+      "learning_rate": 0.00011266855830722523,
+      "loss": 0.741,
+      "step": 253
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.6331459879875183,
+      "learning_rate": 0.0001120536680255323,
+      "loss": 0.6574,
+      "step": 254
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.7007669806480408,
+      "learning_rate": 0.0001114383149349806,
+      "loss": 0.7269,
+      "step": 255
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.648810088634491,
+      "learning_rate": 0.00011082252266247442,
+      "loss": 0.6414,
+      "step": 256
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.7953597903251648,
+      "learning_rate": 0.00011020631485178083,
+      "loss": 0.7068,
+      "step": 257
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.7083666920661926,
+      "learning_rate": 0.00010958971516262177,
+      "loss": 0.6948,
+      "step": 258
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 1.0010278224945068,
+      "learning_rate": 0.00010897274726976561,
+      "loss": 0.7326,
+      "step": 259
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.682121217250824,
+      "learning_rate": 0.00010835543486211815,
+      "loss": 0.5291,
+      "step": 260
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.7658283710479736,
+      "learning_rate": 0.00010773780164181305,
+      "loss": 0.6827,
+      "step": 261
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.7295763492584229,
+      "learning_rate": 0.00010711987132330181,
+      "loss": 0.6928,
+      "step": 262
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.7689676284790039,
+      "learning_rate": 0.0001065016676324433,
+      "loss": 0.669,
+      "step": 263
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.7863064408302307,
+      "learning_rate": 0.00010588321430559252,
+      "loss": 0.7159,
+      "step": 264
+    },
+    {
+      "epoch": 0.5,
+      "eval_loss": 0.7109076380729675,
+      "eval_runtime": 14.6602,
+      "eval_samples_per_second": 32.196,
+      "eval_steps_per_second": 32.196,
+      "step": 264
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.6780825257301331,
+      "learning_rate": 0.00010526453508868961,
+      "loss": 0.7589,
+      "step": 265
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.7262443900108337,
+      "learning_rate": 0.00010464565373634782,
+      "loss": 0.6978,
+      "step": 266
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.149004578590393,
+      "learning_rate": 0.00010402659401094152,
+      "loss": 0.7782,
+      "step": 267
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.7568680047988892,
+      "learning_rate": 0.00010340737968169389,
+      "loss": 0.6883,
+      "step": 268
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.7424808144569397,
+      "learning_rate": 0.00010278803452376416,
+      "loss": 0.6757,
+      "step": 269
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 1.1873571872711182,
+      "learning_rate": 0.00010216858231733488,
+      "loss": 0.6569,
+      "step": 270
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.7609845399856567,
+      "learning_rate": 0.00010154904684669877,
+      "loss": 0.7123,
+      "step": 271
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.9140599966049194,
+      "learning_rate": 0.00010092945189934558,
+      "loss": 0.6424,
+      "step": 272
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.7356762290000916,
+      "learning_rate": 0.0001003098212650486,
+      "loss": 0.7502,
+      "step": 273
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.7270318269729614,
+      "learning_rate": 9.969017873495143e-05,
+      "loss": 0.7053,
+      "step": 274
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.7932801842689514,
+      "learning_rate": 9.907054810065446e-05,
+      "loss": 0.6494,
+      "step": 275
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.9142821431159973,
+      "learning_rate": 9.845095315330123e-05,
+      "loss": 0.6471,
+      "step": 276
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.4982346296310425,
+      "learning_rate": 9.783141768266511e-05,
+      "loss": 0.592,
+      "step": 277
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.7968485951423645,
+      "learning_rate": 9.721196547623584e-05,
+      "loss": 0.6657,
+      "step": 278
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.2386119365692139,
+      "learning_rate": 9.659262031830612e-05,
+      "loss": 0.7317,
+      "step": 279
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.7291132807731628,
+      "learning_rate": 9.597340598905852e-05,
+      "loss": 0.6723,
+      "step": 280
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 1.0461961030960083,
+      "learning_rate": 9.53543462636522e-05,
+      "loss": 0.7715,
+      "step": 281
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.74568110704422,
+      "learning_rate": 9.473546491131041e-05,
+      "loss": 0.7389,
+      "step": 282
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.7399306893348694,
+      "learning_rate": 9.411678569440752e-05,
+      "loss": 0.6715,
+      "step": 283
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.7953692674636841,
+      "learning_rate": 9.349833236755674e-05,
+      "loss": 0.6911,
+      "step": 284
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.7449805736541748,
+      "learning_rate": 9.28801286766982e-05,
+      "loss": 0.7159,
+      "step": 285
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.7682590484619141,
+      "learning_rate": 9.226219835818699e-05,
+      "loss": 0.7643,
+      "step": 286
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.8503318428993225,
+      "learning_rate": 9.164456513788186e-05,
+      "loss": 0.6482,
+      "step": 287
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 1.5629699230194092,
+      "learning_rate": 9.10272527302344e-05,
+      "loss": 0.564,
+      "step": 288
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.5936041474342346,
+      "learning_rate": 9.041028483737825e-05,
+      "loss": 0.6121,
+      "step": 289
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.8084244728088379,
+      "learning_rate": 8.979368514821916e-05,
+      "loss": 0.7165,
+      "step": 290
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.9212177395820618,
+      "learning_rate": 8.917747733752564e-05,
+      "loss": 0.7041,
+      "step": 291
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.6928449273109436,
+      "learning_rate": 8.856168506501944e-05,
+      "loss": 0.5556,
+      "step": 292
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.7461227774620056,
+      "learning_rate": 8.79463319744677e-05,
+      "loss": 0.7686,
+      "step": 293
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.7020424604415894,
+      "learning_rate": 8.733144169277481e-05,
+      "loss": 0.6741,
+      "step": 294
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 1.0011357069015503,
+      "learning_rate": 8.671703782907518e-05,
+      "loss": 0.7413,
+      "step": 295
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.6851476430892944,
+      "learning_rate": 8.610314397382701e-05,
+      "loss": 0.6345,
+      "step": 296
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.7206119894981384,
+      "learning_rate": 8.548978369790631e-05,
+      "loss": 0.693,
+      "step": 297
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.7812450528144836,
+      "learning_rate": 8.487698055170192e-05,
+      "loss": 0.6601,
+      "step": 298
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.9316333532333374,
+      "learning_rate": 8.426475806421138e-05,
+      "loss": 0.67,
+      "step": 299
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.7167903780937195,
+      "learning_rate": 8.365313974213737e-05,
+      "loss": 0.7628,
+      "step": 300
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 1.1033985614776611,
+      "learning_rate": 8.304214906898526e-05,
+      "loss": 0.4936,
+      "step": 301
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.8084871172904968,
+      "learning_rate": 8.243180950416141e-05,
+      "loss": 0.7081,
+      "step": 302
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.7424312233924866,
+      "learning_rate": 8.182214448207239e-05,
+      "loss": 0.6034,
+      "step": 303
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.878241240978241,
+      "learning_rate": 8.121317741122525e-05,
+      "loss": 0.7169,
+      "step": 304
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.8637294173240662,
+      "learning_rate": 8.060493167332874e-05,
+      "loss": 0.7805,
+      "step": 305
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.6890109777450562,
+      "learning_rate": 7.999743062239557e-05,
+      "loss": 0.6538,
+      "step": 306
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.7422961592674255,
+      "learning_rate": 7.939069758384562e-05,
+      "loss": 0.6648,
+      "step": 307
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.7701572775840759,
+      "learning_rate": 7.878475585361045e-05,
+      "loss": 0.7211,
+      "step": 308
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.887014627456665,
+      "learning_rate": 7.817962869723876e-05,
+      "loss": 0.7689,
+      "step": 309
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.6782895922660828,
+      "learning_rate": 7.757533934900316e-05,
+      "loss": 0.7301,
+      "step": 310
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.7926852107048035,
+      "learning_rate": 7.697191101100801e-05,
+      "loss": 0.7256,
+      "step": 311
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.8395385146141052,
+      "learning_rate": 7.636936685229863e-05,
+      "loss": 0.7125,
+      "step": 312
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.874583899974823,
+      "learning_rate": 7.576773000797166e-05,
+      "loss": 0.6143,
+      "step": 313
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.6823002099990845,
+      "learning_rate": 7.516702357828672e-05,
+      "loss": 0.6415,
+      "step": 314
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.6727049946784973,
+      "learning_rate": 7.456727062777958e-05,
+      "loss": 0.6053,
+      "step": 315
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.7634817361831665,
+      "learning_rate": 7.396849418437652e-05,
+      "loss": 0.5891,
+      "step": 316
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.766338050365448,
+      "learning_rate": 7.337071723851017e-05,
+      "loss": 0.6727,
+      "step": 317
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.8024521470069885,
+      "learning_rate": 7.277396274223671e-05,
+      "loss": 0.6873,
+      "step": 318
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.701219916343689,
+      "learning_rate": 7.217825360835473e-05,
+      "loss": 0.5562,
+      "step": 319
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.8103075623512268,
+      "learning_rate": 7.15836127095254e-05,
+      "loss": 0.5965,
+      "step": 320
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.8132802248001099,
+      "learning_rate": 7.09900628773942e-05,
+      "loss": 0.7483,
+      "step": 321
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.7442184686660767,
+      "learning_rate": 7.039762690171447e-05,
+      "loss": 0.7469,
+      "step": 322
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.7334067821502686,
+      "learning_rate": 6.98063275294722e-05,
+      "loss": 0.5311,
+      "step": 323
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.6543470621109009,
+      "learning_rate": 6.921618746401273e-05,
+      "loss": 0.6516,
+      "step": 324
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.6865355372428894,
+      "learning_rate": 6.862722936416897e-05,
+      "loss": 0.6957,
+      "step": 325
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.8854186534881592,
+      "learning_rate": 6.803947584339148e-05,
+      "loss": 0.6739,
+      "step": 326
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.7788788676261902,
+      "learning_rate": 6.745294946888023e-05,
+      "loss": 0.6183,
+      "step": 327
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.7810282707214355,
+      "learning_rate": 6.686767276071803e-05,
+      "loss": 0.6436,
+      "step": 328
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.7473210096359253,
+      "learning_rate": 6.628366819100585e-05,
+      "loss": 0.6759,
+      "step": 329
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.8050934076309204,
+      "learning_rate": 6.570095818300012e-05,
+      "loss": 0.6867,
+      "step": 330
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.7596076726913452,
+      "learning_rate": 6.511956511025157e-05,
+      "loss": 0.6552,
+      "step": 331
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.8490655422210693,
+      "learning_rate": 6.453951129574644e-05,
+      "loss": 0.7111,
+      "step": 332
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.7241820096969604,
+      "learning_rate": 6.396081901104909e-05,
+      "loss": 0.6154,
+      "step": 333
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.6183046102523804,
+      "learning_rate": 6.338351047544707e-05,
+      "loss": 0.5064,
+      "step": 334
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.8574573993682861,
+      "learning_rate": 6.280760785509801e-05,
+      "loss": 0.7799,
+      "step": 335
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 1.0641992092132568,
+      "learning_rate": 6.223313326217828e-05,
+      "loss": 0.5082,
+      "step": 336
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.9125005602836609,
+      "learning_rate": 6.166010875403429e-05,
+      "loss": 0.7545,
+      "step": 337
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.8739343285560608,
+      "learning_rate": 6.108855633233546e-05,
+      "loss": 0.5929,
+      "step": 338
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.6968967914581299,
+      "learning_rate": 6.0518497942229325e-05,
+      "loss": 0.6475,
+      "step": 339
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.8221940994262695,
+      "learning_rate": 5.9949955471499186e-05,
+      "loss": 0.579,
+      "step": 340
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.96427983045578,
+      "learning_rate": 5.93829507497235e-05,
+      "loss": 0.7766,
+      "step": 341
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.8502172231674194,
+      "learning_rate": 5.881750554743779e-05,
+      "loss": 0.7176,
+      "step": 342
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 1.1693928241729736,
+      "learning_rate": 5.82536415752988e-05,
+      "loss": 0.6447,
+      "step": 343
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.7575929760932922,
+      "learning_rate": 5.769138048325087e-05,
+      "loss": 0.7019,
+      "step": 344
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.9739773869514465,
+      "learning_rate": 5.713074385969457e-05,
+      "loss": 0.7178,
+      "step": 345
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.7964323163032532,
+      "learning_rate": 5.657175323065802e-05,
+      "loss": 0.6736,
+      "step": 346
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.8762109875679016,
+      "learning_rate": 5.6014430058970114e-05,
+      "loss": 0.6314,
+      "step": 347
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.8361276984214783,
+      "learning_rate": 5.545879574343661e-05,
+      "loss": 0.7155,
+      "step": 348
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.8512857556343079,
+      "learning_rate": 5.490487161801854e-05,
+      "loss": 0.6984,
+      "step": 349
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.8726841807365417,
+      "learning_rate": 5.435267895101302e-05,
+      "loss": 0.5749,
+      "step": 350
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.7646642923355103,
+      "learning_rate": 5.3802238944236505e-05,
+      "loss": 0.7181,
+      "step": 351
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.762646496295929,
+      "learning_rate": 5.325357273221099e-05,
+      "loss": 0.6197,
+      "step": 352
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.0159621238708496,
+      "learning_rate": 5.270670138135234e-05,
+      "loss": 0.5808,
+      "step": 353
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 1.1280730962753296,
+      "learning_rate": 5.216164588916148e-05,
+      "loss": 0.6669,
+      "step": 354
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.7981805205345154,
+      "learning_rate": 5.161842718341825e-05,
+      "loss": 0.6309,
+      "step": 355
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.5816403031349182,
+      "learning_rate": 5.107706612137776e-05,
+      "loss": 0.5003,
+      "step": 356
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.9182299971580505,
+      "learning_rate": 5.0537583488969734e-05,
+      "loss": 0.6022,
+      "step": 357
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.9387081265449524,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.623,
+      "step": 358
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.8042228817939758,
+      "learning_rate": 4.9464336295355854e-05,
+      "loss": 0.6668,
+      "step": 359
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.6955826878547668,
+      "learning_rate": 4.8930612942212916e-05,
+      "loss": 0.6922,
+      "step": 360
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.7190336585044861,
+      "learning_rate": 4.83988504332457e-05,
+      "loss": 0.5405,
+      "step": 361
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.6942302584648132,
+      "learning_rate": 4.786906918584083e-05,
+      "loss": 0.7027,
+      "step": 362
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.7399347424507141,
+      "learning_rate": 4.734128954131304e-05,
+      "loss": 0.6277,
+      "step": 363
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.8382598161697388,
+      "learning_rate": 4.6815531764124045e-05,
+      "loss": 0.6038,
+      "step": 364
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.7830659747123718,
+      "learning_rate": 4.629181604110464e-05,
+      "loss": 0.6611,
+      "step": 365
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.7205358743667603,
+      "learning_rate": 4.5770162480679624e-05,
+      "loss": 0.5469,
+      "step": 366
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.6976193189620972,
+      "learning_rate": 4.525059111209548e-05,
+      "loss": 0.6208,
+      "step": 367
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.7750869393348694,
+      "learning_rate": 4.4733121884651664e-05,
+      "loss": 0.6032,
+      "step": 368
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.7206342816352844,
+      "learning_rate": 4.421777466693434e-05,
+      "loss": 0.6763,
+      "step": 369
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.8639499545097351,
+      "learning_rate": 4.3704569246053805e-05,
+      "loss": 0.6836,
+      "step": 370
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.9039255976676941,
+      "learning_rate": 4.3193525326884435e-05,
+      "loss": 0.5905,
+      "step": 371
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.9770961403846741,
+      "learning_rate": 4.2684662531308386e-05,
+      "loss": 0.7598,
+      "step": 372
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.9292394518852234,
+      "learning_rate": 4.217800039746206e-05,
+      "loss": 0.741,
+      "step": 373
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.6980279088020325,
+      "learning_rate": 4.167355837898584e-05,
+      "loss": 0.6229,
+      "step": 374
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.8407570123672485,
+      "learning_rate": 4.1171355844277394e-05,
+      "loss": 0.6891,
+      "step": 375
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.7971334457397461,
+      "learning_rate": 4.0671412075747816e-05,
+      "loss": 0.6462,
+      "step": 376
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.7770645022392273,
+      "learning_rate": 4.017374626908125e-05,
+      "loss": 0.6123,
+      "step": 377
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.793257474899292,
+      "learning_rate": 3.967837753249804e-05,
+      "loss": 0.612,
+      "step": 378
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.7542336583137512,
+      "learning_rate": 3.918532488602094e-05,
+      "loss": 0.643,
+      "step": 379
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 1.2672817707061768,
+      "learning_rate": 3.869460726074474e-05,
+      "loss": 0.5942,
+      "step": 380
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.7728064060211182,
+      "learning_rate": 3.820624349810954e-05,
+      "loss": 0.5941,
+      "step": 381
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.6640836000442505,
+      "learning_rate": 3.772025234917728e-05,
+      "loss": 0.587,
+      "step": 382
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.7824575304985046,
+      "learning_rate": 3.7236652473911814e-05,
+      "loss": 0.7506,
+      "step": 383
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.7651994824409485,
+      "learning_rate": 3.675546244046228e-05,
+      "loss": 0.6337,
+      "step": 384
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.7056587934494019,
+      "learning_rate": 3.6276700724450384e-05,
+      "loss": 0.6715,
+      "step": 385
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.8649887442588806,
+      "learning_rate": 3.580038570826093e-05,
+      "loss": 0.7218,
+      "step": 386
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.8019347786903381,
+      "learning_rate": 3.532653568033587e-05,
+      "loss": 0.6234,
+      "step": 387
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.9476731419563293,
+      "learning_rate": 3.485516883447239e-05,
+      "loss": 0.5199,
+      "step": 388
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.8756290078163147,
+      "learning_rate": 3.438630326912414e-05,
+      "loss": 0.6379,
+      "step": 389
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.8843635320663452,
+      "learning_rate": 3.391995698670638e-05,
+      "loss": 0.6983,
+      "step": 390
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.8399759531021118,
+      "learning_rate": 3.345614789290472e-05,
+      "loss": 0.677,
+      "step": 391
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.7677200436592102,
+      "learning_rate": 3.2994893795987766e-05,
+      "loss": 0.5998,
+      "step": 392
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.0266742706298828,
+      "learning_rate": 3.253621240612326e-05,
+      "loss": 0.6444,
+      "step": 393
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.7664735913276672,
+      "learning_rate": 3.208012133469799e-05,
+      "loss": 0.643,
+      "step": 394
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.6599723100662231,
+      "learning_rate": 3.162663809364178e-05,
+      "loss": 0.5806,
+      "step": 395
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.7938780188560486,
+      "learning_rate": 3.117578009475503e-05,
+      "loss": 0.6664,
+      "step": 396
+    },
+    {
+      "epoch": 0.75,
+      "eval_loss": 0.6555132269859314,
+      "eval_runtime": 14.6967,
+      "eval_samples_per_second": 32.116,
+      "eval_steps_per_second": 32.116,
+      "step": 396
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.6883028149604797,
+      "learning_rate": 3.072756464904006e-05,
+      "loss": 0.6681,
+      "step": 397
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.7418582439422607,
+      "learning_rate": 3.0282008966036646e-05,
+      "loss": 0.619,
+      "step": 398
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.7900329232215881,
+      "learning_rate": 2.9839130153161154e-05,
+      "loss": 0.6181,
+      "step": 399
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.5973490476608276,
+      "learning_rate": 2.9398945215049567e-05,
+      "loss": 0.5325,
+      "step": 400
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.7216522693634033,
+      "learning_rate": 2.8961471052904852e-05,
+      "loss": 0.6496,
+      "step": 401
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.6355336308479309,
+      "learning_rate": 2.8526724463847722e-05,
+      "loss": 0.669,
+      "step": 402
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.849144697189331,
+      "learning_rate": 2.809472214027199e-05,
+      "loss": 0.6917,
+      "step": 403
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 1.0107152462005615,
+      "learning_rate": 2.766548066920338e-05,
+      "loss": 0.7632,
+      "step": 404
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.9305949807167053,
+      "learning_rate": 2.7239016531662887e-05,
+      "loss": 0.6732,
+      "step": 405
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.6983330249786377,
+      "learning_rate": 2.6815346102033877e-05,
+      "loss": 0.5301,
+      "step": 406
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.7181352376937866,
+      "learning_rate": 2.6394485647433277e-05,
+      "loss": 0.6498,
+      "step": 407
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.7354316711425781,
+      "learning_rate": 2.5976451327087204e-05,
+      "loss": 0.5822,
+      "step": 408
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.7557486891746521,
+      "learning_rate": 2.5561259191710407e-05,
+      "loss": 0.5257,
+      "step": 409
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.7786766290664673,
+      "learning_rate": 2.514892518288988e-05,
+      "loss": 0.596,
+      "step": 410
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.8994741439819336,
+      "learning_rate": 2.4739465132473016e-05,
+      "loss": 0.6946,
+      "step": 411
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.8272367119789124,
+      "learning_rate": 2.4332894761959603e-05,
+      "loss": 0.5883,
+      "step": 412
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.7980152368545532,
+      "learning_rate": 2.3929229681898003e-05,
+      "loss": 0.6563,
+      "step": 413
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.7788495421409607,
+      "learning_rate": 2.3528485391286147e-05,
+      "loss": 0.6374,
+      "step": 414
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.7941696643829346,
+      "learning_rate": 2.3130677276976232e-05,
+      "loss": 0.7056,
+      "step": 415
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.8505434989929199,
+      "learning_rate": 2.2735820613083834e-05,
+      "loss": 0.6537,
+      "step": 416
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 1.4972716569900513,
+      "learning_rate": 2.234393056040166e-05,
+      "loss": 0.6924,
+      "step": 417
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.7153500914573669,
+      "learning_rate": 2.195502216581734e-05,
+      "loss": 0.5352,
+      "step": 418
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.7720330953598022,
+      "learning_rate": 2.1569110361735677e-05,
+      "loss": 0.5763,
+      "step": 419
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.8194432258605957,
+      "learning_rate": 2.118620996550529e-05,
+      "loss": 0.6945,
+      "step": 420
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.9006208181381226,
+      "learning_rate": 2.0806335678849765e-05,
+      "loss": 0.675,
+      "step": 421
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.8461016416549683,
+      "learning_rate": 2.0429502087303164e-05,
+      "loss": 0.6308,
+      "step": 422
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 4.719590187072754,
+      "learning_rate": 2.0055723659649904e-05,
+      "loss": 0.7509,
+      "step": 423
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.8160830736160278,
+      "learning_rate": 1.968501474736929e-05,
+      "loss": 0.6519,
+      "step": 424
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.9067161083221436,
+      "learning_rate": 1.9317389584084568e-05,
+      "loss": 0.7138,
+      "step": 425
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.8717969655990601,
+      "learning_rate": 1.8952862285016194e-05,
+      "loss": 0.7402,
+      "step": 426
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 1.1476932764053345,
+      "learning_rate": 1.8591446846440097e-05,
+      "loss": 0.7367,
+      "step": 427
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.7396849393844604,
+      "learning_rate": 1.823315714515018e-05,
+      "loss": 0.6647,
+      "step": 428
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.7775653004646301,
+      "learning_rate": 1.787800693792545e-05,
+      "loss": 0.6635,
+      "step": 429
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.7602683901786804,
+      "learning_rate": 1.7526009861001956e-05,
+      "loss": 0.6498,
+      "step": 430
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.7206698656082153,
+      "learning_rate": 1.717717942954914e-05,
+      "loss": 0.5803,
+      "step": 431
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.97299724817276,
+      "learning_rate": 1.6831529037150827e-05,
+      "loss": 0.721,
+      "step": 432
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 1.1216304302215576,
+      "learning_rate": 1.648907195529117e-05,
+      "loss": 0.6673,
+      "step": 433
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.8716909885406494,
+      "learning_rate": 1.614982133284495e-05,
+      "loss": 0.6804,
+      "step": 434
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.775269627571106,
+      "learning_rate": 1.5813790195572674e-05,
+      "loss": 0.6865,
+      "step": 435
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.807775616645813,
+      "learning_rate": 1.5480991445620542e-05,
+      "loss": 0.6331,
+      "step": 436
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.7059893012046814,
+      "learning_rate": 1.515143786102503e-05,
+      "loss": 0.5684,
+      "step": 437
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.5673685669898987,
+      "learning_rate": 1.482514209522231e-05,
+      "loss": 0.5452,
+      "step": 438
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.849505603313446,
+      "learning_rate": 1.4502116676562261e-05,
+      "loss": 0.6149,
+      "step": 439
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.7047016024589539,
+      "learning_rate": 1.4182374007827603e-05,
+      "loss": 0.6779,
+      "step": 440
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.9966226816177368,
+      "learning_rate": 1.3865926365757643e-05,
+      "loss": 0.706,
+      "step": 441
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.8300743103027344,
+      "learning_rate": 1.3552785900576792e-05,
+      "loss": 0.6306,
+      "step": 442
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.6334244012832642,
+      "learning_rate": 1.324296463552821e-05,
+      "loss": 0.539,
+      "step": 443
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.8060217499732971,
+      "learning_rate": 1.2936474466412085e-05,
+      "loss": 0.6693,
+      "step": 444
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.8065308332443237,
+      "learning_rate": 1.263332716112885e-05,
+      "loss": 0.678,
+      "step": 445
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.7400084137916565,
+      "learning_rate": 1.2333534359227384e-05,
+      "loss": 0.5707,
+      "step": 446
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.7766801118850708,
+      "learning_rate": 1.203710757145815e-05,
+      "loss": 0.6258,
+      "step": 447
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.7389542460441589,
+      "learning_rate": 1.1744058179331175e-05,
+      "loss": 0.6995,
+      "step": 448
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.8142557144165039,
+      "learning_rate": 1.1454397434679021e-05,
+      "loss": 0.6823,
+      "step": 449
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 1.016276240348816,
+      "learning_rate": 1.1168136459224842e-05,
+      "loss": 0.7201,
+      "step": 450
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.7469731569290161,
+      "learning_rate": 1.0885286244155345e-05,
+      "loss": 0.6593,
+      "step": 451
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.8244340419769287,
+      "learning_rate": 1.0605857649698669e-05,
+      "loss": 0.6278,
+      "step": 452
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.8314517736434937,
+      "learning_rate": 1.0329861404707564e-05,
+      "loss": 0.6895,
+      "step": 453
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.7985360026359558,
+      "learning_rate": 1.0057308106247332e-05,
+      "loss": 0.6304,
+      "step": 454
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.7921756505966187,
+      "learning_rate": 9.788208219188932e-06,
+      "loss": 0.6864,
+      "step": 455
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.8282172679901123,
+      "learning_rate": 9.522572075807334e-06,
+      "loss": 0.6364,
+      "step": 456
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.9883142113685608,
+      "learning_rate": 9.260409875384568e-06,
+      "loss": 0.5466,
+      "step": 457
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.8472033143043518,
+      "learning_rate": 9.001731683818337e-06,
+      "loss": 0.5954,
+      "step": 458
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.804810643196106,
+      "learning_rate": 8.746547433235364e-06,
+      "loss": 0.6904,
+      "step": 459
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.716760516166687,
+      "learning_rate": 8.494866921610133e-06,
+      "loss": 0.5326,
+      "step": 460
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 1.048392653465271,
+      "learning_rate": 8.246699812388714e-06,
+      "loss": 0.6277,
+      "step": 461
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.7103719711303711,
+      "learning_rate": 8.002055634117578e-06,
+      "loss": 0.6877,
+      "step": 462
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.9400436878204346,
+      "learning_rate": 7.760943780077933e-06,
+      "loss": 0.6194,
+      "step": 463
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.7876155376434326,
+      "learning_rate": 7.523373507924947e-06,
+      "loss": 0.7067,
+      "step": 464
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.7430306077003479,
+      "learning_rate": 7.289353939332288e-06,
+      "loss": 0.7236,
+      "step": 465
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.7089613676071167,
+      "learning_rate": 7.058894059641963e-06,
+      "loss": 0.6644,
+      "step": 466
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 2.6532530784606934,
+      "learning_rate": 6.8320027175192706e-06,
+      "loss": 0.8418,
+      "step": 467
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.7472643852233887,
+      "learning_rate": 6.608688624613057e-06,
+      "loss": 0.6375,
+      "step": 468
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.8089672327041626,
+      "learning_rate": 6.388960355221207e-06,
+      "loss": 0.6339,
+      "step": 469
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.9210778474807739,
+      "learning_rate": 6.1728263459614796e-06,
+      "loss": 0.6573,
+      "step": 470
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.7312812209129333,
+      "learning_rate": 5.960294895447549e-06,
+      "loss": 0.6352,
+      "step": 471
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.7618116140365601,
+      "learning_rate": 5.751374163970347e-06,
+      "loss": 0.4996,
+      "step": 472
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.7650373578071594,
+      "learning_rate": 5.546072173184791e-06,
+      "loss": 0.6693,
+      "step": 473
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.8264601826667786,
+      "learning_rate": 5.344396805801766e-06,
+      "loss": 0.5748,
+      "step": 474
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.6858938932418823,
+      "learning_rate": 5.146355805285452e-06,
+      "loss": 0.7102,
+      "step": 475
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 1.2532672882080078,
+      "learning_rate": 4.951956775555999e-06,
+      "loss": 0.5724,
+      "step": 476
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.8648524284362793,
+      "learning_rate": 4.7612071806976575e-06,
+      "loss": 0.6986,
+      "step": 477
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.6834442615509033,
+      "learning_rate": 4.574114344672042e-06,
+      "loss": 0.6115,
+      "step": 478
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 1.053343415260315,
+      "learning_rate": 4.390685451037025e-06,
+      "loss": 0.6634,
+      "step": 479
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.6925364136695862,
+      "learning_rate": 4.210927542670917e-06,
+      "loss": 0.5087,
+      "step": 480
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.9825167059898376,
+      "learning_rate": 4.034847521502028e-06,
+      "loss": 0.5799,
+      "step": 481
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.7158229947090149,
+      "learning_rate": 3.862452148243622e-06,
+      "loss": 0.6089,
+      "step": 482
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 1.0506715774536133,
+      "learning_rate": 3.693748042134415e-06,
+      "loss": 0.6954,
+      "step": 483
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.75848788022995,
+      "learning_rate": 3.528741680684411e-06,
+      "loss": 0.6844,
+      "step": 484
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.8962730169296265,
+      "learning_rate": 3.367439399426087e-06,
+      "loss": 0.6411,
+      "step": 485
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.7715455293655396,
+      "learning_rate": 3.2098473916712946e-06,
+      "loss": 0.5918,
+      "step": 486
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.6626971364021301,
+      "learning_rate": 3.055971708273375e-06,
+      "loss": 0.4995,
+      "step": 487
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.9594528675079346,
+      "learning_rate": 2.905818257394799e-06,
+      "loss": 0.6345,
+      "step": 488
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.9263612627983093,
+      "learning_rate": 2.759392804280414e-06,
+      "loss": 0.6999,
+      "step": 489
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.7584457993507385,
+      "learning_rate": 2.616700971036001e-06,
+      "loss": 0.6901,
+      "step": 490
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.9710139036178589,
+      "learning_rate": 2.4777482364124695e-06,
+      "loss": 0.7086,
+      "step": 491
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.7744004726409912,
+      "learning_rate": 2.342539935595445e-06,
+      "loss": 0.6366,
+      "step": 492
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.7367916703224182,
+      "learning_rate": 2.2110812600004694e-06,
+      "loss": 0.5976,
+      "step": 493
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.9458748698234558,
+      "learning_rate": 2.0833772570736375e-06,
+      "loss": 0.6082,
+      "step": 494
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.8267062902450562,
+      "learning_rate": 1.959432830097807e-06,
+      "loss": 0.4377,
+      "step": 495
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.8359901905059814,
+      "learning_rate": 1.83925273800436e-06,
+      "loss": 0.6793,
+      "step": 496
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.6290463805198669,
+      "learning_rate": 1.7228415951904165e-06,
+      "loss": 0.579,
+      "step": 497
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 1.215668797492981,
+      "learning_rate": 1.61020387134172e-06,
+      "loss": 0.617,
+      "step": 498
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.8642238974571228,
+      "learning_rate": 1.50134389126102e-06,
+      "loss": 0.6548,
+      "step": 499
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.8044393062591553,
+      "learning_rate": 1.396265834701982e-06,
+      "loss": 0.6726,
+      "step": 500
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.786636233329773,
+      "learning_rate": 1.2949737362087156e-06,
+      "loss": 0.5501,
+      "step": 501
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.8221418261528015,
+      "learning_rate": 1.1974714849608882e-06,
+      "loss": 0.6528,
+      "step": 502
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.8847975134849548,
+      "learning_rate": 1.103762824624377e-06,
+      "loss": 0.6465,
+      "step": 503
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.9078425168991089,
+      "learning_rate": 1.0138513532075067e-06,
+      "loss": 0.5795,
+      "step": 504
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.8821399211883545,
+      "learning_rate": 9.277405229229708e-07,
+      "loss": 0.6663,
+      "step": 505
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.8557557463645935,
+      "learning_rate": 8.454336400552154e-07,
+      "loss": 0.6063,
+      "step": 506
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.8125676512718201,
+      "learning_rate": 7.669338648334856e-07,
+      "loss": 0.6687,
+      "step": 507
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.698814868927002,
+      "learning_rate": 6.922442113105665e-07,
+      "loss": 0.6499,
+      "step": 508
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.7289395928382874,
+      "learning_rate": 6.21367547246976e-07,
+      "loss": 0.6431,
+      "step": 509
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.6379066705703735,
+      "learning_rate": 5.543065940008862e-07,
+      "loss": 0.633,
+      "step": 510
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.9233940839767456,
+      "learning_rate": 4.910639264236294e-07,
+      "loss": 0.6173,
+      "step": 511
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.8638243079185486,
+      "learning_rate": 4.316419727608434e-07,
+      "loss": 0.7366,
+      "step": 512
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.7939671277999878,
+      "learning_rate": 3.760430145592575e-07,
+      "loss": 0.645,
+      "step": 513
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.8512327671051025,
+      "learning_rate": 3.2426918657900704e-07,
+      "loss": 0.6733,
+      "step": 514
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.7165814638137817,
+      "learning_rate": 2.7632247671177667e-07,
+      "loss": 0.5626,
+      "step": 515
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.7893707752227783,
+      "learning_rate": 2.3220472590440579e-07,
+      "loss": 0.7017,
+      "step": 516
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.6842373013496399,
+      "learning_rate": 1.919176280882229e-07,
+      "loss": 0.6092,
+      "step": 517
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.6570567488670349,
+      "learning_rate": 1.554627301140199e-07,
+      "loss": 0.518,
+      "step": 518
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.9338929653167725,
+      "learning_rate": 1.2284143169261075e-07,
+      "loss": 0.6498,
+      "step": 519
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.7087960243225098,
+      "learning_rate": 9.405498534115209e-08,
+      "loss": 0.6872,
+      "step": 520
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.9282191395759583,
+      "learning_rate": 6.910449633501514e-08,
+      "loss": 0.7439,
+      "step": 521
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.8347114324569702,
+      "learning_rate": 4.799092266535299e-08,
+      "loss": 0.6346,
+      "step": 522
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 1.1854585409164429,
+      "learning_rate": 3.071507500231885e-08,
+      "loss": 0.7587,
+      "step": 523
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.7577679753303528,
+      "learning_rate": 1.727761666394656e-08,
+      "loss": 0.7224,
+      "step": 524
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.9748638868331909,
+      "learning_rate": 7.679063590670942e-09,
+      "loss": 0.59,
+      "step": 525
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.2704914808273315,
+      "learning_rate": 1.919784325521423e-09,
+      "loss": 0.6941,
+      "step": 526
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.7523401379585266,
+      "learning_rate": 0.0,
+      "loss": 0.6226,
+      "step": 527
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 527,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "total_flos": 7.975101399588864e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

checkpoint-527/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fc063ac87d7d75d2424e6f7bc5d3a41cf0a4d792bb021c6147681f06a6f84ac3
+size 5624

config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "_name_or_path": "openlm-research/open_llama_3b_v2",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 3200,
+  "initializer_range": 0.02,
+  "intermediate_size": 8640,
+  "max_position_embeddings": 2048,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 26,
+  "num_key_value_heads": 32,
+  "pad_token_id": 0,
+  "pretraining_tp": 1,
+  "quantization_config": {
+    "_load_in_4bit": false,
+    "_load_in_8bit": true,
+    "bnb_4bit_compute_dtype": "float32",
+    "bnb_4bit_quant_type": "fp4",
+    "bnb_4bit_use_double_quant": false,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "llm_int8_skip_modules": null,
+    "llm_int8_threshold": 6.0,
+    "load_in_4bit": false,
+    "load_in_8bit": true,
+    "quant_method": "bitsandbytes"
+  },
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.38.2",
+  "use_cache": false,
+  "vocab_size": 32000
+}

runs/Mar09_05-23-07_9b5078085e9b/events.out.tfevents.1709961787.9b5078085e9b.1472.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c97ce298bd3fe8ca2eeab1fa207f792f1031b0b4e5d068582a0c35e09553c17
+size 117940

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91b289e85fa20fd375d8b33dc12f77616f18abc6359804471d1fafcb425fecb8
+size 511574

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 2048,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}