MilaWang commited on Mar 28

Commit

c32acad

verified ·

1 Parent(s): 4cdbe50

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-1023/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-1023/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-1023/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-1023/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-1023/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-1023/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-1023/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-1023/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-1023/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-1023/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-1023/trainer_state.json +755 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-1023/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-2047/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-2047/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-2047/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-2047/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-2047/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-2047/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-2047/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-2047/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-2047/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-2047/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-2047/trainer_state.json +1477 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-2047/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-3070/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-3070/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-3070/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-3070/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-3070/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-3070/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-3070/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-3070/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-3070/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-3070/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-3070/trainer_state.json +2206 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-3070/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-4094/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-4094/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-4094/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-4094/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-4094/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-4094/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-4094/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-4094/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-4094/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-4094/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-4094/trainer_state.json +2928 -0

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1783b161c00789a4b631199a90697e7cf1fb64473a5daae89f6d8dd4277e4c66
+size 109069176

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-1023/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-1023/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-1023/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6e987798fb953d47a59902f5b3c86517cedce305908221860c33520110d05629
+size 109069176

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-1023/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe91ed29e79ba606fe5e333859069e02bbb701c59bd6af895bdd53261d43a469
+size 55532666

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-1023/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8abb126f443bfdd69eb1a4c09bb8a151edb1083eb2bacfb18dda619ac766a826
+size 14244

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-1023/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5232d89f08bdd76d944153588195abb7719a0929a020b5e1bb06ede6cd420bd
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-1023/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-1023/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-1023/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-1023/trainer_state.json ADDED Viewed

	@@ -0,0 +1,755 @@

+{
+  "best_metric": 0.433101624250412,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-1023",
+  "epoch": 0.9995114802149487,
+  "eval_steps": 10,
+  "global_step": 1023,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.009770395701025891,
+      "grad_norm": 1.1888047456741333,
+      "learning_rate": 0.0002,
+      "loss": 1.7474,
+      "step": 10
+    },
+    {
+      "epoch": 0.019540791402051783,
+      "grad_norm": 1.3118009567260742,
+      "learning_rate": 0.0002,
+      "loss": 1.157,
+      "step": 20
+    },
+    {
+      "epoch": 0.029311187103077674,
+      "grad_norm": 1.1254922151565552,
+      "learning_rate": 0.0002,
+      "loss": 0.9979,
+      "step": 30
+    },
+    {
+      "epoch": 0.039081582804103565,
+      "grad_norm": 0.9634686708450317,
+      "learning_rate": 0.0002,
+      "loss": 0.8859,
+      "step": 40
+    },
+    {
+      "epoch": 0.048851978505129456,
+      "grad_norm": 0.9101817607879639,
+      "learning_rate": 0.0002,
+      "loss": 0.7826,
+      "step": 50
+    },
+    {
+      "epoch": 0.05862237420615535,
+      "grad_norm": 1.0019943714141846,
+      "learning_rate": 0.0002,
+      "loss": 0.7358,
+      "step": 60
+    },
+    {
+      "epoch": 0.06839276990718124,
+      "grad_norm": 0.9201828837394714,
+      "learning_rate": 0.0002,
+      "loss": 0.6664,
+      "step": 70
+    },
+    {
+      "epoch": 0.07816316560820713,
+      "grad_norm": 0.9210318922996521,
+      "learning_rate": 0.0002,
+      "loss": 0.6785,
+      "step": 80
+    },
+    {
+      "epoch": 0.08793356130923302,
+      "grad_norm": 0.8079697489738464,
+      "learning_rate": 0.0002,
+      "loss": 0.652,
+      "step": 90
+    },
+    {
+      "epoch": 0.09770395701025891,
+      "grad_norm": 0.7530406713485718,
+      "learning_rate": 0.0002,
+      "loss": 0.6469,
+      "step": 100
+    },
+    {
+      "epoch": 0.1074743527112848,
+      "grad_norm": 0.8732273578643799,
+      "learning_rate": 0.0002,
+      "loss": 0.6604,
+      "step": 110
+    },
+    {
+      "epoch": 0.1172447484123107,
+      "grad_norm": 0.9163013696670532,
+      "learning_rate": 0.0002,
+      "loss": 0.6429,
+      "step": 120
+    },
+    {
+      "epoch": 0.1270151441133366,
+      "grad_norm": 0.5931605696678162,
+      "learning_rate": 0.0002,
+      "loss": 0.6269,
+      "step": 130
+    },
+    {
+      "epoch": 0.13678553981436248,
+      "grad_norm": 0.8782339692115784,
+      "learning_rate": 0.0002,
+      "loss": 0.6349,
+      "step": 140
+    },
+    {
+      "epoch": 0.14655593551538837,
+      "grad_norm": 0.6683491468429565,
+      "learning_rate": 0.0002,
+      "loss": 0.657,
+      "step": 150
+    },
+    {
+      "epoch": 0.15632633121641426,
+      "grad_norm": 0.7998592257499695,
+      "learning_rate": 0.0002,
+      "loss": 0.6315,
+      "step": 160
+    },
+    {
+      "epoch": 0.16609672691744015,
+      "grad_norm": 0.6159262657165527,
+      "learning_rate": 0.0002,
+      "loss": 0.6347,
+      "step": 170
+    },
+    {
+      "epoch": 0.17586712261846604,
+      "grad_norm": 0.671146035194397,
+      "learning_rate": 0.0002,
+      "loss": 0.6023,
+      "step": 180
+    },
+    {
+      "epoch": 0.18563751831949193,
+      "grad_norm": 0.5839019417762756,
+      "learning_rate": 0.0002,
+      "loss": 0.6101,
+      "step": 190
+    },
+    {
+      "epoch": 0.19540791402051783,
+      "grad_norm": 0.5090241432189941,
+      "learning_rate": 0.0002,
+      "loss": 0.6121,
+      "step": 200
+    },
+    {
+      "epoch": 0.20517830972154372,
+      "grad_norm": 0.652291476726532,
+      "learning_rate": 0.0002,
+      "loss": 0.6296,
+      "step": 210
+    },
+    {
+      "epoch": 0.2149487054225696,
+      "grad_norm": 0.6500856876373291,
+      "learning_rate": 0.0002,
+      "loss": 0.577,
+      "step": 220
+    },
+    {
+      "epoch": 0.2247191011235955,
+      "grad_norm": 0.6135480999946594,
+      "learning_rate": 0.0002,
+      "loss": 0.6186,
+      "step": 230
+    },
+    {
+      "epoch": 0.2344894968246214,
+      "grad_norm": 0.6102302074432373,
+      "learning_rate": 0.0002,
+      "loss": 0.6132,
+      "step": 240
+    },
+    {
+      "epoch": 0.24425989252564728,
+      "grad_norm": 0.6909783482551575,
+      "learning_rate": 0.0002,
+      "loss": 0.592,
+      "step": 250
+    },
+    {
+      "epoch": 0.2540302882266732,
+      "grad_norm": 0.5834446549415588,
+      "learning_rate": 0.0002,
+      "loss": 0.5832,
+      "step": 260
+    },
+    {
+      "epoch": 0.26380068392769906,
+      "grad_norm": 0.5275322198867798,
+      "learning_rate": 0.0002,
+      "loss": 0.6038,
+      "step": 270
+    },
+    {
+      "epoch": 0.27357107962872496,
+      "grad_norm": 0.5611422657966614,
+      "learning_rate": 0.0002,
+      "loss": 0.5469,
+      "step": 280
+    },
+    {
+      "epoch": 0.28334147532975085,
+      "grad_norm": 0.6549052596092224,
+      "learning_rate": 0.0002,
+      "loss": 0.552,
+      "step": 290
+    },
+    {
+      "epoch": 0.29311187103077674,
+      "grad_norm": 0.563291072845459,
+      "learning_rate": 0.0002,
+      "loss": 0.5609,
+      "step": 300
+    },
+    {
+      "epoch": 0.30288226673180263,
+      "grad_norm": 0.5598369240760803,
+      "learning_rate": 0.0002,
+      "loss": 0.5632,
+      "step": 310
+    },
+    {
+      "epoch": 0.3126526624328285,
+      "grad_norm": 0.6525678634643555,
+      "learning_rate": 0.0002,
+      "loss": 0.5627,
+      "step": 320
+    },
+    {
+      "epoch": 0.3224230581338544,
+      "grad_norm": 0.5190592408180237,
+      "learning_rate": 0.0002,
+      "loss": 0.5526,
+      "step": 330
+    },
+    {
+      "epoch": 0.3321934538348803,
+      "grad_norm": 0.45483070611953735,
+      "learning_rate": 0.0002,
+      "loss": 0.5698,
+      "step": 340
+    },
+    {
+      "epoch": 0.3419638495359062,
+      "grad_norm": 0.8094475865364075,
+      "learning_rate": 0.0002,
+      "loss": 0.5768,
+      "step": 350
+    },
+    {
+      "epoch": 0.3517342452369321,
+      "grad_norm": 0.5545358061790466,
+      "learning_rate": 0.0002,
+      "loss": 0.5555,
+      "step": 360
+    },
+    {
+      "epoch": 0.361504640937958,
+      "grad_norm": 0.6899498701095581,
+      "learning_rate": 0.0002,
+      "loss": 0.5529,
+      "step": 370
+    },
+    {
+      "epoch": 0.37127503663898387,
+      "grad_norm": 0.4584816098213196,
+      "learning_rate": 0.0002,
+      "loss": 0.556,
+      "step": 380
+    },
+    {
+      "epoch": 0.38104543234000976,
+      "grad_norm": 0.5436979532241821,
+      "learning_rate": 0.0002,
+      "loss": 0.5451,
+      "step": 390
+    },
+    {
+      "epoch": 0.39081582804103565,
+      "grad_norm": 0.7512422800064087,
+      "learning_rate": 0.0002,
+      "loss": 0.5377,
+      "step": 400
+    },
+    {
+      "epoch": 0.40058622374206154,
+      "grad_norm": 0.6394727826118469,
+      "learning_rate": 0.0002,
+      "loss": 0.5438,
+      "step": 410
+    },
+    {
+      "epoch": 0.41035661944308743,
+      "grad_norm": 0.5314047336578369,
+      "learning_rate": 0.0002,
+      "loss": 0.5535,
+      "step": 420
+    },
+    {
+      "epoch": 0.4201270151441133,
+      "grad_norm": 0.5658334493637085,
+      "learning_rate": 0.0002,
+      "loss": 0.5587,
+      "step": 430
+    },
+    {
+      "epoch": 0.4298974108451392,
+      "grad_norm": 0.5295330882072449,
+      "learning_rate": 0.0002,
+      "loss": 0.5219,
+      "step": 440
+    },
+    {
+      "epoch": 0.4396678065461651,
+      "grad_norm": 0.6460115313529968,
+      "learning_rate": 0.0002,
+      "loss": 0.522,
+      "step": 450
+    },
+    {
+      "epoch": 0.449438202247191,
+      "grad_norm": 0.512022852897644,
+      "learning_rate": 0.0002,
+      "loss": 0.5416,
+      "step": 460
+    },
+    {
+      "epoch": 0.4592085979482169,
+      "grad_norm": 0.7365363836288452,
+      "learning_rate": 0.0002,
+      "loss": 0.5256,
+      "step": 470
+    },
+    {
+      "epoch": 0.4689789936492428,
+      "grad_norm": 0.6292932629585266,
+      "learning_rate": 0.0002,
+      "loss": 0.5354,
+      "step": 480
+    },
+    {
+      "epoch": 0.4787493893502687,
+      "grad_norm": 0.6255582571029663,
+      "learning_rate": 0.0002,
+      "loss": 0.5436,
+      "step": 490
+    },
+    {
+      "epoch": 0.48851978505129456,
+      "grad_norm": 0.5599279403686523,
+      "learning_rate": 0.0002,
+      "loss": 0.5394,
+      "step": 500
+    },
+    {
+      "epoch": 0.49829018075232046,
+      "grad_norm": 0.573657751083374,
+      "learning_rate": 0.0002,
+      "loss": 0.5297,
+      "step": 510
+    },
+    {
+      "epoch": 0.5080605764533463,
+      "grad_norm": 0.6362313628196716,
+      "learning_rate": 0.0002,
+      "loss": 0.5299,
+      "step": 520
+    },
+    {
+      "epoch": 0.5178309721543722,
+      "grad_norm": 0.6360035538673401,
+      "learning_rate": 0.0002,
+      "loss": 0.5458,
+      "step": 530
+    },
+    {
+      "epoch": 0.5276013678553981,
+      "grad_norm": 0.7129001021385193,
+      "learning_rate": 0.0002,
+      "loss": 0.5228,
+      "step": 540
+    },
+    {
+      "epoch": 0.537371763556424,
+      "grad_norm": 0.5596054196357727,
+      "learning_rate": 0.0002,
+      "loss": 0.5091,
+      "step": 550
+    },
+    {
+      "epoch": 0.5471421592574499,
+      "grad_norm": 0.7081596851348877,
+      "learning_rate": 0.0002,
+      "loss": 0.5153,
+      "step": 560
+    },
+    {
+      "epoch": 0.5569125549584758,
+      "grad_norm": 0.6816760301589966,
+      "learning_rate": 0.0002,
+      "loss": 0.4999,
+      "step": 570
+    },
+    {
+      "epoch": 0.5666829506595017,
+      "grad_norm": 0.47695112228393555,
+      "learning_rate": 0.0002,
+      "loss": 0.4974,
+      "step": 580
+    },
+    {
+      "epoch": 0.5764533463605276,
+      "grad_norm": 0.7528041005134583,
+      "learning_rate": 0.0002,
+      "loss": 0.5247,
+      "step": 590
+    },
+    {
+      "epoch": 0.5862237420615535,
+      "grad_norm": 0.5452813506126404,
+      "learning_rate": 0.0002,
+      "loss": 0.5265,
+      "step": 600
+    },
+    {
+      "epoch": 0.5959941377625794,
+      "grad_norm": 0.6085044741630554,
+      "learning_rate": 0.0002,
+      "loss": 0.4965,
+      "step": 610
+    },
+    {
+      "epoch": 0.6057645334636053,
+      "grad_norm": 0.6745641231536865,
+      "learning_rate": 0.0002,
+      "loss": 0.4916,
+      "step": 620
+    },
+    {
+      "epoch": 0.6155349291646312,
+      "grad_norm": 0.647544264793396,
+      "learning_rate": 0.0002,
+      "loss": 0.5107,
+      "step": 630
+    },
+    {
+      "epoch": 0.625305324865657,
+      "grad_norm": 0.6123825311660767,
+      "learning_rate": 0.0002,
+      "loss": 0.4864,
+      "step": 640
+    },
+    {
+      "epoch": 0.6350757205666829,
+      "grad_norm": 0.5815364122390747,
+      "learning_rate": 0.0002,
+      "loss": 0.484,
+      "step": 650
+    },
+    {
+      "epoch": 0.6448461162677088,
+      "grad_norm": 0.6184095740318298,
+      "learning_rate": 0.0002,
+      "loss": 0.4966,
+      "step": 660
+    },
+    {
+      "epoch": 0.6546165119687347,
+      "grad_norm": 0.5856700539588928,
+      "learning_rate": 0.0002,
+      "loss": 0.4861,
+      "step": 670
+    },
+    {
+      "epoch": 0.6643869076697606,
+      "grad_norm": 0.6424922943115234,
+      "learning_rate": 0.0002,
+      "loss": 0.4964,
+      "step": 680
+    },
+    {
+      "epoch": 0.6741573033707865,
+      "grad_norm": 0.7051425576210022,
+      "learning_rate": 0.0002,
+      "loss": 0.5019,
+      "step": 690
+    },
+    {
+      "epoch": 0.6839276990718124,
+      "grad_norm": 0.6133471131324768,
+      "learning_rate": 0.0002,
+      "loss": 0.4649,
+      "step": 700
+    },
+    {
+      "epoch": 0.6936980947728383,
+      "grad_norm": 0.6933842897415161,
+      "learning_rate": 0.0002,
+      "loss": 0.4847,
+      "step": 710
+    },
+    {
+      "epoch": 0.7034684904738642,
+      "grad_norm": 0.6440989375114441,
+      "learning_rate": 0.0002,
+      "loss": 0.4945,
+      "step": 720
+    },
+    {
+      "epoch": 0.7132388861748901,
+      "grad_norm": 0.87819904088974,
+      "learning_rate": 0.0002,
+      "loss": 0.4777,
+      "step": 730
+    },
+    {
+      "epoch": 0.723009281875916,
+      "grad_norm": 0.6810497641563416,
+      "learning_rate": 0.0002,
+      "loss": 0.4914,
+      "step": 740
+    },
+    {
+      "epoch": 0.7327796775769418,
+      "grad_norm": 0.7822733521461487,
+      "learning_rate": 0.0002,
+      "loss": 0.4789,
+      "step": 750
+    },
+    {
+      "epoch": 0.7425500732779677,
+      "grad_norm": 0.6669152975082397,
+      "learning_rate": 0.0002,
+      "loss": 0.4615,
+      "step": 760
+    },
+    {
+      "epoch": 0.7523204689789936,
+      "grad_norm": 0.7351736426353455,
+      "learning_rate": 0.0002,
+      "loss": 0.4689,
+      "step": 770
+    },
+    {
+      "epoch": 0.7620908646800195,
+      "grad_norm": 1.0013558864593506,
+      "learning_rate": 0.0002,
+      "loss": 0.4629,
+      "step": 780
+    },
+    {
+      "epoch": 0.7718612603810454,
+      "grad_norm": 0.7465775609016418,
+      "learning_rate": 0.0002,
+      "loss": 0.4739,
+      "step": 790
+    },
+    {
+      "epoch": 0.7816316560820713,
+      "grad_norm": 1.0959300994873047,
+      "learning_rate": 0.0002,
+      "loss": 0.4635,
+      "step": 800
+    },
+    {
+      "epoch": 0.7914020517830972,
+      "grad_norm": 0.5292418599128723,
+      "learning_rate": 0.0002,
+      "loss": 0.4549,
+      "step": 810
+    },
+    {
+      "epoch": 0.8011724474841231,
+      "grad_norm": 0.6555328965187073,
+      "learning_rate": 0.0002,
+      "loss": 0.458,
+      "step": 820
+    },
+    {
+      "epoch": 0.810942843185149,
+      "grad_norm": 0.6462382674217224,
+      "learning_rate": 0.0002,
+      "loss": 0.488,
+      "step": 830
+    },
+    {
+      "epoch": 0.8207132388861749,
+      "grad_norm": 0.6840918064117432,
+      "learning_rate": 0.0002,
+      "loss": 0.4541,
+      "step": 840
+    },
+    {
+      "epoch": 0.8304836345872008,
+      "grad_norm": 0.5715351700782776,
+      "learning_rate": 0.0002,
+      "loss": 0.4509,
+      "step": 850
+    },
+    {
+      "epoch": 0.8402540302882266,
+      "grad_norm": 0.5583404898643494,
+      "learning_rate": 0.0002,
+      "loss": 0.4535,
+      "step": 860
+    },
+    {
+      "epoch": 0.8500244259892525,
+      "grad_norm": 0.8243112564086914,
+      "learning_rate": 0.0002,
+      "loss": 0.4533,
+      "step": 870
+    },
+    {
+      "epoch": 0.8597948216902784,
+      "grad_norm": 0.6543600559234619,
+      "learning_rate": 0.0002,
+      "loss": 0.4545,
+      "step": 880
+    },
+    {
+      "epoch": 0.8695652173913043,
+      "grad_norm": 0.6494827270507812,
+      "learning_rate": 0.0002,
+      "loss": 0.4814,
+      "step": 890
+    },
+    {
+      "epoch": 0.8793356130923302,
+      "grad_norm": 0.8458304405212402,
+      "learning_rate": 0.0002,
+      "loss": 0.4593,
+      "step": 900
+    },
+    {
+      "epoch": 0.8891060087933561,
+      "grad_norm": 0.6854186654090881,
+      "learning_rate": 0.0002,
+      "loss": 0.4382,
+      "step": 910
+    },
+    {
+      "epoch": 0.898876404494382,
+      "grad_norm": 0.6300225853919983,
+      "learning_rate": 0.0002,
+      "loss": 0.4488,
+      "step": 920
+    },
+    {
+      "epoch": 0.9086468001954079,
+      "grad_norm": 0.9791533350944519,
+      "learning_rate": 0.0002,
+      "loss": 0.4638,
+      "step": 930
+    },
+    {
+      "epoch": 0.9184171958964338,
+      "grad_norm": 0.6965218186378479,
+      "learning_rate": 0.0002,
+      "loss": 0.446,
+      "step": 940
+    },
+    {
+      "epoch": 0.9281875915974597,
+      "grad_norm": 0.6066922545433044,
+      "learning_rate": 0.0002,
+      "loss": 0.4453,
+      "step": 950
+    },
+    {
+      "epoch": 0.9379579872984856,
+      "grad_norm": 0.8081962466239929,
+      "learning_rate": 0.0002,
+      "loss": 0.4471,
+      "step": 960
+    },
+    {
+      "epoch": 0.9477283829995115,
+      "grad_norm": 0.7755117416381836,
+      "learning_rate": 0.0002,
+      "loss": 0.4348,
+      "step": 970
+    },
+    {
+      "epoch": 0.9574987787005373,
+      "grad_norm": 0.7127223610877991,
+      "learning_rate": 0.0002,
+      "loss": 0.4423,
+      "step": 980
+    },
+    {
+      "epoch": 0.9672691744015632,
+      "grad_norm": 0.6947609186172485,
+      "learning_rate": 0.0002,
+      "loss": 0.4272,
+      "step": 990
+    },
+    {
+      "epoch": 0.9770395701025891,
+      "grad_norm": 1.0100330114364624,
+      "learning_rate": 0.0002,
+      "loss": 0.4262,
+      "step": 1000
+    },
+    {
+      "epoch": 0.986809965803615,
+      "grad_norm": 0.6727001667022705,
+      "learning_rate": 0.0002,
+      "loss": 0.4169,
+      "step": 1010
+    },
+    {
+      "epoch": 0.9965803615046409,
+      "grad_norm": 0.7834463119506836,
+      "learning_rate": 0.0002,
+      "loss": 0.4507,
+      "step": 1020
+    },
+    {
+      "epoch": 0.9995114802149487,
+      "eval_loss": 0.433101624250412,
+      "eval_runtime": 26.5222,
+      "eval_samples_per_second": 13.762,
+      "eval_steps_per_second": 1.734,
+      "step": 1023
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 8184,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.49057827848192e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-1023/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:469618ae8560edd4a517eb99451fb8bc5c5f148706842d569488535fb05e84cb
+size 5560

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-2047/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-2047/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:adc979ea9bd43c1e8245e72e8f60f6e33ea07dae85becbf2fa3a957cda657347
+size 109069176

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-2047/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0f390acf8a313926b1a26a81c35026b933d766da265ce88c1b44b7088cfdfc24
+size 55532666

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-2047/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:246ee26d5ccdcb5674a202819fba3e19700de529e218e03698311d1a8886d49b
+size 14244

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-2047/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c51ecee2be7313ce1ceb257e8cd0083bd8b7ccfae83d12d0412d787bf15c77b
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-2047/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-2047/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-2047/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-2047/trainer_state.json ADDED Viewed

	@@ -0,0 +1,1477 @@

+{
+  "best_metric": 0.3446754515171051,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-2047",
+  "epoch": 2.0,
+  "eval_steps": 10,
+  "global_step": 2047,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.009770395701025891,
+      "grad_norm": 1.1888047456741333,
+      "learning_rate": 0.0002,
+      "loss": 1.7474,
+      "step": 10
+    },
+    {
+      "epoch": 0.019540791402051783,
+      "grad_norm": 1.3118009567260742,
+      "learning_rate": 0.0002,
+      "loss": 1.157,
+      "step": 20
+    },
+    {
+      "epoch": 0.029311187103077674,
+      "grad_norm": 1.1254922151565552,
+      "learning_rate": 0.0002,
+      "loss": 0.9979,
+      "step": 30
+    },
+    {
+      "epoch": 0.039081582804103565,
+      "grad_norm": 0.9634686708450317,
+      "learning_rate": 0.0002,
+      "loss": 0.8859,
+      "step": 40
+    },
+    {
+      "epoch": 0.048851978505129456,
+      "grad_norm": 0.9101817607879639,
+      "learning_rate": 0.0002,
+      "loss": 0.7826,
+      "step": 50
+    },
+    {
+      "epoch": 0.05862237420615535,
+      "grad_norm": 1.0019943714141846,
+      "learning_rate": 0.0002,
+      "loss": 0.7358,
+      "step": 60
+    },
+    {
+      "epoch": 0.06839276990718124,
+      "grad_norm": 0.9201828837394714,
+      "learning_rate": 0.0002,
+      "loss": 0.6664,
+      "step": 70
+    },
+    {
+      "epoch": 0.07816316560820713,
+      "grad_norm": 0.9210318922996521,
+      "learning_rate": 0.0002,
+      "loss": 0.6785,
+      "step": 80
+    },
+    {
+      "epoch": 0.08793356130923302,
+      "grad_norm": 0.8079697489738464,
+      "learning_rate": 0.0002,
+      "loss": 0.652,
+      "step": 90
+    },
+    {
+      "epoch": 0.09770395701025891,
+      "grad_norm": 0.7530406713485718,
+      "learning_rate": 0.0002,
+      "loss": 0.6469,
+      "step": 100
+    },
+    {
+      "epoch": 0.1074743527112848,
+      "grad_norm": 0.8732273578643799,
+      "learning_rate": 0.0002,
+      "loss": 0.6604,
+      "step": 110
+    },
+    {
+      "epoch": 0.1172447484123107,
+      "grad_norm": 0.9163013696670532,
+      "learning_rate": 0.0002,
+      "loss": 0.6429,
+      "step": 120
+    },
+    {
+      "epoch": 0.1270151441133366,
+      "grad_norm": 0.5931605696678162,
+      "learning_rate": 0.0002,
+      "loss": 0.6269,
+      "step": 130
+    },
+    {
+      "epoch": 0.13678553981436248,
+      "grad_norm": 0.8782339692115784,
+      "learning_rate": 0.0002,
+      "loss": 0.6349,
+      "step": 140
+    },
+    {
+      "epoch": 0.14655593551538837,
+      "grad_norm": 0.6683491468429565,
+      "learning_rate": 0.0002,
+      "loss": 0.657,
+      "step": 150
+    },
+    {
+      "epoch": 0.15632633121641426,
+      "grad_norm": 0.7998592257499695,
+      "learning_rate": 0.0002,
+      "loss": 0.6315,
+      "step": 160
+    },
+    {
+      "epoch": 0.16609672691744015,
+      "grad_norm": 0.6159262657165527,
+      "learning_rate": 0.0002,
+      "loss": 0.6347,
+      "step": 170
+    },
+    {
+      "epoch": 0.17586712261846604,
+      "grad_norm": 0.671146035194397,
+      "learning_rate": 0.0002,
+      "loss": 0.6023,
+      "step": 180
+    },
+    {
+      "epoch": 0.18563751831949193,
+      "grad_norm": 0.5839019417762756,
+      "learning_rate": 0.0002,
+      "loss": 0.6101,
+      "step": 190
+    },
+    {
+      "epoch": 0.19540791402051783,
+      "grad_norm": 0.5090241432189941,
+      "learning_rate": 0.0002,
+      "loss": 0.6121,
+      "step": 200
+    },
+    {
+      "epoch": 0.20517830972154372,
+      "grad_norm": 0.652291476726532,
+      "learning_rate": 0.0002,
+      "loss": 0.6296,
+      "step": 210
+    },
+    {
+      "epoch": 0.2149487054225696,
+      "grad_norm": 0.6500856876373291,
+      "learning_rate": 0.0002,
+      "loss": 0.577,
+      "step": 220
+    },
+    {
+      "epoch": 0.2247191011235955,
+      "grad_norm": 0.6135480999946594,
+      "learning_rate": 0.0002,
+      "loss": 0.6186,
+      "step": 230
+    },
+    {
+      "epoch": 0.2344894968246214,
+      "grad_norm": 0.6102302074432373,
+      "learning_rate": 0.0002,
+      "loss": 0.6132,
+      "step": 240
+    },
+    {
+      "epoch": 0.24425989252564728,
+      "grad_norm": 0.6909783482551575,
+      "learning_rate": 0.0002,
+      "loss": 0.592,
+      "step": 250
+    },
+    {
+      "epoch": 0.2540302882266732,
+      "grad_norm": 0.5834446549415588,
+      "learning_rate": 0.0002,
+      "loss": 0.5832,
+      "step": 260
+    },
+    {
+      "epoch": 0.26380068392769906,
+      "grad_norm": 0.5275322198867798,
+      "learning_rate": 0.0002,
+      "loss": 0.6038,
+      "step": 270
+    },
+    {
+      "epoch": 0.27357107962872496,
+      "grad_norm": 0.5611422657966614,
+      "learning_rate": 0.0002,
+      "loss": 0.5469,
+      "step": 280
+    },
+    {
+      "epoch": 0.28334147532975085,
+      "grad_norm": 0.6549052596092224,
+      "learning_rate": 0.0002,
+      "loss": 0.552,
+      "step": 290
+    },
+    {
+      "epoch": 0.29311187103077674,
+      "grad_norm": 0.563291072845459,
+      "learning_rate": 0.0002,
+      "loss": 0.5609,
+      "step": 300
+    },
+    {
+      "epoch": 0.30288226673180263,
+      "grad_norm": 0.5598369240760803,
+      "learning_rate": 0.0002,
+      "loss": 0.5632,
+      "step": 310
+    },
+    {
+      "epoch": 0.3126526624328285,
+      "grad_norm": 0.6525678634643555,
+      "learning_rate": 0.0002,
+      "loss": 0.5627,
+      "step": 320
+    },
+    {
+      "epoch": 0.3224230581338544,
+      "grad_norm": 0.5190592408180237,
+      "learning_rate": 0.0002,
+      "loss": 0.5526,
+      "step": 330
+    },
+    {
+      "epoch": 0.3321934538348803,
+      "grad_norm": 0.45483070611953735,
+      "learning_rate": 0.0002,
+      "loss": 0.5698,
+      "step": 340
+    },
+    {
+      "epoch": 0.3419638495359062,
+      "grad_norm": 0.8094475865364075,
+      "learning_rate": 0.0002,
+      "loss": 0.5768,
+      "step": 350
+    },
+    {
+      "epoch": 0.3517342452369321,
+      "grad_norm": 0.5545358061790466,
+      "learning_rate": 0.0002,
+      "loss": 0.5555,
+      "step": 360
+    },
+    {
+      "epoch": 0.361504640937958,
+      "grad_norm": 0.6899498701095581,
+      "learning_rate": 0.0002,
+      "loss": 0.5529,
+      "step": 370
+    },
+    {
+      "epoch": 0.37127503663898387,
+      "grad_norm": 0.4584816098213196,
+      "learning_rate": 0.0002,
+      "loss": 0.556,
+      "step": 380
+    },
+    {
+      "epoch": 0.38104543234000976,
+      "grad_norm": 0.5436979532241821,
+      "learning_rate": 0.0002,
+      "loss": 0.5451,
+      "step": 390
+    },
+    {
+      "epoch": 0.39081582804103565,
+      "grad_norm": 0.7512422800064087,
+      "learning_rate": 0.0002,
+      "loss": 0.5377,
+      "step": 400
+    },
+    {
+      "epoch": 0.40058622374206154,
+      "grad_norm": 0.6394727826118469,
+      "learning_rate": 0.0002,
+      "loss": 0.5438,
+      "step": 410
+    },
+    {
+      "epoch": 0.41035661944308743,
+      "grad_norm": 0.5314047336578369,
+      "learning_rate": 0.0002,
+      "loss": 0.5535,
+      "step": 420
+    },
+    {
+      "epoch": 0.4201270151441133,
+      "grad_norm": 0.5658334493637085,
+      "learning_rate": 0.0002,
+      "loss": 0.5587,
+      "step": 430
+    },
+    {
+      "epoch": 0.4298974108451392,
+      "grad_norm": 0.5295330882072449,
+      "learning_rate": 0.0002,
+      "loss": 0.5219,
+      "step": 440
+    },
+    {
+      "epoch": 0.4396678065461651,
+      "grad_norm": 0.6460115313529968,
+      "learning_rate": 0.0002,
+      "loss": 0.522,
+      "step": 450
+    },
+    {
+      "epoch": 0.449438202247191,
+      "grad_norm": 0.512022852897644,
+      "learning_rate": 0.0002,
+      "loss": 0.5416,
+      "step": 460
+    },
+    {
+      "epoch": 0.4592085979482169,
+      "grad_norm": 0.7365363836288452,
+      "learning_rate": 0.0002,
+      "loss": 0.5256,
+      "step": 470
+    },
+    {
+      "epoch": 0.4689789936492428,
+      "grad_norm": 0.6292932629585266,
+      "learning_rate": 0.0002,
+      "loss": 0.5354,
+      "step": 480
+    },
+    {
+      "epoch": 0.4787493893502687,
+      "grad_norm": 0.6255582571029663,
+      "learning_rate": 0.0002,
+      "loss": 0.5436,
+      "step": 490
+    },
+    {
+      "epoch": 0.48851978505129456,
+      "grad_norm": 0.5599279403686523,
+      "learning_rate": 0.0002,
+      "loss": 0.5394,
+      "step": 500
+    },
+    {
+      "epoch": 0.49829018075232046,
+      "grad_norm": 0.573657751083374,
+      "learning_rate": 0.0002,
+      "loss": 0.5297,
+      "step": 510
+    },
+    {
+      "epoch": 0.5080605764533463,
+      "grad_norm": 0.6362313628196716,
+      "learning_rate": 0.0002,
+      "loss": 0.5299,
+      "step": 520
+    },
+    {
+      "epoch": 0.5178309721543722,
+      "grad_norm": 0.6360035538673401,
+      "learning_rate": 0.0002,
+      "loss": 0.5458,
+      "step": 530
+    },
+    {
+      "epoch": 0.5276013678553981,
+      "grad_norm": 0.7129001021385193,
+      "learning_rate": 0.0002,
+      "loss": 0.5228,
+      "step": 540
+    },
+    {
+      "epoch": 0.537371763556424,
+      "grad_norm": 0.5596054196357727,
+      "learning_rate": 0.0002,
+      "loss": 0.5091,
+      "step": 550
+    },
+    {
+      "epoch": 0.5471421592574499,
+      "grad_norm": 0.7081596851348877,
+      "learning_rate": 0.0002,
+      "loss": 0.5153,
+      "step": 560
+    },
+    {
+      "epoch": 0.5569125549584758,
+      "grad_norm": 0.6816760301589966,
+      "learning_rate": 0.0002,
+      "loss": 0.4999,
+      "step": 570
+    },
+    {
+      "epoch": 0.5666829506595017,
+      "grad_norm": 0.47695112228393555,
+      "learning_rate": 0.0002,
+      "loss": 0.4974,
+      "step": 580
+    },
+    {
+      "epoch": 0.5764533463605276,
+      "grad_norm": 0.7528041005134583,
+      "learning_rate": 0.0002,
+      "loss": 0.5247,
+      "step": 590
+    },
+    {
+      "epoch": 0.5862237420615535,
+      "grad_norm": 0.5452813506126404,
+      "learning_rate": 0.0002,
+      "loss": 0.5265,
+      "step": 600
+    },
+    {
+      "epoch": 0.5959941377625794,
+      "grad_norm": 0.6085044741630554,
+      "learning_rate": 0.0002,
+      "loss": 0.4965,
+      "step": 610
+    },
+    {
+      "epoch": 0.6057645334636053,
+      "grad_norm": 0.6745641231536865,
+      "learning_rate": 0.0002,
+      "loss": 0.4916,
+      "step": 620
+    },
+    {
+      "epoch": 0.6155349291646312,
+      "grad_norm": 0.647544264793396,
+      "learning_rate": 0.0002,
+      "loss": 0.5107,
+      "step": 630
+    },
+    {
+      "epoch": 0.625305324865657,
+      "grad_norm": 0.6123825311660767,
+      "learning_rate": 0.0002,
+      "loss": 0.4864,
+      "step": 640
+    },
+    {
+      "epoch": 0.6350757205666829,
+      "grad_norm": 0.5815364122390747,
+      "learning_rate": 0.0002,
+      "loss": 0.484,
+      "step": 650
+    },
+    {
+      "epoch": 0.6448461162677088,
+      "grad_norm": 0.6184095740318298,
+      "learning_rate": 0.0002,
+      "loss": 0.4966,
+      "step": 660
+    },
+    {
+      "epoch": 0.6546165119687347,
+      "grad_norm": 0.5856700539588928,
+      "learning_rate": 0.0002,
+      "loss": 0.4861,
+      "step": 670
+    },
+    {
+      "epoch": 0.6643869076697606,
+      "grad_norm": 0.6424922943115234,
+      "learning_rate": 0.0002,
+      "loss": 0.4964,
+      "step": 680
+    },
+    {
+      "epoch": 0.6741573033707865,
+      "grad_norm": 0.7051425576210022,
+      "learning_rate": 0.0002,
+      "loss": 0.5019,
+      "step": 690
+    },
+    {
+      "epoch": 0.6839276990718124,
+      "grad_norm": 0.6133471131324768,
+      "learning_rate": 0.0002,
+      "loss": 0.4649,
+      "step": 700
+    },
+    {
+      "epoch": 0.6936980947728383,
+      "grad_norm": 0.6933842897415161,
+      "learning_rate": 0.0002,
+      "loss": 0.4847,
+      "step": 710
+    },
+    {
+      "epoch": 0.7034684904738642,
+      "grad_norm": 0.6440989375114441,
+      "learning_rate": 0.0002,
+      "loss": 0.4945,
+      "step": 720
+    },
+    {
+      "epoch": 0.7132388861748901,
+      "grad_norm": 0.87819904088974,
+      "learning_rate": 0.0002,
+      "loss": 0.4777,
+      "step": 730
+    },
+    {
+      "epoch": 0.723009281875916,
+      "grad_norm": 0.6810497641563416,
+      "learning_rate": 0.0002,
+      "loss": 0.4914,
+      "step": 740
+    },
+    {
+      "epoch": 0.7327796775769418,
+      "grad_norm": 0.7822733521461487,
+      "learning_rate": 0.0002,
+      "loss": 0.4789,
+      "step": 750
+    },
+    {
+      "epoch": 0.7425500732779677,
+      "grad_norm": 0.6669152975082397,
+      "learning_rate": 0.0002,
+      "loss": 0.4615,
+      "step": 760
+    },
+    {
+      "epoch": 0.7523204689789936,
+      "grad_norm": 0.7351736426353455,
+      "learning_rate": 0.0002,
+      "loss": 0.4689,
+      "step": 770
+    },
+    {
+      "epoch": 0.7620908646800195,
+      "grad_norm": 1.0013558864593506,
+      "learning_rate": 0.0002,
+      "loss": 0.4629,
+      "step": 780
+    },
+    {
+      "epoch": 0.7718612603810454,
+      "grad_norm": 0.7465775609016418,
+      "learning_rate": 0.0002,
+      "loss": 0.4739,
+      "step": 790
+    },
+    {
+      "epoch": 0.7816316560820713,
+      "grad_norm": 1.0959300994873047,
+      "learning_rate": 0.0002,
+      "loss": 0.4635,
+      "step": 800
+    },
+    {
+      "epoch": 0.7914020517830972,
+      "grad_norm": 0.5292418599128723,
+      "learning_rate": 0.0002,
+      "loss": 0.4549,
+      "step": 810
+    },
+    {
+      "epoch": 0.8011724474841231,
+      "grad_norm": 0.6555328965187073,
+      "learning_rate": 0.0002,
+      "loss": 0.458,
+      "step": 820
+    },
+    {
+      "epoch": 0.810942843185149,
+      "grad_norm": 0.6462382674217224,
+      "learning_rate": 0.0002,
+      "loss": 0.488,
+      "step": 830
+    },
+    {
+      "epoch": 0.8207132388861749,
+      "grad_norm": 0.6840918064117432,
+      "learning_rate": 0.0002,
+      "loss": 0.4541,
+      "step": 840
+    },
+    {
+      "epoch": 0.8304836345872008,
+      "grad_norm": 0.5715351700782776,
+      "learning_rate": 0.0002,
+      "loss": 0.4509,
+      "step": 850
+    },
+    {
+      "epoch": 0.8402540302882266,
+      "grad_norm": 0.5583404898643494,
+      "learning_rate": 0.0002,
+      "loss": 0.4535,
+      "step": 860
+    },
+    {
+      "epoch": 0.8500244259892525,
+      "grad_norm": 0.8243112564086914,
+      "learning_rate": 0.0002,
+      "loss": 0.4533,
+      "step": 870
+    },
+    {
+      "epoch": 0.8597948216902784,
+      "grad_norm": 0.6543600559234619,
+      "learning_rate": 0.0002,
+      "loss": 0.4545,
+      "step": 880
+    },
+    {
+      "epoch": 0.8695652173913043,
+      "grad_norm": 0.6494827270507812,
+      "learning_rate": 0.0002,
+      "loss": 0.4814,
+      "step": 890
+    },
+    {
+      "epoch": 0.8793356130923302,
+      "grad_norm": 0.8458304405212402,
+      "learning_rate": 0.0002,
+      "loss": 0.4593,
+      "step": 900
+    },
+    {
+      "epoch": 0.8891060087933561,
+      "grad_norm": 0.6854186654090881,
+      "learning_rate": 0.0002,
+      "loss": 0.4382,
+      "step": 910
+    },
+    {
+      "epoch": 0.898876404494382,
+      "grad_norm": 0.6300225853919983,
+      "learning_rate": 0.0002,
+      "loss": 0.4488,
+      "step": 920
+    },
+    {
+      "epoch": 0.9086468001954079,
+      "grad_norm": 0.9791533350944519,
+      "learning_rate": 0.0002,
+      "loss": 0.4638,
+      "step": 930
+    },
+    {
+      "epoch": 0.9184171958964338,
+      "grad_norm": 0.6965218186378479,
+      "learning_rate": 0.0002,
+      "loss": 0.446,
+      "step": 940
+    },
+    {
+      "epoch": 0.9281875915974597,
+      "grad_norm": 0.6066922545433044,
+      "learning_rate": 0.0002,
+      "loss": 0.4453,
+      "step": 950
+    },
+    {
+      "epoch": 0.9379579872984856,
+      "grad_norm": 0.8081962466239929,
+      "learning_rate": 0.0002,
+      "loss": 0.4471,
+      "step": 960
+    },
+    {
+      "epoch": 0.9477283829995115,
+      "grad_norm": 0.7755117416381836,
+      "learning_rate": 0.0002,
+      "loss": 0.4348,
+      "step": 970
+    },
+    {
+      "epoch": 0.9574987787005373,
+      "grad_norm": 0.7127223610877991,
+      "learning_rate": 0.0002,
+      "loss": 0.4423,
+      "step": 980
+    },
+    {
+      "epoch": 0.9672691744015632,
+      "grad_norm": 0.6947609186172485,
+      "learning_rate": 0.0002,
+      "loss": 0.4272,
+      "step": 990
+    },
+    {
+      "epoch": 0.9770395701025891,
+      "grad_norm": 1.0100330114364624,
+      "learning_rate": 0.0002,
+      "loss": 0.4262,
+      "step": 1000
+    },
+    {
+      "epoch": 0.986809965803615,
+      "grad_norm": 0.6727001667022705,
+      "learning_rate": 0.0002,
+      "loss": 0.4169,
+      "step": 1010
+    },
+    {
+      "epoch": 0.9965803615046409,
+      "grad_norm": 0.7834463119506836,
+      "learning_rate": 0.0002,
+      "loss": 0.4507,
+      "step": 1020
+    },
+    {
+      "epoch": 0.9995114802149487,
+      "eval_loss": 0.433101624250412,
+      "eval_runtime": 26.5222,
+      "eval_samples_per_second": 13.762,
+      "eval_steps_per_second": 1.734,
+      "step": 1023
+    },
+    {
+      "epoch": 1.006350757205667,
+      "grad_norm": 0.8070526123046875,
+      "learning_rate": 0.0002,
+      "loss": 0.4154,
+      "step": 1030
+    },
+    {
+      "epoch": 1.0161211529066927,
+      "grad_norm": 0.7301508784294128,
+      "learning_rate": 0.0002,
+      "loss": 0.3951,
+      "step": 1040
+    },
+    {
+      "epoch": 1.0258915486077187,
+      "grad_norm": 0.8212476372718811,
+      "learning_rate": 0.0002,
+      "loss": 0.4001,
+      "step": 1050
+    },
+    {
+      "epoch": 1.0356619443087445,
+      "grad_norm": 0.6269228458404541,
+      "learning_rate": 0.0002,
+      "loss": 0.3953,
+      "step": 1060
+    },
+    {
+      "epoch": 1.0454323400097705,
+      "grad_norm": 0.700432300567627,
+      "learning_rate": 0.0002,
+      "loss": 0.4147,
+      "step": 1070
+    },
+    {
+      "epoch": 1.0552027357107963,
+      "grad_norm": 0.9251010417938232,
+      "learning_rate": 0.0002,
+      "loss": 0.3857,
+      "step": 1080
+    },
+    {
+      "epoch": 1.0649731314118223,
+      "grad_norm": 0.6018561720848083,
+      "learning_rate": 0.0002,
+      "loss": 0.3955,
+      "step": 1090
+    },
+    {
+      "epoch": 1.074743527112848,
+      "grad_norm": 0.7045873403549194,
+      "learning_rate": 0.0002,
+      "loss": 0.4079,
+      "step": 1100
+    },
+    {
+      "epoch": 1.084513922813874,
+      "grad_norm": 0.7800339460372925,
+      "learning_rate": 0.0002,
+      "loss": 0.4005,
+      "step": 1110
+    },
+    {
+      "epoch": 1.0942843185148998,
+      "grad_norm": 0.7404900789260864,
+      "learning_rate": 0.0002,
+      "loss": 0.419,
+      "step": 1120
+    },
+    {
+      "epoch": 1.1040547142159258,
+      "grad_norm": 1.1851727962493896,
+      "learning_rate": 0.0002,
+      "loss": 0.4057,
+      "step": 1130
+    },
+    {
+      "epoch": 1.1138251099169516,
+      "grad_norm": 0.875406801700592,
+      "learning_rate": 0.0002,
+      "loss": 0.3966,
+      "step": 1140
+    },
+    {
+      "epoch": 1.1235955056179776,
+      "grad_norm": 0.9795705676078796,
+      "learning_rate": 0.0002,
+      "loss": 0.3863,
+      "step": 1150
+    },
+    {
+      "epoch": 1.1333659013190034,
+      "grad_norm": 0.7387922406196594,
+      "learning_rate": 0.0002,
+      "loss": 0.3991,
+      "step": 1160
+    },
+    {
+      "epoch": 1.1431362970200294,
+      "grad_norm": 0.6640482544898987,
+      "learning_rate": 0.0002,
+      "loss": 0.3914,
+      "step": 1170
+    },
+    {
+      "epoch": 1.1529066927210552,
+      "grad_norm": 0.6067684888839722,
+      "learning_rate": 0.0002,
+      "loss": 0.3809,
+      "step": 1180
+    },
+    {
+      "epoch": 1.1626770884220812,
+      "grad_norm": 0.7623337507247925,
+      "learning_rate": 0.0002,
+      "loss": 0.3915,
+      "step": 1190
+    },
+    {
+      "epoch": 1.172447484123107,
+      "grad_norm": 1.0410432815551758,
+      "learning_rate": 0.0002,
+      "loss": 0.3832,
+      "step": 1200
+    },
+    {
+      "epoch": 1.182217879824133,
+      "grad_norm": 0.7790178656578064,
+      "learning_rate": 0.0002,
+      "loss": 0.3875,
+      "step": 1210
+    },
+    {
+      "epoch": 1.1919882755251587,
+      "grad_norm": 0.7643477916717529,
+      "learning_rate": 0.0002,
+      "loss": 0.3869,
+      "step": 1220
+    },
+    {
+      "epoch": 1.2017586712261847,
+      "grad_norm": 1.2028473615646362,
+      "learning_rate": 0.0002,
+      "loss": 0.3719,
+      "step": 1230
+    },
+    {
+      "epoch": 1.2115290669272105,
+      "grad_norm": 0.787656307220459,
+      "learning_rate": 0.0002,
+      "loss": 0.3863,
+      "step": 1240
+    },
+    {
+      "epoch": 1.2212994626282365,
+      "grad_norm": 0.8074171543121338,
+      "learning_rate": 0.0002,
+      "loss": 0.3875,
+      "step": 1250
+    },
+    {
+      "epoch": 1.2310698583292623,
+      "grad_norm": 0.8488901853561401,
+      "learning_rate": 0.0002,
+      "loss": 0.3923,
+      "step": 1260
+    },
+    {
+      "epoch": 1.2408402540302883,
+      "grad_norm": 0.7454975247383118,
+      "learning_rate": 0.0002,
+      "loss": 0.3829,
+      "step": 1270
+    },
+    {
+      "epoch": 1.250610649731314,
+      "grad_norm": 0.6724955439567566,
+      "learning_rate": 0.0002,
+      "loss": 0.3981,
+      "step": 1280
+    },
+    {
+      "epoch": 1.26038104543234,
+      "grad_norm": 1.1912977695465088,
+      "learning_rate": 0.0002,
+      "loss": 0.383,
+      "step": 1290
+    },
+    {
+      "epoch": 1.2701514411333659,
+      "grad_norm": 0.7795814871788025,
+      "learning_rate": 0.0002,
+      "loss": 0.3837,
+      "step": 1300
+    },
+    {
+      "epoch": 1.2799218368343919,
+      "grad_norm": 0.672956645488739,
+      "learning_rate": 0.0002,
+      "loss": 0.3898,
+      "step": 1310
+    },
+    {
+      "epoch": 1.2896922325354176,
+      "grad_norm": 1.245808482170105,
+      "learning_rate": 0.0002,
+      "loss": 0.3849,
+      "step": 1320
+    },
+    {
+      "epoch": 1.2994626282364437,
+      "grad_norm": 0.9562020301818848,
+      "learning_rate": 0.0002,
+      "loss": 0.3877,
+      "step": 1330
+    },
+    {
+      "epoch": 1.3092330239374694,
+      "grad_norm": 1.2005938291549683,
+      "learning_rate": 0.0002,
+      "loss": 0.3711,
+      "step": 1340
+    },
+    {
+      "epoch": 1.3190034196384954,
+      "grad_norm": 0.7105128169059753,
+      "learning_rate": 0.0002,
+      "loss": 0.3761,
+      "step": 1350
+    },
+    {
+      "epoch": 1.3287738153395212,
+      "grad_norm": 0.9829772710800171,
+      "learning_rate": 0.0002,
+      "loss": 0.371,
+      "step": 1360
+    },
+    {
+      "epoch": 1.3385442110405472,
+      "grad_norm": 0.6548563241958618,
+      "learning_rate": 0.0002,
+      "loss": 0.3845,
+      "step": 1370
+    },
+    {
+      "epoch": 1.348314606741573,
+      "grad_norm": 0.877531111240387,
+      "learning_rate": 0.0002,
+      "loss": 0.3797,
+      "step": 1380
+    },
+    {
+      "epoch": 1.358085002442599,
+      "grad_norm": 0.6915368437767029,
+      "learning_rate": 0.0002,
+      "loss": 0.3757,
+      "step": 1390
+    },
+    {
+      "epoch": 1.3678553981436248,
+      "grad_norm": 0.6052316427230835,
+      "learning_rate": 0.0002,
+      "loss": 0.368,
+      "step": 1400
+    },
+    {
+      "epoch": 1.3776257938446508,
+      "grad_norm": 0.6086260080337524,
+      "learning_rate": 0.0002,
+      "loss": 0.3758,
+      "step": 1410
+    },
+    {
+      "epoch": 1.3873961895456766,
+      "grad_norm": 1.0432673692703247,
+      "learning_rate": 0.0002,
+      "loss": 0.3794,
+      "step": 1420
+    },
+    {
+      "epoch": 1.3971665852467026,
+      "grad_norm": 0.7252581715583801,
+      "learning_rate": 0.0002,
+      "loss": 0.3715,
+      "step": 1430
+    },
+    {
+      "epoch": 1.4069369809477283,
+      "grad_norm": 0.7926928997039795,
+      "learning_rate": 0.0002,
+      "loss": 0.3919,
+      "step": 1440
+    },
+    {
+      "epoch": 1.4167073766487543,
+      "grad_norm": 0.6464225649833679,
+      "learning_rate": 0.0002,
+      "loss": 0.3701,
+      "step": 1450
+    },
+    {
+      "epoch": 1.4264777723497801,
+      "grad_norm": 1.0563385486602783,
+      "learning_rate": 0.0002,
+      "loss": 0.3738,
+      "step": 1460
+    },
+    {
+      "epoch": 1.4362481680508061,
+      "grad_norm": 0.5497196316719055,
+      "learning_rate": 0.0002,
+      "loss": 0.3782,
+      "step": 1470
+    },
+    {
+      "epoch": 1.446018563751832,
+      "grad_norm": 0.7382678389549255,
+      "learning_rate": 0.0002,
+      "loss": 0.3668,
+      "step": 1480
+    },
+    {
+      "epoch": 1.455788959452858,
+      "grad_norm": 0.6264833807945251,
+      "learning_rate": 0.0002,
+      "loss": 0.3592,
+      "step": 1490
+    },
+    {
+      "epoch": 1.4655593551538837,
+      "grad_norm": 0.6722145080566406,
+      "learning_rate": 0.0002,
+      "loss": 0.3809,
+      "step": 1500
+    },
+    {
+      "epoch": 1.4753297508549097,
+      "grad_norm": 0.8594183921813965,
+      "learning_rate": 0.0002,
+      "loss": 0.3715,
+      "step": 1510
+    },
+    {
+      "epoch": 1.4851001465559355,
+      "grad_norm": 0.8588142395019531,
+      "learning_rate": 0.0002,
+      "loss": 0.354,
+      "step": 1520
+    },
+    {
+      "epoch": 1.4948705422569615,
+      "grad_norm": 0.8683834075927734,
+      "learning_rate": 0.0002,
+      "loss": 0.3654,
+      "step": 1530
+    },
+    {
+      "epoch": 1.5046409379579873,
+      "grad_norm": 0.7628163695335388,
+      "learning_rate": 0.0002,
+      "loss": 0.3647,
+      "step": 1540
+    },
+    {
+      "epoch": 1.5144113336590133,
+      "grad_norm": 0.7967382669448853,
+      "learning_rate": 0.0002,
+      "loss": 0.3666,
+      "step": 1550
+    },
+    {
+      "epoch": 1.524181729360039,
+      "grad_norm": 0.7065442800521851,
+      "learning_rate": 0.0002,
+      "loss": 0.361,
+      "step": 1560
+    },
+    {
+      "epoch": 1.5339521250610648,
+      "grad_norm": 0.6472197771072388,
+      "learning_rate": 0.0002,
+      "loss": 0.3623,
+      "step": 1570
+    },
+    {
+      "epoch": 1.5437225207620908,
+      "grad_norm": 1.105960488319397,
+      "learning_rate": 0.0002,
+      "loss": 0.3626,
+      "step": 1580
+    },
+    {
+      "epoch": 1.5534929164631168,
+      "grad_norm": 0.9730587601661682,
+      "learning_rate": 0.0002,
+      "loss": 0.3528,
+      "step": 1590
+    },
+    {
+      "epoch": 1.5632633121641426,
+      "grad_norm": 0.987910807132721,
+      "learning_rate": 0.0002,
+      "loss": 0.3739,
+      "step": 1600
+    },
+    {
+      "epoch": 1.5730337078651684,
+      "grad_norm": 0.9708227515220642,
+      "learning_rate": 0.0002,
+      "loss": 0.3546,
+      "step": 1610
+    },
+    {
+      "epoch": 1.5828041035661944,
+      "grad_norm": 0.6303295493125916,
+      "learning_rate": 0.0002,
+      "loss": 0.3653,
+      "step": 1620
+    },
+    {
+      "epoch": 1.5925744992672204,
+      "grad_norm": 1.0985002517700195,
+      "learning_rate": 0.0002,
+      "loss": 0.3639,
+      "step": 1630
+    },
+    {
+      "epoch": 1.6023448949682462,
+      "grad_norm": 0.839419960975647,
+      "learning_rate": 0.0002,
+      "loss": 0.3533,
+      "step": 1640
+    },
+    {
+      "epoch": 1.612115290669272,
+      "grad_norm": 0.7963409423828125,
+      "learning_rate": 0.0002,
+      "loss": 0.3544,
+      "step": 1650
+    },
+    {
+      "epoch": 1.621885686370298,
+      "grad_norm": 0.8074514269828796,
+      "learning_rate": 0.0002,
+      "loss": 0.3721,
+      "step": 1660
+    },
+    {
+      "epoch": 1.631656082071324,
+      "grad_norm": 0.8368266820907593,
+      "learning_rate": 0.0002,
+      "loss": 0.3573,
+      "step": 1670
+    },
+    {
+      "epoch": 1.6414264777723497,
+      "grad_norm": 0.6562672257423401,
+      "learning_rate": 0.0002,
+      "loss": 0.3556,
+      "step": 1680
+    },
+    {
+      "epoch": 1.6511968734733755,
+      "grad_norm": 0.5512149930000305,
+      "learning_rate": 0.0002,
+      "loss": 0.3593,
+      "step": 1690
+    },
+    {
+      "epoch": 1.6609672691744015,
+      "grad_norm": 0.5829663276672363,
+      "learning_rate": 0.0002,
+      "loss": 0.3626,
+      "step": 1700
+    },
+    {
+      "epoch": 1.6707376648754275,
+      "grad_norm": 0.8412625193595886,
+      "learning_rate": 0.0002,
+      "loss": 0.3526,
+      "step": 1710
+    },
+    {
+      "epoch": 1.6805080605764533,
+      "grad_norm": 0.8657066226005554,
+      "learning_rate": 0.0002,
+      "loss": 0.3593,
+      "step": 1720
+    },
+    {
+      "epoch": 1.690278456277479,
+      "grad_norm": 0.9691681861877441,
+      "learning_rate": 0.0002,
+      "loss": 0.3545,
+      "step": 1730
+    },
+    {
+      "epoch": 1.700048851978505,
+      "grad_norm": 0.641669511795044,
+      "learning_rate": 0.0002,
+      "loss": 0.3694,
+      "step": 1740
+    },
+    {
+      "epoch": 1.709819247679531,
+      "grad_norm": 0.7599552273750305,
+      "learning_rate": 0.0002,
+      "loss": 0.3594,
+      "step": 1750
+    },
+    {
+      "epoch": 1.7195896433805569,
+      "grad_norm": 0.7562308311462402,
+      "learning_rate": 0.0002,
+      "loss": 0.3563,
+      "step": 1760
+    },
+    {
+      "epoch": 1.7293600390815826,
+      "grad_norm": 0.6949060559272766,
+      "learning_rate": 0.0002,
+      "loss": 0.3741,
+      "step": 1770
+    },
+    {
+      "epoch": 1.7391304347826086,
+      "grad_norm": 1.1047314405441284,
+      "learning_rate": 0.0002,
+      "loss": 0.3444,
+      "step": 1780
+    },
+    {
+      "epoch": 1.7489008304836346,
+      "grad_norm": 0.9239255785942078,
+      "learning_rate": 0.0002,
+      "loss": 0.3602,
+      "step": 1790
+    },
+    {
+      "epoch": 1.7586712261846604,
+      "grad_norm": 0.6171822547912598,
+      "learning_rate": 0.0002,
+      "loss": 0.3464,
+      "step": 1800
+    },
+    {
+      "epoch": 1.7684416218856862,
+      "grad_norm": 0.8883067965507507,
+      "learning_rate": 0.0002,
+      "loss": 0.3504,
+      "step": 1810
+    },
+    {
+      "epoch": 1.7782120175867122,
+      "grad_norm": 0.8204503059387207,
+      "learning_rate": 0.0002,
+      "loss": 0.341,
+      "step": 1820
+    },
+    {
+      "epoch": 1.7879824132877382,
+      "grad_norm": 0.807534396648407,
+      "learning_rate": 0.0002,
+      "loss": 0.3455,
+      "step": 1830
+    },
+    {
+      "epoch": 1.797752808988764,
+      "grad_norm": 0.8063831329345703,
+      "learning_rate": 0.0002,
+      "loss": 0.3287,
+      "step": 1840
+    },
+    {
+      "epoch": 1.8075232046897898,
+      "grad_norm": 0.7789983749389648,
+      "learning_rate": 0.0002,
+      "loss": 0.3424,
+      "step": 1850
+    },
+    {
+      "epoch": 1.8172936003908158,
+      "grad_norm": 0.6771978735923767,
+      "learning_rate": 0.0002,
+      "loss": 0.3495,
+      "step": 1860
+    },
+    {
+      "epoch": 1.8270639960918418,
+      "grad_norm": 0.9140942096710205,
+      "learning_rate": 0.0002,
+      "loss": 0.3437,
+      "step": 1870
+    },
+    {
+      "epoch": 1.8368343917928676,
+      "grad_norm": 0.6635336875915527,
+      "learning_rate": 0.0002,
+      "loss": 0.3458,
+      "step": 1880
+    },
+    {
+      "epoch": 1.8466047874938933,
+      "grad_norm": 1.1987066268920898,
+      "learning_rate": 0.0002,
+      "loss": 0.3396,
+      "step": 1890
+    },
+    {
+      "epoch": 1.8563751831949193,
+      "grad_norm": 0.7020497918128967,
+      "learning_rate": 0.0002,
+      "loss": 0.3413,
+      "step": 1900
+    },
+    {
+      "epoch": 1.8661455788959453,
+      "grad_norm": 1.0113945007324219,
+      "learning_rate": 0.0002,
+      "loss": 0.3442,
+      "step": 1910
+    },
+    {
+      "epoch": 1.8759159745969711,
+      "grad_norm": 0.8227802515029907,
+      "learning_rate": 0.0002,
+      "loss": 0.3503,
+      "step": 1920
+    },
+    {
+      "epoch": 1.885686370297997,
+      "grad_norm": 0.8185329437255859,
+      "learning_rate": 0.0002,
+      "loss": 0.3565,
+      "step": 1930
+    },
+    {
+      "epoch": 1.895456765999023,
+      "grad_norm": 0.7708970904350281,
+      "learning_rate": 0.0002,
+      "loss": 0.335,
+      "step": 1940
+    },
+    {
+      "epoch": 1.905227161700049,
+      "grad_norm": 0.8888451457023621,
+      "learning_rate": 0.0002,
+      "loss": 0.3365,
+      "step": 1950
+    },
+    {
+      "epoch": 1.9149975574010747,
+      "grad_norm": 0.720267653465271,
+      "learning_rate": 0.0002,
+      "loss": 0.3342,
+      "step": 1960
+    },
+    {
+      "epoch": 1.9247679531021005,
+      "grad_norm": 0.888666570186615,
+      "learning_rate": 0.0002,
+      "loss": 0.3512,
+      "step": 1970
+    },
+    {
+      "epoch": 1.9345383488031265,
+      "grad_norm": 0.7471952438354492,
+      "learning_rate": 0.0002,
+      "loss": 0.3284,
+      "step": 1980
+    },
+    {
+      "epoch": 1.9443087445041525,
+      "grad_norm": 0.7166922092437744,
+      "learning_rate": 0.0002,
+      "loss": 0.3383,
+      "step": 1990
+    },
+    {
+      "epoch": 1.9540791402051783,
+      "grad_norm": 0.7097923159599304,
+      "learning_rate": 0.0002,
+      "loss": 0.3355,
+      "step": 2000
+    },
+    {
+      "epoch": 1.963849535906204,
+      "grad_norm": 0.8592363595962524,
+      "learning_rate": 0.0002,
+      "loss": 0.3282,
+      "step": 2010
+    },
+    {
+      "epoch": 1.97361993160723,
+      "grad_norm": 0.5352440476417542,
+      "learning_rate": 0.0002,
+      "loss": 0.3273,
+      "step": 2020
+    },
+    {
+      "epoch": 1.983390327308256,
+      "grad_norm": 1.0193064212799072,
+      "learning_rate": 0.0002,
+      "loss": 0.3387,
+      "step": 2030
+    },
+    {
+      "epoch": 1.9931607230092818,
+      "grad_norm": 0.7331683039665222,
+      "learning_rate": 0.0002,
+      "loss": 0.3277,
+      "step": 2040
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.3446754515171051,
+      "eval_runtime": 26.5209,
+      "eval_samples_per_second": 13.763,
+      "eval_steps_per_second": 1.734,
+      "step": 2047
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 8184,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.98115655696384e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-2047/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:469618ae8560edd4a517eb99451fb8bc5c5f148706842d569488535fb05e84cb
+size 5560

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-3070/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-3070/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:507409201b19276fc2df370d6adf627c24b08000d9eba3080c15bb8a5f2d2b61
+size 109069176

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-3070/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:56edad678f35ccce751f24c508cc9fccf04cf486ecfac4dc0e01e4a553383809
+size 55532666

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-3070/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0747557aab36063756040bfe2374c7e81d60a6da97d0d0956cdd9fbc8c45a5e3
+size 14244

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-3070/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:46b4b8ee0a85d37d1674bfdd49bfe7292e5c24f26cb4aad2f9ad59834d4dbc0d
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-3070/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-3070/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-3070/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-3070/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2206 @@

+{
+  "best_metric": 0.30571895837783813,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-3070",
+  "epoch": 2.9995114802149487,
+  "eval_steps": 10,
+  "global_step": 3070,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.009770395701025891,
+      "grad_norm": 1.1888047456741333,
+      "learning_rate": 0.0002,
+      "loss": 1.7474,
+      "step": 10
+    },
+    {
+      "epoch": 0.019540791402051783,
+      "grad_norm": 1.3118009567260742,
+      "learning_rate": 0.0002,
+      "loss": 1.157,
+      "step": 20
+    },
+    {
+      "epoch": 0.029311187103077674,
+      "grad_norm": 1.1254922151565552,
+      "learning_rate": 0.0002,
+      "loss": 0.9979,
+      "step": 30
+    },
+    {
+      "epoch": 0.039081582804103565,
+      "grad_norm": 0.9634686708450317,
+      "learning_rate": 0.0002,
+      "loss": 0.8859,
+      "step": 40
+    },
+    {
+      "epoch": 0.048851978505129456,
+      "grad_norm": 0.9101817607879639,
+      "learning_rate": 0.0002,
+      "loss": 0.7826,
+      "step": 50
+    },
+    {
+      "epoch": 0.05862237420615535,
+      "grad_norm": 1.0019943714141846,
+      "learning_rate": 0.0002,
+      "loss": 0.7358,
+      "step": 60
+    },
+    {
+      "epoch": 0.06839276990718124,
+      "grad_norm": 0.9201828837394714,
+      "learning_rate": 0.0002,
+      "loss": 0.6664,
+      "step": 70
+    },
+    {
+      "epoch": 0.07816316560820713,
+      "grad_norm": 0.9210318922996521,
+      "learning_rate": 0.0002,
+      "loss": 0.6785,
+      "step": 80
+    },
+    {
+      "epoch": 0.08793356130923302,
+      "grad_norm": 0.8079697489738464,
+      "learning_rate": 0.0002,
+      "loss": 0.652,
+      "step": 90
+    },
+    {
+      "epoch": 0.09770395701025891,
+      "grad_norm": 0.7530406713485718,
+      "learning_rate": 0.0002,
+      "loss": 0.6469,
+      "step": 100
+    },
+    {
+      "epoch": 0.1074743527112848,
+      "grad_norm": 0.8732273578643799,
+      "learning_rate": 0.0002,
+      "loss": 0.6604,
+      "step": 110
+    },
+    {
+      "epoch": 0.1172447484123107,
+      "grad_norm": 0.9163013696670532,
+      "learning_rate": 0.0002,
+      "loss": 0.6429,
+      "step": 120
+    },
+    {
+      "epoch": 0.1270151441133366,
+      "grad_norm": 0.5931605696678162,
+      "learning_rate": 0.0002,
+      "loss": 0.6269,
+      "step": 130
+    },
+    {
+      "epoch": 0.13678553981436248,
+      "grad_norm": 0.8782339692115784,
+      "learning_rate": 0.0002,
+      "loss": 0.6349,
+      "step": 140
+    },
+    {
+      "epoch": 0.14655593551538837,
+      "grad_norm": 0.6683491468429565,
+      "learning_rate": 0.0002,
+      "loss": 0.657,
+      "step": 150
+    },
+    {
+      "epoch": 0.15632633121641426,
+      "grad_norm": 0.7998592257499695,
+      "learning_rate": 0.0002,
+      "loss": 0.6315,
+      "step": 160
+    },
+    {
+      "epoch": 0.16609672691744015,
+      "grad_norm": 0.6159262657165527,
+      "learning_rate": 0.0002,
+      "loss": 0.6347,
+      "step": 170
+    },
+    {
+      "epoch": 0.17586712261846604,
+      "grad_norm": 0.671146035194397,
+      "learning_rate": 0.0002,
+      "loss": 0.6023,
+      "step": 180
+    },
+    {
+      "epoch": 0.18563751831949193,
+      "grad_norm": 0.5839019417762756,
+      "learning_rate": 0.0002,
+      "loss": 0.6101,
+      "step": 190
+    },
+    {
+      "epoch": 0.19540791402051783,
+      "grad_norm": 0.5090241432189941,
+      "learning_rate": 0.0002,
+      "loss": 0.6121,
+      "step": 200
+    },
+    {
+      "epoch": 0.20517830972154372,
+      "grad_norm": 0.652291476726532,
+      "learning_rate": 0.0002,
+      "loss": 0.6296,
+      "step": 210
+    },
+    {
+      "epoch": 0.2149487054225696,
+      "grad_norm": 0.6500856876373291,
+      "learning_rate": 0.0002,
+      "loss": 0.577,
+      "step": 220
+    },
+    {
+      "epoch": 0.2247191011235955,
+      "grad_norm": 0.6135480999946594,
+      "learning_rate": 0.0002,
+      "loss": 0.6186,
+      "step": 230
+    },
+    {
+      "epoch": 0.2344894968246214,
+      "grad_norm": 0.6102302074432373,
+      "learning_rate": 0.0002,
+      "loss": 0.6132,
+      "step": 240
+    },
+    {
+      "epoch": 0.24425989252564728,
+      "grad_norm": 0.6909783482551575,
+      "learning_rate": 0.0002,
+      "loss": 0.592,
+      "step": 250
+    },
+    {
+      "epoch": 0.2540302882266732,
+      "grad_norm": 0.5834446549415588,
+      "learning_rate": 0.0002,
+      "loss": 0.5832,
+      "step": 260
+    },
+    {
+      "epoch": 0.26380068392769906,
+      "grad_norm": 0.5275322198867798,
+      "learning_rate": 0.0002,
+      "loss": 0.6038,
+      "step": 270
+    },
+    {
+      "epoch": 0.27357107962872496,
+      "grad_norm": 0.5611422657966614,
+      "learning_rate": 0.0002,
+      "loss": 0.5469,
+      "step": 280
+    },
+    {
+      "epoch": 0.28334147532975085,
+      "grad_norm": 0.6549052596092224,
+      "learning_rate": 0.0002,
+      "loss": 0.552,
+      "step": 290
+    },
+    {
+      "epoch": 0.29311187103077674,
+      "grad_norm": 0.563291072845459,
+      "learning_rate": 0.0002,
+      "loss": 0.5609,
+      "step": 300
+    },
+    {
+      "epoch": 0.30288226673180263,
+      "grad_norm": 0.5598369240760803,
+      "learning_rate": 0.0002,
+      "loss": 0.5632,
+      "step": 310
+    },
+    {
+      "epoch": 0.3126526624328285,
+      "grad_norm": 0.6525678634643555,
+      "learning_rate": 0.0002,
+      "loss": 0.5627,
+      "step": 320
+    },
+    {
+      "epoch": 0.3224230581338544,
+      "grad_norm": 0.5190592408180237,
+      "learning_rate": 0.0002,
+      "loss": 0.5526,
+      "step": 330
+    },
+    {
+      "epoch": 0.3321934538348803,
+      "grad_norm": 0.45483070611953735,
+      "learning_rate": 0.0002,
+      "loss": 0.5698,
+      "step": 340
+    },
+    {
+      "epoch": 0.3419638495359062,
+      "grad_norm": 0.8094475865364075,
+      "learning_rate": 0.0002,
+      "loss": 0.5768,
+      "step": 350
+    },
+    {
+      "epoch": 0.3517342452369321,
+      "grad_norm": 0.5545358061790466,
+      "learning_rate": 0.0002,
+      "loss": 0.5555,
+      "step": 360
+    },
+    {
+      "epoch": 0.361504640937958,
+      "grad_norm": 0.6899498701095581,
+      "learning_rate": 0.0002,
+      "loss": 0.5529,
+      "step": 370
+    },
+    {
+      "epoch": 0.37127503663898387,
+      "grad_norm": 0.4584816098213196,
+      "learning_rate": 0.0002,
+      "loss": 0.556,
+      "step": 380
+    },
+    {
+      "epoch": 0.38104543234000976,
+      "grad_norm": 0.5436979532241821,
+      "learning_rate": 0.0002,
+      "loss": 0.5451,
+      "step": 390
+    },
+    {
+      "epoch": 0.39081582804103565,
+      "grad_norm": 0.7512422800064087,
+      "learning_rate": 0.0002,
+      "loss": 0.5377,
+      "step": 400
+    },
+    {
+      "epoch": 0.40058622374206154,
+      "grad_norm": 0.6394727826118469,
+      "learning_rate": 0.0002,
+      "loss": 0.5438,
+      "step": 410
+    },
+    {
+      "epoch": 0.41035661944308743,
+      "grad_norm": 0.5314047336578369,
+      "learning_rate": 0.0002,
+      "loss": 0.5535,
+      "step": 420
+    },
+    {
+      "epoch": 0.4201270151441133,
+      "grad_norm": 0.5658334493637085,
+      "learning_rate": 0.0002,
+      "loss": 0.5587,
+      "step": 430
+    },
+    {
+      "epoch": 0.4298974108451392,
+      "grad_norm": 0.5295330882072449,
+      "learning_rate": 0.0002,
+      "loss": 0.5219,
+      "step": 440
+    },
+    {
+      "epoch": 0.4396678065461651,
+      "grad_norm": 0.6460115313529968,
+      "learning_rate": 0.0002,
+      "loss": 0.522,
+      "step": 450
+    },
+    {
+      "epoch": 0.449438202247191,
+      "grad_norm": 0.512022852897644,
+      "learning_rate": 0.0002,
+      "loss": 0.5416,
+      "step": 460
+    },
+    {
+      "epoch": 0.4592085979482169,
+      "grad_norm": 0.7365363836288452,
+      "learning_rate": 0.0002,
+      "loss": 0.5256,
+      "step": 470
+    },
+    {
+      "epoch": 0.4689789936492428,
+      "grad_norm": 0.6292932629585266,
+      "learning_rate": 0.0002,
+      "loss": 0.5354,
+      "step": 480
+    },
+    {
+      "epoch": 0.4787493893502687,
+      "grad_norm": 0.6255582571029663,
+      "learning_rate": 0.0002,
+      "loss": 0.5436,
+      "step": 490
+    },
+    {
+      "epoch": 0.48851978505129456,
+      "grad_norm": 0.5599279403686523,
+      "learning_rate": 0.0002,
+      "loss": 0.5394,
+      "step": 500
+    },
+    {
+      "epoch": 0.49829018075232046,
+      "grad_norm": 0.573657751083374,
+      "learning_rate": 0.0002,
+      "loss": 0.5297,
+      "step": 510
+    },
+    {
+      "epoch": 0.5080605764533463,
+      "grad_norm": 0.6362313628196716,
+      "learning_rate": 0.0002,
+      "loss": 0.5299,
+      "step": 520
+    },
+    {
+      "epoch": 0.5178309721543722,
+      "grad_norm": 0.6360035538673401,
+      "learning_rate": 0.0002,
+      "loss": 0.5458,
+      "step": 530
+    },
+    {
+      "epoch": 0.5276013678553981,
+      "grad_norm": 0.7129001021385193,
+      "learning_rate": 0.0002,
+      "loss": 0.5228,
+      "step": 540
+    },
+    {
+      "epoch": 0.537371763556424,
+      "grad_norm": 0.5596054196357727,
+      "learning_rate": 0.0002,
+      "loss": 0.5091,
+      "step": 550
+    },
+    {
+      "epoch": 0.5471421592574499,
+      "grad_norm": 0.7081596851348877,
+      "learning_rate": 0.0002,
+      "loss": 0.5153,
+      "step": 560
+    },
+    {
+      "epoch": 0.5569125549584758,
+      "grad_norm": 0.6816760301589966,
+      "learning_rate": 0.0002,
+      "loss": 0.4999,
+      "step": 570
+    },
+    {
+      "epoch": 0.5666829506595017,
+      "grad_norm": 0.47695112228393555,
+      "learning_rate": 0.0002,
+      "loss": 0.4974,
+      "step": 580
+    },
+    {
+      "epoch": 0.5764533463605276,
+      "grad_norm": 0.7528041005134583,
+      "learning_rate": 0.0002,
+      "loss": 0.5247,
+      "step": 590
+    },
+    {
+      "epoch": 0.5862237420615535,
+      "grad_norm": 0.5452813506126404,
+      "learning_rate": 0.0002,
+      "loss": 0.5265,
+      "step": 600
+    },
+    {
+      "epoch": 0.5959941377625794,
+      "grad_norm": 0.6085044741630554,
+      "learning_rate": 0.0002,
+      "loss": 0.4965,
+      "step": 610
+    },
+    {
+      "epoch": 0.6057645334636053,
+      "grad_norm": 0.6745641231536865,
+      "learning_rate": 0.0002,
+      "loss": 0.4916,
+      "step": 620
+    },
+    {
+      "epoch": 0.6155349291646312,
+      "grad_norm": 0.647544264793396,
+      "learning_rate": 0.0002,
+      "loss": 0.5107,
+      "step": 630
+    },
+    {
+      "epoch": 0.625305324865657,
+      "grad_norm": 0.6123825311660767,
+      "learning_rate": 0.0002,
+      "loss": 0.4864,
+      "step": 640
+    },
+    {
+      "epoch": 0.6350757205666829,
+      "grad_norm": 0.5815364122390747,
+      "learning_rate": 0.0002,
+      "loss": 0.484,
+      "step": 650
+    },
+    {
+      "epoch": 0.6448461162677088,
+      "grad_norm": 0.6184095740318298,
+      "learning_rate": 0.0002,
+      "loss": 0.4966,
+      "step": 660
+    },
+    {
+      "epoch": 0.6546165119687347,
+      "grad_norm": 0.5856700539588928,
+      "learning_rate": 0.0002,
+      "loss": 0.4861,
+      "step": 670
+    },
+    {
+      "epoch": 0.6643869076697606,
+      "grad_norm": 0.6424922943115234,
+      "learning_rate": 0.0002,
+      "loss": 0.4964,
+      "step": 680
+    },
+    {
+      "epoch": 0.6741573033707865,
+      "grad_norm": 0.7051425576210022,
+      "learning_rate": 0.0002,
+      "loss": 0.5019,
+      "step": 690
+    },
+    {
+      "epoch": 0.6839276990718124,
+      "grad_norm": 0.6133471131324768,
+      "learning_rate": 0.0002,
+      "loss": 0.4649,
+      "step": 700
+    },
+    {
+      "epoch": 0.6936980947728383,
+      "grad_norm": 0.6933842897415161,
+      "learning_rate": 0.0002,
+      "loss": 0.4847,
+      "step": 710
+    },
+    {
+      "epoch": 0.7034684904738642,
+      "grad_norm": 0.6440989375114441,
+      "learning_rate": 0.0002,
+      "loss": 0.4945,
+      "step": 720
+    },
+    {
+      "epoch": 0.7132388861748901,
+      "grad_norm": 0.87819904088974,
+      "learning_rate": 0.0002,
+      "loss": 0.4777,
+      "step": 730
+    },
+    {
+      "epoch": 0.723009281875916,
+      "grad_norm": 0.6810497641563416,
+      "learning_rate": 0.0002,
+      "loss": 0.4914,
+      "step": 740
+    },
+    {
+      "epoch": 0.7327796775769418,
+      "grad_norm": 0.7822733521461487,
+      "learning_rate": 0.0002,
+      "loss": 0.4789,
+      "step": 750
+    },
+    {
+      "epoch": 0.7425500732779677,
+      "grad_norm": 0.6669152975082397,
+      "learning_rate": 0.0002,
+      "loss": 0.4615,
+      "step": 760
+    },
+    {
+      "epoch": 0.7523204689789936,
+      "grad_norm": 0.7351736426353455,
+      "learning_rate": 0.0002,
+      "loss": 0.4689,
+      "step": 770
+    },
+    {
+      "epoch": 0.7620908646800195,
+      "grad_norm": 1.0013558864593506,
+      "learning_rate": 0.0002,
+      "loss": 0.4629,
+      "step": 780
+    },
+    {
+      "epoch": 0.7718612603810454,
+      "grad_norm": 0.7465775609016418,
+      "learning_rate": 0.0002,
+      "loss": 0.4739,
+      "step": 790
+    },
+    {
+      "epoch": 0.7816316560820713,
+      "grad_norm": 1.0959300994873047,
+      "learning_rate": 0.0002,
+      "loss": 0.4635,
+      "step": 800
+    },
+    {
+      "epoch": 0.7914020517830972,
+      "grad_norm": 0.5292418599128723,
+      "learning_rate": 0.0002,
+      "loss": 0.4549,
+      "step": 810
+    },
+    {
+      "epoch": 0.8011724474841231,
+      "grad_norm": 0.6555328965187073,
+      "learning_rate": 0.0002,
+      "loss": 0.458,
+      "step": 820
+    },
+    {
+      "epoch": 0.810942843185149,
+      "grad_norm": 0.6462382674217224,
+      "learning_rate": 0.0002,
+      "loss": 0.488,
+      "step": 830
+    },
+    {
+      "epoch": 0.8207132388861749,
+      "grad_norm": 0.6840918064117432,
+      "learning_rate": 0.0002,
+      "loss": 0.4541,
+      "step": 840
+    },
+    {
+      "epoch": 0.8304836345872008,
+      "grad_norm": 0.5715351700782776,
+      "learning_rate": 0.0002,
+      "loss": 0.4509,
+      "step": 850
+    },
+    {
+      "epoch": 0.8402540302882266,
+      "grad_norm": 0.5583404898643494,
+      "learning_rate": 0.0002,
+      "loss": 0.4535,
+      "step": 860
+    },
+    {
+      "epoch": 0.8500244259892525,
+      "grad_norm": 0.8243112564086914,
+      "learning_rate": 0.0002,
+      "loss": 0.4533,
+      "step": 870
+    },
+    {
+      "epoch": 0.8597948216902784,
+      "grad_norm": 0.6543600559234619,
+      "learning_rate": 0.0002,
+      "loss": 0.4545,
+      "step": 880
+    },
+    {
+      "epoch": 0.8695652173913043,
+      "grad_norm": 0.6494827270507812,
+      "learning_rate": 0.0002,
+      "loss": 0.4814,
+      "step": 890
+    },
+    {
+      "epoch": 0.8793356130923302,
+      "grad_norm": 0.8458304405212402,
+      "learning_rate": 0.0002,
+      "loss": 0.4593,
+      "step": 900
+    },
+    {
+      "epoch": 0.8891060087933561,
+      "grad_norm": 0.6854186654090881,
+      "learning_rate": 0.0002,
+      "loss": 0.4382,
+      "step": 910
+    },
+    {
+      "epoch": 0.898876404494382,
+      "grad_norm": 0.6300225853919983,
+      "learning_rate": 0.0002,
+      "loss": 0.4488,
+      "step": 920
+    },
+    {
+      "epoch": 0.9086468001954079,
+      "grad_norm": 0.9791533350944519,
+      "learning_rate": 0.0002,
+      "loss": 0.4638,
+      "step": 930
+    },
+    {
+      "epoch": 0.9184171958964338,
+      "grad_norm": 0.6965218186378479,
+      "learning_rate": 0.0002,
+      "loss": 0.446,
+      "step": 940
+    },
+    {
+      "epoch": 0.9281875915974597,
+      "grad_norm": 0.6066922545433044,
+      "learning_rate": 0.0002,
+      "loss": 0.4453,
+      "step": 950
+    },
+    {
+      "epoch": 0.9379579872984856,
+      "grad_norm": 0.8081962466239929,
+      "learning_rate": 0.0002,
+      "loss": 0.4471,
+      "step": 960
+    },
+    {
+      "epoch": 0.9477283829995115,
+      "grad_norm": 0.7755117416381836,
+      "learning_rate": 0.0002,
+      "loss": 0.4348,
+      "step": 970
+    },
+    {
+      "epoch": 0.9574987787005373,
+      "grad_norm": 0.7127223610877991,
+      "learning_rate": 0.0002,
+      "loss": 0.4423,
+      "step": 980
+    },
+    {
+      "epoch": 0.9672691744015632,
+      "grad_norm": 0.6947609186172485,
+      "learning_rate": 0.0002,
+      "loss": 0.4272,
+      "step": 990
+    },
+    {
+      "epoch": 0.9770395701025891,
+      "grad_norm": 1.0100330114364624,
+      "learning_rate": 0.0002,
+      "loss": 0.4262,
+      "step": 1000
+    },
+    {
+      "epoch": 0.986809965803615,
+      "grad_norm": 0.6727001667022705,
+      "learning_rate": 0.0002,
+      "loss": 0.4169,
+      "step": 1010
+    },
+    {
+      "epoch": 0.9965803615046409,
+      "grad_norm": 0.7834463119506836,
+      "learning_rate": 0.0002,
+      "loss": 0.4507,
+      "step": 1020
+    },
+    {
+      "epoch": 0.9995114802149487,
+      "eval_loss": 0.433101624250412,
+      "eval_runtime": 26.5222,
+      "eval_samples_per_second": 13.762,
+      "eval_steps_per_second": 1.734,
+      "step": 1023
+    },
+    {
+      "epoch": 1.006350757205667,
+      "grad_norm": 0.8070526123046875,
+      "learning_rate": 0.0002,
+      "loss": 0.4154,
+      "step": 1030
+    },
+    {
+      "epoch": 1.0161211529066927,
+      "grad_norm": 0.7301508784294128,
+      "learning_rate": 0.0002,
+      "loss": 0.3951,
+      "step": 1040
+    },
+    {
+      "epoch": 1.0258915486077187,
+      "grad_norm": 0.8212476372718811,
+      "learning_rate": 0.0002,
+      "loss": 0.4001,
+      "step": 1050
+    },
+    {
+      "epoch": 1.0356619443087445,
+      "grad_norm": 0.6269228458404541,
+      "learning_rate": 0.0002,
+      "loss": 0.3953,
+      "step": 1060
+    },
+    {
+      "epoch": 1.0454323400097705,
+      "grad_norm": 0.700432300567627,
+      "learning_rate": 0.0002,
+      "loss": 0.4147,
+      "step": 1070
+    },
+    {
+      "epoch": 1.0552027357107963,
+      "grad_norm": 0.9251010417938232,
+      "learning_rate": 0.0002,
+      "loss": 0.3857,
+      "step": 1080
+    },
+    {
+      "epoch": 1.0649731314118223,
+      "grad_norm": 0.6018561720848083,
+      "learning_rate": 0.0002,
+      "loss": 0.3955,
+      "step": 1090
+    },
+    {
+      "epoch": 1.074743527112848,
+      "grad_norm": 0.7045873403549194,
+      "learning_rate": 0.0002,
+      "loss": 0.4079,
+      "step": 1100
+    },
+    {
+      "epoch": 1.084513922813874,
+      "grad_norm": 0.7800339460372925,
+      "learning_rate": 0.0002,
+      "loss": 0.4005,
+      "step": 1110
+    },
+    {
+      "epoch": 1.0942843185148998,
+      "grad_norm": 0.7404900789260864,
+      "learning_rate": 0.0002,
+      "loss": 0.419,
+      "step": 1120
+    },
+    {
+      "epoch": 1.1040547142159258,
+      "grad_norm": 1.1851727962493896,
+      "learning_rate": 0.0002,
+      "loss": 0.4057,
+      "step": 1130
+    },
+    {
+      "epoch": 1.1138251099169516,
+      "grad_norm": 0.875406801700592,
+      "learning_rate": 0.0002,
+      "loss": 0.3966,
+      "step": 1140
+    },
+    {
+      "epoch": 1.1235955056179776,
+      "grad_norm": 0.9795705676078796,
+      "learning_rate": 0.0002,
+      "loss": 0.3863,
+      "step": 1150
+    },
+    {
+      "epoch": 1.1333659013190034,
+      "grad_norm": 0.7387922406196594,
+      "learning_rate": 0.0002,
+      "loss": 0.3991,
+      "step": 1160
+    },
+    {
+      "epoch": 1.1431362970200294,
+      "grad_norm": 0.6640482544898987,
+      "learning_rate": 0.0002,
+      "loss": 0.3914,
+      "step": 1170
+    },
+    {
+      "epoch": 1.1529066927210552,
+      "grad_norm": 0.6067684888839722,
+      "learning_rate": 0.0002,
+      "loss": 0.3809,
+      "step": 1180
+    },
+    {
+      "epoch": 1.1626770884220812,
+      "grad_norm": 0.7623337507247925,
+      "learning_rate": 0.0002,
+      "loss": 0.3915,
+      "step": 1190
+    },
+    {
+      "epoch": 1.172447484123107,
+      "grad_norm": 1.0410432815551758,
+      "learning_rate": 0.0002,
+      "loss": 0.3832,
+      "step": 1200
+    },
+    {
+      "epoch": 1.182217879824133,
+      "grad_norm": 0.7790178656578064,
+      "learning_rate": 0.0002,
+      "loss": 0.3875,
+      "step": 1210
+    },
+    {
+      "epoch": 1.1919882755251587,
+      "grad_norm": 0.7643477916717529,
+      "learning_rate": 0.0002,
+      "loss": 0.3869,
+      "step": 1220
+    },
+    {
+      "epoch": 1.2017586712261847,
+      "grad_norm": 1.2028473615646362,
+      "learning_rate": 0.0002,
+      "loss": 0.3719,
+      "step": 1230
+    },
+    {
+      "epoch": 1.2115290669272105,
+      "grad_norm": 0.787656307220459,
+      "learning_rate": 0.0002,
+      "loss": 0.3863,
+      "step": 1240
+    },
+    {
+      "epoch": 1.2212994626282365,
+      "grad_norm": 0.8074171543121338,
+      "learning_rate": 0.0002,
+      "loss": 0.3875,
+      "step": 1250
+    },
+    {
+      "epoch": 1.2310698583292623,
+      "grad_norm": 0.8488901853561401,
+      "learning_rate": 0.0002,
+      "loss": 0.3923,
+      "step": 1260
+    },
+    {
+      "epoch": 1.2408402540302883,
+      "grad_norm": 0.7454975247383118,
+      "learning_rate": 0.0002,
+      "loss": 0.3829,
+      "step": 1270
+    },
+    {
+      "epoch": 1.250610649731314,
+      "grad_norm": 0.6724955439567566,
+      "learning_rate": 0.0002,
+      "loss": 0.3981,
+      "step": 1280
+    },
+    {
+      "epoch": 1.26038104543234,
+      "grad_norm": 1.1912977695465088,
+      "learning_rate": 0.0002,
+      "loss": 0.383,
+      "step": 1290
+    },
+    {
+      "epoch": 1.2701514411333659,
+      "grad_norm": 0.7795814871788025,
+      "learning_rate": 0.0002,
+      "loss": 0.3837,
+      "step": 1300
+    },
+    {
+      "epoch": 1.2799218368343919,
+      "grad_norm": 0.672956645488739,
+      "learning_rate": 0.0002,
+      "loss": 0.3898,
+      "step": 1310
+    },
+    {
+      "epoch": 1.2896922325354176,
+      "grad_norm": 1.245808482170105,
+      "learning_rate": 0.0002,
+      "loss": 0.3849,
+      "step": 1320
+    },
+    {
+      "epoch": 1.2994626282364437,
+      "grad_norm": 0.9562020301818848,
+      "learning_rate": 0.0002,
+      "loss": 0.3877,
+      "step": 1330
+    },
+    {
+      "epoch": 1.3092330239374694,
+      "grad_norm": 1.2005938291549683,
+      "learning_rate": 0.0002,
+      "loss": 0.3711,
+      "step": 1340
+    },
+    {
+      "epoch": 1.3190034196384954,
+      "grad_norm": 0.7105128169059753,
+      "learning_rate": 0.0002,
+      "loss": 0.3761,
+      "step": 1350
+    },
+    {
+      "epoch": 1.3287738153395212,
+      "grad_norm": 0.9829772710800171,
+      "learning_rate": 0.0002,
+      "loss": 0.371,
+      "step": 1360
+    },
+    {
+      "epoch": 1.3385442110405472,
+      "grad_norm": 0.6548563241958618,
+      "learning_rate": 0.0002,
+      "loss": 0.3845,
+      "step": 1370
+    },
+    {
+      "epoch": 1.348314606741573,
+      "grad_norm": 0.877531111240387,
+      "learning_rate": 0.0002,
+      "loss": 0.3797,
+      "step": 1380
+    },
+    {
+      "epoch": 1.358085002442599,
+      "grad_norm": 0.6915368437767029,
+      "learning_rate": 0.0002,
+      "loss": 0.3757,
+      "step": 1390
+    },
+    {
+      "epoch": 1.3678553981436248,
+      "grad_norm": 0.6052316427230835,
+      "learning_rate": 0.0002,
+      "loss": 0.368,
+      "step": 1400
+    },
+    {
+      "epoch": 1.3776257938446508,
+      "grad_norm": 0.6086260080337524,
+      "learning_rate": 0.0002,
+      "loss": 0.3758,
+      "step": 1410
+    },
+    {
+      "epoch": 1.3873961895456766,
+      "grad_norm": 1.0432673692703247,
+      "learning_rate": 0.0002,
+      "loss": 0.3794,
+      "step": 1420
+    },
+    {
+      "epoch": 1.3971665852467026,
+      "grad_norm": 0.7252581715583801,
+      "learning_rate": 0.0002,
+      "loss": 0.3715,
+      "step": 1430
+    },
+    {
+      "epoch": 1.4069369809477283,
+      "grad_norm": 0.7926928997039795,
+      "learning_rate": 0.0002,
+      "loss": 0.3919,
+      "step": 1440
+    },
+    {
+      "epoch": 1.4167073766487543,
+      "grad_norm": 0.6464225649833679,
+      "learning_rate": 0.0002,
+      "loss": 0.3701,
+      "step": 1450
+    },
+    {
+      "epoch": 1.4264777723497801,
+      "grad_norm": 1.0563385486602783,
+      "learning_rate": 0.0002,
+      "loss": 0.3738,
+      "step": 1460
+    },
+    {
+      "epoch": 1.4362481680508061,
+      "grad_norm": 0.5497196316719055,
+      "learning_rate": 0.0002,
+      "loss": 0.3782,
+      "step": 1470
+    },
+    {
+      "epoch": 1.446018563751832,
+      "grad_norm": 0.7382678389549255,
+      "learning_rate": 0.0002,
+      "loss": 0.3668,
+      "step": 1480
+    },
+    {
+      "epoch": 1.455788959452858,
+      "grad_norm": 0.6264833807945251,
+      "learning_rate": 0.0002,
+      "loss": 0.3592,
+      "step": 1490
+    },
+    {
+      "epoch": 1.4655593551538837,
+      "grad_norm": 0.6722145080566406,
+      "learning_rate": 0.0002,
+      "loss": 0.3809,
+      "step": 1500
+    },
+    {
+      "epoch": 1.4753297508549097,
+      "grad_norm": 0.8594183921813965,
+      "learning_rate": 0.0002,
+      "loss": 0.3715,
+      "step": 1510
+    },
+    {
+      "epoch": 1.4851001465559355,
+      "grad_norm": 0.8588142395019531,
+      "learning_rate": 0.0002,
+      "loss": 0.354,
+      "step": 1520
+    },
+    {
+      "epoch": 1.4948705422569615,
+      "grad_norm": 0.8683834075927734,
+      "learning_rate": 0.0002,
+      "loss": 0.3654,
+      "step": 1530
+    },
+    {
+      "epoch": 1.5046409379579873,
+      "grad_norm": 0.7628163695335388,
+      "learning_rate": 0.0002,
+      "loss": 0.3647,
+      "step": 1540
+    },
+    {
+      "epoch": 1.5144113336590133,
+      "grad_norm": 0.7967382669448853,
+      "learning_rate": 0.0002,
+      "loss": 0.3666,
+      "step": 1550
+    },
+    {
+      "epoch": 1.524181729360039,
+      "grad_norm": 0.7065442800521851,
+      "learning_rate": 0.0002,
+      "loss": 0.361,
+      "step": 1560
+    },
+    {
+      "epoch": 1.5339521250610648,
+      "grad_norm": 0.6472197771072388,
+      "learning_rate": 0.0002,
+      "loss": 0.3623,
+      "step": 1570
+    },
+    {
+      "epoch": 1.5437225207620908,
+      "grad_norm": 1.105960488319397,
+      "learning_rate": 0.0002,
+      "loss": 0.3626,
+      "step": 1580
+    },
+    {
+      "epoch": 1.5534929164631168,
+      "grad_norm": 0.9730587601661682,
+      "learning_rate": 0.0002,
+      "loss": 0.3528,
+      "step": 1590
+    },
+    {
+      "epoch": 1.5632633121641426,
+      "grad_norm": 0.987910807132721,
+      "learning_rate": 0.0002,
+      "loss": 0.3739,
+      "step": 1600
+    },
+    {
+      "epoch": 1.5730337078651684,
+      "grad_norm": 0.9708227515220642,
+      "learning_rate": 0.0002,
+      "loss": 0.3546,
+      "step": 1610
+    },
+    {
+      "epoch": 1.5828041035661944,
+      "grad_norm": 0.6303295493125916,
+      "learning_rate": 0.0002,
+      "loss": 0.3653,
+      "step": 1620
+    },
+    {
+      "epoch": 1.5925744992672204,
+      "grad_norm": 1.0985002517700195,
+      "learning_rate": 0.0002,
+      "loss": 0.3639,
+      "step": 1630
+    },
+    {
+      "epoch": 1.6023448949682462,
+      "grad_norm": 0.839419960975647,
+      "learning_rate": 0.0002,
+      "loss": 0.3533,
+      "step": 1640
+    },
+    {
+      "epoch": 1.612115290669272,
+      "grad_norm": 0.7963409423828125,
+      "learning_rate": 0.0002,
+      "loss": 0.3544,
+      "step": 1650
+    },
+    {
+      "epoch": 1.621885686370298,
+      "grad_norm": 0.8074514269828796,
+      "learning_rate": 0.0002,
+      "loss": 0.3721,
+      "step": 1660
+    },
+    {
+      "epoch": 1.631656082071324,
+      "grad_norm": 0.8368266820907593,
+      "learning_rate": 0.0002,
+      "loss": 0.3573,
+      "step": 1670
+    },
+    {
+      "epoch": 1.6414264777723497,
+      "grad_norm": 0.6562672257423401,
+      "learning_rate": 0.0002,
+      "loss": 0.3556,
+      "step": 1680
+    },
+    {
+      "epoch": 1.6511968734733755,
+      "grad_norm": 0.5512149930000305,
+      "learning_rate": 0.0002,
+      "loss": 0.3593,
+      "step": 1690
+    },
+    {
+      "epoch": 1.6609672691744015,
+      "grad_norm": 0.5829663276672363,
+      "learning_rate": 0.0002,
+      "loss": 0.3626,
+      "step": 1700
+    },
+    {
+      "epoch": 1.6707376648754275,
+      "grad_norm": 0.8412625193595886,
+      "learning_rate": 0.0002,
+      "loss": 0.3526,
+      "step": 1710
+    },
+    {
+      "epoch": 1.6805080605764533,
+      "grad_norm": 0.8657066226005554,
+      "learning_rate": 0.0002,
+      "loss": 0.3593,
+      "step": 1720
+    },
+    {
+      "epoch": 1.690278456277479,
+      "grad_norm": 0.9691681861877441,
+      "learning_rate": 0.0002,
+      "loss": 0.3545,
+      "step": 1730
+    },
+    {
+      "epoch": 1.700048851978505,
+      "grad_norm": 0.641669511795044,
+      "learning_rate": 0.0002,
+      "loss": 0.3694,
+      "step": 1740
+    },
+    {
+      "epoch": 1.709819247679531,
+      "grad_norm": 0.7599552273750305,
+      "learning_rate": 0.0002,
+      "loss": 0.3594,
+      "step": 1750
+    },
+    {
+      "epoch": 1.7195896433805569,
+      "grad_norm": 0.7562308311462402,
+      "learning_rate": 0.0002,
+      "loss": 0.3563,
+      "step": 1760
+    },
+    {
+      "epoch": 1.7293600390815826,
+      "grad_norm": 0.6949060559272766,
+      "learning_rate": 0.0002,
+      "loss": 0.3741,
+      "step": 1770
+    },
+    {
+      "epoch": 1.7391304347826086,
+      "grad_norm": 1.1047314405441284,
+      "learning_rate": 0.0002,
+      "loss": 0.3444,
+      "step": 1780
+    },
+    {
+      "epoch": 1.7489008304836346,
+      "grad_norm": 0.9239255785942078,
+      "learning_rate": 0.0002,
+      "loss": 0.3602,
+      "step": 1790
+    },
+    {
+      "epoch": 1.7586712261846604,
+      "grad_norm": 0.6171822547912598,
+      "learning_rate": 0.0002,
+      "loss": 0.3464,
+      "step": 1800
+    },
+    {
+      "epoch": 1.7684416218856862,
+      "grad_norm": 0.8883067965507507,
+      "learning_rate": 0.0002,
+      "loss": 0.3504,
+      "step": 1810
+    },
+    {
+      "epoch": 1.7782120175867122,
+      "grad_norm": 0.8204503059387207,
+      "learning_rate": 0.0002,
+      "loss": 0.341,
+      "step": 1820
+    },
+    {
+      "epoch": 1.7879824132877382,
+      "grad_norm": 0.807534396648407,
+      "learning_rate": 0.0002,
+      "loss": 0.3455,
+      "step": 1830
+    },
+    {
+      "epoch": 1.797752808988764,
+      "grad_norm": 0.8063831329345703,
+      "learning_rate": 0.0002,
+      "loss": 0.3287,
+      "step": 1840
+    },
+    {
+      "epoch": 1.8075232046897898,
+      "grad_norm": 0.7789983749389648,
+      "learning_rate": 0.0002,
+      "loss": 0.3424,
+      "step": 1850
+    },
+    {
+      "epoch": 1.8172936003908158,
+      "grad_norm": 0.6771978735923767,
+      "learning_rate": 0.0002,
+      "loss": 0.3495,
+      "step": 1860
+    },
+    {
+      "epoch": 1.8270639960918418,
+      "grad_norm": 0.9140942096710205,
+      "learning_rate": 0.0002,
+      "loss": 0.3437,
+      "step": 1870
+    },
+    {
+      "epoch": 1.8368343917928676,
+      "grad_norm": 0.6635336875915527,
+      "learning_rate": 0.0002,
+      "loss": 0.3458,
+      "step": 1880
+    },
+    {
+      "epoch": 1.8466047874938933,
+      "grad_norm": 1.1987066268920898,
+      "learning_rate": 0.0002,
+      "loss": 0.3396,
+      "step": 1890
+    },
+    {
+      "epoch": 1.8563751831949193,
+      "grad_norm": 0.7020497918128967,
+      "learning_rate": 0.0002,
+      "loss": 0.3413,
+      "step": 1900
+    },
+    {
+      "epoch": 1.8661455788959453,
+      "grad_norm": 1.0113945007324219,
+      "learning_rate": 0.0002,
+      "loss": 0.3442,
+      "step": 1910
+    },
+    {
+      "epoch": 1.8759159745969711,
+      "grad_norm": 0.8227802515029907,
+      "learning_rate": 0.0002,
+      "loss": 0.3503,
+      "step": 1920
+    },
+    {
+      "epoch": 1.885686370297997,
+      "grad_norm": 0.8185329437255859,
+      "learning_rate": 0.0002,
+      "loss": 0.3565,
+      "step": 1930
+    },
+    {
+      "epoch": 1.895456765999023,
+      "grad_norm": 0.7708970904350281,
+      "learning_rate": 0.0002,
+      "loss": 0.335,
+      "step": 1940
+    },
+    {
+      "epoch": 1.905227161700049,
+      "grad_norm": 0.8888451457023621,
+      "learning_rate": 0.0002,
+      "loss": 0.3365,
+      "step": 1950
+    },
+    {
+      "epoch": 1.9149975574010747,
+      "grad_norm": 0.720267653465271,
+      "learning_rate": 0.0002,
+      "loss": 0.3342,
+      "step": 1960
+    },
+    {
+      "epoch": 1.9247679531021005,
+      "grad_norm": 0.888666570186615,
+      "learning_rate": 0.0002,
+      "loss": 0.3512,
+      "step": 1970
+    },
+    {
+      "epoch": 1.9345383488031265,
+      "grad_norm": 0.7471952438354492,
+      "learning_rate": 0.0002,
+      "loss": 0.3284,
+      "step": 1980
+    },
+    {
+      "epoch": 1.9443087445041525,
+      "grad_norm": 0.7166922092437744,
+      "learning_rate": 0.0002,
+      "loss": 0.3383,
+      "step": 1990
+    },
+    {
+      "epoch": 1.9540791402051783,
+      "grad_norm": 0.7097923159599304,
+      "learning_rate": 0.0002,
+      "loss": 0.3355,
+      "step": 2000
+    },
+    {
+      "epoch": 1.963849535906204,
+      "grad_norm": 0.8592363595962524,
+      "learning_rate": 0.0002,
+      "loss": 0.3282,
+      "step": 2010
+    },
+    {
+      "epoch": 1.97361993160723,
+      "grad_norm": 0.5352440476417542,
+      "learning_rate": 0.0002,
+      "loss": 0.3273,
+      "step": 2020
+    },
+    {
+      "epoch": 1.983390327308256,
+      "grad_norm": 1.0193064212799072,
+      "learning_rate": 0.0002,
+      "loss": 0.3387,
+      "step": 2030
+    },
+    {
+      "epoch": 1.9931607230092818,
+      "grad_norm": 0.7331683039665222,
+      "learning_rate": 0.0002,
+      "loss": 0.3277,
+      "step": 2040
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.3446754515171051,
+      "eval_runtime": 26.5209,
+      "eval_samples_per_second": 13.763,
+      "eval_steps_per_second": 1.734,
+      "step": 2047
+    },
+    {
+      "epoch": 2.0029311187103076,
+      "grad_norm": 0.5937952399253845,
+      "learning_rate": 0.0002,
+      "loss": 0.321,
+      "step": 2050
+    },
+    {
+      "epoch": 2.012701514411334,
+      "grad_norm": 0.7739789485931396,
+      "learning_rate": 0.0002,
+      "loss": 0.3193,
+      "step": 2060
+    },
+    {
+      "epoch": 2.0224719101123596,
+      "grad_norm": 0.8177487850189209,
+      "learning_rate": 0.0002,
+      "loss": 0.3082,
+      "step": 2070
+    },
+    {
+      "epoch": 2.0322423058133854,
+      "grad_norm": 0.8874511122703552,
+      "learning_rate": 0.0002,
+      "loss": 0.3124,
+      "step": 2080
+    },
+    {
+      "epoch": 2.042012701514411,
+      "grad_norm": 0.5704050660133362,
+      "learning_rate": 0.0002,
+      "loss": 0.3134,
+      "step": 2090
+    },
+    {
+      "epoch": 2.0517830972154374,
+      "grad_norm": 0.6900630593299866,
+      "learning_rate": 0.0002,
+      "loss": 0.3183,
+      "step": 2100
+    },
+    {
+      "epoch": 2.061553492916463,
+      "grad_norm": 0.6171090006828308,
+      "learning_rate": 0.0002,
+      "loss": 0.3299,
+      "step": 2110
+    },
+    {
+      "epoch": 2.071323888617489,
+      "grad_norm": 0.6837073564529419,
+      "learning_rate": 0.0002,
+      "loss": 0.3174,
+      "step": 2120
+    },
+    {
+      "epoch": 2.0810942843185147,
+      "grad_norm": 0.7657505869865417,
+      "learning_rate": 0.0002,
+      "loss": 0.3188,
+      "step": 2130
+    },
+    {
+      "epoch": 2.090864680019541,
+      "grad_norm": 0.6443445682525635,
+      "learning_rate": 0.0002,
+      "loss": 0.3106,
+      "step": 2140
+    },
+    {
+      "epoch": 2.1006350757205667,
+      "grad_norm": 0.7839877605438232,
+      "learning_rate": 0.0002,
+      "loss": 0.3122,
+      "step": 2150
+    },
+    {
+      "epoch": 2.1104054714215925,
+      "grad_norm": 0.6591543555259705,
+      "learning_rate": 0.0002,
+      "loss": 0.3075,
+      "step": 2160
+    },
+    {
+      "epoch": 2.1201758671226183,
+      "grad_norm": 0.4450279176235199,
+      "learning_rate": 0.0002,
+      "loss": 0.3156,
+      "step": 2170
+    },
+    {
+      "epoch": 2.1299462628236445,
+      "grad_norm": 0.7616181373596191,
+      "learning_rate": 0.0002,
+      "loss": 0.3166,
+      "step": 2180
+    },
+    {
+      "epoch": 2.1397166585246703,
+      "grad_norm": 0.9556062817573547,
+      "learning_rate": 0.0002,
+      "loss": 0.3222,
+      "step": 2190
+    },
+    {
+      "epoch": 2.149487054225696,
+      "grad_norm": 0.7944735288619995,
+      "learning_rate": 0.0002,
+      "loss": 0.3065,
+      "step": 2200
+    },
+    {
+      "epoch": 2.159257449926722,
+      "grad_norm": 0.8850461840629578,
+      "learning_rate": 0.0002,
+      "loss": 0.3182,
+      "step": 2210
+    },
+    {
+      "epoch": 2.169027845627748,
+      "grad_norm": 0.586155354976654,
+      "learning_rate": 0.0002,
+      "loss": 0.3116,
+      "step": 2220
+    },
+    {
+      "epoch": 2.178798241328774,
+      "grad_norm": 0.5621091723442078,
+      "learning_rate": 0.0002,
+      "loss": 0.3124,
+      "step": 2230
+    },
+    {
+      "epoch": 2.1885686370297996,
+      "grad_norm": 1.0284475088119507,
+      "learning_rate": 0.0002,
+      "loss": 0.3231,
+      "step": 2240
+    },
+    {
+      "epoch": 2.1983390327308254,
+      "grad_norm": 0.6767295002937317,
+      "learning_rate": 0.0002,
+      "loss": 0.313,
+      "step": 2250
+    },
+    {
+      "epoch": 2.2081094284318517,
+      "grad_norm": 1.5721969604492188,
+      "learning_rate": 0.0002,
+      "loss": 0.3058,
+      "step": 2260
+    },
+    {
+      "epoch": 2.2178798241328774,
+      "grad_norm": 0.6935747861862183,
+      "learning_rate": 0.0002,
+      "loss": 0.3184,
+      "step": 2270
+    },
+    {
+      "epoch": 2.227650219833903,
+      "grad_norm": 0.6964385509490967,
+      "learning_rate": 0.0002,
+      "loss": 0.3145,
+      "step": 2280
+    },
+    {
+      "epoch": 2.237420615534929,
+      "grad_norm": 0.7350403070449829,
+      "learning_rate": 0.0002,
+      "loss": 0.3196,
+      "step": 2290
+    },
+    {
+      "epoch": 2.247191011235955,
+      "grad_norm": 0.6564902663230896,
+      "learning_rate": 0.0002,
+      "loss": 0.3043,
+      "step": 2300
+    },
+    {
+      "epoch": 2.256961406936981,
+      "grad_norm": 0.6696506142616272,
+      "learning_rate": 0.0002,
+      "loss": 0.3092,
+      "step": 2310
+    },
+    {
+      "epoch": 2.2667318026380068,
+      "grad_norm": 0.5929620265960693,
+      "learning_rate": 0.0002,
+      "loss": 0.3163,
+      "step": 2320
+    },
+    {
+      "epoch": 2.2765021983390326,
+      "grad_norm": 0.7476680874824524,
+      "learning_rate": 0.0002,
+      "loss": 0.3156,
+      "step": 2330
+    },
+    {
+      "epoch": 2.286272594040059,
+      "grad_norm": 1.0137721300125122,
+      "learning_rate": 0.0002,
+      "loss": 0.3151,
+      "step": 2340
+    },
+    {
+      "epoch": 2.2960429897410846,
+      "grad_norm": 0.6992525458335876,
+      "learning_rate": 0.0002,
+      "loss": 0.308,
+      "step": 2350
+    },
+    {
+      "epoch": 2.3058133854421103,
+      "grad_norm": 0.572147786617279,
+      "learning_rate": 0.0002,
+      "loss": 0.3166,
+      "step": 2360
+    },
+    {
+      "epoch": 2.315583781143136,
+      "grad_norm": 0.6631198525428772,
+      "learning_rate": 0.0002,
+      "loss": 0.314,
+      "step": 2370
+    },
+    {
+      "epoch": 2.3253541768441623,
+      "grad_norm": 0.9330461025238037,
+      "learning_rate": 0.0002,
+      "loss": 0.308,
+      "step": 2380
+    },
+    {
+      "epoch": 2.335124572545188,
+      "grad_norm": 0.783240556716919,
+      "learning_rate": 0.0002,
+      "loss": 0.3266,
+      "step": 2390
+    },
+    {
+      "epoch": 2.344894968246214,
+      "grad_norm": 0.574898898601532,
+      "learning_rate": 0.0002,
+      "loss": 0.3166,
+      "step": 2400
+    },
+    {
+      "epoch": 2.3546653639472397,
+      "grad_norm": 0.6607279777526855,
+      "learning_rate": 0.0002,
+      "loss": 0.3119,
+      "step": 2410
+    },
+    {
+      "epoch": 2.364435759648266,
+      "grad_norm": 0.8342743515968323,
+      "learning_rate": 0.0002,
+      "loss": 0.3129,
+      "step": 2420
+    },
+    {
+      "epoch": 2.3742061553492917,
+      "grad_norm": 0.8198254108428955,
+      "learning_rate": 0.0002,
+      "loss": 0.315,
+      "step": 2430
+    },
+    {
+      "epoch": 2.3839765510503175,
+      "grad_norm": 0.9324616193771362,
+      "learning_rate": 0.0002,
+      "loss": 0.3107,
+      "step": 2440
+    },
+    {
+      "epoch": 2.3937469467513433,
+      "grad_norm": 0.8188948035240173,
+      "learning_rate": 0.0002,
+      "loss": 0.3,
+      "step": 2450
+    },
+    {
+      "epoch": 2.4035173424523695,
+      "grad_norm": 0.7812654376029968,
+      "learning_rate": 0.0002,
+      "loss": 0.3095,
+      "step": 2460
+    },
+    {
+      "epoch": 2.4132877381533953,
+      "grad_norm": 0.7986653447151184,
+      "learning_rate": 0.0002,
+      "loss": 0.2994,
+      "step": 2470
+    },
+    {
+      "epoch": 2.423058133854421,
+      "grad_norm": 0.6537502408027649,
+      "learning_rate": 0.0002,
+      "loss": 0.3095,
+      "step": 2480
+    },
+    {
+      "epoch": 2.432828529555447,
+      "grad_norm": 0.4680769741535187,
+      "learning_rate": 0.0002,
+      "loss": 0.3092,
+      "step": 2490
+    },
+    {
+      "epoch": 2.442598925256473,
+      "grad_norm": 1.0223482847213745,
+      "learning_rate": 0.0002,
+      "loss": 0.3117,
+      "step": 2500
+    },
+    {
+      "epoch": 2.452369320957499,
+      "grad_norm": 0.5865668654441833,
+      "learning_rate": 0.0002,
+      "loss": 0.3047,
+      "step": 2510
+    },
+    {
+      "epoch": 2.4621397166585246,
+      "grad_norm": 0.8539699912071228,
+      "learning_rate": 0.0002,
+      "loss": 0.3138,
+      "step": 2520
+    },
+    {
+      "epoch": 2.4719101123595504,
+      "grad_norm": 0.8653438687324524,
+      "learning_rate": 0.0002,
+      "loss": 0.3082,
+      "step": 2530
+    },
+    {
+      "epoch": 2.4816805080605766,
+      "grad_norm": 1.084686040878296,
+      "learning_rate": 0.0002,
+      "loss": 0.3098,
+      "step": 2540
+    },
+    {
+      "epoch": 2.4914509037616024,
+      "grad_norm": 0.8754410743713379,
+      "learning_rate": 0.0002,
+      "loss": 0.3139,
+      "step": 2550
+    },
+    {
+      "epoch": 2.501221299462628,
+      "grad_norm": 0.838127851486206,
+      "learning_rate": 0.0002,
+      "loss": 0.3066,
+      "step": 2560
+    },
+    {
+      "epoch": 2.5109916951636544,
+      "grad_norm": 0.7761465907096863,
+      "learning_rate": 0.0002,
+      "loss": 0.2933,
+      "step": 2570
+    },
+    {
+      "epoch": 2.52076209086468,
+      "grad_norm": 0.7373273372650146,
+      "learning_rate": 0.0002,
+      "loss": 0.2942,
+      "step": 2580
+    },
+    {
+      "epoch": 2.530532486565706,
+      "grad_norm": 0.7441604137420654,
+      "learning_rate": 0.0002,
+      "loss": 0.3079,
+      "step": 2590
+    },
+    {
+      "epoch": 2.5403028822667317,
+      "grad_norm": 0.7476372718811035,
+      "learning_rate": 0.0002,
+      "loss": 0.3048,
+      "step": 2600
+    },
+    {
+      "epoch": 2.5500732779677575,
+      "grad_norm": 0.860421895980835,
+      "learning_rate": 0.0002,
+      "loss": 0.2979,
+      "step": 2610
+    },
+    {
+      "epoch": 2.5598436736687837,
+      "grad_norm": 0.8230026364326477,
+      "learning_rate": 0.0002,
+      "loss": 0.3046,
+      "step": 2620
+    },
+    {
+      "epoch": 2.5696140693698095,
+      "grad_norm": 0.8646627068519592,
+      "learning_rate": 0.0002,
+      "loss": 0.3034,
+      "step": 2630
+    },
+    {
+      "epoch": 2.5793844650708353,
+      "grad_norm": 0.9704413414001465,
+      "learning_rate": 0.0002,
+      "loss": 0.3147,
+      "step": 2640
+    },
+    {
+      "epoch": 2.5891548607718615,
+      "grad_norm": 0.8837246298789978,
+      "learning_rate": 0.0002,
+      "loss": 0.3078,
+      "step": 2650
+    },
+    {
+      "epoch": 2.5989252564728873,
+      "grad_norm": 0.7060710191726685,
+      "learning_rate": 0.0002,
+      "loss": 0.3006,
+      "step": 2660
+    },
+    {
+      "epoch": 2.608695652173913,
+      "grad_norm": 0.7364303469657898,
+      "learning_rate": 0.0002,
+      "loss": 0.3024,
+      "step": 2670
+    },
+    {
+      "epoch": 2.618466047874939,
+      "grad_norm": 0.9422456622123718,
+      "learning_rate": 0.0002,
+      "loss": 0.3047,
+      "step": 2680
+    },
+    {
+      "epoch": 2.6282364435759646,
+      "grad_norm": 0.8265060186386108,
+      "learning_rate": 0.0002,
+      "loss": 0.3033,
+      "step": 2690
+    },
+    {
+      "epoch": 2.638006839276991,
+      "grad_norm": 0.6122261881828308,
+      "learning_rate": 0.0002,
+      "loss": 0.2949,
+      "step": 2700
+    },
+    {
+      "epoch": 2.6477772349780166,
+      "grad_norm": 0.7424021363258362,
+      "learning_rate": 0.0002,
+      "loss": 0.2978,
+      "step": 2710
+    },
+    {
+      "epoch": 2.6575476306790424,
+      "grad_norm": 0.6899349689483643,
+      "learning_rate": 0.0002,
+      "loss": 0.3078,
+      "step": 2720
+    },
+    {
+      "epoch": 2.6673180263800687,
+      "grad_norm": 0.8241371512413025,
+      "learning_rate": 0.0002,
+      "loss": 0.3059,
+      "step": 2730
+    },
+    {
+      "epoch": 2.6770884220810944,
+      "grad_norm": 0.7357944846153259,
+      "learning_rate": 0.0002,
+      "loss": 0.3169,
+      "step": 2740
+    },
+    {
+      "epoch": 2.68685881778212,
+      "grad_norm": 1.2319949865341187,
+      "learning_rate": 0.0002,
+      "loss": 0.3104,
+      "step": 2750
+    },
+    {
+      "epoch": 2.696629213483146,
+      "grad_norm": 0.6758335828781128,
+      "learning_rate": 0.0002,
+      "loss": 0.3016,
+      "step": 2760
+    },
+    {
+      "epoch": 2.7063996091841718,
+      "grad_norm": 0.666590690612793,
+      "learning_rate": 0.0002,
+      "loss": 0.3175,
+      "step": 2770
+    },
+    {
+      "epoch": 2.716170004885198,
+      "grad_norm": 0.765657365322113,
+      "learning_rate": 0.0002,
+      "loss": 0.3123,
+      "step": 2780
+    },
+    {
+      "epoch": 2.7259404005862238,
+      "grad_norm": 0.6624470949172974,
+      "learning_rate": 0.0002,
+      "loss": 0.2969,
+      "step": 2790
+    },
+    {
+      "epoch": 2.7357107962872496,
+      "grad_norm": 0.9891471266746521,
+      "learning_rate": 0.0002,
+      "loss": 0.3021,
+      "step": 2800
+    },
+    {
+      "epoch": 2.745481191988276,
+      "grad_norm": 0.590451180934906,
+      "learning_rate": 0.0002,
+      "loss": 0.307,
+      "step": 2810
+    },
+    {
+      "epoch": 2.7552515876893016,
+      "grad_norm": 0.5418292284011841,
+      "learning_rate": 0.0002,
+      "loss": 0.3084,
+      "step": 2820
+    },
+    {
+      "epoch": 2.7650219833903273,
+      "grad_norm": 0.9565151929855347,
+      "learning_rate": 0.0002,
+      "loss": 0.309,
+      "step": 2830
+    },
+    {
+      "epoch": 2.774792379091353,
+      "grad_norm": 0.7840000987052917,
+      "learning_rate": 0.0002,
+      "loss": 0.3046,
+      "step": 2840
+    },
+    {
+      "epoch": 2.784562774792379,
+      "grad_norm": 0.7269287705421448,
+      "learning_rate": 0.0002,
+      "loss": 0.2938,
+      "step": 2850
+    },
+    {
+      "epoch": 2.794333170493405,
+      "grad_norm": 0.6564769744873047,
+      "learning_rate": 0.0002,
+      "loss": 0.2945,
+      "step": 2860
+    },
+    {
+      "epoch": 2.804103566194431,
+      "grad_norm": 0.5916360020637512,
+      "learning_rate": 0.0002,
+      "loss": 0.2952,
+      "step": 2870
+    },
+    {
+      "epoch": 2.8138739618954567,
+      "grad_norm": 0.5752355456352234,
+      "learning_rate": 0.0002,
+      "loss": 0.2981,
+      "step": 2880
+    },
+    {
+      "epoch": 2.823644357596483,
+      "grad_norm": 0.9079744815826416,
+      "learning_rate": 0.0002,
+      "loss": 0.2976,
+      "step": 2890
+    },
+    {
+      "epoch": 2.8334147532975087,
+      "grad_norm": 0.6955378651618958,
+      "learning_rate": 0.0002,
+      "loss": 0.2967,
+      "step": 2900
+    },
+    {
+      "epoch": 2.8431851489985345,
+      "grad_norm": 0.5551539063453674,
+      "learning_rate": 0.0002,
+      "loss": 0.289,
+      "step": 2910
+    },
+    {
+      "epoch": 2.8529555446995603,
+      "grad_norm": 0.7029260396957397,
+      "learning_rate": 0.0002,
+      "loss": 0.3047,
+      "step": 2920
+    },
+    {
+      "epoch": 2.862725940400586,
+      "grad_norm": 1.002670168876648,
+      "learning_rate": 0.0002,
+      "loss": 0.2935,
+      "step": 2930
+    },
+    {
+      "epoch": 2.8724963361016123,
+      "grad_norm": 0.8380820751190186,
+      "learning_rate": 0.0002,
+      "loss": 0.3005,
+      "step": 2940
+    },
+    {
+      "epoch": 2.882266731802638,
+      "grad_norm": 0.658412754535675,
+      "learning_rate": 0.0002,
+      "loss": 0.2948,
+      "step": 2950
+    },
+    {
+      "epoch": 2.892037127503664,
+      "grad_norm": 0.9336162209510803,
+      "learning_rate": 0.0002,
+      "loss": 0.3003,
+      "step": 2960
+    },
+    {
+      "epoch": 2.90180752320469,
+      "grad_norm": 0.7143391370773315,
+      "learning_rate": 0.0002,
+      "loss": 0.2874,
+      "step": 2970
+    },
+    {
+      "epoch": 2.911577918905716,
+      "grad_norm": 0.5564678311347961,
+      "learning_rate": 0.0002,
+      "loss": 0.2975,
+      "step": 2980
+    },
+    {
+      "epoch": 2.9213483146067416,
+      "grad_norm": 1.1643658876419067,
+      "learning_rate": 0.0002,
+      "loss": 0.3045,
+      "step": 2990
+    },
+    {
+      "epoch": 2.9311187103077674,
+      "grad_norm": 0.6776673793792725,
+      "learning_rate": 0.0002,
+      "loss": 0.3027,
+      "step": 3000
+    },
+    {
+      "epoch": 2.940889106008793,
+      "grad_norm": 0.6123829483985901,
+      "learning_rate": 0.0002,
+      "loss": 0.2887,
+      "step": 3010
+    },
+    {
+      "epoch": 2.9506595017098194,
+      "grad_norm": 0.7569496631622314,
+      "learning_rate": 0.0002,
+      "loss": 0.2897,
+      "step": 3020
+    },
+    {
+      "epoch": 2.960429897410845,
+      "grad_norm": 0.6484465599060059,
+      "learning_rate": 0.0002,
+      "loss": 0.3023,
+      "step": 3030
+    },
+    {
+      "epoch": 2.970200293111871,
+      "grad_norm": 0.7745254039764404,
+      "learning_rate": 0.0002,
+      "loss": 0.2925,
+      "step": 3040
+    },
+    {
+      "epoch": 2.979970688812897,
+      "grad_norm": 0.6034068465232849,
+      "learning_rate": 0.0002,
+      "loss": 0.2946,
+      "step": 3050
+    },
+    {
+      "epoch": 2.989741084513923,
+      "grad_norm": 1.202962040901184,
+      "learning_rate": 0.0002,
+      "loss": 0.2935,
+      "step": 3060
+    },
+    {
+      "epoch": 2.9995114802149487,
+      "grad_norm": 0.8330838680267334,
+      "learning_rate": 0.0002,
+      "loss": 0.3045,
+      "step": 3070
+    },
+    {
+      "epoch": 2.9995114802149487,
+      "eval_loss": 0.30571895837783813,
+      "eval_runtime": 26.5297,
+      "eval_samples_per_second": 13.758,
+      "eval_steps_per_second": 1.734,
+      "step": 3070
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 8184,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.347173483544576e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-3070/training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:469618ae8560edd4a517eb99451fb8bc5c5f148706842d569488535fb05e84cb
+size 5560

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-4094/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-4094/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43f91955986d5f645672d33784be22a794bc618a9382c323a3c2749b0f3a65ae
+size 109069176

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-4094/optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:02ba35ce47743bfbcc8eb51f146221b23dfc839209af54415950e4e4e4bacbb4
+size 55532666

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-4094/rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe31f03bf55f7742c115caf760ba47051e21b556d1b1d1e5d5760ac992fa9bb6
+size 14244

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-4094/scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2eb0a611aea8c03604e4871ae8f30f2c46c2da4f9dd50a6f3adc320a593e4f99
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-4094/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-4094/tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-4094/tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-4094/trainer_state.json ADDED Viewed

	@@ -0,0 +1,2928 @@

+{
+  "best_metric": 0.289143443107605,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_flare-headlines_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.8-num-17256-sd-1/checkpoint-4094",
+  "epoch": 4.0,
+  "eval_steps": 10,
+  "global_step": 4094,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.009770395701025891,
+      "grad_norm": 1.1888047456741333,
+      "learning_rate": 0.0002,
+      "loss": 1.7474,
+      "step": 10
+    },
+    {
+      "epoch": 0.019540791402051783,
+      "grad_norm": 1.3118009567260742,
+      "learning_rate": 0.0002,
+      "loss": 1.157,
+      "step": 20
+    },
+    {
+      "epoch": 0.029311187103077674,
+      "grad_norm": 1.1254922151565552,
+      "learning_rate": 0.0002,
+      "loss": 0.9979,
+      "step": 30
+    },
+    {
+      "epoch": 0.039081582804103565,
+      "grad_norm": 0.9634686708450317,
+      "learning_rate": 0.0002,
+      "loss": 0.8859,
+      "step": 40
+    },
+    {
+      "epoch": 0.048851978505129456,
+      "grad_norm": 0.9101817607879639,
+      "learning_rate": 0.0002,
+      "loss": 0.7826,
+      "step": 50
+    },
+    {
+      "epoch": 0.05862237420615535,
+      "grad_norm": 1.0019943714141846,
+      "learning_rate": 0.0002,
+      "loss": 0.7358,
+      "step": 60
+    },
+    {
+      "epoch": 0.06839276990718124,
+      "grad_norm": 0.9201828837394714,
+      "learning_rate": 0.0002,
+      "loss": 0.6664,
+      "step": 70
+    },
+    {
+      "epoch": 0.07816316560820713,
+      "grad_norm": 0.9210318922996521,
+      "learning_rate": 0.0002,
+      "loss": 0.6785,
+      "step": 80
+    },
+    {
+      "epoch": 0.08793356130923302,
+      "grad_norm": 0.8079697489738464,
+      "learning_rate": 0.0002,
+      "loss": 0.652,
+      "step": 90
+    },
+    {
+      "epoch": 0.09770395701025891,
+      "grad_norm": 0.7530406713485718,
+      "learning_rate": 0.0002,
+      "loss": 0.6469,
+      "step": 100
+    },
+    {
+      "epoch": 0.1074743527112848,
+      "grad_norm": 0.8732273578643799,
+      "learning_rate": 0.0002,
+      "loss": 0.6604,
+      "step": 110
+    },
+    {
+      "epoch": 0.1172447484123107,
+      "grad_norm": 0.9163013696670532,
+      "learning_rate": 0.0002,
+      "loss": 0.6429,
+      "step": 120
+    },
+    {
+      "epoch": 0.1270151441133366,
+      "grad_norm": 0.5931605696678162,
+      "learning_rate": 0.0002,
+      "loss": 0.6269,
+      "step": 130
+    },
+    {
+      "epoch": 0.13678553981436248,
+      "grad_norm": 0.8782339692115784,
+      "learning_rate": 0.0002,
+      "loss": 0.6349,
+      "step": 140
+    },
+    {
+      "epoch": 0.14655593551538837,
+      "grad_norm": 0.6683491468429565,
+      "learning_rate": 0.0002,
+      "loss": 0.657,
+      "step": 150
+    },
+    {
+      "epoch": 0.15632633121641426,
+      "grad_norm": 0.7998592257499695,
+      "learning_rate": 0.0002,
+      "loss": 0.6315,
+      "step": 160
+    },
+    {
+      "epoch": 0.16609672691744015,
+      "grad_norm": 0.6159262657165527,
+      "learning_rate": 0.0002,
+      "loss": 0.6347,
+      "step": 170
+    },
+    {
+      "epoch": 0.17586712261846604,
+      "grad_norm": 0.671146035194397,
+      "learning_rate": 0.0002,
+      "loss": 0.6023,
+      "step": 180
+    },
+    {
+      "epoch": 0.18563751831949193,
+      "grad_norm": 0.5839019417762756,
+      "learning_rate": 0.0002,
+      "loss": 0.6101,
+      "step": 190
+    },
+    {
+      "epoch": 0.19540791402051783,
+      "grad_norm": 0.5090241432189941,
+      "learning_rate": 0.0002,
+      "loss": 0.6121,
+      "step": 200
+    },
+    {
+      "epoch": 0.20517830972154372,
+      "grad_norm": 0.652291476726532,
+      "learning_rate": 0.0002,
+      "loss": 0.6296,
+      "step": 210
+    },
+    {
+      "epoch": 0.2149487054225696,
+      "grad_norm": 0.6500856876373291,
+      "learning_rate": 0.0002,
+      "loss": 0.577,
+      "step": 220
+    },
+    {
+      "epoch": 0.2247191011235955,
+      "grad_norm": 0.6135480999946594,
+      "learning_rate": 0.0002,
+      "loss": 0.6186,
+      "step": 230
+    },
+    {
+      "epoch": 0.2344894968246214,
+      "grad_norm": 0.6102302074432373,
+      "learning_rate": 0.0002,
+      "loss": 0.6132,
+      "step": 240
+    },
+    {
+      "epoch": 0.24425989252564728,
+      "grad_norm": 0.6909783482551575,
+      "learning_rate": 0.0002,
+      "loss": 0.592,
+      "step": 250
+    },
+    {
+      "epoch": 0.2540302882266732,
+      "grad_norm": 0.5834446549415588,
+      "learning_rate": 0.0002,
+      "loss": 0.5832,
+      "step": 260
+    },
+    {
+      "epoch": 0.26380068392769906,
+      "grad_norm": 0.5275322198867798,
+      "learning_rate": 0.0002,
+      "loss": 0.6038,
+      "step": 270
+    },
+    {
+      "epoch": 0.27357107962872496,
+      "grad_norm": 0.5611422657966614,
+      "learning_rate": 0.0002,
+      "loss": 0.5469,
+      "step": 280
+    },
+    {
+      "epoch": 0.28334147532975085,
+      "grad_norm": 0.6549052596092224,
+      "learning_rate": 0.0002,
+      "loss": 0.552,
+      "step": 290
+    },
+    {
+      "epoch": 0.29311187103077674,
+      "grad_norm": 0.563291072845459,
+      "learning_rate": 0.0002,
+      "loss": 0.5609,
+      "step": 300
+    },
+    {
+      "epoch": 0.30288226673180263,
+      "grad_norm": 0.5598369240760803,
+      "learning_rate": 0.0002,
+      "loss": 0.5632,
+      "step": 310
+    },
+    {
+      "epoch": 0.3126526624328285,
+      "grad_norm": 0.6525678634643555,
+      "learning_rate": 0.0002,
+      "loss": 0.5627,
+      "step": 320
+    },
+    {
+      "epoch": 0.3224230581338544,
+      "grad_norm": 0.5190592408180237,
+      "learning_rate": 0.0002,
+      "loss": 0.5526,
+      "step": 330
+    },
+    {
+      "epoch": 0.3321934538348803,
+      "grad_norm": 0.45483070611953735,
+      "learning_rate": 0.0002,
+      "loss": 0.5698,
+      "step": 340
+    },
+    {
+      "epoch": 0.3419638495359062,
+      "grad_norm": 0.8094475865364075,
+      "learning_rate": 0.0002,
+      "loss": 0.5768,
+      "step": 350
+    },
+    {
+      "epoch": 0.3517342452369321,
+      "grad_norm": 0.5545358061790466,
+      "learning_rate": 0.0002,
+      "loss": 0.5555,
+      "step": 360
+    },
+    {
+      "epoch": 0.361504640937958,
+      "grad_norm": 0.6899498701095581,
+      "learning_rate": 0.0002,
+      "loss": 0.5529,
+      "step": 370
+    },
+    {
+      "epoch": 0.37127503663898387,
+      "grad_norm": 0.4584816098213196,
+      "learning_rate": 0.0002,
+      "loss": 0.556,
+      "step": 380
+    },
+    {
+      "epoch": 0.38104543234000976,
+      "grad_norm": 0.5436979532241821,
+      "learning_rate": 0.0002,
+      "loss": 0.5451,
+      "step": 390
+    },
+    {
+      "epoch": 0.39081582804103565,
+      "grad_norm": 0.7512422800064087,
+      "learning_rate": 0.0002,
+      "loss": 0.5377,
+      "step": 400
+    },
+    {
+      "epoch": 0.40058622374206154,
+      "grad_norm": 0.6394727826118469,
+      "learning_rate": 0.0002,
+      "loss": 0.5438,
+      "step": 410
+    },
+    {
+      "epoch": 0.41035661944308743,
+      "grad_norm": 0.5314047336578369,
+      "learning_rate": 0.0002,
+      "loss": 0.5535,
+      "step": 420
+    },
+    {
+      "epoch": 0.4201270151441133,
+      "grad_norm": 0.5658334493637085,
+      "learning_rate": 0.0002,
+      "loss": 0.5587,
+      "step": 430
+    },
+    {
+      "epoch": 0.4298974108451392,
+      "grad_norm": 0.5295330882072449,
+      "learning_rate": 0.0002,
+      "loss": 0.5219,
+      "step": 440
+    },
+    {
+      "epoch": 0.4396678065461651,
+      "grad_norm": 0.6460115313529968,
+      "learning_rate": 0.0002,
+      "loss": 0.522,
+      "step": 450
+    },
+    {
+      "epoch": 0.449438202247191,
+      "grad_norm": 0.512022852897644,
+      "learning_rate": 0.0002,
+      "loss": 0.5416,
+      "step": 460
+    },
+    {
+      "epoch": 0.4592085979482169,
+      "grad_norm": 0.7365363836288452,
+      "learning_rate": 0.0002,
+      "loss": 0.5256,
+      "step": 470
+    },
+    {
+      "epoch": 0.4689789936492428,
+      "grad_norm": 0.6292932629585266,
+      "learning_rate": 0.0002,
+      "loss": 0.5354,
+      "step": 480
+    },
+    {
+      "epoch": 0.4787493893502687,
+      "grad_norm": 0.6255582571029663,
+      "learning_rate": 0.0002,
+      "loss": 0.5436,
+      "step": 490
+    },
+    {
+      "epoch": 0.48851978505129456,
+      "grad_norm": 0.5599279403686523,
+      "learning_rate": 0.0002,
+      "loss": 0.5394,
+      "step": 500
+    },
+    {
+      "epoch": 0.49829018075232046,
+      "grad_norm": 0.573657751083374,
+      "learning_rate": 0.0002,
+      "loss": 0.5297,
+      "step": 510
+    },
+    {
+      "epoch": 0.5080605764533463,
+      "grad_norm": 0.6362313628196716,
+      "learning_rate": 0.0002,
+      "loss": 0.5299,
+      "step": 520
+    },
+    {
+      "epoch": 0.5178309721543722,
+      "grad_norm": 0.6360035538673401,
+      "learning_rate": 0.0002,
+      "loss": 0.5458,
+      "step": 530
+    },
+    {
+      "epoch": 0.5276013678553981,
+      "grad_norm": 0.7129001021385193,
+      "learning_rate": 0.0002,
+      "loss": 0.5228,
+      "step": 540
+    },
+    {
+      "epoch": 0.537371763556424,
+      "grad_norm": 0.5596054196357727,
+      "learning_rate": 0.0002,
+      "loss": 0.5091,
+      "step": 550
+    },
+    {
+      "epoch": 0.5471421592574499,
+      "grad_norm": 0.7081596851348877,
+      "learning_rate": 0.0002,
+      "loss": 0.5153,
+      "step": 560
+    },
+    {
+      "epoch": 0.5569125549584758,
+      "grad_norm": 0.6816760301589966,
+      "learning_rate": 0.0002,
+      "loss": 0.4999,
+      "step": 570
+    },
+    {
+      "epoch": 0.5666829506595017,
+      "grad_norm": 0.47695112228393555,
+      "learning_rate": 0.0002,
+      "loss": 0.4974,
+      "step": 580
+    },
+    {
+      "epoch": 0.5764533463605276,
+      "grad_norm": 0.7528041005134583,
+      "learning_rate": 0.0002,
+      "loss": 0.5247,
+      "step": 590
+    },
+    {
+      "epoch": 0.5862237420615535,
+      "grad_norm": 0.5452813506126404,
+      "learning_rate": 0.0002,
+      "loss": 0.5265,
+      "step": 600
+    },
+    {
+      "epoch": 0.5959941377625794,
+      "grad_norm": 0.6085044741630554,
+      "learning_rate": 0.0002,
+      "loss": 0.4965,
+      "step": 610
+    },
+    {
+      "epoch": 0.6057645334636053,
+      "grad_norm": 0.6745641231536865,
+      "learning_rate": 0.0002,
+      "loss": 0.4916,
+      "step": 620
+    },
+    {
+      "epoch": 0.6155349291646312,
+      "grad_norm": 0.647544264793396,
+      "learning_rate": 0.0002,
+      "loss": 0.5107,
+      "step": 630
+    },
+    {
+      "epoch": 0.625305324865657,
+      "grad_norm": 0.6123825311660767,
+      "learning_rate": 0.0002,
+      "loss": 0.4864,
+      "step": 640
+    },
+    {
+      "epoch": 0.6350757205666829,
+      "grad_norm": 0.5815364122390747,
+      "learning_rate": 0.0002,
+      "loss": 0.484,
+      "step": 650
+    },
+    {
+      "epoch": 0.6448461162677088,
+      "grad_norm": 0.6184095740318298,
+      "learning_rate": 0.0002,
+      "loss": 0.4966,
+      "step": 660
+    },
+    {
+      "epoch": 0.6546165119687347,
+      "grad_norm": 0.5856700539588928,
+      "learning_rate": 0.0002,
+      "loss": 0.4861,
+      "step": 670
+    },
+    {
+      "epoch": 0.6643869076697606,
+      "grad_norm": 0.6424922943115234,
+      "learning_rate": 0.0002,
+      "loss": 0.4964,
+      "step": 680
+    },
+    {
+      "epoch": 0.6741573033707865,
+      "grad_norm": 0.7051425576210022,
+      "learning_rate": 0.0002,
+      "loss": 0.5019,
+      "step": 690
+    },
+    {
+      "epoch": 0.6839276990718124,
+      "grad_norm": 0.6133471131324768,
+      "learning_rate": 0.0002,
+      "loss": 0.4649,
+      "step": 700
+    },
+    {
+      "epoch": 0.6936980947728383,
+      "grad_norm": 0.6933842897415161,
+      "learning_rate": 0.0002,
+      "loss": 0.4847,
+      "step": 710
+    },
+    {
+      "epoch": 0.7034684904738642,
+      "grad_norm": 0.6440989375114441,
+      "learning_rate": 0.0002,
+      "loss": 0.4945,
+      "step": 720
+    },
+    {
+      "epoch": 0.7132388861748901,
+      "grad_norm": 0.87819904088974,
+      "learning_rate": 0.0002,
+      "loss": 0.4777,
+      "step": 730
+    },
+    {
+      "epoch": 0.723009281875916,
+      "grad_norm": 0.6810497641563416,
+      "learning_rate": 0.0002,
+      "loss": 0.4914,
+      "step": 740
+    },
+    {
+      "epoch": 0.7327796775769418,
+      "grad_norm": 0.7822733521461487,
+      "learning_rate": 0.0002,
+      "loss": 0.4789,
+      "step": 750
+    },
+    {
+      "epoch": 0.7425500732779677,
+      "grad_norm": 0.6669152975082397,
+      "learning_rate": 0.0002,
+      "loss": 0.4615,
+      "step": 760
+    },
+    {
+      "epoch": 0.7523204689789936,
+      "grad_norm": 0.7351736426353455,
+      "learning_rate": 0.0002,
+      "loss": 0.4689,
+      "step": 770
+    },
+    {
+      "epoch": 0.7620908646800195,
+      "grad_norm": 1.0013558864593506,
+      "learning_rate": 0.0002,
+      "loss": 0.4629,
+      "step": 780
+    },
+    {
+      "epoch": 0.7718612603810454,
+      "grad_norm": 0.7465775609016418,
+      "learning_rate": 0.0002,
+      "loss": 0.4739,
+      "step": 790
+    },
+    {
+      "epoch": 0.7816316560820713,
+      "grad_norm": 1.0959300994873047,
+      "learning_rate": 0.0002,
+      "loss": 0.4635,
+      "step": 800
+    },
+    {
+      "epoch": 0.7914020517830972,
+      "grad_norm": 0.5292418599128723,
+      "learning_rate": 0.0002,
+      "loss": 0.4549,
+      "step": 810
+    },
+    {
+      "epoch": 0.8011724474841231,
+      "grad_norm": 0.6555328965187073,
+      "learning_rate": 0.0002,
+      "loss": 0.458,
+      "step": 820
+    },
+    {
+      "epoch": 0.810942843185149,
+      "grad_norm": 0.6462382674217224,
+      "learning_rate": 0.0002,
+      "loss": 0.488,
+      "step": 830
+    },
+    {
+      "epoch": 0.8207132388861749,
+      "grad_norm": 0.6840918064117432,
+      "learning_rate": 0.0002,
+      "loss": 0.4541,
+      "step": 840
+    },
+    {
+      "epoch": 0.8304836345872008,
+      "grad_norm": 0.5715351700782776,
+      "learning_rate": 0.0002,
+      "loss": 0.4509,
+      "step": 850
+    },
+    {
+      "epoch": 0.8402540302882266,
+      "grad_norm": 0.5583404898643494,
+      "learning_rate": 0.0002,
+      "loss": 0.4535,
+      "step": 860
+    },
+    {
+      "epoch": 0.8500244259892525,
+      "grad_norm": 0.8243112564086914,
+      "learning_rate": 0.0002,
+      "loss": 0.4533,
+      "step": 870
+    },
+    {
+      "epoch": 0.8597948216902784,
+      "grad_norm": 0.6543600559234619,
+      "learning_rate": 0.0002,
+      "loss": 0.4545,
+      "step": 880
+    },
+    {
+      "epoch": 0.8695652173913043,
+      "grad_norm": 0.6494827270507812,
+      "learning_rate": 0.0002,
+      "loss": 0.4814,
+      "step": 890
+    },
+    {
+      "epoch": 0.8793356130923302,
+      "grad_norm": 0.8458304405212402,
+      "learning_rate": 0.0002,
+      "loss": 0.4593,
+      "step": 900
+    },
+    {
+      "epoch": 0.8891060087933561,
+      "grad_norm": 0.6854186654090881,
+      "learning_rate": 0.0002,
+      "loss": 0.4382,
+      "step": 910
+    },
+    {
+      "epoch": 0.898876404494382,
+      "grad_norm": 0.6300225853919983,
+      "learning_rate": 0.0002,
+      "loss": 0.4488,
+      "step": 920
+    },
+    {
+      "epoch": 0.9086468001954079,
+      "grad_norm": 0.9791533350944519,
+      "learning_rate": 0.0002,
+      "loss": 0.4638,
+      "step": 930
+    },
+    {
+      "epoch": 0.9184171958964338,
+      "grad_norm": 0.6965218186378479,
+      "learning_rate": 0.0002,
+      "loss": 0.446,
+      "step": 940
+    },
+    {
+      "epoch": 0.9281875915974597,
+      "grad_norm": 0.6066922545433044,
+      "learning_rate": 0.0002,
+      "loss": 0.4453,
+      "step": 950
+    },
+    {
+      "epoch": 0.9379579872984856,
+      "grad_norm": 0.8081962466239929,
+      "learning_rate": 0.0002,
+      "loss": 0.4471,
+      "step": 960
+    },
+    {
+      "epoch": 0.9477283829995115,
+      "grad_norm": 0.7755117416381836,
+      "learning_rate": 0.0002,
+      "loss": 0.4348,
+      "step": 970
+    },
+    {
+      "epoch": 0.9574987787005373,
+      "grad_norm": 0.7127223610877991,
+      "learning_rate": 0.0002,
+      "loss": 0.4423,
+      "step": 980
+    },
+    {
+      "epoch": 0.9672691744015632,
+      "grad_norm": 0.6947609186172485,
+      "learning_rate": 0.0002,
+      "loss": 0.4272,
+      "step": 990
+    },
+    {
+      "epoch": 0.9770395701025891,
+      "grad_norm": 1.0100330114364624,
+      "learning_rate": 0.0002,
+      "loss": 0.4262,
+      "step": 1000
+    },
+    {
+      "epoch": 0.986809965803615,
+      "grad_norm": 0.6727001667022705,
+      "learning_rate": 0.0002,
+      "loss": 0.4169,
+      "step": 1010
+    },
+    {
+      "epoch": 0.9965803615046409,
+      "grad_norm": 0.7834463119506836,
+      "learning_rate": 0.0002,
+      "loss": 0.4507,
+      "step": 1020
+    },
+    {
+      "epoch": 0.9995114802149487,
+      "eval_loss": 0.433101624250412,
+      "eval_runtime": 26.5222,
+      "eval_samples_per_second": 13.762,
+      "eval_steps_per_second": 1.734,
+      "step": 1023
+    },
+    {
+      "epoch": 1.006350757205667,
+      "grad_norm": 0.8070526123046875,
+      "learning_rate": 0.0002,
+      "loss": 0.4154,
+      "step": 1030
+    },
+    {
+      "epoch": 1.0161211529066927,
+      "grad_norm": 0.7301508784294128,
+      "learning_rate": 0.0002,
+      "loss": 0.3951,
+      "step": 1040
+    },
+    {
+      "epoch": 1.0258915486077187,
+      "grad_norm": 0.8212476372718811,
+      "learning_rate": 0.0002,
+      "loss": 0.4001,
+      "step": 1050
+    },
+    {
+      "epoch": 1.0356619443087445,
+      "grad_norm": 0.6269228458404541,
+      "learning_rate": 0.0002,
+      "loss": 0.3953,
+      "step": 1060
+    },
+    {
+      "epoch": 1.0454323400097705,
+      "grad_norm": 0.700432300567627,
+      "learning_rate": 0.0002,
+      "loss": 0.4147,
+      "step": 1070
+    },
+    {
+      "epoch": 1.0552027357107963,
+      "grad_norm": 0.9251010417938232,
+      "learning_rate": 0.0002,
+      "loss": 0.3857,
+      "step": 1080
+    },
+    {
+      "epoch": 1.0649731314118223,
+      "grad_norm": 0.6018561720848083,
+      "learning_rate": 0.0002,
+      "loss": 0.3955,
+      "step": 1090
+    },
+    {
+      "epoch": 1.074743527112848,
+      "grad_norm": 0.7045873403549194,
+      "learning_rate": 0.0002,
+      "loss": 0.4079,
+      "step": 1100
+    },
+    {
+      "epoch": 1.084513922813874,
+      "grad_norm": 0.7800339460372925,
+      "learning_rate": 0.0002,
+      "loss": 0.4005,
+      "step": 1110
+    },
+    {
+      "epoch": 1.0942843185148998,
+      "grad_norm": 0.7404900789260864,
+      "learning_rate": 0.0002,
+      "loss": 0.419,
+      "step": 1120
+    },
+    {
+      "epoch": 1.1040547142159258,
+      "grad_norm": 1.1851727962493896,
+      "learning_rate": 0.0002,
+      "loss": 0.4057,
+      "step": 1130
+    },
+    {
+      "epoch": 1.1138251099169516,
+      "grad_norm": 0.875406801700592,
+      "learning_rate": 0.0002,
+      "loss": 0.3966,
+      "step": 1140
+    },
+    {
+      "epoch": 1.1235955056179776,
+      "grad_norm": 0.9795705676078796,
+      "learning_rate": 0.0002,
+      "loss": 0.3863,
+      "step": 1150
+    },
+    {
+      "epoch": 1.1333659013190034,
+      "grad_norm": 0.7387922406196594,
+      "learning_rate": 0.0002,
+      "loss": 0.3991,
+      "step": 1160
+    },
+    {
+      "epoch": 1.1431362970200294,
+      "grad_norm": 0.6640482544898987,
+      "learning_rate": 0.0002,
+      "loss": 0.3914,
+      "step": 1170
+    },
+    {
+      "epoch": 1.1529066927210552,
+      "grad_norm": 0.6067684888839722,
+      "learning_rate": 0.0002,
+      "loss": 0.3809,
+      "step": 1180
+    },
+    {
+      "epoch": 1.1626770884220812,
+      "grad_norm": 0.7623337507247925,
+      "learning_rate": 0.0002,
+      "loss": 0.3915,
+      "step": 1190
+    },
+    {
+      "epoch": 1.172447484123107,
+      "grad_norm": 1.0410432815551758,
+      "learning_rate": 0.0002,
+      "loss": 0.3832,
+      "step": 1200
+    },
+    {
+      "epoch": 1.182217879824133,
+      "grad_norm": 0.7790178656578064,
+      "learning_rate": 0.0002,
+      "loss": 0.3875,
+      "step": 1210
+    },
+    {
+      "epoch": 1.1919882755251587,
+      "grad_norm": 0.7643477916717529,
+      "learning_rate": 0.0002,
+      "loss": 0.3869,
+      "step": 1220
+    },
+    {
+      "epoch": 1.2017586712261847,
+      "grad_norm": 1.2028473615646362,
+      "learning_rate": 0.0002,
+      "loss": 0.3719,
+      "step": 1230
+    },
+    {
+      "epoch": 1.2115290669272105,
+      "grad_norm": 0.787656307220459,
+      "learning_rate": 0.0002,
+      "loss": 0.3863,
+      "step": 1240
+    },
+    {
+      "epoch": 1.2212994626282365,
+      "grad_norm": 0.8074171543121338,
+      "learning_rate": 0.0002,
+      "loss": 0.3875,
+      "step": 1250
+    },
+    {
+      "epoch": 1.2310698583292623,
+      "grad_norm": 0.8488901853561401,
+      "learning_rate": 0.0002,
+      "loss": 0.3923,
+      "step": 1260
+    },
+    {
+      "epoch": 1.2408402540302883,
+      "grad_norm": 0.7454975247383118,
+      "learning_rate": 0.0002,
+      "loss": 0.3829,
+      "step": 1270
+    },
+    {
+      "epoch": 1.250610649731314,
+      "grad_norm": 0.6724955439567566,
+      "learning_rate": 0.0002,
+      "loss": 0.3981,
+      "step": 1280
+    },
+    {
+      "epoch": 1.26038104543234,
+      "grad_norm": 1.1912977695465088,
+      "learning_rate": 0.0002,
+      "loss": 0.383,
+      "step": 1290
+    },
+    {
+      "epoch": 1.2701514411333659,
+      "grad_norm": 0.7795814871788025,
+      "learning_rate": 0.0002,
+      "loss": 0.3837,
+      "step": 1300
+    },
+    {
+      "epoch": 1.2799218368343919,
+      "grad_norm": 0.672956645488739,
+      "learning_rate": 0.0002,
+      "loss": 0.3898,
+      "step": 1310
+    },
+    {
+      "epoch": 1.2896922325354176,
+      "grad_norm": 1.245808482170105,
+      "learning_rate": 0.0002,
+      "loss": 0.3849,
+      "step": 1320
+    },
+    {
+      "epoch": 1.2994626282364437,
+      "grad_norm": 0.9562020301818848,
+      "learning_rate": 0.0002,
+      "loss": 0.3877,
+      "step": 1330
+    },
+    {
+      "epoch": 1.3092330239374694,
+      "grad_norm": 1.2005938291549683,
+      "learning_rate": 0.0002,
+      "loss": 0.3711,
+      "step": 1340
+    },
+    {
+      "epoch": 1.3190034196384954,
+      "grad_norm": 0.7105128169059753,
+      "learning_rate": 0.0002,
+      "loss": 0.3761,
+      "step": 1350
+    },
+    {
+      "epoch": 1.3287738153395212,
+      "grad_norm": 0.9829772710800171,
+      "learning_rate": 0.0002,
+      "loss": 0.371,
+      "step": 1360
+    },
+    {
+      "epoch": 1.3385442110405472,
+      "grad_norm": 0.6548563241958618,
+      "learning_rate": 0.0002,
+      "loss": 0.3845,
+      "step": 1370
+    },
+    {
+      "epoch": 1.348314606741573,
+      "grad_norm": 0.877531111240387,
+      "learning_rate": 0.0002,
+      "loss": 0.3797,
+      "step": 1380
+    },
+    {
+      "epoch": 1.358085002442599,
+      "grad_norm": 0.6915368437767029,
+      "learning_rate": 0.0002,
+      "loss": 0.3757,
+      "step": 1390
+    },
+    {
+      "epoch": 1.3678553981436248,
+      "grad_norm": 0.6052316427230835,
+      "learning_rate": 0.0002,
+      "loss": 0.368,
+      "step": 1400
+    },
+    {
+      "epoch": 1.3776257938446508,
+      "grad_norm": 0.6086260080337524,
+      "learning_rate": 0.0002,
+      "loss": 0.3758,
+      "step": 1410
+    },
+    {
+      "epoch": 1.3873961895456766,
+      "grad_norm": 1.0432673692703247,
+      "learning_rate": 0.0002,
+      "loss": 0.3794,
+      "step": 1420
+    },
+    {
+      "epoch": 1.3971665852467026,
+      "grad_norm": 0.7252581715583801,
+      "learning_rate": 0.0002,
+      "loss": 0.3715,
+      "step": 1430
+    },
+    {
+      "epoch": 1.4069369809477283,
+      "grad_norm": 0.7926928997039795,
+      "learning_rate": 0.0002,
+      "loss": 0.3919,
+      "step": 1440
+    },
+    {
+      "epoch": 1.4167073766487543,
+      "grad_norm": 0.6464225649833679,
+      "learning_rate": 0.0002,
+      "loss": 0.3701,
+      "step": 1450
+    },
+    {
+      "epoch": 1.4264777723497801,
+      "grad_norm": 1.0563385486602783,
+      "learning_rate": 0.0002,
+      "loss": 0.3738,
+      "step": 1460
+    },
+    {
+      "epoch": 1.4362481680508061,
+      "grad_norm": 0.5497196316719055,
+      "learning_rate": 0.0002,
+      "loss": 0.3782,
+      "step": 1470
+    },
+    {
+      "epoch": 1.446018563751832,
+      "grad_norm": 0.7382678389549255,
+      "learning_rate": 0.0002,
+      "loss": 0.3668,
+      "step": 1480
+    },
+    {
+      "epoch": 1.455788959452858,
+      "grad_norm": 0.6264833807945251,
+      "learning_rate": 0.0002,
+      "loss": 0.3592,
+      "step": 1490
+    },
+    {
+      "epoch": 1.4655593551538837,
+      "grad_norm": 0.6722145080566406,
+      "learning_rate": 0.0002,
+      "loss": 0.3809,
+      "step": 1500
+    },
+    {
+      "epoch": 1.4753297508549097,
+      "grad_norm": 0.8594183921813965,
+      "learning_rate": 0.0002,
+      "loss": 0.3715,
+      "step": 1510
+    },
+    {
+      "epoch": 1.4851001465559355,
+      "grad_norm": 0.8588142395019531,
+      "learning_rate": 0.0002,
+      "loss": 0.354,
+      "step": 1520
+    },
+    {
+      "epoch": 1.4948705422569615,
+      "grad_norm": 0.8683834075927734,
+      "learning_rate": 0.0002,
+      "loss": 0.3654,
+      "step": 1530
+    },
+    {
+      "epoch": 1.5046409379579873,
+      "grad_norm": 0.7628163695335388,
+      "learning_rate": 0.0002,
+      "loss": 0.3647,
+      "step": 1540
+    },
+    {
+      "epoch": 1.5144113336590133,
+      "grad_norm": 0.7967382669448853,
+      "learning_rate": 0.0002,
+      "loss": 0.3666,
+      "step": 1550
+    },
+    {
+      "epoch": 1.524181729360039,
+      "grad_norm": 0.7065442800521851,
+      "learning_rate": 0.0002,
+      "loss": 0.361,
+      "step": 1560
+    },
+    {
+      "epoch": 1.5339521250610648,
+      "grad_norm": 0.6472197771072388,
+      "learning_rate": 0.0002,
+      "loss": 0.3623,
+      "step": 1570
+    },
+    {
+      "epoch": 1.5437225207620908,
+      "grad_norm": 1.105960488319397,
+      "learning_rate": 0.0002,
+      "loss": 0.3626,
+      "step": 1580
+    },
+    {
+      "epoch": 1.5534929164631168,
+      "grad_norm": 0.9730587601661682,
+      "learning_rate": 0.0002,
+      "loss": 0.3528,
+      "step": 1590
+    },
+    {
+      "epoch": 1.5632633121641426,
+      "grad_norm": 0.987910807132721,
+      "learning_rate": 0.0002,
+      "loss": 0.3739,
+      "step": 1600
+    },
+    {
+      "epoch": 1.5730337078651684,
+      "grad_norm": 0.9708227515220642,
+      "learning_rate": 0.0002,
+      "loss": 0.3546,
+      "step": 1610
+    },
+    {
+      "epoch": 1.5828041035661944,
+      "grad_norm": 0.6303295493125916,
+      "learning_rate": 0.0002,
+      "loss": 0.3653,
+      "step": 1620
+    },
+    {
+      "epoch": 1.5925744992672204,
+      "grad_norm": 1.0985002517700195,
+      "learning_rate": 0.0002,
+      "loss": 0.3639,
+      "step": 1630
+    },
+    {
+      "epoch": 1.6023448949682462,
+      "grad_norm": 0.839419960975647,
+      "learning_rate": 0.0002,
+      "loss": 0.3533,
+      "step": 1640
+    },
+    {
+      "epoch": 1.612115290669272,
+      "grad_norm": 0.7963409423828125,
+      "learning_rate": 0.0002,
+      "loss": 0.3544,
+      "step": 1650
+    },
+    {
+      "epoch": 1.621885686370298,
+      "grad_norm": 0.8074514269828796,
+      "learning_rate": 0.0002,
+      "loss": 0.3721,
+      "step": 1660
+    },
+    {
+      "epoch": 1.631656082071324,
+      "grad_norm": 0.8368266820907593,
+      "learning_rate": 0.0002,
+      "loss": 0.3573,
+      "step": 1670
+    },
+    {
+      "epoch": 1.6414264777723497,
+      "grad_norm": 0.6562672257423401,
+      "learning_rate": 0.0002,
+      "loss": 0.3556,
+      "step": 1680
+    },
+    {
+      "epoch": 1.6511968734733755,
+      "grad_norm": 0.5512149930000305,
+      "learning_rate": 0.0002,
+      "loss": 0.3593,
+      "step": 1690
+    },
+    {
+      "epoch": 1.6609672691744015,
+      "grad_norm": 0.5829663276672363,
+      "learning_rate": 0.0002,
+      "loss": 0.3626,
+      "step": 1700
+    },
+    {
+      "epoch": 1.6707376648754275,
+      "grad_norm": 0.8412625193595886,
+      "learning_rate": 0.0002,
+      "loss": 0.3526,
+      "step": 1710
+    },
+    {
+      "epoch": 1.6805080605764533,
+      "grad_norm": 0.8657066226005554,
+      "learning_rate": 0.0002,
+      "loss": 0.3593,
+      "step": 1720
+    },
+    {
+      "epoch": 1.690278456277479,
+      "grad_norm": 0.9691681861877441,
+      "learning_rate": 0.0002,
+      "loss": 0.3545,
+      "step": 1730
+    },
+    {
+      "epoch": 1.700048851978505,
+      "grad_norm": 0.641669511795044,
+      "learning_rate": 0.0002,
+      "loss": 0.3694,
+      "step": 1740
+    },
+    {
+      "epoch": 1.709819247679531,
+      "grad_norm": 0.7599552273750305,
+      "learning_rate": 0.0002,
+      "loss": 0.3594,
+      "step": 1750
+    },
+    {
+      "epoch": 1.7195896433805569,
+      "grad_norm": 0.7562308311462402,
+      "learning_rate": 0.0002,
+      "loss": 0.3563,
+      "step": 1760
+    },
+    {
+      "epoch": 1.7293600390815826,
+      "grad_norm": 0.6949060559272766,
+      "learning_rate": 0.0002,
+      "loss": 0.3741,
+      "step": 1770
+    },
+    {
+      "epoch": 1.7391304347826086,
+      "grad_norm": 1.1047314405441284,
+      "learning_rate": 0.0002,
+      "loss": 0.3444,
+      "step": 1780
+    },
+    {
+      "epoch": 1.7489008304836346,
+      "grad_norm": 0.9239255785942078,
+      "learning_rate": 0.0002,
+      "loss": 0.3602,
+      "step": 1790
+    },
+    {
+      "epoch": 1.7586712261846604,
+      "grad_norm": 0.6171822547912598,
+      "learning_rate": 0.0002,
+      "loss": 0.3464,
+      "step": 1800
+    },
+    {
+      "epoch": 1.7684416218856862,
+      "grad_norm": 0.8883067965507507,
+      "learning_rate": 0.0002,
+      "loss": 0.3504,
+      "step": 1810
+    },
+    {
+      "epoch": 1.7782120175867122,
+      "grad_norm": 0.8204503059387207,
+      "learning_rate": 0.0002,
+      "loss": 0.341,
+      "step": 1820
+    },
+    {
+      "epoch": 1.7879824132877382,
+      "grad_norm": 0.807534396648407,
+      "learning_rate": 0.0002,
+      "loss": 0.3455,
+      "step": 1830
+    },
+    {
+      "epoch": 1.797752808988764,
+      "grad_norm": 0.8063831329345703,
+      "learning_rate": 0.0002,
+      "loss": 0.3287,
+      "step": 1840
+    },
+    {
+      "epoch": 1.8075232046897898,
+      "grad_norm": 0.7789983749389648,
+      "learning_rate": 0.0002,
+      "loss": 0.3424,
+      "step": 1850
+    },
+    {
+      "epoch": 1.8172936003908158,
+      "grad_norm": 0.6771978735923767,
+      "learning_rate": 0.0002,
+      "loss": 0.3495,
+      "step": 1860
+    },
+    {
+      "epoch": 1.8270639960918418,
+      "grad_norm": 0.9140942096710205,
+      "learning_rate": 0.0002,
+      "loss": 0.3437,
+      "step": 1870
+    },
+    {
+      "epoch": 1.8368343917928676,
+      "grad_norm": 0.6635336875915527,
+      "learning_rate": 0.0002,
+      "loss": 0.3458,
+      "step": 1880
+    },
+    {
+      "epoch": 1.8466047874938933,
+      "grad_norm": 1.1987066268920898,
+      "learning_rate": 0.0002,
+      "loss": 0.3396,
+      "step": 1890
+    },
+    {
+      "epoch": 1.8563751831949193,
+      "grad_norm": 0.7020497918128967,
+      "learning_rate": 0.0002,
+      "loss": 0.3413,
+      "step": 1900
+    },
+    {
+      "epoch": 1.8661455788959453,
+      "grad_norm": 1.0113945007324219,
+      "learning_rate": 0.0002,
+      "loss": 0.3442,
+      "step": 1910
+    },
+    {
+      "epoch": 1.8759159745969711,
+      "grad_norm": 0.8227802515029907,
+      "learning_rate": 0.0002,
+      "loss": 0.3503,
+      "step": 1920
+    },
+    {
+      "epoch": 1.885686370297997,
+      "grad_norm": 0.8185329437255859,
+      "learning_rate": 0.0002,
+      "loss": 0.3565,
+      "step": 1930
+    },
+    {
+      "epoch": 1.895456765999023,
+      "grad_norm": 0.7708970904350281,
+      "learning_rate": 0.0002,
+      "loss": 0.335,
+      "step": 1940
+    },
+    {
+      "epoch": 1.905227161700049,
+      "grad_norm": 0.8888451457023621,
+      "learning_rate": 0.0002,
+      "loss": 0.3365,
+      "step": 1950
+    },
+    {
+      "epoch": 1.9149975574010747,
+      "grad_norm": 0.720267653465271,
+      "learning_rate": 0.0002,
+      "loss": 0.3342,
+      "step": 1960
+    },
+    {
+      "epoch": 1.9247679531021005,
+      "grad_norm": 0.888666570186615,
+      "learning_rate": 0.0002,
+      "loss": 0.3512,
+      "step": 1970
+    },
+    {
+      "epoch": 1.9345383488031265,
+      "grad_norm": 0.7471952438354492,
+      "learning_rate": 0.0002,
+      "loss": 0.3284,
+      "step": 1980
+    },
+    {
+      "epoch": 1.9443087445041525,
+      "grad_norm": 0.7166922092437744,
+      "learning_rate": 0.0002,
+      "loss": 0.3383,
+      "step": 1990
+    },
+    {
+      "epoch": 1.9540791402051783,
+      "grad_norm": 0.7097923159599304,
+      "learning_rate": 0.0002,
+      "loss": 0.3355,
+      "step": 2000
+    },
+    {
+      "epoch": 1.963849535906204,
+      "grad_norm": 0.8592363595962524,
+      "learning_rate": 0.0002,
+      "loss": 0.3282,
+      "step": 2010
+    },
+    {
+      "epoch": 1.97361993160723,
+      "grad_norm": 0.5352440476417542,
+      "learning_rate": 0.0002,
+      "loss": 0.3273,
+      "step": 2020
+    },
+    {
+      "epoch": 1.983390327308256,
+      "grad_norm": 1.0193064212799072,
+      "learning_rate": 0.0002,
+      "loss": 0.3387,
+      "step": 2030
+    },
+    {
+      "epoch": 1.9931607230092818,
+      "grad_norm": 0.7331683039665222,
+      "learning_rate": 0.0002,
+      "loss": 0.3277,
+      "step": 2040
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.3446754515171051,
+      "eval_runtime": 26.5209,
+      "eval_samples_per_second": 13.763,
+      "eval_steps_per_second": 1.734,
+      "step": 2047
+    },
+    {
+      "epoch": 2.0029311187103076,
+      "grad_norm": 0.5937952399253845,
+      "learning_rate": 0.0002,
+      "loss": 0.321,
+      "step": 2050
+    },
+    {
+      "epoch": 2.012701514411334,
+      "grad_norm": 0.7739789485931396,
+      "learning_rate": 0.0002,
+      "loss": 0.3193,
+      "step": 2060
+    },
+    {
+      "epoch": 2.0224719101123596,
+      "grad_norm": 0.8177487850189209,
+      "learning_rate": 0.0002,
+      "loss": 0.3082,
+      "step": 2070
+    },
+    {
+      "epoch": 2.0322423058133854,
+      "grad_norm": 0.8874511122703552,
+      "learning_rate": 0.0002,
+      "loss": 0.3124,
+      "step": 2080
+    },
+    {
+      "epoch": 2.042012701514411,
+      "grad_norm": 0.5704050660133362,
+      "learning_rate": 0.0002,
+      "loss": 0.3134,
+      "step": 2090
+    },
+    {
+      "epoch": 2.0517830972154374,
+      "grad_norm": 0.6900630593299866,
+      "learning_rate": 0.0002,
+      "loss": 0.3183,
+      "step": 2100
+    },
+    {
+      "epoch": 2.061553492916463,
+      "grad_norm": 0.6171090006828308,
+      "learning_rate": 0.0002,
+      "loss": 0.3299,
+      "step": 2110
+    },
+    {
+      "epoch": 2.071323888617489,
+      "grad_norm": 0.6837073564529419,
+      "learning_rate": 0.0002,
+      "loss": 0.3174,
+      "step": 2120
+    },
+    {
+      "epoch": 2.0810942843185147,
+      "grad_norm": 0.7657505869865417,
+      "learning_rate": 0.0002,
+      "loss": 0.3188,
+      "step": 2130
+    },
+    {
+      "epoch": 2.090864680019541,
+      "grad_norm": 0.6443445682525635,
+      "learning_rate": 0.0002,
+      "loss": 0.3106,
+      "step": 2140
+    },
+    {
+      "epoch": 2.1006350757205667,
+      "grad_norm": 0.7839877605438232,
+      "learning_rate": 0.0002,
+      "loss": 0.3122,
+      "step": 2150
+    },
+    {
+      "epoch": 2.1104054714215925,
+      "grad_norm": 0.6591543555259705,
+      "learning_rate": 0.0002,
+      "loss": 0.3075,
+      "step": 2160
+    },
+    {
+      "epoch": 2.1201758671226183,
+      "grad_norm": 0.4450279176235199,
+      "learning_rate": 0.0002,
+      "loss": 0.3156,
+      "step": 2170
+    },
+    {
+      "epoch": 2.1299462628236445,
+      "grad_norm": 0.7616181373596191,
+      "learning_rate": 0.0002,
+      "loss": 0.3166,
+      "step": 2180
+    },
+    {
+      "epoch": 2.1397166585246703,
+      "grad_norm": 0.9556062817573547,
+      "learning_rate": 0.0002,
+      "loss": 0.3222,
+      "step": 2190
+    },
+    {
+      "epoch": 2.149487054225696,
+      "grad_norm": 0.7944735288619995,
+      "learning_rate": 0.0002,
+      "loss": 0.3065,
+      "step": 2200
+    },
+    {
+      "epoch": 2.159257449926722,
+      "grad_norm": 0.8850461840629578,
+      "learning_rate": 0.0002,
+      "loss": 0.3182,
+      "step": 2210
+    },
+    {
+      "epoch": 2.169027845627748,
+      "grad_norm": 0.586155354976654,
+      "learning_rate": 0.0002,
+      "loss": 0.3116,
+      "step": 2220
+    },
+    {
+      "epoch": 2.178798241328774,
+      "grad_norm": 0.5621091723442078,
+      "learning_rate": 0.0002,
+      "loss": 0.3124,
+      "step": 2230
+    },
+    {
+      "epoch": 2.1885686370297996,
+      "grad_norm": 1.0284475088119507,
+      "learning_rate": 0.0002,
+      "loss": 0.3231,
+      "step": 2240
+    },
+    {
+      "epoch": 2.1983390327308254,
+      "grad_norm": 0.6767295002937317,
+      "learning_rate": 0.0002,
+      "loss": 0.313,
+      "step": 2250
+    },
+    {
+      "epoch": 2.2081094284318517,
+      "grad_norm": 1.5721969604492188,
+      "learning_rate": 0.0002,
+      "loss": 0.3058,
+      "step": 2260
+    },
+    {
+      "epoch": 2.2178798241328774,
+      "grad_norm": 0.6935747861862183,
+      "learning_rate": 0.0002,
+      "loss": 0.3184,
+      "step": 2270
+    },
+    {
+      "epoch": 2.227650219833903,
+      "grad_norm": 0.6964385509490967,
+      "learning_rate": 0.0002,
+      "loss": 0.3145,
+      "step": 2280
+    },
+    {
+      "epoch": 2.237420615534929,
+      "grad_norm": 0.7350403070449829,
+      "learning_rate": 0.0002,
+      "loss": 0.3196,
+      "step": 2290
+    },
+    {
+      "epoch": 2.247191011235955,
+      "grad_norm": 0.6564902663230896,
+      "learning_rate": 0.0002,
+      "loss": 0.3043,
+      "step": 2300
+    },
+    {
+      "epoch": 2.256961406936981,
+      "grad_norm": 0.6696506142616272,
+      "learning_rate": 0.0002,
+      "loss": 0.3092,
+      "step": 2310
+    },
+    {
+      "epoch": 2.2667318026380068,
+      "grad_norm": 0.5929620265960693,
+      "learning_rate": 0.0002,
+      "loss": 0.3163,
+      "step": 2320
+    },
+    {
+      "epoch": 2.2765021983390326,
+      "grad_norm": 0.7476680874824524,
+      "learning_rate": 0.0002,
+      "loss": 0.3156,
+      "step": 2330
+    },
+    {
+      "epoch": 2.286272594040059,
+      "grad_norm": 1.0137721300125122,
+      "learning_rate": 0.0002,
+      "loss": 0.3151,
+      "step": 2340
+    },
+    {
+      "epoch": 2.2960429897410846,
+      "grad_norm": 0.6992525458335876,
+      "learning_rate": 0.0002,
+      "loss": 0.308,
+      "step": 2350
+    },
+    {
+      "epoch": 2.3058133854421103,
+      "grad_norm": 0.572147786617279,
+      "learning_rate": 0.0002,
+      "loss": 0.3166,
+      "step": 2360
+    },
+    {
+      "epoch": 2.315583781143136,
+      "grad_norm": 0.6631198525428772,
+      "learning_rate": 0.0002,
+      "loss": 0.314,
+      "step": 2370
+    },
+    {
+      "epoch": 2.3253541768441623,
+      "grad_norm": 0.9330461025238037,
+      "learning_rate": 0.0002,
+      "loss": 0.308,
+      "step": 2380
+    },
+    {
+      "epoch": 2.335124572545188,
+      "grad_norm": 0.783240556716919,
+      "learning_rate": 0.0002,
+      "loss": 0.3266,
+      "step": 2390
+    },
+    {
+      "epoch": 2.344894968246214,
+      "grad_norm": 0.574898898601532,
+      "learning_rate": 0.0002,
+      "loss": 0.3166,
+      "step": 2400
+    },
+    {
+      "epoch": 2.3546653639472397,
+      "grad_norm": 0.6607279777526855,
+      "learning_rate": 0.0002,
+      "loss": 0.3119,
+      "step": 2410
+    },
+    {
+      "epoch": 2.364435759648266,
+      "grad_norm": 0.8342743515968323,
+      "learning_rate": 0.0002,
+      "loss": 0.3129,
+      "step": 2420
+    },
+    {
+      "epoch": 2.3742061553492917,
+      "grad_norm": 0.8198254108428955,
+      "learning_rate": 0.0002,
+      "loss": 0.315,
+      "step": 2430
+    },
+    {
+      "epoch": 2.3839765510503175,
+      "grad_norm": 0.9324616193771362,
+      "learning_rate": 0.0002,
+      "loss": 0.3107,
+      "step": 2440
+    },
+    {
+      "epoch": 2.3937469467513433,
+      "grad_norm": 0.8188948035240173,
+      "learning_rate": 0.0002,
+      "loss": 0.3,
+      "step": 2450
+    },
+    {
+      "epoch": 2.4035173424523695,
+      "grad_norm": 0.7812654376029968,
+      "learning_rate": 0.0002,
+      "loss": 0.3095,
+      "step": 2460
+    },
+    {
+      "epoch": 2.4132877381533953,
+      "grad_norm": 0.7986653447151184,
+      "learning_rate": 0.0002,
+      "loss": 0.2994,
+      "step": 2470
+    },
+    {
+      "epoch": 2.423058133854421,
+      "grad_norm": 0.6537502408027649,
+      "learning_rate": 0.0002,
+      "loss": 0.3095,
+      "step": 2480
+    },
+    {
+      "epoch": 2.432828529555447,
+      "grad_norm": 0.4680769741535187,
+      "learning_rate": 0.0002,
+      "loss": 0.3092,
+      "step": 2490
+    },
+    {
+      "epoch": 2.442598925256473,
+      "grad_norm": 1.0223482847213745,
+      "learning_rate": 0.0002,
+      "loss": 0.3117,
+      "step": 2500
+    },
+    {
+      "epoch": 2.452369320957499,
+      "grad_norm": 0.5865668654441833,
+      "learning_rate": 0.0002,
+      "loss": 0.3047,
+      "step": 2510
+    },
+    {
+      "epoch": 2.4621397166585246,
+      "grad_norm": 0.8539699912071228,
+      "learning_rate": 0.0002,
+      "loss": 0.3138,
+      "step": 2520
+    },
+    {
+      "epoch": 2.4719101123595504,
+      "grad_norm": 0.8653438687324524,
+      "learning_rate": 0.0002,
+      "loss": 0.3082,
+      "step": 2530
+    },
+    {
+      "epoch": 2.4816805080605766,
+      "grad_norm": 1.084686040878296,
+      "learning_rate": 0.0002,
+      "loss": 0.3098,
+      "step": 2540
+    },
+    {
+      "epoch": 2.4914509037616024,
+      "grad_norm": 0.8754410743713379,
+      "learning_rate": 0.0002,
+      "loss": 0.3139,
+      "step": 2550
+    },
+    {
+      "epoch": 2.501221299462628,
+      "grad_norm": 0.838127851486206,
+      "learning_rate": 0.0002,
+      "loss": 0.3066,
+      "step": 2560
+    },
+    {
+      "epoch": 2.5109916951636544,
+      "grad_norm": 0.7761465907096863,
+      "learning_rate": 0.0002,
+      "loss": 0.2933,
+      "step": 2570
+    },
+    {
+      "epoch": 2.52076209086468,
+      "grad_norm": 0.7373273372650146,
+      "learning_rate": 0.0002,
+      "loss": 0.2942,
+      "step": 2580
+    },
+    {
+      "epoch": 2.530532486565706,
+      "grad_norm": 0.7441604137420654,
+      "learning_rate": 0.0002,
+      "loss": 0.3079,
+      "step": 2590
+    },
+    {
+      "epoch": 2.5403028822667317,
+      "grad_norm": 0.7476372718811035,
+      "learning_rate": 0.0002,
+      "loss": 0.3048,
+      "step": 2600
+    },
+    {
+      "epoch": 2.5500732779677575,
+      "grad_norm": 0.860421895980835,
+      "learning_rate": 0.0002,
+      "loss": 0.2979,
+      "step": 2610
+    },
+    {
+      "epoch": 2.5598436736687837,
+      "grad_norm": 0.8230026364326477,
+      "learning_rate": 0.0002,
+      "loss": 0.3046,
+      "step": 2620
+    },
+    {
+      "epoch": 2.5696140693698095,
+      "grad_norm": 0.8646627068519592,
+      "learning_rate": 0.0002,
+      "loss": 0.3034,
+      "step": 2630
+    },
+    {
+      "epoch": 2.5793844650708353,
+      "grad_norm": 0.9704413414001465,
+      "learning_rate": 0.0002,
+      "loss": 0.3147,
+      "step": 2640
+    },
+    {
+      "epoch": 2.5891548607718615,
+      "grad_norm": 0.8837246298789978,
+      "learning_rate": 0.0002,
+      "loss": 0.3078,
+      "step": 2650
+    },
+    {
+      "epoch": 2.5989252564728873,
+      "grad_norm": 0.7060710191726685,
+      "learning_rate": 0.0002,
+      "loss": 0.3006,
+      "step": 2660
+    },
+    {
+      "epoch": 2.608695652173913,
+      "grad_norm": 0.7364303469657898,
+      "learning_rate": 0.0002,
+      "loss": 0.3024,
+      "step": 2670
+    },
+    {
+      "epoch": 2.618466047874939,
+      "grad_norm": 0.9422456622123718,
+      "learning_rate": 0.0002,
+      "loss": 0.3047,
+      "step": 2680
+    },
+    {
+      "epoch": 2.6282364435759646,
+      "grad_norm": 0.8265060186386108,
+      "learning_rate": 0.0002,
+      "loss": 0.3033,
+      "step": 2690
+    },
+    {
+      "epoch": 2.638006839276991,
+      "grad_norm": 0.6122261881828308,
+      "learning_rate": 0.0002,
+      "loss": 0.2949,
+      "step": 2700
+    },
+    {
+      "epoch": 2.6477772349780166,
+      "grad_norm": 0.7424021363258362,
+      "learning_rate": 0.0002,
+      "loss": 0.2978,
+      "step": 2710
+    },
+    {
+      "epoch": 2.6575476306790424,
+      "grad_norm": 0.6899349689483643,
+      "learning_rate": 0.0002,
+      "loss": 0.3078,
+      "step": 2720
+    },
+    {
+      "epoch": 2.6673180263800687,
+      "grad_norm": 0.8241371512413025,
+      "learning_rate": 0.0002,
+      "loss": 0.3059,
+      "step": 2730
+    },
+    {
+      "epoch": 2.6770884220810944,
+      "grad_norm": 0.7357944846153259,
+      "learning_rate": 0.0002,
+      "loss": 0.3169,
+      "step": 2740
+    },
+    {
+      "epoch": 2.68685881778212,
+      "grad_norm": 1.2319949865341187,
+      "learning_rate": 0.0002,
+      "loss": 0.3104,
+      "step": 2750
+    },
+    {
+      "epoch": 2.696629213483146,
+      "grad_norm": 0.6758335828781128,
+      "learning_rate": 0.0002,
+      "loss": 0.3016,
+      "step": 2760
+    },
+    {
+      "epoch": 2.7063996091841718,
+      "grad_norm": 0.666590690612793,
+      "learning_rate": 0.0002,
+      "loss": 0.3175,
+      "step": 2770
+    },
+    {
+      "epoch": 2.716170004885198,
+      "grad_norm": 0.765657365322113,
+      "learning_rate": 0.0002,
+      "loss": 0.3123,
+      "step": 2780
+    },
+    {
+      "epoch": 2.7259404005862238,
+      "grad_norm": 0.6624470949172974,
+      "learning_rate": 0.0002,
+      "loss": 0.2969,
+      "step": 2790
+    },
+    {
+      "epoch": 2.7357107962872496,
+      "grad_norm": 0.9891471266746521,
+      "learning_rate": 0.0002,
+      "loss": 0.3021,
+      "step": 2800
+    },
+    {
+      "epoch": 2.745481191988276,
+      "grad_norm": 0.590451180934906,
+      "learning_rate": 0.0002,
+      "loss": 0.307,
+      "step": 2810
+    },
+    {
+      "epoch": 2.7552515876893016,
+      "grad_norm": 0.5418292284011841,
+      "learning_rate": 0.0002,
+      "loss": 0.3084,
+      "step": 2820
+    },
+    {
+      "epoch": 2.7650219833903273,
+      "grad_norm": 0.9565151929855347,
+      "learning_rate": 0.0002,
+      "loss": 0.309,
+      "step": 2830
+    },
+    {
+      "epoch": 2.774792379091353,
+      "grad_norm": 0.7840000987052917,
+      "learning_rate": 0.0002,
+      "loss": 0.3046,
+      "step": 2840
+    },
+    {
+      "epoch": 2.784562774792379,
+      "grad_norm": 0.7269287705421448,
+      "learning_rate": 0.0002,
+      "loss": 0.2938,
+      "step": 2850
+    },
+    {
+      "epoch": 2.794333170493405,
+      "grad_norm": 0.6564769744873047,
+      "learning_rate": 0.0002,
+      "loss": 0.2945,
+      "step": 2860
+    },
+    {
+      "epoch": 2.804103566194431,
+      "grad_norm": 0.5916360020637512,
+      "learning_rate": 0.0002,
+      "loss": 0.2952,
+      "step": 2870
+    },
+    {
+      "epoch": 2.8138739618954567,
+      "grad_norm": 0.5752355456352234,
+      "learning_rate": 0.0002,
+      "loss": 0.2981,
+      "step": 2880
+    },
+    {
+      "epoch": 2.823644357596483,
+      "grad_norm": 0.9079744815826416,
+      "learning_rate": 0.0002,
+      "loss": 0.2976,
+      "step": 2890
+    },
+    {
+      "epoch": 2.8334147532975087,
+      "grad_norm": 0.6955378651618958,
+      "learning_rate": 0.0002,
+      "loss": 0.2967,
+      "step": 2900
+    },
+    {
+      "epoch": 2.8431851489985345,
+      "grad_norm": 0.5551539063453674,
+      "learning_rate": 0.0002,
+      "loss": 0.289,
+      "step": 2910
+    },
+    {
+      "epoch": 2.8529555446995603,
+      "grad_norm": 0.7029260396957397,
+      "learning_rate": 0.0002,
+      "loss": 0.3047,
+      "step": 2920
+    },
+    {
+      "epoch": 2.862725940400586,
+      "grad_norm": 1.002670168876648,
+      "learning_rate": 0.0002,
+      "loss": 0.2935,
+      "step": 2930
+    },
+    {
+      "epoch": 2.8724963361016123,
+      "grad_norm": 0.8380820751190186,
+      "learning_rate": 0.0002,
+      "loss": 0.3005,
+      "step": 2940
+    },
+    {
+      "epoch": 2.882266731802638,
+      "grad_norm": 0.658412754535675,
+      "learning_rate": 0.0002,
+      "loss": 0.2948,
+      "step": 2950
+    },
+    {
+      "epoch": 2.892037127503664,
+      "grad_norm": 0.9336162209510803,
+      "learning_rate": 0.0002,
+      "loss": 0.3003,
+      "step": 2960
+    },
+    {
+      "epoch": 2.90180752320469,
+      "grad_norm": 0.7143391370773315,
+      "learning_rate": 0.0002,
+      "loss": 0.2874,
+      "step": 2970
+    },
+    {
+      "epoch": 2.911577918905716,
+      "grad_norm": 0.5564678311347961,
+      "learning_rate": 0.0002,
+      "loss": 0.2975,
+      "step": 2980
+    },
+    {
+      "epoch": 2.9213483146067416,
+      "grad_norm": 1.1643658876419067,
+      "learning_rate": 0.0002,
+      "loss": 0.3045,
+      "step": 2990
+    },
+    {
+      "epoch": 2.9311187103077674,
+      "grad_norm": 0.6776673793792725,
+      "learning_rate": 0.0002,
+      "loss": 0.3027,
+      "step": 3000
+    },
+    {
+      "epoch": 2.940889106008793,
+      "grad_norm": 0.6123829483985901,
+      "learning_rate": 0.0002,
+      "loss": 0.2887,
+      "step": 3010
+    },
+    {
+      "epoch": 2.9506595017098194,
+      "grad_norm": 0.7569496631622314,
+      "learning_rate": 0.0002,
+      "loss": 0.2897,
+      "step": 3020
+    },
+    {
+      "epoch": 2.960429897410845,
+      "grad_norm": 0.6484465599060059,
+      "learning_rate": 0.0002,
+      "loss": 0.3023,
+      "step": 3030
+    },
+    {
+      "epoch": 2.970200293111871,
+      "grad_norm": 0.7745254039764404,
+      "learning_rate": 0.0002,
+      "loss": 0.2925,
+      "step": 3040
+    },
+    {
+      "epoch": 2.979970688812897,
+      "grad_norm": 0.6034068465232849,
+      "learning_rate": 0.0002,
+      "loss": 0.2946,
+      "step": 3050
+    },
+    {
+      "epoch": 2.989741084513923,
+      "grad_norm": 1.202962040901184,
+      "learning_rate": 0.0002,
+      "loss": 0.2935,
+      "step": 3060
+    },
+    {
+      "epoch": 2.9995114802149487,
+      "grad_norm": 0.8330838680267334,
+      "learning_rate": 0.0002,
+      "loss": 0.3045,
+      "step": 3070
+    },
+    {
+      "epoch": 2.9995114802149487,
+      "eval_loss": 0.30571895837783813,
+      "eval_runtime": 26.5297,
+      "eval_samples_per_second": 13.758,
+      "eval_steps_per_second": 1.734,
+      "step": 3070
+    },
+    {
+      "epoch": 3.0092818759159745,
+      "grad_norm": 0.7035648226737976,
+      "learning_rate": 0.0002,
+      "loss": 0.2876,
+      "step": 3080
+    },
+    {
+      "epoch": 3.0190522716170003,
+      "grad_norm": 1.0382764339447021,
+      "learning_rate": 0.0002,
+      "loss": 0.2739,
+      "step": 3090
+    },
+    {
+      "epoch": 3.0288226673180265,
+      "grad_norm": 0.7345609068870544,
+      "learning_rate": 0.0002,
+      "loss": 0.278,
+      "step": 3100
+    },
+    {
+      "epoch": 3.0385930630190523,
+      "grad_norm": 0.8979442119598389,
+      "learning_rate": 0.0002,
+      "loss": 0.2761,
+      "step": 3110
+    },
+    {
+      "epoch": 3.048363458720078,
+      "grad_norm": 0.940156102180481,
+      "learning_rate": 0.0002,
+      "loss": 0.2774,
+      "step": 3120
+    },
+    {
+      "epoch": 3.058133854421104,
+      "grad_norm": 0.6340954303741455,
+      "learning_rate": 0.0002,
+      "loss": 0.2787,
+      "step": 3130
+    },
+    {
+      "epoch": 3.06790425012213,
+      "grad_norm": 1.4032169580459595,
+      "learning_rate": 0.0002,
+      "loss": 0.276,
+      "step": 3140
+    },
+    {
+      "epoch": 3.077674645823156,
+      "grad_norm": 0.7248355746269226,
+      "learning_rate": 0.0002,
+      "loss": 0.2784,
+      "step": 3150
+    },
+    {
+      "epoch": 3.0874450415241816,
+      "grad_norm": 0.9100632667541504,
+      "learning_rate": 0.0002,
+      "loss": 0.2811,
+      "step": 3160
+    },
+    {
+      "epoch": 3.0972154372252074,
+      "grad_norm": 0.47295379638671875,
+      "learning_rate": 0.0002,
+      "loss": 0.2858,
+      "step": 3170
+    },
+    {
+      "epoch": 3.1069858329262336,
+      "grad_norm": 0.7997456789016724,
+      "learning_rate": 0.0002,
+      "loss": 0.2852,
+      "step": 3180
+    },
+    {
+      "epoch": 3.1167562286272594,
+      "grad_norm": 0.6676840782165527,
+      "learning_rate": 0.0002,
+      "loss": 0.269,
+      "step": 3190
+    },
+    {
+      "epoch": 3.126526624328285,
+      "grad_norm": 0.6773821115493774,
+      "learning_rate": 0.0002,
+      "loss": 0.29,
+      "step": 3200
+    },
+    {
+      "epoch": 3.136297020029311,
+      "grad_norm": 0.49832019209861755,
+      "learning_rate": 0.0002,
+      "loss": 0.2896,
+      "step": 3210
+    },
+    {
+      "epoch": 3.146067415730337,
+      "grad_norm": 0.7048546671867371,
+      "learning_rate": 0.0002,
+      "loss": 0.2885,
+      "step": 3220
+    },
+    {
+      "epoch": 3.155837811431363,
+      "grad_norm": 0.9097464084625244,
+      "learning_rate": 0.0002,
+      "loss": 0.2785,
+      "step": 3230
+    },
+    {
+      "epoch": 3.1656082071323888,
+      "grad_norm": 0.7356650233268738,
+      "learning_rate": 0.0002,
+      "loss": 0.28,
+      "step": 3240
+    },
+    {
+      "epoch": 3.1753786028334146,
+      "grad_norm": 0.5919857621192932,
+      "learning_rate": 0.0002,
+      "loss": 0.2813,
+      "step": 3250
+    },
+    {
+      "epoch": 3.185148998534441,
+      "grad_norm": 0.7269758582115173,
+      "learning_rate": 0.0002,
+      "loss": 0.2803,
+      "step": 3260
+    },
+    {
+      "epoch": 3.1949193942354666,
+      "grad_norm": 0.6074000597000122,
+      "learning_rate": 0.0002,
+      "loss": 0.2782,
+      "step": 3270
+    },
+    {
+      "epoch": 3.2046897899364923,
+      "grad_norm": 0.7818130850791931,
+      "learning_rate": 0.0002,
+      "loss": 0.2866,
+      "step": 3280
+    },
+    {
+      "epoch": 3.214460185637518,
+      "grad_norm": 0.7337279915809631,
+      "learning_rate": 0.0002,
+      "loss": 0.2717,
+      "step": 3290
+    },
+    {
+      "epoch": 3.2242305813385443,
+      "grad_norm": 0.5730321407318115,
+      "learning_rate": 0.0002,
+      "loss": 0.2769,
+      "step": 3300
+    },
+    {
+      "epoch": 3.23400097703957,
+      "grad_norm": 0.7278021574020386,
+      "learning_rate": 0.0002,
+      "loss": 0.2786,
+      "step": 3310
+    },
+    {
+      "epoch": 3.243771372740596,
+      "grad_norm": 0.7152529358863831,
+      "learning_rate": 0.0002,
+      "loss": 0.2869,
+      "step": 3320
+    },
+    {
+      "epoch": 3.2535417684416217,
+      "grad_norm": 0.884472131729126,
+      "learning_rate": 0.0002,
+      "loss": 0.2782,
+      "step": 3330
+    },
+    {
+      "epoch": 3.263312164142648,
+      "grad_norm": 0.8212921023368835,
+      "learning_rate": 0.0002,
+      "loss": 0.2844,
+      "step": 3340
+    },
+    {
+      "epoch": 3.2730825598436737,
+      "grad_norm": 0.917287290096283,
+      "learning_rate": 0.0002,
+      "loss": 0.2843,
+      "step": 3350
+    },
+    {
+      "epoch": 3.2828529555446995,
+      "grad_norm": 0.7095558047294617,
+      "learning_rate": 0.0002,
+      "loss": 0.2788,
+      "step": 3360
+    },
+    {
+      "epoch": 3.2926233512457257,
+      "grad_norm": 0.5871877074241638,
+      "learning_rate": 0.0002,
+      "loss": 0.2877,
+      "step": 3370
+    },
+    {
+      "epoch": 3.3023937469467515,
+      "grad_norm": 1.0710159540176392,
+      "learning_rate": 0.0002,
+      "loss": 0.2858,
+      "step": 3380
+    },
+    {
+      "epoch": 3.3121641426477773,
+      "grad_norm": 0.7994568347930908,
+      "learning_rate": 0.0002,
+      "loss": 0.2803,
+      "step": 3390
+    },
+    {
+      "epoch": 3.321934538348803,
+      "grad_norm": 0.7846646308898926,
+      "learning_rate": 0.0002,
+      "loss": 0.3245,
+      "step": 3400
+    },
+    {
+      "epoch": 3.331704934049829,
+      "grad_norm": 1.0486291646957397,
+      "learning_rate": 0.0002,
+      "loss": 0.2871,
+      "step": 3410
+    },
+    {
+      "epoch": 3.341475329750855,
+      "grad_norm": 0.901267945766449,
+      "learning_rate": 0.0002,
+      "loss": 0.2871,
+      "step": 3420
+    },
+    {
+      "epoch": 3.351245725451881,
+      "grad_norm": 0.7573235034942627,
+      "learning_rate": 0.0002,
+      "loss": 0.2841,
+      "step": 3430
+    },
+    {
+      "epoch": 3.3610161211529066,
+      "grad_norm": 0.8427221179008484,
+      "learning_rate": 0.0002,
+      "loss": 0.2874,
+      "step": 3440
+    },
+    {
+      "epoch": 3.370786516853933,
+      "grad_norm": 0.7547389268875122,
+      "learning_rate": 0.0002,
+      "loss": 0.2806,
+      "step": 3450
+    },
+    {
+      "epoch": 3.3805569125549586,
+      "grad_norm": 0.9360662698745728,
+      "learning_rate": 0.0002,
+      "loss": 0.2763,
+      "step": 3460
+    },
+    {
+      "epoch": 3.3903273082559844,
+      "grad_norm": 0.6213487982749939,
+      "learning_rate": 0.0002,
+      "loss": 0.2816,
+      "step": 3470
+    },
+    {
+      "epoch": 3.40009770395701,
+      "grad_norm": 1.4937654733657837,
+      "learning_rate": 0.0002,
+      "loss": 0.2805,
+      "step": 3480
+    },
+    {
+      "epoch": 3.409868099658036,
+      "grad_norm": 1.0794259309768677,
+      "learning_rate": 0.0002,
+      "loss": 0.3484,
+      "step": 3490
+    },
+    {
+      "epoch": 3.419638495359062,
+      "grad_norm": 0.929327666759491,
+      "learning_rate": 0.0002,
+      "loss": 0.327,
+      "step": 3500
+    },
+    {
+      "epoch": 3.429408891060088,
+      "grad_norm": 0.741318941116333,
+      "learning_rate": 0.0002,
+      "loss": 0.2933,
+      "step": 3510
+    },
+    {
+      "epoch": 3.4391792867611137,
+      "grad_norm": 0.8972593545913696,
+      "learning_rate": 0.0002,
+      "loss": 0.2873,
+      "step": 3520
+    },
+    {
+      "epoch": 3.44894968246214,
+      "grad_norm": 1.035099744796753,
+      "learning_rate": 0.0002,
+      "loss": 0.2836,
+      "step": 3530
+    },
+    {
+      "epoch": 3.4587200781631657,
+      "grad_norm": 0.744045078754425,
+      "learning_rate": 0.0002,
+      "loss": 0.2768,
+      "step": 3540
+    },
+    {
+      "epoch": 3.4684904738641915,
+      "grad_norm": 1.013269066810608,
+      "learning_rate": 0.0002,
+      "loss": 0.2911,
+      "step": 3550
+    },
+    {
+      "epoch": 3.4782608695652173,
+      "grad_norm": 0.667107880115509,
+      "learning_rate": 0.0002,
+      "loss": 0.2819,
+      "step": 3560
+    },
+    {
+      "epoch": 3.488031265266243,
+      "grad_norm": 0.7778298258781433,
+      "learning_rate": 0.0002,
+      "loss": 0.2779,
+      "step": 3570
+    },
+    {
+      "epoch": 3.4978016609672693,
+      "grad_norm": 0.7953827977180481,
+      "learning_rate": 0.0002,
+      "loss": 0.2808,
+      "step": 3580
+    },
+    {
+      "epoch": 3.507572056668295,
+      "grad_norm": 0.6064241528511047,
+      "learning_rate": 0.0002,
+      "loss": 0.2804,
+      "step": 3590
+    },
+    {
+      "epoch": 3.517342452369321,
+      "grad_norm": 0.7711805105209351,
+      "learning_rate": 0.0002,
+      "loss": 0.2928,
+      "step": 3600
+    },
+    {
+      "epoch": 3.527112848070347,
+      "grad_norm": 0.4379819631576538,
+      "learning_rate": 0.0002,
+      "loss": 0.2978,
+      "step": 3610
+    },
+    {
+      "epoch": 3.536883243771373,
+      "grad_norm": 0.7208490967750549,
+      "learning_rate": 0.0002,
+      "loss": 0.2861,
+      "step": 3620
+    },
+    {
+      "epoch": 3.5466536394723986,
+      "grad_norm": 0.5875769257545471,
+      "learning_rate": 0.0002,
+      "loss": 0.2917,
+      "step": 3630
+    },
+    {
+      "epoch": 3.5564240351734244,
+      "grad_norm": 0.8589478135108948,
+      "learning_rate": 0.0002,
+      "loss": 0.2671,
+      "step": 3640
+    },
+    {
+      "epoch": 3.56619443087445,
+      "grad_norm": 0.7626174092292786,
+      "learning_rate": 0.0002,
+      "loss": 0.2769,
+      "step": 3650
+    },
+    {
+      "epoch": 3.5759648265754764,
+      "grad_norm": 1.1809124946594238,
+      "learning_rate": 0.0002,
+      "loss": 0.2863,
+      "step": 3660
+    },
+    {
+      "epoch": 3.585735222276502,
+      "grad_norm": 0.8219048976898193,
+      "learning_rate": 0.0002,
+      "loss": 0.2788,
+      "step": 3670
+    },
+    {
+      "epoch": 3.595505617977528,
+      "grad_norm": 1.075877070426941,
+      "learning_rate": 0.0002,
+      "loss": 0.2725,
+      "step": 3680
+    },
+    {
+      "epoch": 3.605276013678554,
+      "grad_norm": 1.0180445909500122,
+      "learning_rate": 0.0002,
+      "loss": 0.2788,
+      "step": 3690
+    },
+    {
+      "epoch": 3.61504640937958,
+      "grad_norm": 0.812706470489502,
+      "learning_rate": 0.0002,
+      "loss": 0.2714,
+      "step": 3700
+    },
+    {
+      "epoch": 3.6248168050806058,
+      "grad_norm": 0.606896698474884,
+      "learning_rate": 0.0002,
+      "loss": 0.2799,
+      "step": 3710
+    },
+    {
+      "epoch": 3.6345872007816316,
+      "grad_norm": 0.5841220617294312,
+      "learning_rate": 0.0002,
+      "loss": 0.284,
+      "step": 3720
+    },
+    {
+      "epoch": 3.6443575964826573,
+      "grad_norm": 0.9902899265289307,
+      "learning_rate": 0.0002,
+      "loss": 0.2758,
+      "step": 3730
+    },
+    {
+      "epoch": 3.6541279921836836,
+      "grad_norm": 0.6956594586372375,
+      "learning_rate": 0.0002,
+      "loss": 0.2741,
+      "step": 3740
+    },
+    {
+      "epoch": 3.6638983878847093,
+      "grad_norm": 1.011510968208313,
+      "learning_rate": 0.0002,
+      "loss": 0.28,
+      "step": 3750
+    },
+    {
+      "epoch": 3.673668783585735,
+      "grad_norm": 0.6990731954574585,
+      "learning_rate": 0.0002,
+      "loss": 0.2682,
+      "step": 3760
+    },
+    {
+      "epoch": 3.6834391792867613,
+      "grad_norm": 0.7399393916130066,
+      "learning_rate": 0.0002,
+      "loss": 0.2834,
+      "step": 3770
+    },
+    {
+      "epoch": 3.693209574987787,
+      "grad_norm": 1.0586392879486084,
+      "learning_rate": 0.0002,
+      "loss": 0.2852,
+      "step": 3780
+    },
+    {
+      "epoch": 3.702979970688813,
+      "grad_norm": 0.6087884306907654,
+      "learning_rate": 0.0002,
+      "loss": 0.2772,
+      "step": 3790
+    },
+    {
+      "epoch": 3.7127503663898387,
+      "grad_norm": 0.7378975749015808,
+      "learning_rate": 0.0002,
+      "loss": 0.2767,
+      "step": 3800
+    },
+    {
+      "epoch": 3.7225207620908645,
+      "grad_norm": 0.6609274744987488,
+      "learning_rate": 0.0002,
+      "loss": 0.2781,
+      "step": 3810
+    },
+    {
+      "epoch": 3.7322911577918907,
+      "grad_norm": 1.0175760984420776,
+      "learning_rate": 0.0002,
+      "loss": 0.2675,
+      "step": 3820
+    },
+    {
+      "epoch": 3.7420615534929165,
+      "grad_norm": 0.730687141418457,
+      "learning_rate": 0.0002,
+      "loss": 0.2719,
+      "step": 3830
+    },
+    {
+      "epoch": 3.7518319491939423,
+      "grad_norm": 0.7158323526382446,
+      "learning_rate": 0.0002,
+      "loss": 0.2915,
+      "step": 3840
+    },
+    {
+      "epoch": 3.7616023448949685,
+      "grad_norm": 0.8573526740074158,
+      "learning_rate": 0.0002,
+      "loss": 0.2854,
+      "step": 3850
+    },
+    {
+      "epoch": 3.7713727405959943,
+      "grad_norm": 1.04916512966156,
+      "learning_rate": 0.0002,
+      "loss": 0.274,
+      "step": 3860
+    },
+    {
+      "epoch": 3.78114313629702,
+      "grad_norm": 0.9968064427375793,
+      "learning_rate": 0.0002,
+      "loss": 0.2776,
+      "step": 3870
+    },
+    {
+      "epoch": 3.790913531998046,
+      "grad_norm": 0.8024522662162781,
+      "learning_rate": 0.0002,
+      "loss": 0.2816,
+      "step": 3880
+    },
+    {
+      "epoch": 3.8006839276990716,
+      "grad_norm": 0.6639657616615295,
+      "learning_rate": 0.0002,
+      "loss": 0.2733,
+      "step": 3890
+    },
+    {
+      "epoch": 3.810454323400098,
+      "grad_norm": 0.856477677822113,
+      "learning_rate": 0.0002,
+      "loss": 0.2735,
+      "step": 3900
+    },
+    {
+      "epoch": 3.8202247191011236,
+      "grad_norm": 0.6947850584983826,
+      "learning_rate": 0.0002,
+      "loss": 0.274,
+      "step": 3910
+    },
+    {
+      "epoch": 3.8299951148021494,
+      "grad_norm": 0.8612431287765503,
+      "learning_rate": 0.0002,
+      "loss": 0.2682,
+      "step": 3920
+    },
+    {
+      "epoch": 3.8397655105031756,
+      "grad_norm": 0.6200122833251953,
+      "learning_rate": 0.0002,
+      "loss": 0.2725,
+      "step": 3930
+    },
+    {
+      "epoch": 3.8495359062042014,
+      "grad_norm": 0.7116451859474182,
+      "learning_rate": 0.0002,
+      "loss": 0.2786,
+      "step": 3940
+    },
+    {
+      "epoch": 3.859306301905227,
+      "grad_norm": 0.6768040657043457,
+      "learning_rate": 0.0002,
+      "loss": 0.2719,
+      "step": 3950
+    },
+    {
+      "epoch": 3.869076697606253,
+      "grad_norm": 0.7205768823623657,
+      "learning_rate": 0.0002,
+      "loss": 0.2806,
+      "step": 3960
+    },
+    {
+      "epoch": 3.8788470933072787,
+      "grad_norm": 0.6989039778709412,
+      "learning_rate": 0.0002,
+      "loss": 0.2806,
+      "step": 3970
+    },
+    {
+      "epoch": 3.888617489008305,
+      "grad_norm": 0.6655344367027283,
+      "learning_rate": 0.0002,
+      "loss": 0.2865,
+      "step": 3980
+    },
+    {
+      "epoch": 3.8983878847093307,
+      "grad_norm": 0.7526548504829407,
+      "learning_rate": 0.0002,
+      "loss": 0.2796,
+      "step": 3990
+    },
+    {
+      "epoch": 3.9081582804103565,
+      "grad_norm": 0.8535363078117371,
+      "learning_rate": 0.0002,
+      "loss": 0.2849,
+      "step": 4000
+    },
+    {
+      "epoch": 3.9179286761113827,
+      "grad_norm": 0.8054668307304382,
+      "learning_rate": 0.0002,
+      "loss": 0.2773,
+      "step": 4010
+    },
+    {
+      "epoch": 3.9276990718124085,
+      "grad_norm": 0.664475679397583,
+      "learning_rate": 0.0002,
+      "loss": 0.2794,
+      "step": 4020
+    },
+    {
+      "epoch": 3.9374694675134343,
+      "grad_norm": 0.8805311322212219,
+      "learning_rate": 0.0002,
+      "loss": 0.273,
+      "step": 4030
+    },
+    {
+      "epoch": 3.94723986321446,
+      "grad_norm": 0.47290244698524475,
+      "learning_rate": 0.0002,
+      "loss": 0.2825,
+      "step": 4040
+    },
+    {
+      "epoch": 3.957010258915486,
+      "grad_norm": 0.9041091799736023,
+      "learning_rate": 0.0002,
+      "loss": 0.2723,
+      "step": 4050
+    },
+    {
+      "epoch": 3.966780654616512,
+      "grad_norm": 0.9564446210861206,
+      "learning_rate": 0.0002,
+      "loss": 0.2838,
+      "step": 4060
+    },
+    {
+      "epoch": 3.976551050317538,
+      "grad_norm": 0.6496501564979553,
+      "learning_rate": 0.0002,
+      "loss": 0.2799,
+      "step": 4070
+    },
+    {
+      "epoch": 3.9863214460185636,
+      "grad_norm": 0.7228884100914001,
+      "learning_rate": 0.0002,
+      "loss": 0.2781,
+      "step": 4080
+    },
+    {
+      "epoch": 3.99609184171959,
+      "grad_norm": 1.1264238357543945,
+      "learning_rate": 0.0002,
+      "loss": 0.2896,
+      "step": 4090
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 0.289143443107605,
+      "eval_runtime": 26.5206,
+      "eval_samples_per_second": 13.763,
+      "eval_steps_per_second": 1.735,
+      "step": 4094
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 8184,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.796231311392768e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}