MilaWang commited on Mar 28

Commit

525a5f2

verified ·

1 Parent(s): 0691674

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1230/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1230/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1230/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1230/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1230/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1230/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1230/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1230/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1230/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1230/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1230/trainer_state.json +910 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1230/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1845/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1845/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1845/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1845/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1845/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1845/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1845/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1845/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1845/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1845/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1845/trainer_state.json +1345 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1845/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-2460/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-2460/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-2460/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-2460/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-2460/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-2460/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-2460/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-2460/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-2460/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-2460/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-2460/trainer_state.json +1787 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-2460/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-3075/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-3075/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-3075/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-3075/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-3075/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-3075/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-3075/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-3075/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-3075/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-3075/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-3075/trainer_state.json +2222 -0

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b486b358aea1479761012918bfb040fdf031818a91dc8796bed62083123e7c6
+size 109069176

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1230/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1230/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7df5fa654fead9c6cc35e3c044bb0dd46b0b51a9222e5855bcc13a1805dac4a
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:77af8e8d3ff1e84aca498ab331b63f37d398f4e4419523222aa09428aafaa1eb
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96072bc59d4e66f3b13852c2b1a567b5da80eafb4d1d12d51714de2b320ea949
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5000a5052c8d090ffd908477f337cfaa88890bbadf7e2e9c16cda94eee348adf
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,910 @@

+{
+  "best_metric": 1.7831426858901978,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-615",
+  "epoch": 2.0,
+  "eval_steps": 10,
+  "global_step": 1230,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016260162601626018,
+      "grad_norm": 1.903406023979187,
+      "learning_rate": 0.0002,
+      "loss": 2.6243,
+      "step": 10
+    },
+    {
+      "epoch": 0.032520325203252036,
+      "grad_norm": 1.0100678205490112,
+      "learning_rate": 0.0002,
+      "loss": 2.2084,
+      "step": 20
+    },
+    {
+      "epoch": 0.04878048780487805,
+      "grad_norm": 0.7413098812103271,
+      "learning_rate": 0.0002,
+      "loss": 2.0423,
+      "step": 30
+    },
+    {
+      "epoch": 0.06504065040650407,
+      "grad_norm": 0.7540805339813232,
+      "learning_rate": 0.0002,
+      "loss": 2.0647,
+      "step": 40
+    },
+    {
+      "epoch": 0.08130081300813008,
+      "grad_norm": 0.6508658528327942,
+      "learning_rate": 0.0002,
+      "loss": 2.0926,
+      "step": 50
+    },
+    {
+      "epoch": 0.0975609756097561,
+      "grad_norm": 0.7228319048881531,
+      "learning_rate": 0.0002,
+      "loss": 1.878,
+      "step": 60
+    },
+    {
+      "epoch": 0.11382113821138211,
+      "grad_norm": 0.6510937213897705,
+      "learning_rate": 0.0002,
+      "loss": 1.8672,
+      "step": 70
+    },
+    {
+      "epoch": 0.13008130081300814,
+      "grad_norm": 0.7238746881484985,
+      "learning_rate": 0.0002,
+      "loss": 1.8592,
+      "step": 80
+    },
+    {
+      "epoch": 0.14634146341463414,
+      "grad_norm": 0.7530466318130493,
+      "learning_rate": 0.0002,
+      "loss": 1.8541,
+      "step": 90
+    },
+    {
+      "epoch": 0.16260162601626016,
+      "grad_norm": 0.622166097164154,
+      "learning_rate": 0.0002,
+      "loss": 1.8245,
+      "step": 100
+    },
+    {
+      "epoch": 0.17886178861788618,
+      "grad_norm": 0.6180148720741272,
+      "learning_rate": 0.0002,
+      "loss": 1.7581,
+      "step": 110
+    },
+    {
+      "epoch": 0.1951219512195122,
+      "grad_norm": 0.6221362352371216,
+      "learning_rate": 0.0002,
+      "loss": 1.7741,
+      "step": 120
+    },
+    {
+      "epoch": 0.21138211382113822,
+      "grad_norm": 0.569580078125,
+      "learning_rate": 0.0002,
+      "loss": 1.7575,
+      "step": 130
+    },
+    {
+      "epoch": 0.22764227642276422,
+      "grad_norm": 0.6962840557098389,
+      "learning_rate": 0.0002,
+      "loss": 1.7833,
+      "step": 140
+    },
+    {
+      "epoch": 0.24390243902439024,
+      "grad_norm": 0.644322395324707,
+      "learning_rate": 0.0002,
+      "loss": 1.8329,
+      "step": 150
+    },
+    {
+      "epoch": 0.2601626016260163,
+      "grad_norm": 0.5970060229301453,
+      "learning_rate": 0.0002,
+      "loss": 1.7794,
+      "step": 160
+    },
+    {
+      "epoch": 0.2764227642276423,
+      "grad_norm": 0.6249210834503174,
+      "learning_rate": 0.0002,
+      "loss": 1.8521,
+      "step": 170
+    },
+    {
+      "epoch": 0.2926829268292683,
+      "grad_norm": 0.7134785652160645,
+      "learning_rate": 0.0002,
+      "loss": 1.8066,
+      "step": 180
+    },
+    {
+      "epoch": 0.3089430894308943,
+      "grad_norm": 0.5477158427238464,
+      "learning_rate": 0.0002,
+      "loss": 1.8815,
+      "step": 190
+    },
+    {
+      "epoch": 0.3252032520325203,
+      "grad_norm": 0.6054863333702087,
+      "learning_rate": 0.0002,
+      "loss": 1.7222,
+      "step": 200
+    },
+    {
+      "epoch": 0.34146341463414637,
+      "grad_norm": 0.5664568543434143,
+      "learning_rate": 0.0002,
+      "loss": 1.7598,
+      "step": 210
+    },
+    {
+      "epoch": 0.35772357723577236,
+      "grad_norm": 0.5942816734313965,
+      "learning_rate": 0.0002,
+      "loss": 1.7688,
+      "step": 220
+    },
+    {
+      "epoch": 0.37398373983739835,
+      "grad_norm": 0.6311767101287842,
+      "learning_rate": 0.0002,
+      "loss": 1.7715,
+      "step": 230
+    },
+    {
+      "epoch": 0.3902439024390244,
+      "grad_norm": 0.6614870429039001,
+      "learning_rate": 0.0002,
+      "loss": 1.7663,
+      "step": 240
+    },
+    {
+      "epoch": 0.4065040650406504,
+      "grad_norm": 0.5644984841346741,
+      "learning_rate": 0.0002,
+      "loss": 1.8045,
+      "step": 250
+    },
+    {
+      "epoch": 0.42276422764227645,
+      "grad_norm": 0.7260110974311829,
+      "learning_rate": 0.0002,
+      "loss": 1.7364,
+      "step": 260
+    },
+    {
+      "epoch": 0.43902439024390244,
+      "grad_norm": 0.6733413934707642,
+      "learning_rate": 0.0002,
+      "loss": 1.7606,
+      "step": 270
+    },
+    {
+      "epoch": 0.45528455284552843,
+      "grad_norm": 0.5211837887763977,
+      "learning_rate": 0.0002,
+      "loss": 1.8432,
+      "step": 280
+    },
+    {
+      "epoch": 0.4715447154471545,
+      "grad_norm": 0.5538370013237,
+      "learning_rate": 0.0002,
+      "loss": 1.9166,
+      "step": 290
+    },
+    {
+      "epoch": 0.4878048780487805,
+      "grad_norm": 0.5429130792617798,
+      "learning_rate": 0.0002,
+      "loss": 1.8391,
+      "step": 300
+    },
+    {
+      "epoch": 0.5040650406504065,
+      "grad_norm": 0.517801821231842,
+      "learning_rate": 0.0002,
+      "loss": 1.8072,
+      "step": 310
+    },
+    {
+      "epoch": 0.5203252032520326,
+      "grad_norm": 0.6029635667800903,
+      "learning_rate": 0.0002,
+      "loss": 1.8045,
+      "step": 320
+    },
+    {
+      "epoch": 0.5365853658536586,
+      "grad_norm": 0.506401002407074,
+      "learning_rate": 0.0002,
+      "loss": 1.7734,
+      "step": 330
+    },
+    {
+      "epoch": 0.5528455284552846,
+      "grad_norm": 0.5226597189903259,
+      "learning_rate": 0.0002,
+      "loss": 1.7923,
+      "step": 340
+    },
+    {
+      "epoch": 0.5691056910569106,
+      "grad_norm": 0.5899750590324402,
+      "learning_rate": 0.0002,
+      "loss": 1.7625,
+      "step": 350
+    },
+    {
+      "epoch": 0.5853658536585366,
+      "grad_norm": 0.6185210943222046,
+      "learning_rate": 0.0002,
+      "loss": 1.828,
+      "step": 360
+    },
+    {
+      "epoch": 0.6016260162601627,
+      "grad_norm": 0.8088458180427551,
+      "learning_rate": 0.0002,
+      "loss": 1.8358,
+      "step": 370
+    },
+    {
+      "epoch": 0.6178861788617886,
+      "grad_norm": 0.509591817855835,
+      "learning_rate": 0.0002,
+      "loss": 1.8351,
+      "step": 380
+    },
+    {
+      "epoch": 0.6341463414634146,
+      "grad_norm": 0.5209569931030273,
+      "learning_rate": 0.0002,
+      "loss": 1.7849,
+      "step": 390
+    },
+    {
+      "epoch": 0.6504065040650406,
+      "grad_norm": 0.50320965051651,
+      "learning_rate": 0.0002,
+      "loss": 1.7925,
+      "step": 400
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.5555663108825684,
+      "learning_rate": 0.0002,
+      "loss": 1.795,
+      "step": 410
+    },
+    {
+      "epoch": 0.6829268292682927,
+      "grad_norm": 0.5865469574928284,
+      "learning_rate": 0.0002,
+      "loss": 1.7562,
+      "step": 420
+    },
+    {
+      "epoch": 0.6991869918699187,
+      "grad_norm": 0.5288474559783936,
+      "learning_rate": 0.0002,
+      "loss": 1.7869,
+      "step": 430
+    },
+    {
+      "epoch": 0.7154471544715447,
+      "grad_norm": 0.5364211797714233,
+      "learning_rate": 0.0002,
+      "loss": 1.8046,
+      "step": 440
+    },
+    {
+      "epoch": 0.7317073170731707,
+      "grad_norm": 0.5877127051353455,
+      "learning_rate": 0.0002,
+      "loss": 1.8124,
+      "step": 450
+    },
+    {
+      "epoch": 0.7479674796747967,
+      "grad_norm": 0.5993741154670715,
+      "learning_rate": 0.0002,
+      "loss": 1.7938,
+      "step": 460
+    },
+    {
+      "epoch": 0.7642276422764228,
+      "grad_norm": 0.4871112108230591,
+      "learning_rate": 0.0002,
+      "loss": 1.8034,
+      "step": 470
+    },
+    {
+      "epoch": 0.7804878048780488,
+      "grad_norm": 0.5300846099853516,
+      "learning_rate": 0.0002,
+      "loss": 1.7798,
+      "step": 480
+    },
+    {
+      "epoch": 0.7967479674796748,
+      "grad_norm": 0.5623212456703186,
+      "learning_rate": 0.0002,
+      "loss": 1.7772,
+      "step": 490
+    },
+    {
+      "epoch": 0.8130081300813008,
+      "grad_norm": 0.5131309032440186,
+      "learning_rate": 0.0002,
+      "loss": 1.7207,
+      "step": 500
+    },
+    {
+      "epoch": 0.8292682926829268,
+      "grad_norm": 0.49512147903442383,
+      "learning_rate": 0.0002,
+      "loss": 1.7143,
+      "step": 510
+    },
+    {
+      "epoch": 0.8455284552845529,
+      "grad_norm": 0.6260727643966675,
+      "learning_rate": 0.0002,
+      "loss": 1.8032,
+      "step": 520
+    },
+    {
+      "epoch": 0.8617886178861789,
+      "grad_norm": 0.5796844959259033,
+      "learning_rate": 0.0002,
+      "loss": 1.8292,
+      "step": 530
+    },
+    {
+      "epoch": 0.8780487804878049,
+      "grad_norm": 0.615927517414093,
+      "learning_rate": 0.0002,
+      "loss": 1.7775,
+      "step": 540
+    },
+    {
+      "epoch": 0.8943089430894309,
+      "grad_norm": 0.5230891704559326,
+      "learning_rate": 0.0002,
+      "loss": 1.7254,
+      "step": 550
+    },
+    {
+      "epoch": 0.9105691056910569,
+      "grad_norm": 0.5990992784500122,
+      "learning_rate": 0.0002,
+      "loss": 1.8126,
+      "step": 560
+    },
+    {
+      "epoch": 0.926829268292683,
+      "grad_norm": 0.538957417011261,
+      "learning_rate": 0.0002,
+      "loss": 1.8551,
+      "step": 570
+    },
+    {
+      "epoch": 0.943089430894309,
+      "grad_norm": 0.556900680065155,
+      "learning_rate": 0.0002,
+      "loss": 1.791,
+      "step": 580
+    },
+    {
+      "epoch": 0.959349593495935,
+      "grad_norm": 0.6459956765174866,
+      "learning_rate": 0.0002,
+      "loss": 1.8799,
+      "step": 590
+    },
+    {
+      "epoch": 0.975609756097561,
+      "grad_norm": 0.5648245215415955,
+      "learning_rate": 0.0002,
+      "loss": 1.774,
+      "step": 600
+    },
+    {
+      "epoch": 0.991869918699187,
+      "grad_norm": 0.5341294407844543,
+      "learning_rate": 0.0002,
+      "loss": 1.7746,
+      "step": 610
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.7831426858901978,
+      "eval_runtime": 98.419,
+      "eval_samples_per_second": 5.416,
+      "eval_steps_per_second": 0.681,
+      "step": 615
+    },
+    {
+      "epoch": 1.008130081300813,
+      "grad_norm": 0.49698150157928467,
+      "learning_rate": 0.0002,
+      "loss": 1.7212,
+      "step": 620
+    },
+    {
+      "epoch": 1.024390243902439,
+      "grad_norm": 0.696890652179718,
+      "learning_rate": 0.0002,
+      "loss": 1.7379,
+      "step": 630
+    },
+    {
+      "epoch": 1.040650406504065,
+      "grad_norm": 0.5939123630523682,
+      "learning_rate": 0.0002,
+      "loss": 1.6391,
+      "step": 640
+    },
+    {
+      "epoch": 1.056910569105691,
+      "grad_norm": 0.5630994439125061,
+      "learning_rate": 0.0002,
+      "loss": 1.712,
+      "step": 650
+    },
+    {
+      "epoch": 1.0731707317073171,
+      "grad_norm": 0.5783666968345642,
+      "learning_rate": 0.0002,
+      "loss": 1.6401,
+      "step": 660
+    },
+    {
+      "epoch": 1.089430894308943,
+      "grad_norm": 0.6006693840026855,
+      "learning_rate": 0.0002,
+      "loss": 1.7158,
+      "step": 670
+    },
+    {
+      "epoch": 1.1056910569105691,
+      "grad_norm": 0.6544332504272461,
+      "learning_rate": 0.0002,
+      "loss": 1.7434,
+      "step": 680
+    },
+    {
+      "epoch": 1.1219512195121952,
+      "grad_norm": 0.6734776496887207,
+      "learning_rate": 0.0002,
+      "loss": 1.6562,
+      "step": 690
+    },
+    {
+      "epoch": 1.1382113821138211,
+      "grad_norm": 0.6067698001861572,
+      "learning_rate": 0.0002,
+      "loss": 1.6724,
+      "step": 700
+    },
+    {
+      "epoch": 1.1544715447154472,
+      "grad_norm": 0.6639267802238464,
+      "learning_rate": 0.0002,
+      "loss": 1.6932,
+      "step": 710
+    },
+    {
+      "epoch": 1.170731707317073,
+      "grad_norm": 0.5179714560508728,
+      "learning_rate": 0.0002,
+      "loss": 1.6562,
+      "step": 720
+    },
+    {
+      "epoch": 1.1869918699186992,
+      "grad_norm": 0.7320363521575928,
+      "learning_rate": 0.0002,
+      "loss": 1.6811,
+      "step": 730
+    },
+    {
+      "epoch": 1.203252032520325,
+      "grad_norm": 0.689231276512146,
+      "learning_rate": 0.0002,
+      "loss": 1.5619,
+      "step": 740
+    },
+    {
+      "epoch": 1.2195121951219512,
+      "grad_norm": 0.6605235934257507,
+      "learning_rate": 0.0002,
+      "loss": 1.5688,
+      "step": 750
+    },
+    {
+      "epoch": 1.2357723577235773,
+      "grad_norm": 0.7013542056083679,
+      "learning_rate": 0.0002,
+      "loss": 1.7045,
+      "step": 760
+    },
+    {
+      "epoch": 1.2520325203252032,
+      "grad_norm": 0.6349928975105286,
+      "learning_rate": 0.0002,
+      "loss": 1.6857,
+      "step": 770
+    },
+    {
+      "epoch": 1.2682926829268293,
+      "grad_norm": 0.6362272500991821,
+      "learning_rate": 0.0002,
+      "loss": 1.6767,
+      "step": 780
+    },
+    {
+      "epoch": 1.2845528455284554,
+      "grad_norm": 0.6152030229568481,
+      "learning_rate": 0.0002,
+      "loss": 1.6594,
+      "step": 790
+    },
+    {
+      "epoch": 1.3008130081300813,
+      "grad_norm": 0.6406176686286926,
+      "learning_rate": 0.0002,
+      "loss": 1.7542,
+      "step": 800
+    },
+    {
+      "epoch": 1.3170731707317074,
+      "grad_norm": 0.6099124550819397,
+      "learning_rate": 0.0002,
+      "loss": 1.7243,
+      "step": 810
+    },
+    {
+      "epoch": 1.3333333333333333,
+      "grad_norm": 0.6298971772193909,
+      "learning_rate": 0.0002,
+      "loss": 1.6642,
+      "step": 820
+    },
+    {
+      "epoch": 1.3495934959349594,
+      "grad_norm": 0.775223433971405,
+      "learning_rate": 0.0002,
+      "loss": 1.6901,
+      "step": 830
+    },
+    {
+      "epoch": 1.3658536585365852,
+      "grad_norm": 0.7261736392974854,
+      "learning_rate": 0.0002,
+      "loss": 1.6284,
+      "step": 840
+    },
+    {
+      "epoch": 1.3821138211382114,
+      "grad_norm": 0.6321929097175598,
+      "learning_rate": 0.0002,
+      "loss": 1.6671,
+      "step": 850
+    },
+    {
+      "epoch": 1.3983739837398375,
+      "grad_norm": 0.7564281225204468,
+      "learning_rate": 0.0002,
+      "loss": 1.7036,
+      "step": 860
+    },
+    {
+      "epoch": 1.4146341463414633,
+      "grad_norm": 0.6329448819160461,
+      "learning_rate": 0.0002,
+      "loss": 1.7014,
+      "step": 870
+    },
+    {
+      "epoch": 1.4308943089430894,
+      "grad_norm": 0.6288684606552124,
+      "learning_rate": 0.0002,
+      "loss": 1.6398,
+      "step": 880
+    },
+    {
+      "epoch": 1.4471544715447155,
+      "grad_norm": 0.6165404915809631,
+      "learning_rate": 0.0002,
+      "loss": 1.673,
+      "step": 890
+    },
+    {
+      "epoch": 1.4634146341463414,
+      "grad_norm": 0.6124468445777893,
+      "learning_rate": 0.0002,
+      "loss": 1.6668,
+      "step": 900
+    },
+    {
+      "epoch": 1.4796747967479675,
+      "grad_norm": 0.7038629651069641,
+      "learning_rate": 0.0002,
+      "loss": 1.6879,
+      "step": 910
+    },
+    {
+      "epoch": 1.4959349593495934,
+      "grad_norm": 0.5755146145820618,
+      "learning_rate": 0.0002,
+      "loss": 1.6701,
+      "step": 920
+    },
+    {
+      "epoch": 1.5121951219512195,
+      "grad_norm": 0.7639156579971313,
+      "learning_rate": 0.0002,
+      "loss": 1.7244,
+      "step": 930
+    },
+    {
+      "epoch": 1.5284552845528454,
+      "grad_norm": 0.6948140859603882,
+      "learning_rate": 0.0002,
+      "loss": 1.6836,
+      "step": 940
+    },
+    {
+      "epoch": 1.5447154471544715,
+      "grad_norm": 0.6887956261634827,
+      "learning_rate": 0.0002,
+      "loss": 1.6479,
+      "step": 950
+    },
+    {
+      "epoch": 1.5609756097560976,
+      "grad_norm": 0.7226824164390564,
+      "learning_rate": 0.0002,
+      "loss": 1.7285,
+      "step": 960
+    },
+    {
+      "epoch": 1.5772357723577235,
+      "grad_norm": 0.6753950715065002,
+      "learning_rate": 0.0002,
+      "loss": 1.6214,
+      "step": 970
+    },
+    {
+      "epoch": 1.5934959349593496,
+      "grad_norm": 0.6580971479415894,
+      "learning_rate": 0.0002,
+      "loss": 1.7283,
+      "step": 980
+    },
+    {
+      "epoch": 1.6097560975609757,
+      "grad_norm": 0.7157843112945557,
+      "learning_rate": 0.0002,
+      "loss": 1.6671,
+      "step": 990
+    },
+    {
+      "epoch": 1.6260162601626016,
+      "grad_norm": 0.6736738681793213,
+      "learning_rate": 0.0002,
+      "loss": 1.645,
+      "step": 1000
+    },
+    {
+      "epoch": 1.6422764227642277,
+      "grad_norm": 0.5271940231323242,
+      "learning_rate": 0.0002,
+      "loss": 1.6589,
+      "step": 1010
+    },
+    {
+      "epoch": 1.6585365853658538,
+      "grad_norm": 0.6378998160362244,
+      "learning_rate": 0.0002,
+      "loss": 1.7358,
+      "step": 1020
+    },
+    {
+      "epoch": 1.6747967479674797,
+      "grad_norm": 0.6498209834098816,
+      "learning_rate": 0.0002,
+      "loss": 1.6924,
+      "step": 1030
+    },
+    {
+      "epoch": 1.6910569105691056,
+      "grad_norm": 0.7050761580467224,
+      "learning_rate": 0.0002,
+      "loss": 1.6253,
+      "step": 1040
+    },
+    {
+      "epoch": 1.7073170731707317,
+      "grad_norm": 0.7122200131416321,
+      "learning_rate": 0.0002,
+      "loss": 1.7146,
+      "step": 1050
+    },
+    {
+      "epoch": 1.7235772357723578,
+      "grad_norm": 0.6705704927444458,
+      "learning_rate": 0.0002,
+      "loss": 1.6511,
+      "step": 1060
+    },
+    {
+      "epoch": 1.7398373983739837,
+      "grad_norm": 0.6859356760978699,
+      "learning_rate": 0.0002,
+      "loss": 1.6506,
+      "step": 1070
+    },
+    {
+      "epoch": 1.7560975609756098,
+      "grad_norm": 0.6540971994400024,
+      "learning_rate": 0.0002,
+      "loss": 1.6562,
+      "step": 1080
+    },
+    {
+      "epoch": 1.7723577235772359,
+      "grad_norm": 0.6297651529312134,
+      "learning_rate": 0.0002,
+      "loss": 1.6627,
+      "step": 1090
+    },
+    {
+      "epoch": 1.7886178861788617,
+      "grad_norm": 0.6645651459693909,
+      "learning_rate": 0.0002,
+      "loss": 1.704,
+      "step": 1100
+    },
+    {
+      "epoch": 1.8048780487804879,
+      "grad_norm": 0.6450296640396118,
+      "learning_rate": 0.0002,
+      "loss": 1.6908,
+      "step": 1110
+    },
+    {
+      "epoch": 1.821138211382114,
+      "grad_norm": 0.7785659432411194,
+      "learning_rate": 0.0002,
+      "loss": 1.7642,
+      "step": 1120
+    },
+    {
+      "epoch": 1.8373983739837398,
+      "grad_norm": 0.6845982670783997,
+      "learning_rate": 0.0002,
+      "loss": 1.6773,
+      "step": 1130
+    },
+    {
+      "epoch": 1.8536585365853657,
+      "grad_norm": 0.699683666229248,
+      "learning_rate": 0.0002,
+      "loss": 1.6879,
+      "step": 1140
+    },
+    {
+      "epoch": 1.8699186991869918,
+      "grad_norm": 0.6600332856178284,
+      "learning_rate": 0.0002,
+      "loss": 1.7162,
+      "step": 1150
+    },
+    {
+      "epoch": 1.886178861788618,
+      "grad_norm": 0.7301949262619019,
+      "learning_rate": 0.0002,
+      "loss": 1.7291,
+      "step": 1160
+    },
+    {
+      "epoch": 1.9024390243902438,
+      "grad_norm": 0.8183556795120239,
+      "learning_rate": 0.0002,
+      "loss": 1.6874,
+      "step": 1170
+    },
+    {
+      "epoch": 1.91869918699187,
+      "grad_norm": 0.7122833132743835,
+      "learning_rate": 0.0002,
+      "loss": 1.6779,
+      "step": 1180
+    },
+    {
+      "epoch": 1.934959349593496,
+      "grad_norm": 0.6391404271125793,
+      "learning_rate": 0.0002,
+      "loss": 1.7361,
+      "step": 1190
+    },
+    {
+      "epoch": 1.951219512195122,
+      "grad_norm": 0.6136474013328552,
+      "learning_rate": 0.0002,
+      "loss": 1.7188,
+      "step": 1200
+    },
+    {
+      "epoch": 1.967479674796748,
+      "grad_norm": 0.7704503536224365,
+      "learning_rate": 0.0002,
+      "loss": 1.6536,
+      "step": 1210
+    },
+    {
+      "epoch": 1.9837398373983741,
+      "grad_norm": 0.6155434846878052,
+      "learning_rate": 0.0002,
+      "loss": 1.6735,
+      "step": 1220
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.6262536644935608,
+      "learning_rate": 0.0002,
+      "loss": 1.6534,
+      "step": 1230
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.7886285781860352,
+      "eval_runtime": 98.7888,
+      "eval_samples_per_second": 5.395,
+      "eval_steps_per_second": 0.678,
+      "step": 1230
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 4920,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.3965913849856e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:794da760599f5a7e302e2faa616ba0185215c069e7fa3436832bde34c7f2ec7b
+size 5560

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-1845/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45a8e9d5384381b3e8903830b6e34c51316cfca1cad200aac53e47837137e391
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f22849adf6fbc0690380f8ad61bd0e75641c0a742ae078d0df0dd5709cdd960
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f506141fde9fefcbfd2fbdf5037d513aea18a05dfdfb5d36fd7515adf2079d2e
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d9ecf3cbe2bdeeb71fbeaf36b416466e300cc4d67381f2ab44ce5a65f3569ea2
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,1345 @@

+{
+  "best_metric": 1.7831426858901978,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-615",
+  "epoch": 3.0,
+  "eval_steps": 10,
+  "global_step": 1845,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016260162601626018,
+      "grad_norm": 1.903406023979187,
+      "learning_rate": 0.0002,
+      "loss": 2.6243,
+      "step": 10
+    },
+    {
+      "epoch": 0.032520325203252036,
+      "grad_norm": 1.0100678205490112,
+      "learning_rate": 0.0002,
+      "loss": 2.2084,
+      "step": 20
+    },
+    {
+      "epoch": 0.04878048780487805,
+      "grad_norm": 0.7413098812103271,
+      "learning_rate": 0.0002,
+      "loss": 2.0423,
+      "step": 30
+    },
+    {
+      "epoch": 0.06504065040650407,
+      "grad_norm": 0.7540805339813232,
+      "learning_rate": 0.0002,
+      "loss": 2.0647,
+      "step": 40
+    },
+    {
+      "epoch": 0.08130081300813008,
+      "grad_norm": 0.6508658528327942,
+      "learning_rate": 0.0002,
+      "loss": 2.0926,
+      "step": 50
+    },
+    {
+      "epoch": 0.0975609756097561,
+      "grad_norm": 0.7228319048881531,
+      "learning_rate": 0.0002,
+      "loss": 1.878,
+      "step": 60
+    },
+    {
+      "epoch": 0.11382113821138211,
+      "grad_norm": 0.6510937213897705,
+      "learning_rate": 0.0002,
+      "loss": 1.8672,
+      "step": 70
+    },
+    {
+      "epoch": 0.13008130081300814,
+      "grad_norm": 0.7238746881484985,
+      "learning_rate": 0.0002,
+      "loss": 1.8592,
+      "step": 80
+    },
+    {
+      "epoch": 0.14634146341463414,
+      "grad_norm": 0.7530466318130493,
+      "learning_rate": 0.0002,
+      "loss": 1.8541,
+      "step": 90
+    },
+    {
+      "epoch": 0.16260162601626016,
+      "grad_norm": 0.622166097164154,
+      "learning_rate": 0.0002,
+      "loss": 1.8245,
+      "step": 100
+    },
+    {
+      "epoch": 0.17886178861788618,
+      "grad_norm": 0.6180148720741272,
+      "learning_rate": 0.0002,
+      "loss": 1.7581,
+      "step": 110
+    },
+    {
+      "epoch": 0.1951219512195122,
+      "grad_norm": 0.6221362352371216,
+      "learning_rate": 0.0002,
+      "loss": 1.7741,
+      "step": 120
+    },
+    {
+      "epoch": 0.21138211382113822,
+      "grad_norm": 0.569580078125,
+      "learning_rate": 0.0002,
+      "loss": 1.7575,
+      "step": 130
+    },
+    {
+      "epoch": 0.22764227642276422,
+      "grad_norm": 0.6962840557098389,
+      "learning_rate": 0.0002,
+      "loss": 1.7833,
+      "step": 140
+    },
+    {
+      "epoch": 0.24390243902439024,
+      "grad_norm": 0.644322395324707,
+      "learning_rate": 0.0002,
+      "loss": 1.8329,
+      "step": 150
+    },
+    {
+      "epoch": 0.2601626016260163,
+      "grad_norm": 0.5970060229301453,
+      "learning_rate": 0.0002,
+      "loss": 1.7794,
+      "step": 160
+    },
+    {
+      "epoch": 0.2764227642276423,
+      "grad_norm": 0.6249210834503174,
+      "learning_rate": 0.0002,
+      "loss": 1.8521,
+      "step": 170
+    },
+    {
+      "epoch": 0.2926829268292683,
+      "grad_norm": 0.7134785652160645,
+      "learning_rate": 0.0002,
+      "loss": 1.8066,
+      "step": 180
+    },
+    {
+      "epoch": 0.3089430894308943,
+      "grad_norm": 0.5477158427238464,
+      "learning_rate": 0.0002,
+      "loss": 1.8815,
+      "step": 190
+    },
+    {
+      "epoch": 0.3252032520325203,
+      "grad_norm": 0.6054863333702087,
+      "learning_rate": 0.0002,
+      "loss": 1.7222,
+      "step": 200
+    },
+    {
+      "epoch": 0.34146341463414637,
+      "grad_norm": 0.5664568543434143,
+      "learning_rate": 0.0002,
+      "loss": 1.7598,
+      "step": 210
+    },
+    {
+      "epoch": 0.35772357723577236,
+      "grad_norm": 0.5942816734313965,
+      "learning_rate": 0.0002,
+      "loss": 1.7688,
+      "step": 220
+    },
+    {
+      "epoch": 0.37398373983739835,
+      "grad_norm": 0.6311767101287842,
+      "learning_rate": 0.0002,
+      "loss": 1.7715,
+      "step": 230
+    },
+    {
+      "epoch": 0.3902439024390244,
+      "grad_norm": 0.6614870429039001,
+      "learning_rate": 0.0002,
+      "loss": 1.7663,
+      "step": 240
+    },
+    {
+      "epoch": 0.4065040650406504,
+      "grad_norm": 0.5644984841346741,
+      "learning_rate": 0.0002,
+      "loss": 1.8045,
+      "step": 250
+    },
+    {
+      "epoch": 0.42276422764227645,
+      "grad_norm": 0.7260110974311829,
+      "learning_rate": 0.0002,
+      "loss": 1.7364,
+      "step": 260
+    },
+    {
+      "epoch": 0.43902439024390244,
+      "grad_norm": 0.6733413934707642,
+      "learning_rate": 0.0002,
+      "loss": 1.7606,
+      "step": 270
+    },
+    {
+      "epoch": 0.45528455284552843,
+      "grad_norm": 0.5211837887763977,
+      "learning_rate": 0.0002,
+      "loss": 1.8432,
+      "step": 280
+    },
+    {
+      "epoch": 0.4715447154471545,
+      "grad_norm": 0.5538370013237,
+      "learning_rate": 0.0002,
+      "loss": 1.9166,
+      "step": 290
+    },
+    {
+      "epoch": 0.4878048780487805,
+      "grad_norm": 0.5429130792617798,
+      "learning_rate": 0.0002,
+      "loss": 1.8391,
+      "step": 300
+    },
+    {
+      "epoch": 0.5040650406504065,
+      "grad_norm": 0.517801821231842,
+      "learning_rate": 0.0002,
+      "loss": 1.8072,
+      "step": 310
+    },
+    {
+      "epoch": 0.5203252032520326,
+      "grad_norm": 0.6029635667800903,
+      "learning_rate": 0.0002,
+      "loss": 1.8045,
+      "step": 320
+    },
+    {
+      "epoch": 0.5365853658536586,
+      "grad_norm": 0.506401002407074,
+      "learning_rate": 0.0002,
+      "loss": 1.7734,
+      "step": 330
+    },
+    {
+      "epoch": 0.5528455284552846,
+      "grad_norm": 0.5226597189903259,
+      "learning_rate": 0.0002,
+      "loss": 1.7923,
+      "step": 340
+    },
+    {
+      "epoch": 0.5691056910569106,
+      "grad_norm": 0.5899750590324402,
+      "learning_rate": 0.0002,
+      "loss": 1.7625,
+      "step": 350
+    },
+    {
+      "epoch": 0.5853658536585366,
+      "grad_norm": 0.6185210943222046,
+      "learning_rate": 0.0002,
+      "loss": 1.828,
+      "step": 360
+    },
+    {
+      "epoch": 0.6016260162601627,
+      "grad_norm": 0.8088458180427551,
+      "learning_rate": 0.0002,
+      "loss": 1.8358,
+      "step": 370
+    },
+    {
+      "epoch": 0.6178861788617886,
+      "grad_norm": 0.509591817855835,
+      "learning_rate": 0.0002,
+      "loss": 1.8351,
+      "step": 380
+    },
+    {
+      "epoch": 0.6341463414634146,
+      "grad_norm": 0.5209569931030273,
+      "learning_rate": 0.0002,
+      "loss": 1.7849,
+      "step": 390
+    },
+    {
+      "epoch": 0.6504065040650406,
+      "grad_norm": 0.50320965051651,
+      "learning_rate": 0.0002,
+      "loss": 1.7925,
+      "step": 400
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.5555663108825684,
+      "learning_rate": 0.0002,
+      "loss": 1.795,
+      "step": 410
+    },
+    {
+      "epoch": 0.6829268292682927,
+      "grad_norm": 0.5865469574928284,
+      "learning_rate": 0.0002,
+      "loss": 1.7562,
+      "step": 420
+    },
+    {
+      "epoch": 0.6991869918699187,
+      "grad_norm": 0.5288474559783936,
+      "learning_rate": 0.0002,
+      "loss": 1.7869,
+      "step": 430
+    },
+    {
+      "epoch": 0.7154471544715447,
+      "grad_norm": 0.5364211797714233,
+      "learning_rate": 0.0002,
+      "loss": 1.8046,
+      "step": 440
+    },
+    {
+      "epoch": 0.7317073170731707,
+      "grad_norm": 0.5877127051353455,
+      "learning_rate": 0.0002,
+      "loss": 1.8124,
+      "step": 450
+    },
+    {
+      "epoch": 0.7479674796747967,
+      "grad_norm": 0.5993741154670715,
+      "learning_rate": 0.0002,
+      "loss": 1.7938,
+      "step": 460
+    },
+    {
+      "epoch": 0.7642276422764228,
+      "grad_norm": 0.4871112108230591,
+      "learning_rate": 0.0002,
+      "loss": 1.8034,
+      "step": 470
+    },
+    {
+      "epoch": 0.7804878048780488,
+      "grad_norm": 0.5300846099853516,
+      "learning_rate": 0.0002,
+      "loss": 1.7798,
+      "step": 480
+    },
+    {
+      "epoch": 0.7967479674796748,
+      "grad_norm": 0.5623212456703186,
+      "learning_rate": 0.0002,
+      "loss": 1.7772,
+      "step": 490
+    },
+    {
+      "epoch": 0.8130081300813008,
+      "grad_norm": 0.5131309032440186,
+      "learning_rate": 0.0002,
+      "loss": 1.7207,
+      "step": 500
+    },
+    {
+      "epoch": 0.8292682926829268,
+      "grad_norm": 0.49512147903442383,
+      "learning_rate": 0.0002,
+      "loss": 1.7143,
+      "step": 510
+    },
+    {
+      "epoch": 0.8455284552845529,
+      "grad_norm": 0.6260727643966675,
+      "learning_rate": 0.0002,
+      "loss": 1.8032,
+      "step": 520
+    },
+    {
+      "epoch": 0.8617886178861789,
+      "grad_norm": 0.5796844959259033,
+      "learning_rate": 0.0002,
+      "loss": 1.8292,
+      "step": 530
+    },
+    {
+      "epoch": 0.8780487804878049,
+      "grad_norm": 0.615927517414093,
+      "learning_rate": 0.0002,
+      "loss": 1.7775,
+      "step": 540
+    },
+    {
+      "epoch": 0.8943089430894309,
+      "grad_norm": 0.5230891704559326,
+      "learning_rate": 0.0002,
+      "loss": 1.7254,
+      "step": 550
+    },
+    {
+      "epoch": 0.9105691056910569,
+      "grad_norm": 0.5990992784500122,
+      "learning_rate": 0.0002,
+      "loss": 1.8126,
+      "step": 560
+    },
+    {
+      "epoch": 0.926829268292683,
+      "grad_norm": 0.538957417011261,
+      "learning_rate": 0.0002,
+      "loss": 1.8551,
+      "step": 570
+    },
+    {
+      "epoch": 0.943089430894309,
+      "grad_norm": 0.556900680065155,
+      "learning_rate": 0.0002,
+      "loss": 1.791,
+      "step": 580
+    },
+    {
+      "epoch": 0.959349593495935,
+      "grad_norm": 0.6459956765174866,
+      "learning_rate": 0.0002,
+      "loss": 1.8799,
+      "step": 590
+    },
+    {
+      "epoch": 0.975609756097561,
+      "grad_norm": 0.5648245215415955,
+      "learning_rate": 0.0002,
+      "loss": 1.774,
+      "step": 600
+    },
+    {
+      "epoch": 0.991869918699187,
+      "grad_norm": 0.5341294407844543,
+      "learning_rate": 0.0002,
+      "loss": 1.7746,
+      "step": 610
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.7831426858901978,
+      "eval_runtime": 98.419,
+      "eval_samples_per_second": 5.416,
+      "eval_steps_per_second": 0.681,
+      "step": 615
+    },
+    {
+      "epoch": 1.008130081300813,
+      "grad_norm": 0.49698150157928467,
+      "learning_rate": 0.0002,
+      "loss": 1.7212,
+      "step": 620
+    },
+    {
+      "epoch": 1.024390243902439,
+      "grad_norm": 0.696890652179718,
+      "learning_rate": 0.0002,
+      "loss": 1.7379,
+      "step": 630
+    },
+    {
+      "epoch": 1.040650406504065,
+      "grad_norm": 0.5939123630523682,
+      "learning_rate": 0.0002,
+      "loss": 1.6391,
+      "step": 640
+    },
+    {
+      "epoch": 1.056910569105691,
+      "grad_norm": 0.5630994439125061,
+      "learning_rate": 0.0002,
+      "loss": 1.712,
+      "step": 650
+    },
+    {
+      "epoch": 1.0731707317073171,
+      "grad_norm": 0.5783666968345642,
+      "learning_rate": 0.0002,
+      "loss": 1.6401,
+      "step": 660
+    },
+    {
+      "epoch": 1.089430894308943,
+      "grad_norm": 0.6006693840026855,
+      "learning_rate": 0.0002,
+      "loss": 1.7158,
+      "step": 670
+    },
+    {
+      "epoch": 1.1056910569105691,
+      "grad_norm": 0.6544332504272461,
+      "learning_rate": 0.0002,
+      "loss": 1.7434,
+      "step": 680
+    },
+    {
+      "epoch": 1.1219512195121952,
+      "grad_norm": 0.6734776496887207,
+      "learning_rate": 0.0002,
+      "loss": 1.6562,
+      "step": 690
+    },
+    {
+      "epoch": 1.1382113821138211,
+      "grad_norm": 0.6067698001861572,
+      "learning_rate": 0.0002,
+      "loss": 1.6724,
+      "step": 700
+    },
+    {
+      "epoch": 1.1544715447154472,
+      "grad_norm": 0.6639267802238464,
+      "learning_rate": 0.0002,
+      "loss": 1.6932,
+      "step": 710
+    },
+    {
+      "epoch": 1.170731707317073,
+      "grad_norm": 0.5179714560508728,
+      "learning_rate": 0.0002,
+      "loss": 1.6562,
+      "step": 720
+    },
+    {
+      "epoch": 1.1869918699186992,
+      "grad_norm": 0.7320363521575928,
+      "learning_rate": 0.0002,
+      "loss": 1.6811,
+      "step": 730
+    },
+    {
+      "epoch": 1.203252032520325,
+      "grad_norm": 0.689231276512146,
+      "learning_rate": 0.0002,
+      "loss": 1.5619,
+      "step": 740
+    },
+    {
+      "epoch": 1.2195121951219512,
+      "grad_norm": 0.6605235934257507,
+      "learning_rate": 0.0002,
+      "loss": 1.5688,
+      "step": 750
+    },
+    {
+      "epoch": 1.2357723577235773,
+      "grad_norm": 0.7013542056083679,
+      "learning_rate": 0.0002,
+      "loss": 1.7045,
+      "step": 760
+    },
+    {
+      "epoch": 1.2520325203252032,
+      "grad_norm": 0.6349928975105286,
+      "learning_rate": 0.0002,
+      "loss": 1.6857,
+      "step": 770
+    },
+    {
+      "epoch": 1.2682926829268293,
+      "grad_norm": 0.6362272500991821,
+      "learning_rate": 0.0002,
+      "loss": 1.6767,
+      "step": 780
+    },
+    {
+      "epoch": 1.2845528455284554,
+      "grad_norm": 0.6152030229568481,
+      "learning_rate": 0.0002,
+      "loss": 1.6594,
+      "step": 790
+    },
+    {
+      "epoch": 1.3008130081300813,
+      "grad_norm": 0.6406176686286926,
+      "learning_rate": 0.0002,
+      "loss": 1.7542,
+      "step": 800
+    },
+    {
+      "epoch": 1.3170731707317074,
+      "grad_norm": 0.6099124550819397,
+      "learning_rate": 0.0002,
+      "loss": 1.7243,
+      "step": 810
+    },
+    {
+      "epoch": 1.3333333333333333,
+      "grad_norm": 0.6298971772193909,
+      "learning_rate": 0.0002,
+      "loss": 1.6642,
+      "step": 820
+    },
+    {
+      "epoch": 1.3495934959349594,
+      "grad_norm": 0.775223433971405,
+      "learning_rate": 0.0002,
+      "loss": 1.6901,
+      "step": 830
+    },
+    {
+      "epoch": 1.3658536585365852,
+      "grad_norm": 0.7261736392974854,
+      "learning_rate": 0.0002,
+      "loss": 1.6284,
+      "step": 840
+    },
+    {
+      "epoch": 1.3821138211382114,
+      "grad_norm": 0.6321929097175598,
+      "learning_rate": 0.0002,
+      "loss": 1.6671,
+      "step": 850
+    },
+    {
+      "epoch": 1.3983739837398375,
+      "grad_norm": 0.7564281225204468,
+      "learning_rate": 0.0002,
+      "loss": 1.7036,
+      "step": 860
+    },
+    {
+      "epoch": 1.4146341463414633,
+      "grad_norm": 0.6329448819160461,
+      "learning_rate": 0.0002,
+      "loss": 1.7014,
+      "step": 870
+    },
+    {
+      "epoch": 1.4308943089430894,
+      "grad_norm": 0.6288684606552124,
+      "learning_rate": 0.0002,
+      "loss": 1.6398,
+      "step": 880
+    },
+    {
+      "epoch": 1.4471544715447155,
+      "grad_norm": 0.6165404915809631,
+      "learning_rate": 0.0002,
+      "loss": 1.673,
+      "step": 890
+    },
+    {
+      "epoch": 1.4634146341463414,
+      "grad_norm": 0.6124468445777893,
+      "learning_rate": 0.0002,
+      "loss": 1.6668,
+      "step": 900
+    },
+    {
+      "epoch": 1.4796747967479675,
+      "grad_norm": 0.7038629651069641,
+      "learning_rate": 0.0002,
+      "loss": 1.6879,
+      "step": 910
+    },
+    {
+      "epoch": 1.4959349593495934,
+      "grad_norm": 0.5755146145820618,
+      "learning_rate": 0.0002,
+      "loss": 1.6701,
+      "step": 920
+    },
+    {
+      "epoch": 1.5121951219512195,
+      "grad_norm": 0.7639156579971313,
+      "learning_rate": 0.0002,
+      "loss": 1.7244,
+      "step": 930
+    },
+    {
+      "epoch": 1.5284552845528454,
+      "grad_norm": 0.6948140859603882,
+      "learning_rate": 0.0002,
+      "loss": 1.6836,
+      "step": 940
+    },
+    {
+      "epoch": 1.5447154471544715,
+      "grad_norm": 0.6887956261634827,
+      "learning_rate": 0.0002,
+      "loss": 1.6479,
+      "step": 950
+    },
+    {
+      "epoch": 1.5609756097560976,
+      "grad_norm": 0.7226824164390564,
+      "learning_rate": 0.0002,
+      "loss": 1.7285,
+      "step": 960
+    },
+    {
+      "epoch": 1.5772357723577235,
+      "grad_norm": 0.6753950715065002,
+      "learning_rate": 0.0002,
+      "loss": 1.6214,
+      "step": 970
+    },
+    {
+      "epoch": 1.5934959349593496,
+      "grad_norm": 0.6580971479415894,
+      "learning_rate": 0.0002,
+      "loss": 1.7283,
+      "step": 980
+    },
+    {
+      "epoch": 1.6097560975609757,
+      "grad_norm": 0.7157843112945557,
+      "learning_rate": 0.0002,
+      "loss": 1.6671,
+      "step": 990
+    },
+    {
+      "epoch": 1.6260162601626016,
+      "grad_norm": 0.6736738681793213,
+      "learning_rate": 0.0002,
+      "loss": 1.645,
+      "step": 1000
+    },
+    {
+      "epoch": 1.6422764227642277,
+      "grad_norm": 0.5271940231323242,
+      "learning_rate": 0.0002,
+      "loss": 1.6589,
+      "step": 1010
+    },
+    {
+      "epoch": 1.6585365853658538,
+      "grad_norm": 0.6378998160362244,
+      "learning_rate": 0.0002,
+      "loss": 1.7358,
+      "step": 1020
+    },
+    {
+      "epoch": 1.6747967479674797,
+      "grad_norm": 0.6498209834098816,
+      "learning_rate": 0.0002,
+      "loss": 1.6924,
+      "step": 1030
+    },
+    {
+      "epoch": 1.6910569105691056,
+      "grad_norm": 0.7050761580467224,
+      "learning_rate": 0.0002,
+      "loss": 1.6253,
+      "step": 1040
+    },
+    {
+      "epoch": 1.7073170731707317,
+      "grad_norm": 0.7122200131416321,
+      "learning_rate": 0.0002,
+      "loss": 1.7146,
+      "step": 1050
+    },
+    {
+      "epoch": 1.7235772357723578,
+      "grad_norm": 0.6705704927444458,
+      "learning_rate": 0.0002,
+      "loss": 1.6511,
+      "step": 1060
+    },
+    {
+      "epoch": 1.7398373983739837,
+      "grad_norm": 0.6859356760978699,
+      "learning_rate": 0.0002,
+      "loss": 1.6506,
+      "step": 1070
+    },
+    {
+      "epoch": 1.7560975609756098,
+      "grad_norm": 0.6540971994400024,
+      "learning_rate": 0.0002,
+      "loss": 1.6562,
+      "step": 1080
+    },
+    {
+      "epoch": 1.7723577235772359,
+      "grad_norm": 0.6297651529312134,
+      "learning_rate": 0.0002,
+      "loss": 1.6627,
+      "step": 1090
+    },
+    {
+      "epoch": 1.7886178861788617,
+      "grad_norm": 0.6645651459693909,
+      "learning_rate": 0.0002,
+      "loss": 1.704,
+      "step": 1100
+    },
+    {
+      "epoch": 1.8048780487804879,
+      "grad_norm": 0.6450296640396118,
+      "learning_rate": 0.0002,
+      "loss": 1.6908,
+      "step": 1110
+    },
+    {
+      "epoch": 1.821138211382114,
+      "grad_norm": 0.7785659432411194,
+      "learning_rate": 0.0002,
+      "loss": 1.7642,
+      "step": 1120
+    },
+    {
+      "epoch": 1.8373983739837398,
+      "grad_norm": 0.6845982670783997,
+      "learning_rate": 0.0002,
+      "loss": 1.6773,
+      "step": 1130
+    },
+    {
+      "epoch": 1.8536585365853657,
+      "grad_norm": 0.699683666229248,
+      "learning_rate": 0.0002,
+      "loss": 1.6879,
+      "step": 1140
+    },
+    {
+      "epoch": 1.8699186991869918,
+      "grad_norm": 0.6600332856178284,
+      "learning_rate": 0.0002,
+      "loss": 1.7162,
+      "step": 1150
+    },
+    {
+      "epoch": 1.886178861788618,
+      "grad_norm": 0.7301949262619019,
+      "learning_rate": 0.0002,
+      "loss": 1.7291,
+      "step": 1160
+    },
+    {
+      "epoch": 1.9024390243902438,
+      "grad_norm": 0.8183556795120239,
+      "learning_rate": 0.0002,
+      "loss": 1.6874,
+      "step": 1170
+    },
+    {
+      "epoch": 1.91869918699187,
+      "grad_norm": 0.7122833132743835,
+      "learning_rate": 0.0002,
+      "loss": 1.6779,
+      "step": 1180
+    },
+    {
+      "epoch": 1.934959349593496,
+      "grad_norm": 0.6391404271125793,
+      "learning_rate": 0.0002,
+      "loss": 1.7361,
+      "step": 1190
+    },
+    {
+      "epoch": 1.951219512195122,
+      "grad_norm": 0.6136474013328552,
+      "learning_rate": 0.0002,
+      "loss": 1.7188,
+      "step": 1200
+    },
+    {
+      "epoch": 1.967479674796748,
+      "grad_norm": 0.7704503536224365,
+      "learning_rate": 0.0002,
+      "loss": 1.6536,
+      "step": 1210
+    },
+    {
+      "epoch": 1.9837398373983741,
+      "grad_norm": 0.6155434846878052,
+      "learning_rate": 0.0002,
+      "loss": 1.6735,
+      "step": 1220
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.6262536644935608,
+      "learning_rate": 0.0002,
+      "loss": 1.6534,
+      "step": 1230
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.7886285781860352,
+      "eval_runtime": 98.7888,
+      "eval_samples_per_second": 5.395,
+      "eval_steps_per_second": 0.678,
+      "step": 1230
+    },
+    {
+      "epoch": 2.016260162601626,
+      "grad_norm": 0.9827656149864197,
+      "learning_rate": 0.0002,
+      "loss": 1.5135,
+      "step": 1240
+    },
+    {
+      "epoch": 2.032520325203252,
+      "grad_norm": 0.8443078398704529,
+      "learning_rate": 0.0002,
+      "loss": 1.5396,
+      "step": 1250
+    },
+    {
+      "epoch": 2.048780487804878,
+      "grad_norm": 0.9006391763687134,
+      "learning_rate": 0.0002,
+      "loss": 1.5326,
+      "step": 1260
+    },
+    {
+      "epoch": 2.065040650406504,
+      "grad_norm": 0.7900105714797974,
+      "learning_rate": 0.0002,
+      "loss": 1.5176,
+      "step": 1270
+    },
+    {
+      "epoch": 2.08130081300813,
+      "grad_norm": 0.6430686116218567,
+      "learning_rate": 0.0002,
+      "loss": 1.5578,
+      "step": 1280
+    },
+    {
+      "epoch": 2.097560975609756,
+      "grad_norm": 0.8998992443084717,
+      "learning_rate": 0.0002,
+      "loss": 1.5453,
+      "step": 1290
+    },
+    {
+      "epoch": 2.113821138211382,
+      "grad_norm": 0.7658976316452026,
+      "learning_rate": 0.0002,
+      "loss": 1.5051,
+      "step": 1300
+    },
+    {
+      "epoch": 2.130081300813008,
+      "grad_norm": 0.9033166766166687,
+      "learning_rate": 0.0002,
+      "loss": 1.5005,
+      "step": 1310
+    },
+    {
+      "epoch": 2.1463414634146343,
+      "grad_norm": 0.7942133545875549,
+      "learning_rate": 0.0002,
+      "loss": 1.5517,
+      "step": 1320
+    },
+    {
+      "epoch": 2.16260162601626,
+      "grad_norm": 0.8496367931365967,
+      "learning_rate": 0.0002,
+      "loss": 1.5248,
+      "step": 1330
+    },
+    {
+      "epoch": 2.178861788617886,
+      "grad_norm": 0.8638061881065369,
+      "learning_rate": 0.0002,
+      "loss": 1.4887,
+      "step": 1340
+    },
+    {
+      "epoch": 2.1951219512195124,
+      "grad_norm": 0.9003657102584839,
+      "learning_rate": 0.0002,
+      "loss": 1.5023,
+      "step": 1350
+    },
+    {
+      "epoch": 2.2113821138211383,
+      "grad_norm": 0.8387648463249207,
+      "learning_rate": 0.0002,
+      "loss": 1.4904,
+      "step": 1360
+    },
+    {
+      "epoch": 2.227642276422764,
+      "grad_norm": 0.7598716616630554,
+      "learning_rate": 0.0002,
+      "loss": 1.5553,
+      "step": 1370
+    },
+    {
+      "epoch": 2.2439024390243905,
+      "grad_norm": 0.872882604598999,
+      "learning_rate": 0.0002,
+      "loss": 1.4723,
+      "step": 1380
+    },
+    {
+      "epoch": 2.2601626016260163,
+      "grad_norm": 0.8919313549995422,
+      "learning_rate": 0.0002,
+      "loss": 1.5121,
+      "step": 1390
+    },
+    {
+      "epoch": 2.2764227642276422,
+      "grad_norm": 0.9646918773651123,
+      "learning_rate": 0.0002,
+      "loss": 1.5162,
+      "step": 1400
+    },
+    {
+      "epoch": 2.292682926829268,
+      "grad_norm": 0.8501992225646973,
+      "learning_rate": 0.0002,
+      "loss": 1.5163,
+      "step": 1410
+    },
+    {
+      "epoch": 2.3089430894308944,
+      "grad_norm": 0.7517067790031433,
+      "learning_rate": 0.0002,
+      "loss": 1.5096,
+      "step": 1420
+    },
+    {
+      "epoch": 2.3252032520325203,
+      "grad_norm": 0.9097304940223694,
+      "learning_rate": 0.0002,
+      "loss": 1.5359,
+      "step": 1430
+    },
+    {
+      "epoch": 2.341463414634146,
+      "grad_norm": 0.8515191674232483,
+      "learning_rate": 0.0002,
+      "loss": 1.4843,
+      "step": 1440
+    },
+    {
+      "epoch": 2.3577235772357725,
+      "grad_norm": 0.8925113677978516,
+      "learning_rate": 0.0002,
+      "loss": 1.5021,
+      "step": 1450
+    },
+    {
+      "epoch": 2.3739837398373984,
+      "grad_norm": 1.0194441080093384,
+      "learning_rate": 0.0002,
+      "loss": 1.4235,
+      "step": 1460
+    },
+    {
+      "epoch": 2.3902439024390243,
+      "grad_norm": 0.9004436731338501,
+      "learning_rate": 0.0002,
+      "loss": 1.5778,
+      "step": 1470
+    },
+    {
+      "epoch": 2.40650406504065,
+      "grad_norm": 0.9552311897277832,
+      "learning_rate": 0.0002,
+      "loss": 1.5623,
+      "step": 1480
+    },
+    {
+      "epoch": 2.4227642276422765,
+      "grad_norm": 0.9185764789581299,
+      "learning_rate": 0.0002,
+      "loss": 1.5507,
+      "step": 1490
+    },
+    {
+      "epoch": 2.4390243902439024,
+      "grad_norm": 0.7935037016868591,
+      "learning_rate": 0.0002,
+      "loss": 1.5058,
+      "step": 1500
+    },
+    {
+      "epoch": 2.4552845528455283,
+      "grad_norm": 0.8124602437019348,
+      "learning_rate": 0.0002,
+      "loss": 1.5374,
+      "step": 1510
+    },
+    {
+      "epoch": 2.4715447154471546,
+      "grad_norm": 0.7927430272102356,
+      "learning_rate": 0.0002,
+      "loss": 1.4553,
+      "step": 1520
+    },
+    {
+      "epoch": 2.4878048780487805,
+      "grad_norm": 0.9143779873847961,
+      "learning_rate": 0.0002,
+      "loss": 1.5135,
+      "step": 1530
+    },
+    {
+      "epoch": 2.5040650406504064,
+      "grad_norm": 0.938185453414917,
+      "learning_rate": 0.0002,
+      "loss": 1.4842,
+      "step": 1540
+    },
+    {
+      "epoch": 2.5203252032520327,
+      "grad_norm": 0.9858708381652832,
+      "learning_rate": 0.0002,
+      "loss": 1.5983,
+      "step": 1550
+    },
+    {
+      "epoch": 2.5365853658536586,
+      "grad_norm": 0.9211642742156982,
+      "learning_rate": 0.0002,
+      "loss": 1.5464,
+      "step": 1560
+    },
+    {
+      "epoch": 2.5528455284552845,
+      "grad_norm": 0.9824395775794983,
+      "learning_rate": 0.0002,
+      "loss": 1.5293,
+      "step": 1570
+    },
+    {
+      "epoch": 2.569105691056911,
+      "grad_norm": 0.916930615901947,
+      "learning_rate": 0.0002,
+      "loss": 1.5559,
+      "step": 1580
+    },
+    {
+      "epoch": 2.5853658536585367,
+      "grad_norm": 0.9336596727371216,
+      "learning_rate": 0.0002,
+      "loss": 1.5581,
+      "step": 1590
+    },
+    {
+      "epoch": 2.6016260162601625,
+      "grad_norm": 0.9006481170654297,
+      "learning_rate": 0.0002,
+      "loss": 1.5379,
+      "step": 1600
+    },
+    {
+      "epoch": 2.617886178861789,
+      "grad_norm": 0.8296214938163757,
+      "learning_rate": 0.0002,
+      "loss": 1.5254,
+      "step": 1610
+    },
+    {
+      "epoch": 2.6341463414634148,
+      "grad_norm": 1.0448366403579712,
+      "learning_rate": 0.0002,
+      "loss": 1.5782,
+      "step": 1620
+    },
+    {
+      "epoch": 2.6504065040650406,
+      "grad_norm": 0.8174839019775391,
+      "learning_rate": 0.0002,
+      "loss": 1.5523,
+      "step": 1630
+    },
+    {
+      "epoch": 2.6666666666666665,
+      "grad_norm": 0.873572051525116,
+      "learning_rate": 0.0002,
+      "loss": 1.4434,
+      "step": 1640
+    },
+    {
+      "epoch": 2.682926829268293,
+      "grad_norm": 0.9270642995834351,
+      "learning_rate": 0.0002,
+      "loss": 1.4723,
+      "step": 1650
+    },
+    {
+      "epoch": 2.6991869918699187,
+      "grad_norm": 0.8988297581672668,
+      "learning_rate": 0.0002,
+      "loss": 1.4736,
+      "step": 1660
+    },
+    {
+      "epoch": 2.7154471544715446,
+      "grad_norm": 0.8537285923957825,
+      "learning_rate": 0.0002,
+      "loss": 1.52,
+      "step": 1670
+    },
+    {
+      "epoch": 2.7317073170731705,
+      "grad_norm": 0.7982168793678284,
+      "learning_rate": 0.0002,
+      "loss": 1.5073,
+      "step": 1680
+    },
+    {
+      "epoch": 2.747967479674797,
+      "grad_norm": 0.9140633940696716,
+      "learning_rate": 0.0002,
+      "loss": 1.5357,
+      "step": 1690
+    },
+    {
+      "epoch": 2.7642276422764227,
+      "grad_norm": 0.8485862016677856,
+      "learning_rate": 0.0002,
+      "loss": 1.5182,
+      "step": 1700
+    },
+    {
+      "epoch": 2.7804878048780486,
+      "grad_norm": 1.3670072555541992,
+      "learning_rate": 0.0002,
+      "loss": 1.5273,
+      "step": 1710
+    },
+    {
+      "epoch": 2.796747967479675,
+      "grad_norm": 0.8846588134765625,
+      "learning_rate": 0.0002,
+      "loss": 1.492,
+      "step": 1720
+    },
+    {
+      "epoch": 2.813008130081301,
+      "grad_norm": 1.0143219232559204,
+      "learning_rate": 0.0002,
+      "loss": 1.5286,
+      "step": 1730
+    },
+    {
+      "epoch": 2.8292682926829267,
+      "grad_norm": 0.9646075367927551,
+      "learning_rate": 0.0002,
+      "loss": 1.5253,
+      "step": 1740
+    },
+    {
+      "epoch": 2.845528455284553,
+      "grad_norm": 0.9912563562393188,
+      "learning_rate": 0.0002,
+      "loss": 1.5865,
+      "step": 1750
+    },
+    {
+      "epoch": 2.861788617886179,
+      "grad_norm": 0.8160223364830017,
+      "learning_rate": 0.0002,
+      "loss": 1.5266,
+      "step": 1760
+    },
+    {
+      "epoch": 2.8780487804878048,
+      "grad_norm": 0.8553791642189026,
+      "learning_rate": 0.0002,
+      "loss": 1.5542,
+      "step": 1770
+    },
+    {
+      "epoch": 2.894308943089431,
+      "grad_norm": 0.8816639184951782,
+      "learning_rate": 0.0002,
+      "loss": 1.5592,
+      "step": 1780
+    },
+    {
+      "epoch": 2.910569105691057,
+      "grad_norm": 0.829551637172699,
+      "learning_rate": 0.0002,
+      "loss": 1.5443,
+      "step": 1790
+    },
+    {
+      "epoch": 2.926829268292683,
+      "grad_norm": 1.0520497560501099,
+      "learning_rate": 0.0002,
+      "loss": 1.5111,
+      "step": 1800
+    },
+    {
+      "epoch": 2.943089430894309,
+      "grad_norm": 0.8627844452857971,
+      "learning_rate": 0.0002,
+      "loss": 1.509,
+      "step": 1810
+    },
+    {
+      "epoch": 2.959349593495935,
+      "grad_norm": 0.8868018388748169,
+      "learning_rate": 0.0002,
+      "loss": 1.5119,
+      "step": 1820
+    },
+    {
+      "epoch": 2.975609756097561,
+      "grad_norm": 1.047621250152588,
+      "learning_rate": 0.0002,
+      "loss": 1.5956,
+      "step": 1830
+    },
+    {
+      "epoch": 2.991869918699187,
+      "grad_norm": 1.122131109237671,
+      "learning_rate": 0.0002,
+      "loss": 1.5189,
+      "step": 1840
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 1.8374383449554443,
+      "eval_runtime": 98.0056,
+      "eval_samples_per_second": 5.438,
+      "eval_steps_per_second": 0.684,
+      "step": 1845
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 4920,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.0948870774784e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:794da760599f5a7e302e2faa616ba0185215c069e7fa3436832bde34c7f2ec7b
+size 5560

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-2460/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b86b1157344ee03d1ea9e53ecb7441f26c937f3f7a3e6a087620639cc0b4576
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac93e172748bfa648efe9fbc93d295ff4861413dcf819b2a0d68b6db6157a6bd
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95e15f5fb27ea7d5bd0ba228e7d4894788a0b2e16eaaff348a1a0c2de0b0980c
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b03d97ce7b3c60cec4ded43deb4819d236c41cd836fa7210a3124936d0cd1e7
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,1787 @@

+{
+  "best_metric": 1.7831426858901978,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-615",
+  "epoch": 4.0,
+  "eval_steps": 10,
+  "global_step": 2460,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016260162601626018,
+      "grad_norm": 1.903406023979187,
+      "learning_rate": 0.0002,
+      "loss": 2.6243,
+      "step": 10
+    },
+    {
+      "epoch": 0.032520325203252036,
+      "grad_norm": 1.0100678205490112,
+      "learning_rate": 0.0002,
+      "loss": 2.2084,
+      "step": 20
+    },
+    {
+      "epoch": 0.04878048780487805,
+      "grad_norm": 0.7413098812103271,
+      "learning_rate": 0.0002,
+      "loss": 2.0423,
+      "step": 30
+    },
+    {
+      "epoch": 0.06504065040650407,
+      "grad_norm": 0.7540805339813232,
+      "learning_rate": 0.0002,
+      "loss": 2.0647,
+      "step": 40
+    },
+    {
+      "epoch": 0.08130081300813008,
+      "grad_norm": 0.6508658528327942,
+      "learning_rate": 0.0002,
+      "loss": 2.0926,
+      "step": 50
+    },
+    {
+      "epoch": 0.0975609756097561,
+      "grad_norm": 0.7228319048881531,
+      "learning_rate": 0.0002,
+      "loss": 1.878,
+      "step": 60
+    },
+    {
+      "epoch": 0.11382113821138211,
+      "grad_norm": 0.6510937213897705,
+      "learning_rate": 0.0002,
+      "loss": 1.8672,
+      "step": 70
+    },
+    {
+      "epoch": 0.13008130081300814,
+      "grad_norm": 0.7238746881484985,
+      "learning_rate": 0.0002,
+      "loss": 1.8592,
+      "step": 80
+    },
+    {
+      "epoch": 0.14634146341463414,
+      "grad_norm": 0.7530466318130493,
+      "learning_rate": 0.0002,
+      "loss": 1.8541,
+      "step": 90
+    },
+    {
+      "epoch": 0.16260162601626016,
+      "grad_norm": 0.622166097164154,
+      "learning_rate": 0.0002,
+      "loss": 1.8245,
+      "step": 100
+    },
+    {
+      "epoch": 0.17886178861788618,
+      "grad_norm": 0.6180148720741272,
+      "learning_rate": 0.0002,
+      "loss": 1.7581,
+      "step": 110
+    },
+    {
+      "epoch": 0.1951219512195122,
+      "grad_norm": 0.6221362352371216,
+      "learning_rate": 0.0002,
+      "loss": 1.7741,
+      "step": 120
+    },
+    {
+      "epoch": 0.21138211382113822,
+      "grad_norm": 0.569580078125,
+      "learning_rate": 0.0002,
+      "loss": 1.7575,
+      "step": 130
+    },
+    {
+      "epoch": 0.22764227642276422,
+      "grad_norm": 0.6962840557098389,
+      "learning_rate": 0.0002,
+      "loss": 1.7833,
+      "step": 140
+    },
+    {
+      "epoch": 0.24390243902439024,
+      "grad_norm": 0.644322395324707,
+      "learning_rate": 0.0002,
+      "loss": 1.8329,
+      "step": 150
+    },
+    {
+      "epoch": 0.2601626016260163,
+      "grad_norm": 0.5970060229301453,
+      "learning_rate": 0.0002,
+      "loss": 1.7794,
+      "step": 160
+    },
+    {
+      "epoch": 0.2764227642276423,
+      "grad_norm": 0.6249210834503174,
+      "learning_rate": 0.0002,
+      "loss": 1.8521,
+      "step": 170
+    },
+    {
+      "epoch": 0.2926829268292683,
+      "grad_norm": 0.7134785652160645,
+      "learning_rate": 0.0002,
+      "loss": 1.8066,
+      "step": 180
+    },
+    {
+      "epoch": 0.3089430894308943,
+      "grad_norm": 0.5477158427238464,
+      "learning_rate": 0.0002,
+      "loss": 1.8815,
+      "step": 190
+    },
+    {
+      "epoch": 0.3252032520325203,
+      "grad_norm": 0.6054863333702087,
+      "learning_rate": 0.0002,
+      "loss": 1.7222,
+      "step": 200
+    },
+    {
+      "epoch": 0.34146341463414637,
+      "grad_norm": 0.5664568543434143,
+      "learning_rate": 0.0002,
+      "loss": 1.7598,
+      "step": 210
+    },
+    {
+      "epoch": 0.35772357723577236,
+      "grad_norm": 0.5942816734313965,
+      "learning_rate": 0.0002,
+      "loss": 1.7688,
+      "step": 220
+    },
+    {
+      "epoch": 0.37398373983739835,
+      "grad_norm": 0.6311767101287842,
+      "learning_rate": 0.0002,
+      "loss": 1.7715,
+      "step": 230
+    },
+    {
+      "epoch": 0.3902439024390244,
+      "grad_norm": 0.6614870429039001,
+      "learning_rate": 0.0002,
+      "loss": 1.7663,
+      "step": 240
+    },
+    {
+      "epoch": 0.4065040650406504,
+      "grad_norm": 0.5644984841346741,
+      "learning_rate": 0.0002,
+      "loss": 1.8045,
+      "step": 250
+    },
+    {
+      "epoch": 0.42276422764227645,
+      "grad_norm": 0.7260110974311829,
+      "learning_rate": 0.0002,
+      "loss": 1.7364,
+      "step": 260
+    },
+    {
+      "epoch": 0.43902439024390244,
+      "grad_norm": 0.6733413934707642,
+      "learning_rate": 0.0002,
+      "loss": 1.7606,
+      "step": 270
+    },
+    {
+      "epoch": 0.45528455284552843,
+      "grad_norm": 0.5211837887763977,
+      "learning_rate": 0.0002,
+      "loss": 1.8432,
+      "step": 280
+    },
+    {
+      "epoch": 0.4715447154471545,
+      "grad_norm": 0.5538370013237,
+      "learning_rate": 0.0002,
+      "loss": 1.9166,
+      "step": 290
+    },
+    {
+      "epoch": 0.4878048780487805,
+      "grad_norm": 0.5429130792617798,
+      "learning_rate": 0.0002,
+      "loss": 1.8391,
+      "step": 300
+    },
+    {
+      "epoch": 0.5040650406504065,
+      "grad_norm": 0.517801821231842,
+      "learning_rate": 0.0002,
+      "loss": 1.8072,
+      "step": 310
+    },
+    {
+      "epoch": 0.5203252032520326,
+      "grad_norm": 0.6029635667800903,
+      "learning_rate": 0.0002,
+      "loss": 1.8045,
+      "step": 320
+    },
+    {
+      "epoch": 0.5365853658536586,
+      "grad_norm": 0.506401002407074,
+      "learning_rate": 0.0002,
+      "loss": 1.7734,
+      "step": 330
+    },
+    {
+      "epoch": 0.5528455284552846,
+      "grad_norm": 0.5226597189903259,
+      "learning_rate": 0.0002,
+      "loss": 1.7923,
+      "step": 340
+    },
+    {
+      "epoch": 0.5691056910569106,
+      "grad_norm": 0.5899750590324402,
+      "learning_rate": 0.0002,
+      "loss": 1.7625,
+      "step": 350
+    },
+    {
+      "epoch": 0.5853658536585366,
+      "grad_norm": 0.6185210943222046,
+      "learning_rate": 0.0002,
+      "loss": 1.828,
+      "step": 360
+    },
+    {
+      "epoch": 0.6016260162601627,
+      "grad_norm": 0.8088458180427551,
+      "learning_rate": 0.0002,
+      "loss": 1.8358,
+      "step": 370
+    },
+    {
+      "epoch": 0.6178861788617886,
+      "grad_norm": 0.509591817855835,
+      "learning_rate": 0.0002,
+      "loss": 1.8351,
+      "step": 380
+    },
+    {
+      "epoch": 0.6341463414634146,
+      "grad_norm": 0.5209569931030273,
+      "learning_rate": 0.0002,
+      "loss": 1.7849,
+      "step": 390
+    },
+    {
+      "epoch": 0.6504065040650406,
+      "grad_norm": 0.50320965051651,
+      "learning_rate": 0.0002,
+      "loss": 1.7925,
+      "step": 400
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.5555663108825684,
+      "learning_rate": 0.0002,
+      "loss": 1.795,
+      "step": 410
+    },
+    {
+      "epoch": 0.6829268292682927,
+      "grad_norm": 0.5865469574928284,
+      "learning_rate": 0.0002,
+      "loss": 1.7562,
+      "step": 420
+    },
+    {
+      "epoch": 0.6991869918699187,
+      "grad_norm": 0.5288474559783936,
+      "learning_rate": 0.0002,
+      "loss": 1.7869,
+      "step": 430
+    },
+    {
+      "epoch": 0.7154471544715447,
+      "grad_norm": 0.5364211797714233,
+      "learning_rate": 0.0002,
+      "loss": 1.8046,
+      "step": 440
+    },
+    {
+      "epoch": 0.7317073170731707,
+      "grad_norm": 0.5877127051353455,
+      "learning_rate": 0.0002,
+      "loss": 1.8124,
+      "step": 450
+    },
+    {
+      "epoch": 0.7479674796747967,
+      "grad_norm": 0.5993741154670715,
+      "learning_rate": 0.0002,
+      "loss": 1.7938,
+      "step": 460
+    },
+    {
+      "epoch": 0.7642276422764228,
+      "grad_norm": 0.4871112108230591,
+      "learning_rate": 0.0002,
+      "loss": 1.8034,
+      "step": 470
+    },
+    {
+      "epoch": 0.7804878048780488,
+      "grad_norm": 0.5300846099853516,
+      "learning_rate": 0.0002,
+      "loss": 1.7798,
+      "step": 480
+    },
+    {
+      "epoch": 0.7967479674796748,
+      "grad_norm": 0.5623212456703186,
+      "learning_rate": 0.0002,
+      "loss": 1.7772,
+      "step": 490
+    },
+    {
+      "epoch": 0.8130081300813008,
+      "grad_norm": 0.5131309032440186,
+      "learning_rate": 0.0002,
+      "loss": 1.7207,
+      "step": 500
+    },
+    {
+      "epoch": 0.8292682926829268,
+      "grad_norm": 0.49512147903442383,
+      "learning_rate": 0.0002,
+      "loss": 1.7143,
+      "step": 510
+    },
+    {
+      "epoch": 0.8455284552845529,
+      "grad_norm": 0.6260727643966675,
+      "learning_rate": 0.0002,
+      "loss": 1.8032,
+      "step": 520
+    },
+    {
+      "epoch": 0.8617886178861789,
+      "grad_norm": 0.5796844959259033,
+      "learning_rate": 0.0002,
+      "loss": 1.8292,
+      "step": 530
+    },
+    {
+      "epoch": 0.8780487804878049,
+      "grad_norm": 0.615927517414093,
+      "learning_rate": 0.0002,
+      "loss": 1.7775,
+      "step": 540
+    },
+    {
+      "epoch": 0.8943089430894309,
+      "grad_norm": 0.5230891704559326,
+      "learning_rate": 0.0002,
+      "loss": 1.7254,
+      "step": 550
+    },
+    {
+      "epoch": 0.9105691056910569,
+      "grad_norm": 0.5990992784500122,
+      "learning_rate": 0.0002,
+      "loss": 1.8126,
+      "step": 560
+    },
+    {
+      "epoch": 0.926829268292683,
+      "grad_norm": 0.538957417011261,
+      "learning_rate": 0.0002,
+      "loss": 1.8551,
+      "step": 570
+    },
+    {
+      "epoch": 0.943089430894309,
+      "grad_norm": 0.556900680065155,
+      "learning_rate": 0.0002,
+      "loss": 1.791,
+      "step": 580
+    },
+    {
+      "epoch": 0.959349593495935,
+      "grad_norm": 0.6459956765174866,
+      "learning_rate": 0.0002,
+      "loss": 1.8799,
+      "step": 590
+    },
+    {
+      "epoch": 0.975609756097561,
+      "grad_norm": 0.5648245215415955,
+      "learning_rate": 0.0002,
+      "loss": 1.774,
+      "step": 600
+    },
+    {
+      "epoch": 0.991869918699187,
+      "grad_norm": 0.5341294407844543,
+      "learning_rate": 0.0002,
+      "loss": 1.7746,
+      "step": 610
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.7831426858901978,
+      "eval_runtime": 98.419,
+      "eval_samples_per_second": 5.416,
+      "eval_steps_per_second": 0.681,
+      "step": 615
+    },
+    {
+      "epoch": 1.008130081300813,
+      "grad_norm": 0.49698150157928467,
+      "learning_rate": 0.0002,
+      "loss": 1.7212,
+      "step": 620
+    },
+    {
+      "epoch": 1.024390243902439,
+      "grad_norm": 0.696890652179718,
+      "learning_rate": 0.0002,
+      "loss": 1.7379,
+      "step": 630
+    },
+    {
+      "epoch": 1.040650406504065,
+      "grad_norm": 0.5939123630523682,
+      "learning_rate": 0.0002,
+      "loss": 1.6391,
+      "step": 640
+    },
+    {
+      "epoch": 1.056910569105691,
+      "grad_norm": 0.5630994439125061,
+      "learning_rate": 0.0002,
+      "loss": 1.712,
+      "step": 650
+    },
+    {
+      "epoch": 1.0731707317073171,
+      "grad_norm": 0.5783666968345642,
+      "learning_rate": 0.0002,
+      "loss": 1.6401,
+      "step": 660
+    },
+    {
+      "epoch": 1.089430894308943,
+      "grad_norm": 0.6006693840026855,
+      "learning_rate": 0.0002,
+      "loss": 1.7158,
+      "step": 670
+    },
+    {
+      "epoch": 1.1056910569105691,
+      "grad_norm": 0.6544332504272461,
+      "learning_rate": 0.0002,
+      "loss": 1.7434,
+      "step": 680
+    },
+    {
+      "epoch": 1.1219512195121952,
+      "grad_norm": 0.6734776496887207,
+      "learning_rate": 0.0002,
+      "loss": 1.6562,
+      "step": 690
+    },
+    {
+      "epoch": 1.1382113821138211,
+      "grad_norm": 0.6067698001861572,
+      "learning_rate": 0.0002,
+      "loss": 1.6724,
+      "step": 700
+    },
+    {
+      "epoch": 1.1544715447154472,
+      "grad_norm": 0.6639267802238464,
+      "learning_rate": 0.0002,
+      "loss": 1.6932,
+      "step": 710
+    },
+    {
+      "epoch": 1.170731707317073,
+      "grad_norm": 0.5179714560508728,
+      "learning_rate": 0.0002,
+      "loss": 1.6562,
+      "step": 720
+    },
+    {
+      "epoch": 1.1869918699186992,
+      "grad_norm": 0.7320363521575928,
+      "learning_rate": 0.0002,
+      "loss": 1.6811,
+      "step": 730
+    },
+    {
+      "epoch": 1.203252032520325,
+      "grad_norm": 0.689231276512146,
+      "learning_rate": 0.0002,
+      "loss": 1.5619,
+      "step": 740
+    },
+    {
+      "epoch": 1.2195121951219512,
+      "grad_norm": 0.6605235934257507,
+      "learning_rate": 0.0002,
+      "loss": 1.5688,
+      "step": 750
+    },
+    {
+      "epoch": 1.2357723577235773,
+      "grad_norm": 0.7013542056083679,
+      "learning_rate": 0.0002,
+      "loss": 1.7045,
+      "step": 760
+    },
+    {
+      "epoch": 1.2520325203252032,
+      "grad_norm": 0.6349928975105286,
+      "learning_rate": 0.0002,
+      "loss": 1.6857,
+      "step": 770
+    },
+    {
+      "epoch": 1.2682926829268293,
+      "grad_norm": 0.6362272500991821,
+      "learning_rate": 0.0002,
+      "loss": 1.6767,
+      "step": 780
+    },
+    {
+      "epoch": 1.2845528455284554,
+      "grad_norm": 0.6152030229568481,
+      "learning_rate": 0.0002,
+      "loss": 1.6594,
+      "step": 790
+    },
+    {
+      "epoch": 1.3008130081300813,
+      "grad_norm": 0.6406176686286926,
+      "learning_rate": 0.0002,
+      "loss": 1.7542,
+      "step": 800
+    },
+    {
+      "epoch": 1.3170731707317074,
+      "grad_norm": 0.6099124550819397,
+      "learning_rate": 0.0002,
+      "loss": 1.7243,
+      "step": 810
+    },
+    {
+      "epoch": 1.3333333333333333,
+      "grad_norm": 0.6298971772193909,
+      "learning_rate": 0.0002,
+      "loss": 1.6642,
+      "step": 820
+    },
+    {
+      "epoch": 1.3495934959349594,
+      "grad_norm": 0.775223433971405,
+      "learning_rate": 0.0002,
+      "loss": 1.6901,
+      "step": 830
+    },
+    {
+      "epoch": 1.3658536585365852,
+      "grad_norm": 0.7261736392974854,
+      "learning_rate": 0.0002,
+      "loss": 1.6284,
+      "step": 840
+    },
+    {
+      "epoch": 1.3821138211382114,
+      "grad_norm": 0.6321929097175598,
+      "learning_rate": 0.0002,
+      "loss": 1.6671,
+      "step": 850
+    },
+    {
+      "epoch": 1.3983739837398375,
+      "grad_norm": 0.7564281225204468,
+      "learning_rate": 0.0002,
+      "loss": 1.7036,
+      "step": 860
+    },
+    {
+      "epoch": 1.4146341463414633,
+      "grad_norm": 0.6329448819160461,
+      "learning_rate": 0.0002,
+      "loss": 1.7014,
+      "step": 870
+    },
+    {
+      "epoch": 1.4308943089430894,
+      "grad_norm": 0.6288684606552124,
+      "learning_rate": 0.0002,
+      "loss": 1.6398,
+      "step": 880
+    },
+    {
+      "epoch": 1.4471544715447155,
+      "grad_norm": 0.6165404915809631,
+      "learning_rate": 0.0002,
+      "loss": 1.673,
+      "step": 890
+    },
+    {
+      "epoch": 1.4634146341463414,
+      "grad_norm": 0.6124468445777893,
+      "learning_rate": 0.0002,
+      "loss": 1.6668,
+      "step": 900
+    },
+    {
+      "epoch": 1.4796747967479675,
+      "grad_norm": 0.7038629651069641,
+      "learning_rate": 0.0002,
+      "loss": 1.6879,
+      "step": 910
+    },
+    {
+      "epoch": 1.4959349593495934,
+      "grad_norm": 0.5755146145820618,
+      "learning_rate": 0.0002,
+      "loss": 1.6701,
+      "step": 920
+    },
+    {
+      "epoch": 1.5121951219512195,
+      "grad_norm": 0.7639156579971313,
+      "learning_rate": 0.0002,
+      "loss": 1.7244,
+      "step": 930
+    },
+    {
+      "epoch": 1.5284552845528454,
+      "grad_norm": 0.6948140859603882,
+      "learning_rate": 0.0002,
+      "loss": 1.6836,
+      "step": 940
+    },
+    {
+      "epoch": 1.5447154471544715,
+      "grad_norm": 0.6887956261634827,
+      "learning_rate": 0.0002,
+      "loss": 1.6479,
+      "step": 950
+    },
+    {
+      "epoch": 1.5609756097560976,
+      "grad_norm": 0.7226824164390564,
+      "learning_rate": 0.0002,
+      "loss": 1.7285,
+      "step": 960
+    },
+    {
+      "epoch": 1.5772357723577235,
+      "grad_norm": 0.6753950715065002,
+      "learning_rate": 0.0002,
+      "loss": 1.6214,
+      "step": 970
+    },
+    {
+      "epoch": 1.5934959349593496,
+      "grad_norm": 0.6580971479415894,
+      "learning_rate": 0.0002,
+      "loss": 1.7283,
+      "step": 980
+    },
+    {
+      "epoch": 1.6097560975609757,
+      "grad_norm": 0.7157843112945557,
+      "learning_rate": 0.0002,
+      "loss": 1.6671,
+      "step": 990
+    },
+    {
+      "epoch": 1.6260162601626016,
+      "grad_norm": 0.6736738681793213,
+      "learning_rate": 0.0002,
+      "loss": 1.645,
+      "step": 1000
+    },
+    {
+      "epoch": 1.6422764227642277,
+      "grad_norm": 0.5271940231323242,
+      "learning_rate": 0.0002,
+      "loss": 1.6589,
+      "step": 1010
+    },
+    {
+      "epoch": 1.6585365853658538,
+      "grad_norm": 0.6378998160362244,
+      "learning_rate": 0.0002,
+      "loss": 1.7358,
+      "step": 1020
+    },
+    {
+      "epoch": 1.6747967479674797,
+      "grad_norm": 0.6498209834098816,
+      "learning_rate": 0.0002,
+      "loss": 1.6924,
+      "step": 1030
+    },
+    {
+      "epoch": 1.6910569105691056,
+      "grad_norm": 0.7050761580467224,
+      "learning_rate": 0.0002,
+      "loss": 1.6253,
+      "step": 1040
+    },
+    {
+      "epoch": 1.7073170731707317,
+      "grad_norm": 0.7122200131416321,
+      "learning_rate": 0.0002,
+      "loss": 1.7146,
+      "step": 1050
+    },
+    {
+      "epoch": 1.7235772357723578,
+      "grad_norm": 0.6705704927444458,
+      "learning_rate": 0.0002,
+      "loss": 1.6511,
+      "step": 1060
+    },
+    {
+      "epoch": 1.7398373983739837,
+      "grad_norm": 0.6859356760978699,
+      "learning_rate": 0.0002,
+      "loss": 1.6506,
+      "step": 1070
+    },
+    {
+      "epoch": 1.7560975609756098,
+      "grad_norm": 0.6540971994400024,
+      "learning_rate": 0.0002,
+      "loss": 1.6562,
+      "step": 1080
+    },
+    {
+      "epoch": 1.7723577235772359,
+      "grad_norm": 0.6297651529312134,
+      "learning_rate": 0.0002,
+      "loss": 1.6627,
+      "step": 1090
+    },
+    {
+      "epoch": 1.7886178861788617,
+      "grad_norm": 0.6645651459693909,
+      "learning_rate": 0.0002,
+      "loss": 1.704,
+      "step": 1100
+    },
+    {
+      "epoch": 1.8048780487804879,
+      "grad_norm": 0.6450296640396118,
+      "learning_rate": 0.0002,
+      "loss": 1.6908,
+      "step": 1110
+    },
+    {
+      "epoch": 1.821138211382114,
+      "grad_norm": 0.7785659432411194,
+      "learning_rate": 0.0002,
+      "loss": 1.7642,
+      "step": 1120
+    },
+    {
+      "epoch": 1.8373983739837398,
+      "grad_norm": 0.6845982670783997,
+      "learning_rate": 0.0002,
+      "loss": 1.6773,
+      "step": 1130
+    },
+    {
+      "epoch": 1.8536585365853657,
+      "grad_norm": 0.699683666229248,
+      "learning_rate": 0.0002,
+      "loss": 1.6879,
+      "step": 1140
+    },
+    {
+      "epoch": 1.8699186991869918,
+      "grad_norm": 0.6600332856178284,
+      "learning_rate": 0.0002,
+      "loss": 1.7162,
+      "step": 1150
+    },
+    {
+      "epoch": 1.886178861788618,
+      "grad_norm": 0.7301949262619019,
+      "learning_rate": 0.0002,
+      "loss": 1.7291,
+      "step": 1160
+    },
+    {
+      "epoch": 1.9024390243902438,
+      "grad_norm": 0.8183556795120239,
+      "learning_rate": 0.0002,
+      "loss": 1.6874,
+      "step": 1170
+    },
+    {
+      "epoch": 1.91869918699187,
+      "grad_norm": 0.7122833132743835,
+      "learning_rate": 0.0002,
+      "loss": 1.6779,
+      "step": 1180
+    },
+    {
+      "epoch": 1.934959349593496,
+      "grad_norm": 0.6391404271125793,
+      "learning_rate": 0.0002,
+      "loss": 1.7361,
+      "step": 1190
+    },
+    {
+      "epoch": 1.951219512195122,
+      "grad_norm": 0.6136474013328552,
+      "learning_rate": 0.0002,
+      "loss": 1.7188,
+      "step": 1200
+    },
+    {
+      "epoch": 1.967479674796748,
+      "grad_norm": 0.7704503536224365,
+      "learning_rate": 0.0002,
+      "loss": 1.6536,
+      "step": 1210
+    },
+    {
+      "epoch": 1.9837398373983741,
+      "grad_norm": 0.6155434846878052,
+      "learning_rate": 0.0002,
+      "loss": 1.6735,
+      "step": 1220
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.6262536644935608,
+      "learning_rate": 0.0002,
+      "loss": 1.6534,
+      "step": 1230
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.7886285781860352,
+      "eval_runtime": 98.7888,
+      "eval_samples_per_second": 5.395,
+      "eval_steps_per_second": 0.678,
+      "step": 1230
+    },
+    {
+      "epoch": 2.016260162601626,
+      "grad_norm": 0.9827656149864197,
+      "learning_rate": 0.0002,
+      "loss": 1.5135,
+      "step": 1240
+    },
+    {
+      "epoch": 2.032520325203252,
+      "grad_norm": 0.8443078398704529,
+      "learning_rate": 0.0002,
+      "loss": 1.5396,
+      "step": 1250
+    },
+    {
+      "epoch": 2.048780487804878,
+      "grad_norm": 0.9006391763687134,
+      "learning_rate": 0.0002,
+      "loss": 1.5326,
+      "step": 1260
+    },
+    {
+      "epoch": 2.065040650406504,
+      "grad_norm": 0.7900105714797974,
+      "learning_rate": 0.0002,
+      "loss": 1.5176,
+      "step": 1270
+    },
+    {
+      "epoch": 2.08130081300813,
+      "grad_norm": 0.6430686116218567,
+      "learning_rate": 0.0002,
+      "loss": 1.5578,
+      "step": 1280
+    },
+    {
+      "epoch": 2.097560975609756,
+      "grad_norm": 0.8998992443084717,
+      "learning_rate": 0.0002,
+      "loss": 1.5453,
+      "step": 1290
+    },
+    {
+      "epoch": 2.113821138211382,
+      "grad_norm": 0.7658976316452026,
+      "learning_rate": 0.0002,
+      "loss": 1.5051,
+      "step": 1300
+    },
+    {
+      "epoch": 2.130081300813008,
+      "grad_norm": 0.9033166766166687,
+      "learning_rate": 0.0002,
+      "loss": 1.5005,
+      "step": 1310
+    },
+    {
+      "epoch": 2.1463414634146343,
+      "grad_norm": 0.7942133545875549,
+      "learning_rate": 0.0002,
+      "loss": 1.5517,
+      "step": 1320
+    },
+    {
+      "epoch": 2.16260162601626,
+      "grad_norm": 0.8496367931365967,
+      "learning_rate": 0.0002,
+      "loss": 1.5248,
+      "step": 1330
+    },
+    {
+      "epoch": 2.178861788617886,
+      "grad_norm": 0.8638061881065369,
+      "learning_rate": 0.0002,
+      "loss": 1.4887,
+      "step": 1340
+    },
+    {
+      "epoch": 2.1951219512195124,
+      "grad_norm": 0.9003657102584839,
+      "learning_rate": 0.0002,
+      "loss": 1.5023,
+      "step": 1350
+    },
+    {
+      "epoch": 2.2113821138211383,
+      "grad_norm": 0.8387648463249207,
+      "learning_rate": 0.0002,
+      "loss": 1.4904,
+      "step": 1360
+    },
+    {
+      "epoch": 2.227642276422764,
+      "grad_norm": 0.7598716616630554,
+      "learning_rate": 0.0002,
+      "loss": 1.5553,
+      "step": 1370
+    },
+    {
+      "epoch": 2.2439024390243905,
+      "grad_norm": 0.872882604598999,
+      "learning_rate": 0.0002,
+      "loss": 1.4723,
+      "step": 1380
+    },
+    {
+      "epoch": 2.2601626016260163,
+      "grad_norm": 0.8919313549995422,
+      "learning_rate": 0.0002,
+      "loss": 1.5121,
+      "step": 1390
+    },
+    {
+      "epoch": 2.2764227642276422,
+      "grad_norm": 0.9646918773651123,
+      "learning_rate": 0.0002,
+      "loss": 1.5162,
+      "step": 1400
+    },
+    {
+      "epoch": 2.292682926829268,
+      "grad_norm": 0.8501992225646973,
+      "learning_rate": 0.0002,
+      "loss": 1.5163,
+      "step": 1410
+    },
+    {
+      "epoch": 2.3089430894308944,
+      "grad_norm": 0.7517067790031433,
+      "learning_rate": 0.0002,
+      "loss": 1.5096,
+      "step": 1420
+    },
+    {
+      "epoch": 2.3252032520325203,
+      "grad_norm": 0.9097304940223694,
+      "learning_rate": 0.0002,
+      "loss": 1.5359,
+      "step": 1430
+    },
+    {
+      "epoch": 2.341463414634146,
+      "grad_norm": 0.8515191674232483,
+      "learning_rate": 0.0002,
+      "loss": 1.4843,
+      "step": 1440
+    },
+    {
+      "epoch": 2.3577235772357725,
+      "grad_norm": 0.8925113677978516,
+      "learning_rate": 0.0002,
+      "loss": 1.5021,
+      "step": 1450
+    },
+    {
+      "epoch": 2.3739837398373984,
+      "grad_norm": 1.0194441080093384,
+      "learning_rate": 0.0002,
+      "loss": 1.4235,
+      "step": 1460
+    },
+    {
+      "epoch": 2.3902439024390243,
+      "grad_norm": 0.9004436731338501,
+      "learning_rate": 0.0002,
+      "loss": 1.5778,
+      "step": 1470
+    },
+    {
+      "epoch": 2.40650406504065,
+      "grad_norm": 0.9552311897277832,
+      "learning_rate": 0.0002,
+      "loss": 1.5623,
+      "step": 1480
+    },
+    {
+      "epoch": 2.4227642276422765,
+      "grad_norm": 0.9185764789581299,
+      "learning_rate": 0.0002,
+      "loss": 1.5507,
+      "step": 1490
+    },
+    {
+      "epoch": 2.4390243902439024,
+      "grad_norm": 0.7935037016868591,
+      "learning_rate": 0.0002,
+      "loss": 1.5058,
+      "step": 1500
+    },
+    {
+      "epoch": 2.4552845528455283,
+      "grad_norm": 0.8124602437019348,
+      "learning_rate": 0.0002,
+      "loss": 1.5374,
+      "step": 1510
+    },
+    {
+      "epoch": 2.4715447154471546,
+      "grad_norm": 0.7927430272102356,
+      "learning_rate": 0.0002,
+      "loss": 1.4553,
+      "step": 1520
+    },
+    {
+      "epoch": 2.4878048780487805,
+      "grad_norm": 0.9143779873847961,
+      "learning_rate": 0.0002,
+      "loss": 1.5135,
+      "step": 1530
+    },
+    {
+      "epoch": 2.5040650406504064,
+      "grad_norm": 0.938185453414917,
+      "learning_rate": 0.0002,
+      "loss": 1.4842,
+      "step": 1540
+    },
+    {
+      "epoch": 2.5203252032520327,
+      "grad_norm": 0.9858708381652832,
+      "learning_rate": 0.0002,
+      "loss": 1.5983,
+      "step": 1550
+    },
+    {
+      "epoch": 2.5365853658536586,
+      "grad_norm": 0.9211642742156982,
+      "learning_rate": 0.0002,
+      "loss": 1.5464,
+      "step": 1560
+    },
+    {
+      "epoch": 2.5528455284552845,
+      "grad_norm": 0.9824395775794983,
+      "learning_rate": 0.0002,
+      "loss": 1.5293,
+      "step": 1570
+    },
+    {
+      "epoch": 2.569105691056911,
+      "grad_norm": 0.916930615901947,
+      "learning_rate": 0.0002,
+      "loss": 1.5559,
+      "step": 1580
+    },
+    {
+      "epoch": 2.5853658536585367,
+      "grad_norm": 0.9336596727371216,
+      "learning_rate": 0.0002,
+      "loss": 1.5581,
+      "step": 1590
+    },
+    {
+      "epoch": 2.6016260162601625,
+      "grad_norm": 0.9006481170654297,
+      "learning_rate": 0.0002,
+      "loss": 1.5379,
+      "step": 1600
+    },
+    {
+      "epoch": 2.617886178861789,
+      "grad_norm": 0.8296214938163757,
+      "learning_rate": 0.0002,
+      "loss": 1.5254,
+      "step": 1610
+    },
+    {
+      "epoch": 2.6341463414634148,
+      "grad_norm": 1.0448366403579712,
+      "learning_rate": 0.0002,
+      "loss": 1.5782,
+      "step": 1620
+    },
+    {
+      "epoch": 2.6504065040650406,
+      "grad_norm": 0.8174839019775391,
+      "learning_rate": 0.0002,
+      "loss": 1.5523,
+      "step": 1630
+    },
+    {
+      "epoch": 2.6666666666666665,
+      "grad_norm": 0.873572051525116,
+      "learning_rate": 0.0002,
+      "loss": 1.4434,
+      "step": 1640
+    },
+    {
+      "epoch": 2.682926829268293,
+      "grad_norm": 0.9270642995834351,
+      "learning_rate": 0.0002,
+      "loss": 1.4723,
+      "step": 1650
+    },
+    {
+      "epoch": 2.6991869918699187,
+      "grad_norm": 0.8988297581672668,
+      "learning_rate": 0.0002,
+      "loss": 1.4736,
+      "step": 1660
+    },
+    {
+      "epoch": 2.7154471544715446,
+      "grad_norm": 0.8537285923957825,
+      "learning_rate": 0.0002,
+      "loss": 1.52,
+      "step": 1670
+    },
+    {
+      "epoch": 2.7317073170731705,
+      "grad_norm": 0.7982168793678284,
+      "learning_rate": 0.0002,
+      "loss": 1.5073,
+      "step": 1680
+    },
+    {
+      "epoch": 2.747967479674797,
+      "grad_norm": 0.9140633940696716,
+      "learning_rate": 0.0002,
+      "loss": 1.5357,
+      "step": 1690
+    },
+    {
+      "epoch": 2.7642276422764227,
+      "grad_norm": 0.8485862016677856,
+      "learning_rate": 0.0002,
+      "loss": 1.5182,
+      "step": 1700
+    },
+    {
+      "epoch": 2.7804878048780486,
+      "grad_norm": 1.3670072555541992,
+      "learning_rate": 0.0002,
+      "loss": 1.5273,
+      "step": 1710
+    },
+    {
+      "epoch": 2.796747967479675,
+      "grad_norm": 0.8846588134765625,
+      "learning_rate": 0.0002,
+      "loss": 1.492,
+      "step": 1720
+    },
+    {
+      "epoch": 2.813008130081301,
+      "grad_norm": 1.0143219232559204,
+      "learning_rate": 0.0002,
+      "loss": 1.5286,
+      "step": 1730
+    },
+    {
+      "epoch": 2.8292682926829267,
+      "grad_norm": 0.9646075367927551,
+      "learning_rate": 0.0002,
+      "loss": 1.5253,
+      "step": 1740
+    },
+    {
+      "epoch": 2.845528455284553,
+      "grad_norm": 0.9912563562393188,
+      "learning_rate": 0.0002,
+      "loss": 1.5865,
+      "step": 1750
+    },
+    {
+      "epoch": 2.861788617886179,
+      "grad_norm": 0.8160223364830017,
+      "learning_rate": 0.0002,
+      "loss": 1.5266,
+      "step": 1760
+    },
+    {
+      "epoch": 2.8780487804878048,
+      "grad_norm": 0.8553791642189026,
+      "learning_rate": 0.0002,
+      "loss": 1.5542,
+      "step": 1770
+    },
+    {
+      "epoch": 2.894308943089431,
+      "grad_norm": 0.8816639184951782,
+      "learning_rate": 0.0002,
+      "loss": 1.5592,
+      "step": 1780
+    },
+    {
+      "epoch": 2.910569105691057,
+      "grad_norm": 0.829551637172699,
+      "learning_rate": 0.0002,
+      "loss": 1.5443,
+      "step": 1790
+    },
+    {
+      "epoch": 2.926829268292683,
+      "grad_norm": 1.0520497560501099,
+      "learning_rate": 0.0002,
+      "loss": 1.5111,
+      "step": 1800
+    },
+    {
+      "epoch": 2.943089430894309,
+      "grad_norm": 0.8627844452857971,
+      "learning_rate": 0.0002,
+      "loss": 1.509,
+      "step": 1810
+    },
+    {
+      "epoch": 2.959349593495935,
+      "grad_norm": 0.8868018388748169,
+      "learning_rate": 0.0002,
+      "loss": 1.5119,
+      "step": 1820
+    },
+    {
+      "epoch": 2.975609756097561,
+      "grad_norm": 1.047621250152588,
+      "learning_rate": 0.0002,
+      "loss": 1.5956,
+      "step": 1830
+    },
+    {
+      "epoch": 2.991869918699187,
+      "grad_norm": 1.122131109237671,
+      "learning_rate": 0.0002,
+      "loss": 1.5189,
+      "step": 1840
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 1.8374383449554443,
+      "eval_runtime": 98.0056,
+      "eval_samples_per_second": 5.438,
+      "eval_steps_per_second": 0.684,
+      "step": 1845
+    },
+    {
+      "epoch": 3.008130081300813,
+      "grad_norm": 0.9361767172813416,
+      "learning_rate": 0.0002,
+      "loss": 1.3996,
+      "step": 1850
+    },
+    {
+      "epoch": 3.024390243902439,
+      "grad_norm": 1.0564402341842651,
+      "learning_rate": 0.0002,
+      "loss": 1.3122,
+      "step": 1860
+    },
+    {
+      "epoch": 3.040650406504065,
+      "grad_norm": 1.2450026273727417,
+      "learning_rate": 0.0002,
+      "loss": 1.2512,
+      "step": 1870
+    },
+    {
+      "epoch": 3.0569105691056913,
+      "grad_norm": 1.082606554031372,
+      "learning_rate": 0.0002,
+      "loss": 1.2585,
+      "step": 1880
+    },
+    {
+      "epoch": 3.073170731707317,
+      "grad_norm": 1.1582257747650146,
+      "learning_rate": 0.0002,
+      "loss": 1.2827,
+      "step": 1890
+    },
+    {
+      "epoch": 3.089430894308943,
+      "grad_norm": 1.1113696098327637,
+      "learning_rate": 0.0002,
+      "loss": 1.2865,
+      "step": 1900
+    },
+    {
+      "epoch": 3.105691056910569,
+      "grad_norm": 1.1716952323913574,
+      "learning_rate": 0.0002,
+      "loss": 1.2867,
+      "step": 1910
+    },
+    {
+      "epoch": 3.1219512195121952,
+      "grad_norm": 1.1270506381988525,
+      "learning_rate": 0.0002,
+      "loss": 1.286,
+      "step": 1920
+    },
+    {
+      "epoch": 3.138211382113821,
+      "grad_norm": 1.1955605745315552,
+      "learning_rate": 0.0002,
+      "loss": 1.3074,
+      "step": 1930
+    },
+    {
+      "epoch": 3.154471544715447,
+      "grad_norm": 1.246848464012146,
+      "learning_rate": 0.0002,
+      "loss": 1.2752,
+      "step": 1940
+    },
+    {
+      "epoch": 3.1707317073170733,
+      "grad_norm": 1.2208205461502075,
+      "learning_rate": 0.0002,
+      "loss": 1.3422,
+      "step": 1950
+    },
+    {
+      "epoch": 3.186991869918699,
+      "grad_norm": 1.1758005619049072,
+      "learning_rate": 0.0002,
+      "loss": 1.2618,
+      "step": 1960
+    },
+    {
+      "epoch": 3.203252032520325,
+      "grad_norm": 1.2697960138320923,
+      "learning_rate": 0.0002,
+      "loss": 1.3094,
+      "step": 1970
+    },
+    {
+      "epoch": 3.2195121951219514,
+      "grad_norm": 1.0855997800827026,
+      "learning_rate": 0.0002,
+      "loss": 1.3714,
+      "step": 1980
+    },
+    {
+      "epoch": 3.2357723577235773,
+      "grad_norm": 1.1054189205169678,
+      "learning_rate": 0.0002,
+      "loss": 1.2866,
+      "step": 1990
+    },
+    {
+      "epoch": 3.252032520325203,
+      "grad_norm": 1.2496592998504639,
+      "learning_rate": 0.0002,
+      "loss": 1.3057,
+      "step": 2000
+    },
+    {
+      "epoch": 3.2682926829268295,
+      "grad_norm": 1.215553641319275,
+      "learning_rate": 0.0002,
+      "loss": 1.3868,
+      "step": 2010
+    },
+    {
+      "epoch": 3.2845528455284554,
+      "grad_norm": 1.1711665391921997,
+      "learning_rate": 0.0002,
+      "loss": 1.2866,
+      "step": 2020
+    },
+    {
+      "epoch": 3.3008130081300813,
+      "grad_norm": 1.493438959121704,
+      "learning_rate": 0.0002,
+      "loss": 1.2969,
+      "step": 2030
+    },
+    {
+      "epoch": 3.317073170731707,
+      "grad_norm": 1.1202969551086426,
+      "learning_rate": 0.0002,
+      "loss": 1.3032,
+      "step": 2040
+    },
+    {
+      "epoch": 3.3333333333333335,
+      "grad_norm": 1.1334387063980103,
+      "learning_rate": 0.0002,
+      "loss": 1.3257,
+      "step": 2050
+    },
+    {
+      "epoch": 3.3495934959349594,
+      "grad_norm": 1.2813389301300049,
+      "learning_rate": 0.0002,
+      "loss": 1.2823,
+      "step": 2060
+    },
+    {
+      "epoch": 3.3658536585365852,
+      "grad_norm": 1.1317278146743774,
+      "learning_rate": 0.0002,
+      "loss": 1.2892,
+      "step": 2070
+    },
+    {
+      "epoch": 3.3821138211382116,
+      "grad_norm": 1.4018956422805786,
+      "learning_rate": 0.0002,
+      "loss": 1.2731,
+      "step": 2080
+    },
+    {
+      "epoch": 3.3983739837398375,
+      "grad_norm": 1.1856937408447266,
+      "learning_rate": 0.0002,
+      "loss": 1.3279,
+      "step": 2090
+    },
+    {
+      "epoch": 3.4146341463414633,
+      "grad_norm": 1.480185627937317,
+      "learning_rate": 0.0002,
+      "loss": 1.2903,
+      "step": 2100
+    },
+    {
+      "epoch": 3.430894308943089,
+      "grad_norm": 1.3945696353912354,
+      "learning_rate": 0.0002,
+      "loss": 1.3713,
+      "step": 2110
+    },
+    {
+      "epoch": 3.4471544715447155,
+      "grad_norm": 1.5409419536590576,
+      "learning_rate": 0.0002,
+      "loss": 1.3327,
+      "step": 2120
+    },
+    {
+      "epoch": 3.4634146341463414,
+      "grad_norm": 1.3170857429504395,
+      "learning_rate": 0.0002,
+      "loss": 1.3456,
+      "step": 2130
+    },
+    {
+      "epoch": 3.4796747967479673,
+      "grad_norm": 1.1793901920318604,
+      "learning_rate": 0.0002,
+      "loss": 1.3129,
+      "step": 2140
+    },
+    {
+      "epoch": 3.4959349593495936,
+      "grad_norm": 1.3043832778930664,
+      "learning_rate": 0.0002,
+      "loss": 1.3356,
+      "step": 2150
+    },
+    {
+      "epoch": 3.5121951219512195,
+      "grad_norm": 1.2157930135726929,
+      "learning_rate": 0.0002,
+      "loss": 1.2893,
+      "step": 2160
+    },
+    {
+      "epoch": 3.5284552845528454,
+      "grad_norm": 1.2139101028442383,
+      "learning_rate": 0.0002,
+      "loss": 1.3606,
+      "step": 2170
+    },
+    {
+      "epoch": 3.5447154471544717,
+      "grad_norm": 1.0714174509048462,
+      "learning_rate": 0.0002,
+      "loss": 1.2897,
+      "step": 2180
+    },
+    {
+      "epoch": 3.5609756097560976,
+      "grad_norm": 1.1357146501541138,
+      "learning_rate": 0.0002,
+      "loss": 1.3398,
+      "step": 2190
+    },
+    {
+      "epoch": 3.5772357723577235,
+      "grad_norm": 1.216141939163208,
+      "learning_rate": 0.0002,
+      "loss": 1.2829,
+      "step": 2200
+    },
+    {
+      "epoch": 3.59349593495935,
+      "grad_norm": 1.2001926898956299,
+      "learning_rate": 0.0002,
+      "loss": 1.3411,
+      "step": 2210
+    },
+    {
+      "epoch": 3.6097560975609757,
+      "grad_norm": 1.355756163597107,
+      "learning_rate": 0.0002,
+      "loss": 1.2804,
+      "step": 2220
+    },
+    {
+      "epoch": 3.6260162601626016,
+      "grad_norm": 1.1870149374008179,
+      "learning_rate": 0.0002,
+      "loss": 1.3732,
+      "step": 2230
+    },
+    {
+      "epoch": 3.642276422764228,
+      "grad_norm": 1.0973352193832397,
+      "learning_rate": 0.0002,
+      "loss": 1.4334,
+      "step": 2240
+    },
+    {
+      "epoch": 3.658536585365854,
+      "grad_norm": 1.110839605331421,
+      "learning_rate": 0.0002,
+      "loss": 1.3987,
+      "step": 2250
+    },
+    {
+      "epoch": 3.6747967479674797,
+      "grad_norm": 1.1280663013458252,
+      "learning_rate": 0.0002,
+      "loss": 1.3316,
+      "step": 2260
+    },
+    {
+      "epoch": 3.6910569105691056,
+      "grad_norm": 1.3871443271636963,
+      "learning_rate": 0.0002,
+      "loss": 1.2897,
+      "step": 2270
+    },
+    {
+      "epoch": 3.7073170731707314,
+      "grad_norm": 1.384059190750122,
+      "learning_rate": 0.0002,
+      "loss": 1.3784,
+      "step": 2280
+    },
+    {
+      "epoch": 3.7235772357723578,
+      "grad_norm": 1.422131896018982,
+      "learning_rate": 0.0002,
+      "loss": 1.3288,
+      "step": 2290
+    },
+    {
+      "epoch": 3.7398373983739837,
+      "grad_norm": 1.2262955904006958,
+      "learning_rate": 0.0002,
+      "loss": 1.342,
+      "step": 2300
+    },
+    {
+      "epoch": 3.7560975609756095,
+      "grad_norm": 1.4098708629608154,
+      "learning_rate": 0.0002,
+      "loss": 1.3674,
+      "step": 2310
+    },
+    {
+      "epoch": 3.772357723577236,
+      "grad_norm": 1.3726389408111572,
+      "learning_rate": 0.0002,
+      "loss": 1.4156,
+      "step": 2320
+    },
+    {
+      "epoch": 3.7886178861788617,
+      "grad_norm": 1.2945446968078613,
+      "learning_rate": 0.0002,
+      "loss": 1.3083,
+      "step": 2330
+    },
+    {
+      "epoch": 3.8048780487804876,
+      "grad_norm": 1.2011241912841797,
+      "learning_rate": 0.0002,
+      "loss": 1.3631,
+      "step": 2340
+    },
+    {
+      "epoch": 3.821138211382114,
+      "grad_norm": 1.158033847808838,
+      "learning_rate": 0.0002,
+      "loss": 1.3888,
+      "step": 2350
+    },
+    {
+      "epoch": 3.83739837398374,
+      "grad_norm": 1.2479424476623535,
+      "learning_rate": 0.0002,
+      "loss": 1.3159,
+      "step": 2360
+    },
+    {
+      "epoch": 3.8536585365853657,
+      "grad_norm": 1.253841519355774,
+      "learning_rate": 0.0002,
+      "loss": 1.3116,
+      "step": 2370
+    },
+    {
+      "epoch": 3.869918699186992,
+      "grad_norm": 1.2509289979934692,
+      "learning_rate": 0.0002,
+      "loss": 1.3943,
+      "step": 2380
+    },
+    {
+      "epoch": 3.886178861788618,
+      "grad_norm": 1.529388666152954,
+      "learning_rate": 0.0002,
+      "loss": 1.3717,
+      "step": 2390
+    },
+    {
+      "epoch": 3.902439024390244,
+      "grad_norm": 1.241012692451477,
+      "learning_rate": 0.0002,
+      "loss": 1.3875,
+      "step": 2400
+    },
+    {
+      "epoch": 3.91869918699187,
+      "grad_norm": 1.4315979480743408,
+      "learning_rate": 0.0002,
+      "loss": 1.3352,
+      "step": 2410
+    },
+    {
+      "epoch": 3.934959349593496,
+      "grad_norm": 1.6688332557678223,
+      "learning_rate": 0.0002,
+      "loss": 1.4241,
+      "step": 2420
+    },
+    {
+      "epoch": 3.951219512195122,
+      "grad_norm": 1.3832660913467407,
+      "learning_rate": 0.0002,
+      "loss": 1.3261,
+      "step": 2430
+    },
+    {
+      "epoch": 3.9674796747967482,
+      "grad_norm": 1.3022568225860596,
+      "learning_rate": 0.0002,
+      "loss": 1.3334,
+      "step": 2440
+    },
+    {
+      "epoch": 3.983739837398374,
+      "grad_norm": 1.3116395473480225,
+      "learning_rate": 0.0002,
+      "loss": 1.4051,
+      "step": 2450
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 1.2045269012451172,
+      "learning_rate": 0.0002,
+      "loss": 1.3712,
+      "step": 2460
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 1.938527226448059,
+      "eval_runtime": 95.315,
+      "eval_samples_per_second": 5.592,
+      "eval_steps_per_second": 0.703,
+      "step": 2460
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 4920,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.07931827699712e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:794da760599f5a7e302e2faa616ba0185215c069e7fa3436832bde34c7f2ec7b
+size 5560

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-3075/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c8674660ff49bcd692030c221252811675aa81a534da5a208b5bb33b3d5a97d
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:14220cf1ae4a7dd06df1aee1a7c6b48d2108005d1e79c4179894677d1c7cec21
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:724dae46b62ffc002919a41c5db975411c8394998e5786353d82c87f5ee642b2
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c8f2c5f9e3d9d5442b1bfc19ef5ae200cd4874a0f1fe07649f57069a5c04c331
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,2222 @@

+{
+  "best_metric": 1.7831426858901978,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.4-num-2811-sd-1/checkpoint-615",
+  "epoch": 5.0,
+  "eval_steps": 10,
+  "global_step": 3075,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.016260162601626018,
+      "grad_norm": 1.903406023979187,
+      "learning_rate": 0.0002,
+      "loss": 2.6243,
+      "step": 10
+    },
+    {
+      "epoch": 0.032520325203252036,
+      "grad_norm": 1.0100678205490112,
+      "learning_rate": 0.0002,
+      "loss": 2.2084,
+      "step": 20
+    },
+    {
+      "epoch": 0.04878048780487805,
+      "grad_norm": 0.7413098812103271,
+      "learning_rate": 0.0002,
+      "loss": 2.0423,
+      "step": 30
+    },
+    {
+      "epoch": 0.06504065040650407,
+      "grad_norm": 0.7540805339813232,
+      "learning_rate": 0.0002,
+      "loss": 2.0647,
+      "step": 40
+    },
+    {
+      "epoch": 0.08130081300813008,
+      "grad_norm": 0.6508658528327942,
+      "learning_rate": 0.0002,
+      "loss": 2.0926,
+      "step": 50
+    },
+    {
+      "epoch": 0.0975609756097561,
+      "grad_norm": 0.7228319048881531,
+      "learning_rate": 0.0002,
+      "loss": 1.878,
+      "step": 60
+    },
+    {
+      "epoch": 0.11382113821138211,
+      "grad_norm": 0.6510937213897705,
+      "learning_rate": 0.0002,
+      "loss": 1.8672,
+      "step": 70
+    },
+    {
+      "epoch": 0.13008130081300814,
+      "grad_norm": 0.7238746881484985,
+      "learning_rate": 0.0002,
+      "loss": 1.8592,
+      "step": 80
+    },
+    {
+      "epoch": 0.14634146341463414,
+      "grad_norm": 0.7530466318130493,
+      "learning_rate": 0.0002,
+      "loss": 1.8541,
+      "step": 90
+    },
+    {
+      "epoch": 0.16260162601626016,
+      "grad_norm": 0.622166097164154,
+      "learning_rate": 0.0002,
+      "loss": 1.8245,
+      "step": 100
+    },
+    {
+      "epoch": 0.17886178861788618,
+      "grad_norm": 0.6180148720741272,
+      "learning_rate": 0.0002,
+      "loss": 1.7581,
+      "step": 110
+    },
+    {
+      "epoch": 0.1951219512195122,
+      "grad_norm": 0.6221362352371216,
+      "learning_rate": 0.0002,
+      "loss": 1.7741,
+      "step": 120
+    },
+    {
+      "epoch": 0.21138211382113822,
+      "grad_norm": 0.569580078125,
+      "learning_rate": 0.0002,
+      "loss": 1.7575,
+      "step": 130
+    },
+    {
+      "epoch": 0.22764227642276422,
+      "grad_norm": 0.6962840557098389,
+      "learning_rate": 0.0002,
+      "loss": 1.7833,
+      "step": 140
+    },
+    {
+      "epoch": 0.24390243902439024,
+      "grad_norm": 0.644322395324707,
+      "learning_rate": 0.0002,
+      "loss": 1.8329,
+      "step": 150
+    },
+    {
+      "epoch": 0.2601626016260163,
+      "grad_norm": 0.5970060229301453,
+      "learning_rate": 0.0002,
+      "loss": 1.7794,
+      "step": 160
+    },
+    {
+      "epoch": 0.2764227642276423,
+      "grad_norm": 0.6249210834503174,
+      "learning_rate": 0.0002,
+      "loss": 1.8521,
+      "step": 170
+    },
+    {
+      "epoch": 0.2926829268292683,
+      "grad_norm": 0.7134785652160645,
+      "learning_rate": 0.0002,
+      "loss": 1.8066,
+      "step": 180
+    },
+    {
+      "epoch": 0.3089430894308943,
+      "grad_norm": 0.5477158427238464,
+      "learning_rate": 0.0002,
+      "loss": 1.8815,
+      "step": 190
+    },
+    {
+      "epoch": 0.3252032520325203,
+      "grad_norm": 0.6054863333702087,
+      "learning_rate": 0.0002,
+      "loss": 1.7222,
+      "step": 200
+    },
+    {
+      "epoch": 0.34146341463414637,
+      "grad_norm": 0.5664568543434143,
+      "learning_rate": 0.0002,
+      "loss": 1.7598,
+      "step": 210
+    },
+    {
+      "epoch": 0.35772357723577236,
+      "grad_norm": 0.5942816734313965,
+      "learning_rate": 0.0002,
+      "loss": 1.7688,
+      "step": 220
+    },
+    {
+      "epoch": 0.37398373983739835,
+      "grad_norm": 0.6311767101287842,
+      "learning_rate": 0.0002,
+      "loss": 1.7715,
+      "step": 230
+    },
+    {
+      "epoch": 0.3902439024390244,
+      "grad_norm": 0.6614870429039001,
+      "learning_rate": 0.0002,
+      "loss": 1.7663,
+      "step": 240
+    },
+    {
+      "epoch": 0.4065040650406504,
+      "grad_norm": 0.5644984841346741,
+      "learning_rate": 0.0002,
+      "loss": 1.8045,
+      "step": 250
+    },
+    {
+      "epoch": 0.42276422764227645,
+      "grad_norm": 0.7260110974311829,
+      "learning_rate": 0.0002,
+      "loss": 1.7364,
+      "step": 260
+    },
+    {
+      "epoch": 0.43902439024390244,
+      "grad_norm": 0.6733413934707642,
+      "learning_rate": 0.0002,
+      "loss": 1.7606,
+      "step": 270
+    },
+    {
+      "epoch": 0.45528455284552843,
+      "grad_norm": 0.5211837887763977,
+      "learning_rate": 0.0002,
+      "loss": 1.8432,
+      "step": 280
+    },
+    {
+      "epoch": 0.4715447154471545,
+      "grad_norm": 0.5538370013237,
+      "learning_rate": 0.0002,
+      "loss": 1.9166,
+      "step": 290
+    },
+    {
+      "epoch": 0.4878048780487805,
+      "grad_norm": 0.5429130792617798,
+      "learning_rate": 0.0002,
+      "loss": 1.8391,
+      "step": 300
+    },
+    {
+      "epoch": 0.5040650406504065,
+      "grad_norm": 0.517801821231842,
+      "learning_rate": 0.0002,
+      "loss": 1.8072,
+      "step": 310
+    },
+    {
+      "epoch": 0.5203252032520326,
+      "grad_norm": 0.6029635667800903,
+      "learning_rate": 0.0002,
+      "loss": 1.8045,
+      "step": 320
+    },
+    {
+      "epoch": 0.5365853658536586,
+      "grad_norm": 0.506401002407074,
+      "learning_rate": 0.0002,
+      "loss": 1.7734,
+      "step": 330
+    },
+    {
+      "epoch": 0.5528455284552846,
+      "grad_norm": 0.5226597189903259,
+      "learning_rate": 0.0002,
+      "loss": 1.7923,
+      "step": 340
+    },
+    {
+      "epoch": 0.5691056910569106,
+      "grad_norm": 0.5899750590324402,
+      "learning_rate": 0.0002,
+      "loss": 1.7625,
+      "step": 350
+    },
+    {
+      "epoch": 0.5853658536585366,
+      "grad_norm": 0.6185210943222046,
+      "learning_rate": 0.0002,
+      "loss": 1.828,
+      "step": 360
+    },
+    {
+      "epoch": 0.6016260162601627,
+      "grad_norm": 0.8088458180427551,
+      "learning_rate": 0.0002,
+      "loss": 1.8358,
+      "step": 370
+    },
+    {
+      "epoch": 0.6178861788617886,
+      "grad_norm": 0.509591817855835,
+      "learning_rate": 0.0002,
+      "loss": 1.8351,
+      "step": 380
+    },
+    {
+      "epoch": 0.6341463414634146,
+      "grad_norm": 0.5209569931030273,
+      "learning_rate": 0.0002,
+      "loss": 1.7849,
+      "step": 390
+    },
+    {
+      "epoch": 0.6504065040650406,
+      "grad_norm": 0.50320965051651,
+      "learning_rate": 0.0002,
+      "loss": 1.7925,
+      "step": 400
+    },
+    {
+      "epoch": 0.6666666666666666,
+      "grad_norm": 0.5555663108825684,
+      "learning_rate": 0.0002,
+      "loss": 1.795,
+      "step": 410
+    },
+    {
+      "epoch": 0.6829268292682927,
+      "grad_norm": 0.5865469574928284,
+      "learning_rate": 0.0002,
+      "loss": 1.7562,
+      "step": 420
+    },
+    {
+      "epoch": 0.6991869918699187,
+      "grad_norm": 0.5288474559783936,
+      "learning_rate": 0.0002,
+      "loss": 1.7869,
+      "step": 430
+    },
+    {
+      "epoch": 0.7154471544715447,
+      "grad_norm": 0.5364211797714233,
+      "learning_rate": 0.0002,
+      "loss": 1.8046,
+      "step": 440
+    },
+    {
+      "epoch": 0.7317073170731707,
+      "grad_norm": 0.5877127051353455,
+      "learning_rate": 0.0002,
+      "loss": 1.8124,
+      "step": 450
+    },
+    {
+      "epoch": 0.7479674796747967,
+      "grad_norm": 0.5993741154670715,
+      "learning_rate": 0.0002,
+      "loss": 1.7938,
+      "step": 460
+    },
+    {
+      "epoch": 0.7642276422764228,
+      "grad_norm": 0.4871112108230591,
+      "learning_rate": 0.0002,
+      "loss": 1.8034,
+      "step": 470
+    },
+    {
+      "epoch": 0.7804878048780488,
+      "grad_norm": 0.5300846099853516,
+      "learning_rate": 0.0002,
+      "loss": 1.7798,
+      "step": 480
+    },
+    {
+      "epoch": 0.7967479674796748,
+      "grad_norm": 0.5623212456703186,
+      "learning_rate": 0.0002,
+      "loss": 1.7772,
+      "step": 490
+    },
+    {
+      "epoch": 0.8130081300813008,
+      "grad_norm": 0.5131309032440186,
+      "learning_rate": 0.0002,
+      "loss": 1.7207,
+      "step": 500
+    },
+    {
+      "epoch": 0.8292682926829268,
+      "grad_norm": 0.49512147903442383,
+      "learning_rate": 0.0002,
+      "loss": 1.7143,
+      "step": 510
+    },
+    {
+      "epoch": 0.8455284552845529,
+      "grad_norm": 0.6260727643966675,
+      "learning_rate": 0.0002,
+      "loss": 1.8032,
+      "step": 520
+    },
+    {
+      "epoch": 0.8617886178861789,
+      "grad_norm": 0.5796844959259033,
+      "learning_rate": 0.0002,
+      "loss": 1.8292,
+      "step": 530
+    },
+    {
+      "epoch": 0.8780487804878049,
+      "grad_norm": 0.615927517414093,
+      "learning_rate": 0.0002,
+      "loss": 1.7775,
+      "step": 540
+    },
+    {
+      "epoch": 0.8943089430894309,
+      "grad_norm": 0.5230891704559326,
+      "learning_rate": 0.0002,
+      "loss": 1.7254,
+      "step": 550
+    },
+    {
+      "epoch": 0.9105691056910569,
+      "grad_norm": 0.5990992784500122,
+      "learning_rate": 0.0002,
+      "loss": 1.8126,
+      "step": 560
+    },
+    {
+      "epoch": 0.926829268292683,
+      "grad_norm": 0.538957417011261,
+      "learning_rate": 0.0002,
+      "loss": 1.8551,
+      "step": 570
+    },
+    {
+      "epoch": 0.943089430894309,
+      "grad_norm": 0.556900680065155,
+      "learning_rate": 0.0002,
+      "loss": 1.791,
+      "step": 580
+    },
+    {
+      "epoch": 0.959349593495935,
+      "grad_norm": 0.6459956765174866,
+      "learning_rate": 0.0002,
+      "loss": 1.8799,
+      "step": 590
+    },
+    {
+      "epoch": 0.975609756097561,
+      "grad_norm": 0.5648245215415955,
+      "learning_rate": 0.0002,
+      "loss": 1.774,
+      "step": 600
+    },
+    {
+      "epoch": 0.991869918699187,
+      "grad_norm": 0.5341294407844543,
+      "learning_rate": 0.0002,
+      "loss": 1.7746,
+      "step": 610
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.7831426858901978,
+      "eval_runtime": 98.419,
+      "eval_samples_per_second": 5.416,
+      "eval_steps_per_second": 0.681,
+      "step": 615
+    },
+    {
+      "epoch": 1.008130081300813,
+      "grad_norm": 0.49698150157928467,
+      "learning_rate": 0.0002,
+      "loss": 1.7212,
+      "step": 620
+    },
+    {
+      "epoch": 1.024390243902439,
+      "grad_norm": 0.696890652179718,
+      "learning_rate": 0.0002,
+      "loss": 1.7379,
+      "step": 630
+    },
+    {
+      "epoch": 1.040650406504065,
+      "grad_norm": 0.5939123630523682,
+      "learning_rate": 0.0002,
+      "loss": 1.6391,
+      "step": 640
+    },
+    {
+      "epoch": 1.056910569105691,
+      "grad_norm": 0.5630994439125061,
+      "learning_rate": 0.0002,
+      "loss": 1.712,
+      "step": 650
+    },
+    {
+      "epoch": 1.0731707317073171,
+      "grad_norm": 0.5783666968345642,
+      "learning_rate": 0.0002,
+      "loss": 1.6401,
+      "step": 660
+    },
+    {
+      "epoch": 1.089430894308943,
+      "grad_norm": 0.6006693840026855,
+      "learning_rate": 0.0002,
+      "loss": 1.7158,
+      "step": 670
+    },
+    {
+      "epoch": 1.1056910569105691,
+      "grad_norm": 0.6544332504272461,
+      "learning_rate": 0.0002,
+      "loss": 1.7434,
+      "step": 680
+    },
+    {
+      "epoch": 1.1219512195121952,
+      "grad_norm": 0.6734776496887207,
+      "learning_rate": 0.0002,
+      "loss": 1.6562,
+      "step": 690
+    },
+    {
+      "epoch": 1.1382113821138211,
+      "grad_norm": 0.6067698001861572,
+      "learning_rate": 0.0002,
+      "loss": 1.6724,
+      "step": 700
+    },
+    {
+      "epoch": 1.1544715447154472,
+      "grad_norm": 0.6639267802238464,
+      "learning_rate": 0.0002,
+      "loss": 1.6932,
+      "step": 710
+    },
+    {
+      "epoch": 1.170731707317073,
+      "grad_norm": 0.5179714560508728,
+      "learning_rate": 0.0002,
+      "loss": 1.6562,
+      "step": 720
+    },
+    {
+      "epoch": 1.1869918699186992,
+      "grad_norm": 0.7320363521575928,
+      "learning_rate": 0.0002,
+      "loss": 1.6811,
+      "step": 730
+    },
+    {
+      "epoch": 1.203252032520325,
+      "grad_norm": 0.689231276512146,
+      "learning_rate": 0.0002,
+      "loss": 1.5619,
+      "step": 740
+    },
+    {
+      "epoch": 1.2195121951219512,
+      "grad_norm": 0.6605235934257507,
+      "learning_rate": 0.0002,
+      "loss": 1.5688,
+      "step": 750
+    },
+    {
+      "epoch": 1.2357723577235773,
+      "grad_norm": 0.7013542056083679,
+      "learning_rate": 0.0002,
+      "loss": 1.7045,
+      "step": 760
+    },
+    {
+      "epoch": 1.2520325203252032,
+      "grad_norm": 0.6349928975105286,
+      "learning_rate": 0.0002,
+      "loss": 1.6857,
+      "step": 770
+    },
+    {
+      "epoch": 1.2682926829268293,
+      "grad_norm": 0.6362272500991821,
+      "learning_rate": 0.0002,
+      "loss": 1.6767,
+      "step": 780
+    },
+    {
+      "epoch": 1.2845528455284554,
+      "grad_norm": 0.6152030229568481,
+      "learning_rate": 0.0002,
+      "loss": 1.6594,
+      "step": 790
+    },
+    {
+      "epoch": 1.3008130081300813,
+      "grad_norm": 0.6406176686286926,
+      "learning_rate": 0.0002,
+      "loss": 1.7542,
+      "step": 800
+    },
+    {
+      "epoch": 1.3170731707317074,
+      "grad_norm": 0.6099124550819397,
+      "learning_rate": 0.0002,
+      "loss": 1.7243,
+      "step": 810
+    },
+    {
+      "epoch": 1.3333333333333333,
+      "grad_norm": 0.6298971772193909,
+      "learning_rate": 0.0002,
+      "loss": 1.6642,
+      "step": 820
+    },
+    {
+      "epoch": 1.3495934959349594,
+      "grad_norm": 0.775223433971405,
+      "learning_rate": 0.0002,
+      "loss": 1.6901,
+      "step": 830
+    },
+    {
+      "epoch": 1.3658536585365852,
+      "grad_norm": 0.7261736392974854,
+      "learning_rate": 0.0002,
+      "loss": 1.6284,
+      "step": 840
+    },
+    {
+      "epoch": 1.3821138211382114,
+      "grad_norm": 0.6321929097175598,
+      "learning_rate": 0.0002,
+      "loss": 1.6671,
+      "step": 850
+    },
+    {
+      "epoch": 1.3983739837398375,
+      "grad_norm": 0.7564281225204468,
+      "learning_rate": 0.0002,
+      "loss": 1.7036,
+      "step": 860
+    },
+    {
+      "epoch": 1.4146341463414633,
+      "grad_norm": 0.6329448819160461,
+      "learning_rate": 0.0002,
+      "loss": 1.7014,
+      "step": 870
+    },
+    {
+      "epoch": 1.4308943089430894,
+      "grad_norm": 0.6288684606552124,
+      "learning_rate": 0.0002,
+      "loss": 1.6398,
+      "step": 880
+    },
+    {
+      "epoch": 1.4471544715447155,
+      "grad_norm": 0.6165404915809631,
+      "learning_rate": 0.0002,
+      "loss": 1.673,
+      "step": 890
+    },
+    {
+      "epoch": 1.4634146341463414,
+      "grad_norm": 0.6124468445777893,
+      "learning_rate": 0.0002,
+      "loss": 1.6668,
+      "step": 900
+    },
+    {
+      "epoch": 1.4796747967479675,
+      "grad_norm": 0.7038629651069641,
+      "learning_rate": 0.0002,
+      "loss": 1.6879,
+      "step": 910
+    },
+    {
+      "epoch": 1.4959349593495934,
+      "grad_norm": 0.5755146145820618,
+      "learning_rate": 0.0002,
+      "loss": 1.6701,
+      "step": 920
+    },
+    {
+      "epoch": 1.5121951219512195,
+      "grad_norm": 0.7639156579971313,
+      "learning_rate": 0.0002,
+      "loss": 1.7244,
+      "step": 930
+    },
+    {
+      "epoch": 1.5284552845528454,
+      "grad_norm": 0.6948140859603882,
+      "learning_rate": 0.0002,
+      "loss": 1.6836,
+      "step": 940
+    },
+    {
+      "epoch": 1.5447154471544715,
+      "grad_norm": 0.6887956261634827,
+      "learning_rate": 0.0002,
+      "loss": 1.6479,
+      "step": 950
+    },
+    {
+      "epoch": 1.5609756097560976,
+      "grad_norm": 0.7226824164390564,
+      "learning_rate": 0.0002,
+      "loss": 1.7285,
+      "step": 960
+    },
+    {
+      "epoch": 1.5772357723577235,
+      "grad_norm": 0.6753950715065002,
+      "learning_rate": 0.0002,
+      "loss": 1.6214,
+      "step": 970
+    },
+    {
+      "epoch": 1.5934959349593496,
+      "grad_norm": 0.6580971479415894,
+      "learning_rate": 0.0002,
+      "loss": 1.7283,
+      "step": 980
+    },
+    {
+      "epoch": 1.6097560975609757,
+      "grad_norm": 0.7157843112945557,
+      "learning_rate": 0.0002,
+      "loss": 1.6671,
+      "step": 990
+    },
+    {
+      "epoch": 1.6260162601626016,
+      "grad_norm": 0.6736738681793213,
+      "learning_rate": 0.0002,
+      "loss": 1.645,
+      "step": 1000
+    },
+    {
+      "epoch": 1.6422764227642277,
+      "grad_norm": 0.5271940231323242,
+      "learning_rate": 0.0002,
+      "loss": 1.6589,
+      "step": 1010
+    },
+    {
+      "epoch": 1.6585365853658538,
+      "grad_norm": 0.6378998160362244,
+      "learning_rate": 0.0002,
+      "loss": 1.7358,
+      "step": 1020
+    },
+    {
+      "epoch": 1.6747967479674797,
+      "grad_norm": 0.6498209834098816,
+      "learning_rate": 0.0002,
+      "loss": 1.6924,
+      "step": 1030
+    },
+    {
+      "epoch": 1.6910569105691056,
+      "grad_norm": 0.7050761580467224,
+      "learning_rate": 0.0002,
+      "loss": 1.6253,
+      "step": 1040
+    },
+    {
+      "epoch": 1.7073170731707317,
+      "grad_norm": 0.7122200131416321,
+      "learning_rate": 0.0002,
+      "loss": 1.7146,
+      "step": 1050
+    },
+    {
+      "epoch": 1.7235772357723578,
+      "grad_norm": 0.6705704927444458,
+      "learning_rate": 0.0002,
+      "loss": 1.6511,
+      "step": 1060
+    },
+    {
+      "epoch": 1.7398373983739837,
+      "grad_norm": 0.6859356760978699,
+      "learning_rate": 0.0002,
+      "loss": 1.6506,
+      "step": 1070
+    },
+    {
+      "epoch": 1.7560975609756098,
+      "grad_norm": 0.6540971994400024,
+      "learning_rate": 0.0002,
+      "loss": 1.6562,
+      "step": 1080
+    },
+    {
+      "epoch": 1.7723577235772359,
+      "grad_norm": 0.6297651529312134,
+      "learning_rate": 0.0002,
+      "loss": 1.6627,
+      "step": 1090
+    },
+    {
+      "epoch": 1.7886178861788617,
+      "grad_norm": 0.6645651459693909,
+      "learning_rate": 0.0002,
+      "loss": 1.704,
+      "step": 1100
+    },
+    {
+      "epoch": 1.8048780487804879,
+      "grad_norm": 0.6450296640396118,
+      "learning_rate": 0.0002,
+      "loss": 1.6908,
+      "step": 1110
+    },
+    {
+      "epoch": 1.821138211382114,
+      "grad_norm": 0.7785659432411194,
+      "learning_rate": 0.0002,
+      "loss": 1.7642,
+      "step": 1120
+    },
+    {
+      "epoch": 1.8373983739837398,
+      "grad_norm": 0.6845982670783997,
+      "learning_rate": 0.0002,
+      "loss": 1.6773,
+      "step": 1130
+    },
+    {
+      "epoch": 1.8536585365853657,
+      "grad_norm": 0.699683666229248,
+      "learning_rate": 0.0002,
+      "loss": 1.6879,
+      "step": 1140
+    },
+    {
+      "epoch": 1.8699186991869918,
+      "grad_norm": 0.6600332856178284,
+      "learning_rate": 0.0002,
+      "loss": 1.7162,
+      "step": 1150
+    },
+    {
+      "epoch": 1.886178861788618,
+      "grad_norm": 0.7301949262619019,
+      "learning_rate": 0.0002,
+      "loss": 1.7291,
+      "step": 1160
+    },
+    {
+      "epoch": 1.9024390243902438,
+      "grad_norm": 0.8183556795120239,
+      "learning_rate": 0.0002,
+      "loss": 1.6874,
+      "step": 1170
+    },
+    {
+      "epoch": 1.91869918699187,
+      "grad_norm": 0.7122833132743835,
+      "learning_rate": 0.0002,
+      "loss": 1.6779,
+      "step": 1180
+    },
+    {
+      "epoch": 1.934959349593496,
+      "grad_norm": 0.6391404271125793,
+      "learning_rate": 0.0002,
+      "loss": 1.7361,
+      "step": 1190
+    },
+    {
+      "epoch": 1.951219512195122,
+      "grad_norm": 0.6136474013328552,
+      "learning_rate": 0.0002,
+      "loss": 1.7188,
+      "step": 1200
+    },
+    {
+      "epoch": 1.967479674796748,
+      "grad_norm": 0.7704503536224365,
+      "learning_rate": 0.0002,
+      "loss": 1.6536,
+      "step": 1210
+    },
+    {
+      "epoch": 1.9837398373983741,
+      "grad_norm": 0.6155434846878052,
+      "learning_rate": 0.0002,
+      "loss": 1.6735,
+      "step": 1220
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.6262536644935608,
+      "learning_rate": 0.0002,
+      "loss": 1.6534,
+      "step": 1230
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.7886285781860352,
+      "eval_runtime": 98.7888,
+      "eval_samples_per_second": 5.395,
+      "eval_steps_per_second": 0.678,
+      "step": 1230
+    },
+    {
+      "epoch": 2.016260162601626,
+      "grad_norm": 0.9827656149864197,
+      "learning_rate": 0.0002,
+      "loss": 1.5135,
+      "step": 1240
+    },
+    {
+      "epoch": 2.032520325203252,
+      "grad_norm": 0.8443078398704529,
+      "learning_rate": 0.0002,
+      "loss": 1.5396,
+      "step": 1250
+    },
+    {
+      "epoch": 2.048780487804878,
+      "grad_norm": 0.9006391763687134,
+      "learning_rate": 0.0002,
+      "loss": 1.5326,
+      "step": 1260
+    },
+    {
+      "epoch": 2.065040650406504,
+      "grad_norm": 0.7900105714797974,
+      "learning_rate": 0.0002,
+      "loss": 1.5176,
+      "step": 1270
+    },
+    {
+      "epoch": 2.08130081300813,
+      "grad_norm": 0.6430686116218567,
+      "learning_rate": 0.0002,
+      "loss": 1.5578,
+      "step": 1280
+    },
+    {
+      "epoch": 2.097560975609756,
+      "grad_norm": 0.8998992443084717,
+      "learning_rate": 0.0002,
+      "loss": 1.5453,
+      "step": 1290
+    },
+    {
+      "epoch": 2.113821138211382,
+      "grad_norm": 0.7658976316452026,
+      "learning_rate": 0.0002,
+      "loss": 1.5051,
+      "step": 1300
+    },
+    {
+      "epoch": 2.130081300813008,
+      "grad_norm": 0.9033166766166687,
+      "learning_rate": 0.0002,
+      "loss": 1.5005,
+      "step": 1310
+    },
+    {
+      "epoch": 2.1463414634146343,
+      "grad_norm": 0.7942133545875549,
+      "learning_rate": 0.0002,
+      "loss": 1.5517,
+      "step": 1320
+    },
+    {
+      "epoch": 2.16260162601626,
+      "grad_norm": 0.8496367931365967,
+      "learning_rate": 0.0002,
+      "loss": 1.5248,
+      "step": 1330
+    },
+    {
+      "epoch": 2.178861788617886,
+      "grad_norm": 0.8638061881065369,
+      "learning_rate": 0.0002,
+      "loss": 1.4887,
+      "step": 1340
+    },
+    {
+      "epoch": 2.1951219512195124,
+      "grad_norm": 0.9003657102584839,
+      "learning_rate": 0.0002,
+      "loss": 1.5023,
+      "step": 1350
+    },
+    {
+      "epoch": 2.2113821138211383,
+      "grad_norm": 0.8387648463249207,
+      "learning_rate": 0.0002,
+      "loss": 1.4904,
+      "step": 1360
+    },
+    {
+      "epoch": 2.227642276422764,
+      "grad_norm": 0.7598716616630554,
+      "learning_rate": 0.0002,
+      "loss": 1.5553,
+      "step": 1370
+    },
+    {
+      "epoch": 2.2439024390243905,
+      "grad_norm": 0.872882604598999,
+      "learning_rate": 0.0002,
+      "loss": 1.4723,
+      "step": 1380
+    },
+    {
+      "epoch": 2.2601626016260163,
+      "grad_norm": 0.8919313549995422,
+      "learning_rate": 0.0002,
+      "loss": 1.5121,
+      "step": 1390
+    },
+    {
+      "epoch": 2.2764227642276422,
+      "grad_norm": 0.9646918773651123,
+      "learning_rate": 0.0002,
+      "loss": 1.5162,
+      "step": 1400
+    },
+    {
+      "epoch": 2.292682926829268,
+      "grad_norm": 0.8501992225646973,
+      "learning_rate": 0.0002,
+      "loss": 1.5163,
+      "step": 1410
+    },
+    {
+      "epoch": 2.3089430894308944,
+      "grad_norm": 0.7517067790031433,
+      "learning_rate": 0.0002,
+      "loss": 1.5096,
+      "step": 1420
+    },
+    {
+      "epoch": 2.3252032520325203,
+      "grad_norm": 0.9097304940223694,
+      "learning_rate": 0.0002,
+      "loss": 1.5359,
+      "step": 1430
+    },
+    {
+      "epoch": 2.341463414634146,
+      "grad_norm": 0.8515191674232483,
+      "learning_rate": 0.0002,
+      "loss": 1.4843,
+      "step": 1440
+    },
+    {
+      "epoch": 2.3577235772357725,
+      "grad_norm": 0.8925113677978516,
+      "learning_rate": 0.0002,
+      "loss": 1.5021,
+      "step": 1450
+    },
+    {
+      "epoch": 2.3739837398373984,
+      "grad_norm": 1.0194441080093384,
+      "learning_rate": 0.0002,
+      "loss": 1.4235,
+      "step": 1460
+    },
+    {
+      "epoch": 2.3902439024390243,
+      "grad_norm": 0.9004436731338501,
+      "learning_rate": 0.0002,
+      "loss": 1.5778,
+      "step": 1470
+    },
+    {
+      "epoch": 2.40650406504065,
+      "grad_norm": 0.9552311897277832,
+      "learning_rate": 0.0002,
+      "loss": 1.5623,
+      "step": 1480
+    },
+    {
+      "epoch": 2.4227642276422765,
+      "grad_norm": 0.9185764789581299,
+      "learning_rate": 0.0002,
+      "loss": 1.5507,
+      "step": 1490
+    },
+    {
+      "epoch": 2.4390243902439024,
+      "grad_norm": 0.7935037016868591,
+      "learning_rate": 0.0002,
+      "loss": 1.5058,
+      "step": 1500
+    },
+    {
+      "epoch": 2.4552845528455283,
+      "grad_norm": 0.8124602437019348,
+      "learning_rate": 0.0002,
+      "loss": 1.5374,
+      "step": 1510
+    },
+    {
+      "epoch": 2.4715447154471546,
+      "grad_norm": 0.7927430272102356,
+      "learning_rate": 0.0002,
+      "loss": 1.4553,
+      "step": 1520
+    },
+    {
+      "epoch": 2.4878048780487805,
+      "grad_norm": 0.9143779873847961,
+      "learning_rate": 0.0002,
+      "loss": 1.5135,
+      "step": 1530
+    },
+    {
+      "epoch": 2.5040650406504064,
+      "grad_norm": 0.938185453414917,
+      "learning_rate": 0.0002,
+      "loss": 1.4842,
+      "step": 1540
+    },
+    {
+      "epoch": 2.5203252032520327,
+      "grad_norm": 0.9858708381652832,
+      "learning_rate": 0.0002,
+      "loss": 1.5983,
+      "step": 1550
+    },
+    {
+      "epoch": 2.5365853658536586,
+      "grad_norm": 0.9211642742156982,
+      "learning_rate": 0.0002,
+      "loss": 1.5464,
+      "step": 1560
+    },
+    {
+      "epoch": 2.5528455284552845,
+      "grad_norm": 0.9824395775794983,
+      "learning_rate": 0.0002,
+      "loss": 1.5293,
+      "step": 1570
+    },
+    {
+      "epoch": 2.569105691056911,
+      "grad_norm": 0.916930615901947,
+      "learning_rate": 0.0002,
+      "loss": 1.5559,
+      "step": 1580
+    },
+    {
+      "epoch": 2.5853658536585367,
+      "grad_norm": 0.9336596727371216,
+      "learning_rate": 0.0002,
+      "loss": 1.5581,
+      "step": 1590
+    },
+    {
+      "epoch": 2.6016260162601625,
+      "grad_norm": 0.9006481170654297,
+      "learning_rate": 0.0002,
+      "loss": 1.5379,
+      "step": 1600
+    },
+    {
+      "epoch": 2.617886178861789,
+      "grad_norm": 0.8296214938163757,
+      "learning_rate": 0.0002,
+      "loss": 1.5254,
+      "step": 1610
+    },
+    {
+      "epoch": 2.6341463414634148,
+      "grad_norm": 1.0448366403579712,
+      "learning_rate": 0.0002,
+      "loss": 1.5782,
+      "step": 1620
+    },
+    {
+      "epoch": 2.6504065040650406,
+      "grad_norm": 0.8174839019775391,
+      "learning_rate": 0.0002,
+      "loss": 1.5523,
+      "step": 1630
+    },
+    {
+      "epoch": 2.6666666666666665,
+      "grad_norm": 0.873572051525116,
+      "learning_rate": 0.0002,
+      "loss": 1.4434,
+      "step": 1640
+    },
+    {
+      "epoch": 2.682926829268293,
+      "grad_norm": 0.9270642995834351,
+      "learning_rate": 0.0002,
+      "loss": 1.4723,
+      "step": 1650
+    },
+    {
+      "epoch": 2.6991869918699187,
+      "grad_norm": 0.8988297581672668,
+      "learning_rate": 0.0002,
+      "loss": 1.4736,
+      "step": 1660
+    },
+    {
+      "epoch": 2.7154471544715446,
+      "grad_norm": 0.8537285923957825,
+      "learning_rate": 0.0002,
+      "loss": 1.52,
+      "step": 1670
+    },
+    {
+      "epoch": 2.7317073170731705,
+      "grad_norm": 0.7982168793678284,
+      "learning_rate": 0.0002,
+      "loss": 1.5073,
+      "step": 1680
+    },
+    {
+      "epoch": 2.747967479674797,
+      "grad_norm": 0.9140633940696716,
+      "learning_rate": 0.0002,
+      "loss": 1.5357,
+      "step": 1690
+    },
+    {
+      "epoch": 2.7642276422764227,
+      "grad_norm": 0.8485862016677856,
+      "learning_rate": 0.0002,
+      "loss": 1.5182,
+      "step": 1700
+    },
+    {
+      "epoch": 2.7804878048780486,
+      "grad_norm": 1.3670072555541992,
+      "learning_rate": 0.0002,
+      "loss": 1.5273,
+      "step": 1710
+    },
+    {
+      "epoch": 2.796747967479675,
+      "grad_norm": 0.8846588134765625,
+      "learning_rate": 0.0002,
+      "loss": 1.492,
+      "step": 1720
+    },
+    {
+      "epoch": 2.813008130081301,
+      "grad_norm": 1.0143219232559204,
+      "learning_rate": 0.0002,
+      "loss": 1.5286,
+      "step": 1730
+    },
+    {
+      "epoch": 2.8292682926829267,
+      "grad_norm": 0.9646075367927551,
+      "learning_rate": 0.0002,
+      "loss": 1.5253,
+      "step": 1740
+    },
+    {
+      "epoch": 2.845528455284553,
+      "grad_norm": 0.9912563562393188,
+      "learning_rate": 0.0002,
+      "loss": 1.5865,
+      "step": 1750
+    },
+    {
+      "epoch": 2.861788617886179,
+      "grad_norm": 0.8160223364830017,
+      "learning_rate": 0.0002,
+      "loss": 1.5266,
+      "step": 1760
+    },
+    {
+      "epoch": 2.8780487804878048,
+      "grad_norm": 0.8553791642189026,
+      "learning_rate": 0.0002,
+      "loss": 1.5542,
+      "step": 1770
+    },
+    {
+      "epoch": 2.894308943089431,
+      "grad_norm": 0.8816639184951782,
+      "learning_rate": 0.0002,
+      "loss": 1.5592,
+      "step": 1780
+    },
+    {
+      "epoch": 2.910569105691057,
+      "grad_norm": 0.829551637172699,
+      "learning_rate": 0.0002,
+      "loss": 1.5443,
+      "step": 1790
+    },
+    {
+      "epoch": 2.926829268292683,
+      "grad_norm": 1.0520497560501099,
+      "learning_rate": 0.0002,
+      "loss": 1.5111,
+      "step": 1800
+    },
+    {
+      "epoch": 2.943089430894309,
+      "grad_norm": 0.8627844452857971,
+      "learning_rate": 0.0002,
+      "loss": 1.509,
+      "step": 1810
+    },
+    {
+      "epoch": 2.959349593495935,
+      "grad_norm": 0.8868018388748169,
+      "learning_rate": 0.0002,
+      "loss": 1.5119,
+      "step": 1820
+    },
+    {
+      "epoch": 2.975609756097561,
+      "grad_norm": 1.047621250152588,
+      "learning_rate": 0.0002,
+      "loss": 1.5956,
+      "step": 1830
+    },
+    {
+      "epoch": 2.991869918699187,
+      "grad_norm": 1.122131109237671,
+      "learning_rate": 0.0002,
+      "loss": 1.5189,
+      "step": 1840
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 1.8374383449554443,
+      "eval_runtime": 98.0056,
+      "eval_samples_per_second": 5.438,
+      "eval_steps_per_second": 0.684,
+      "step": 1845
+    },
+    {
+      "epoch": 3.008130081300813,
+      "grad_norm": 0.9361767172813416,
+      "learning_rate": 0.0002,
+      "loss": 1.3996,
+      "step": 1850
+    },
+    {
+      "epoch": 3.024390243902439,
+      "grad_norm": 1.0564402341842651,
+      "learning_rate": 0.0002,
+      "loss": 1.3122,
+      "step": 1860
+    },
+    {
+      "epoch": 3.040650406504065,
+      "grad_norm": 1.2450026273727417,
+      "learning_rate": 0.0002,
+      "loss": 1.2512,
+      "step": 1870
+    },
+    {
+      "epoch": 3.0569105691056913,
+      "grad_norm": 1.082606554031372,
+      "learning_rate": 0.0002,
+      "loss": 1.2585,
+      "step": 1880
+    },
+    {
+      "epoch": 3.073170731707317,
+      "grad_norm": 1.1582257747650146,
+      "learning_rate": 0.0002,
+      "loss": 1.2827,
+      "step": 1890
+    },
+    {
+      "epoch": 3.089430894308943,
+      "grad_norm": 1.1113696098327637,
+      "learning_rate": 0.0002,
+      "loss": 1.2865,
+      "step": 1900
+    },
+    {
+      "epoch": 3.105691056910569,
+      "grad_norm": 1.1716952323913574,
+      "learning_rate": 0.0002,
+      "loss": 1.2867,
+      "step": 1910
+    },
+    {
+      "epoch": 3.1219512195121952,
+      "grad_norm": 1.1270506381988525,
+      "learning_rate": 0.0002,
+      "loss": 1.286,
+      "step": 1920
+    },
+    {
+      "epoch": 3.138211382113821,
+      "grad_norm": 1.1955605745315552,
+      "learning_rate": 0.0002,
+      "loss": 1.3074,
+      "step": 1930
+    },
+    {
+      "epoch": 3.154471544715447,
+      "grad_norm": 1.246848464012146,
+      "learning_rate": 0.0002,
+      "loss": 1.2752,
+      "step": 1940
+    },
+    {
+      "epoch": 3.1707317073170733,
+      "grad_norm": 1.2208205461502075,
+      "learning_rate": 0.0002,
+      "loss": 1.3422,
+      "step": 1950
+    },
+    {
+      "epoch": 3.186991869918699,
+      "grad_norm": 1.1758005619049072,
+      "learning_rate": 0.0002,
+      "loss": 1.2618,
+      "step": 1960
+    },
+    {
+      "epoch": 3.203252032520325,
+      "grad_norm": 1.2697960138320923,
+      "learning_rate": 0.0002,
+      "loss": 1.3094,
+      "step": 1970
+    },
+    {
+      "epoch": 3.2195121951219514,
+      "grad_norm": 1.0855997800827026,
+      "learning_rate": 0.0002,
+      "loss": 1.3714,
+      "step": 1980
+    },
+    {
+      "epoch": 3.2357723577235773,
+      "grad_norm": 1.1054189205169678,
+      "learning_rate": 0.0002,
+      "loss": 1.2866,
+      "step": 1990
+    },
+    {
+      "epoch": 3.252032520325203,
+      "grad_norm": 1.2496592998504639,
+      "learning_rate": 0.0002,
+      "loss": 1.3057,
+      "step": 2000
+    },
+    {
+      "epoch": 3.2682926829268295,
+      "grad_norm": 1.215553641319275,
+      "learning_rate": 0.0002,
+      "loss": 1.3868,
+      "step": 2010
+    },
+    {
+      "epoch": 3.2845528455284554,
+      "grad_norm": 1.1711665391921997,
+      "learning_rate": 0.0002,
+      "loss": 1.2866,
+      "step": 2020
+    },
+    {
+      "epoch": 3.3008130081300813,
+      "grad_norm": 1.493438959121704,
+      "learning_rate": 0.0002,
+      "loss": 1.2969,
+      "step": 2030
+    },
+    {
+      "epoch": 3.317073170731707,
+      "grad_norm": 1.1202969551086426,
+      "learning_rate": 0.0002,
+      "loss": 1.3032,
+      "step": 2040
+    },
+    {
+      "epoch": 3.3333333333333335,
+      "grad_norm": 1.1334387063980103,
+      "learning_rate": 0.0002,
+      "loss": 1.3257,
+      "step": 2050
+    },
+    {
+      "epoch": 3.3495934959349594,
+      "grad_norm": 1.2813389301300049,
+      "learning_rate": 0.0002,
+      "loss": 1.2823,
+      "step": 2060
+    },
+    {
+      "epoch": 3.3658536585365852,
+      "grad_norm": 1.1317278146743774,
+      "learning_rate": 0.0002,
+      "loss": 1.2892,
+      "step": 2070
+    },
+    {
+      "epoch": 3.3821138211382116,
+      "grad_norm": 1.4018956422805786,
+      "learning_rate": 0.0002,
+      "loss": 1.2731,
+      "step": 2080
+    },
+    {
+      "epoch": 3.3983739837398375,
+      "grad_norm": 1.1856937408447266,
+      "learning_rate": 0.0002,
+      "loss": 1.3279,
+      "step": 2090
+    },
+    {
+      "epoch": 3.4146341463414633,
+      "grad_norm": 1.480185627937317,
+      "learning_rate": 0.0002,
+      "loss": 1.2903,
+      "step": 2100
+    },
+    {
+      "epoch": 3.430894308943089,
+      "grad_norm": 1.3945696353912354,
+      "learning_rate": 0.0002,
+      "loss": 1.3713,
+      "step": 2110
+    },
+    {
+      "epoch": 3.4471544715447155,
+      "grad_norm": 1.5409419536590576,
+      "learning_rate": 0.0002,
+      "loss": 1.3327,
+      "step": 2120
+    },
+    {
+      "epoch": 3.4634146341463414,
+      "grad_norm": 1.3170857429504395,
+      "learning_rate": 0.0002,
+      "loss": 1.3456,
+      "step": 2130
+    },
+    {
+      "epoch": 3.4796747967479673,
+      "grad_norm": 1.1793901920318604,
+      "learning_rate": 0.0002,
+      "loss": 1.3129,
+      "step": 2140
+    },
+    {
+      "epoch": 3.4959349593495936,
+      "grad_norm": 1.3043832778930664,
+      "learning_rate": 0.0002,
+      "loss": 1.3356,
+      "step": 2150
+    },
+    {
+      "epoch": 3.5121951219512195,
+      "grad_norm": 1.2157930135726929,
+      "learning_rate": 0.0002,
+      "loss": 1.2893,
+      "step": 2160
+    },
+    {
+      "epoch": 3.5284552845528454,
+      "grad_norm": 1.2139101028442383,
+      "learning_rate": 0.0002,
+      "loss": 1.3606,
+      "step": 2170
+    },
+    {
+      "epoch": 3.5447154471544717,
+      "grad_norm": 1.0714174509048462,
+      "learning_rate": 0.0002,
+      "loss": 1.2897,
+      "step": 2180
+    },
+    {
+      "epoch": 3.5609756097560976,
+      "grad_norm": 1.1357146501541138,
+      "learning_rate": 0.0002,
+      "loss": 1.3398,
+      "step": 2190
+    },
+    {
+      "epoch": 3.5772357723577235,
+      "grad_norm": 1.216141939163208,
+      "learning_rate": 0.0002,
+      "loss": 1.2829,
+      "step": 2200
+    },
+    {
+      "epoch": 3.59349593495935,
+      "grad_norm": 1.2001926898956299,
+      "learning_rate": 0.0002,
+      "loss": 1.3411,
+      "step": 2210
+    },
+    {
+      "epoch": 3.6097560975609757,
+      "grad_norm": 1.355756163597107,
+      "learning_rate": 0.0002,
+      "loss": 1.2804,
+      "step": 2220
+    },
+    {
+      "epoch": 3.6260162601626016,
+      "grad_norm": 1.1870149374008179,
+      "learning_rate": 0.0002,
+      "loss": 1.3732,
+      "step": 2230
+    },
+    {
+      "epoch": 3.642276422764228,
+      "grad_norm": 1.0973352193832397,
+      "learning_rate": 0.0002,
+      "loss": 1.4334,
+      "step": 2240
+    },
+    {
+      "epoch": 3.658536585365854,
+      "grad_norm": 1.110839605331421,
+      "learning_rate": 0.0002,
+      "loss": 1.3987,
+      "step": 2250
+    },
+    {
+      "epoch": 3.6747967479674797,
+      "grad_norm": 1.1280663013458252,
+      "learning_rate": 0.0002,
+      "loss": 1.3316,
+      "step": 2260
+    },
+    {
+      "epoch": 3.6910569105691056,
+      "grad_norm": 1.3871443271636963,
+      "learning_rate": 0.0002,
+      "loss": 1.2897,
+      "step": 2270
+    },
+    {
+      "epoch": 3.7073170731707314,
+      "grad_norm": 1.384059190750122,
+      "learning_rate": 0.0002,
+      "loss": 1.3784,
+      "step": 2280
+    },
+    {
+      "epoch": 3.7235772357723578,
+      "grad_norm": 1.422131896018982,
+      "learning_rate": 0.0002,
+      "loss": 1.3288,
+      "step": 2290
+    },
+    {
+      "epoch": 3.7398373983739837,
+      "grad_norm": 1.2262955904006958,
+      "learning_rate": 0.0002,
+      "loss": 1.342,
+      "step": 2300
+    },
+    {
+      "epoch": 3.7560975609756095,
+      "grad_norm": 1.4098708629608154,
+      "learning_rate": 0.0002,
+      "loss": 1.3674,
+      "step": 2310
+    },
+    {
+      "epoch": 3.772357723577236,
+      "grad_norm": 1.3726389408111572,
+      "learning_rate": 0.0002,
+      "loss": 1.4156,
+      "step": 2320
+    },
+    {
+      "epoch": 3.7886178861788617,
+      "grad_norm": 1.2945446968078613,
+      "learning_rate": 0.0002,
+      "loss": 1.3083,
+      "step": 2330
+    },
+    {
+      "epoch": 3.8048780487804876,
+      "grad_norm": 1.2011241912841797,
+      "learning_rate": 0.0002,
+      "loss": 1.3631,
+      "step": 2340
+    },
+    {
+      "epoch": 3.821138211382114,
+      "grad_norm": 1.158033847808838,
+      "learning_rate": 0.0002,
+      "loss": 1.3888,
+      "step": 2350
+    },
+    {
+      "epoch": 3.83739837398374,
+      "grad_norm": 1.2479424476623535,
+      "learning_rate": 0.0002,
+      "loss": 1.3159,
+      "step": 2360
+    },
+    {
+      "epoch": 3.8536585365853657,
+      "grad_norm": 1.253841519355774,
+      "learning_rate": 0.0002,
+      "loss": 1.3116,
+      "step": 2370
+    },
+    {
+      "epoch": 3.869918699186992,
+      "grad_norm": 1.2509289979934692,
+      "learning_rate": 0.0002,
+      "loss": 1.3943,
+      "step": 2380
+    },
+    {
+      "epoch": 3.886178861788618,
+      "grad_norm": 1.529388666152954,
+      "learning_rate": 0.0002,
+      "loss": 1.3717,
+      "step": 2390
+    },
+    {
+      "epoch": 3.902439024390244,
+      "grad_norm": 1.241012692451477,
+      "learning_rate": 0.0002,
+      "loss": 1.3875,
+      "step": 2400
+    },
+    {
+      "epoch": 3.91869918699187,
+      "grad_norm": 1.4315979480743408,
+      "learning_rate": 0.0002,
+      "loss": 1.3352,
+      "step": 2410
+    },
+    {
+      "epoch": 3.934959349593496,
+      "grad_norm": 1.6688332557678223,
+      "learning_rate": 0.0002,
+      "loss": 1.4241,
+      "step": 2420
+    },
+    {
+      "epoch": 3.951219512195122,
+      "grad_norm": 1.3832660913467407,
+      "learning_rate": 0.0002,
+      "loss": 1.3261,
+      "step": 2430
+    },
+    {
+      "epoch": 3.9674796747967482,
+      "grad_norm": 1.3022568225860596,
+      "learning_rate": 0.0002,
+      "loss": 1.3334,
+      "step": 2440
+    },
+    {
+      "epoch": 3.983739837398374,
+      "grad_norm": 1.3116395473480225,
+      "learning_rate": 0.0002,
+      "loss": 1.4051,
+      "step": 2450
+    },
+    {
+      "epoch": 4.0,
+      "grad_norm": 1.2045269012451172,
+      "learning_rate": 0.0002,
+      "loss": 1.3712,
+      "step": 2460
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 1.938527226448059,
+      "eval_runtime": 95.315,
+      "eval_samples_per_second": 5.592,
+      "eval_steps_per_second": 0.703,
+      "step": 2460
+    },
+    {
+      "epoch": 4.016260162601626,
+      "grad_norm": 2.3368349075317383,
+      "learning_rate": 0.0002,
+      "loss": 1.089,
+      "step": 2470
+    },
+    {
+      "epoch": 4.032520325203252,
+      "grad_norm": 1.5294667482376099,
+      "learning_rate": 0.0002,
+      "loss": 1.067,
+      "step": 2480
+    },
+    {
+      "epoch": 4.048780487804878,
+      "grad_norm": 1.6742061376571655,
+      "learning_rate": 0.0002,
+      "loss": 1.0322,
+      "step": 2490
+    },
+    {
+      "epoch": 4.065040650406504,
+      "grad_norm": 1.766839623451233,
+      "learning_rate": 0.0002,
+      "loss": 1.0097,
+      "step": 2500
+    },
+    {
+      "epoch": 4.08130081300813,
+      "grad_norm": 1.632996916770935,
+      "learning_rate": 0.0002,
+      "loss": 1.15,
+      "step": 2510
+    },
+    {
+      "epoch": 4.097560975609756,
+      "grad_norm": 1.37165367603302,
+      "learning_rate": 0.0002,
+      "loss": 1.057,
+      "step": 2520
+    },
+    {
+      "epoch": 4.1138211382113825,
+      "grad_norm": 1.4612709283828735,
+      "learning_rate": 0.0002,
+      "loss": 1.128,
+      "step": 2530
+    },
+    {
+      "epoch": 4.130081300813008,
+      "grad_norm": 1.7394530773162842,
+      "learning_rate": 0.0002,
+      "loss": 1.0034,
+      "step": 2540
+    },
+    {
+      "epoch": 4.146341463414634,
+      "grad_norm": 1.8255715370178223,
+      "learning_rate": 0.0002,
+      "loss": 1.0831,
+      "step": 2550
+    },
+    {
+      "epoch": 4.16260162601626,
+      "grad_norm": 2.00886869430542,
+      "learning_rate": 0.0002,
+      "loss": 1.0942,
+      "step": 2560
+    },
+    {
+      "epoch": 4.178861788617886,
+      "grad_norm": 1.6969596147537231,
+      "learning_rate": 0.0002,
+      "loss": 1.0548,
+      "step": 2570
+    },
+    {
+      "epoch": 4.195121951219512,
+      "grad_norm": 1.9369271993637085,
+      "learning_rate": 0.0002,
+      "loss": 1.1523,
+      "step": 2580
+    },
+    {
+      "epoch": 4.211382113821138,
+      "grad_norm": 1.6654353141784668,
+      "learning_rate": 0.0002,
+      "loss": 1.0759,
+      "step": 2590
+    },
+    {
+      "epoch": 4.227642276422764,
+      "grad_norm": 1.621569275856018,
+      "learning_rate": 0.0002,
+      "loss": 1.1495,
+      "step": 2600
+    },
+    {
+      "epoch": 4.2439024390243905,
+      "grad_norm": 1.6566373109817505,
+      "learning_rate": 0.0002,
+      "loss": 1.0638,
+      "step": 2610
+    },
+    {
+      "epoch": 4.260162601626016,
+      "grad_norm": 1.7170981168746948,
+      "learning_rate": 0.0002,
+      "loss": 1.1289,
+      "step": 2620
+    },
+    {
+      "epoch": 4.276422764227642,
+      "grad_norm": 1.5868020057678223,
+      "learning_rate": 0.0002,
+      "loss": 1.0647,
+      "step": 2630
+    },
+    {
+      "epoch": 4.2926829268292686,
+      "grad_norm": 1.6616328954696655,
+      "learning_rate": 0.0002,
+      "loss": 1.1076,
+      "step": 2640
+    },
+    {
+      "epoch": 4.308943089430894,
+      "grad_norm": 1.7867165803909302,
+      "learning_rate": 0.0002,
+      "loss": 1.0803,
+      "step": 2650
+    },
+    {
+      "epoch": 4.32520325203252,
+      "grad_norm": 1.7968727350234985,
+      "learning_rate": 0.0002,
+      "loss": 1.0377,
+      "step": 2660
+    },
+    {
+      "epoch": 4.341463414634147,
+      "grad_norm": 1.5795880556106567,
+      "learning_rate": 0.0002,
+      "loss": 1.1096,
+      "step": 2670
+    },
+    {
+      "epoch": 4.357723577235772,
+      "grad_norm": 1.5703494548797607,
+      "learning_rate": 0.0002,
+      "loss": 1.1472,
+      "step": 2680
+    },
+    {
+      "epoch": 4.373983739837398,
+      "grad_norm": 1.6967381238937378,
+      "learning_rate": 0.0002,
+      "loss": 1.0789,
+      "step": 2690
+    },
+    {
+      "epoch": 4.390243902439025,
+      "grad_norm": 1.5561623573303223,
+      "learning_rate": 0.0002,
+      "loss": 1.1253,
+      "step": 2700
+    },
+    {
+      "epoch": 4.40650406504065,
+      "grad_norm": 1.9618488550186157,
+      "learning_rate": 0.0002,
+      "loss": 1.1493,
+      "step": 2710
+    },
+    {
+      "epoch": 4.4227642276422765,
+      "grad_norm": 1.4679653644561768,
+      "learning_rate": 0.0002,
+      "loss": 1.0674,
+      "step": 2720
+    },
+    {
+      "epoch": 4.439024390243903,
+      "grad_norm": 1.6527636051177979,
+      "learning_rate": 0.0002,
+      "loss": 1.1486,
+      "step": 2730
+    },
+    {
+      "epoch": 4.455284552845528,
+      "grad_norm": 1.9380215406417847,
+      "learning_rate": 0.0002,
+      "loss": 1.081,
+      "step": 2740
+    },
+    {
+      "epoch": 4.471544715447155,
+      "grad_norm": 1.8928139209747314,
+      "learning_rate": 0.0002,
+      "loss": 1.1709,
+      "step": 2750
+    },
+    {
+      "epoch": 4.487804878048781,
+      "grad_norm": 1.6719214916229248,
+      "learning_rate": 0.0002,
+      "loss": 1.121,
+      "step": 2760
+    },
+    {
+      "epoch": 4.504065040650406,
+      "grad_norm": 1.5200358629226685,
+      "learning_rate": 0.0002,
+      "loss": 1.1578,
+      "step": 2770
+    },
+    {
+      "epoch": 4.520325203252033,
+      "grad_norm": 1.6545467376708984,
+      "learning_rate": 0.0002,
+      "loss": 1.1576,
+      "step": 2780
+    },
+    {
+      "epoch": 4.536585365853659,
+      "grad_norm": 1.569615364074707,
+      "learning_rate": 0.0002,
+      "loss": 1.1388,
+      "step": 2790
+    },
+    {
+      "epoch": 4.5528455284552845,
+      "grad_norm": 1.7238937616348267,
+      "learning_rate": 0.0002,
+      "loss": 1.1024,
+      "step": 2800
+    },
+    {
+      "epoch": 4.569105691056911,
+      "grad_norm": 1.8149088621139526,
+      "learning_rate": 0.0002,
+      "loss": 1.1747,
+      "step": 2810
+    },
+    {
+      "epoch": 4.585365853658536,
+      "grad_norm": 1.876002311706543,
+      "learning_rate": 0.0002,
+      "loss": 1.1397,
+      "step": 2820
+    },
+    {
+      "epoch": 4.6016260162601625,
+      "grad_norm": 1.938772439956665,
+      "learning_rate": 0.0002,
+      "loss": 1.091,
+      "step": 2830
+    },
+    {
+      "epoch": 4.617886178861789,
+      "grad_norm": 1.5655368566513062,
+      "learning_rate": 0.0002,
+      "loss": 1.0954,
+      "step": 2840
+    },
+    {
+      "epoch": 4.634146341463414,
+      "grad_norm": 1.8196513652801514,
+      "learning_rate": 0.0002,
+      "loss": 1.2074,
+      "step": 2850
+    },
+    {
+      "epoch": 4.650406504065041,
+      "grad_norm": 1.6780239343643188,
+      "learning_rate": 0.0002,
+      "loss": 1.1261,
+      "step": 2860
+    },
+    {
+      "epoch": 4.666666666666667,
+      "grad_norm": 1.445952296257019,
+      "learning_rate": 0.0002,
+      "loss": 1.1654,
+      "step": 2870
+    },
+    {
+      "epoch": 4.682926829268292,
+      "grad_norm": 1.7116491794586182,
+      "learning_rate": 0.0002,
+      "loss": 1.1576,
+      "step": 2880
+    },
+    {
+      "epoch": 4.699186991869919,
+      "grad_norm": 1.8259165287017822,
+      "learning_rate": 0.0002,
+      "loss": 1.1766,
+      "step": 2890
+    },
+    {
+      "epoch": 4.715447154471545,
+      "grad_norm": 1.690813660621643,
+      "learning_rate": 0.0002,
+      "loss": 1.123,
+      "step": 2900
+    },
+    {
+      "epoch": 4.7317073170731705,
+      "grad_norm": 1.964525818824768,
+      "learning_rate": 0.0002,
+      "loss": 1.15,
+      "step": 2910
+    },
+    {
+      "epoch": 4.747967479674797,
+      "grad_norm": 1.6966286897659302,
+      "learning_rate": 0.0002,
+      "loss": 1.1764,
+      "step": 2920
+    },
+    {
+      "epoch": 4.764227642276423,
+      "grad_norm": 1.8009082078933716,
+      "learning_rate": 0.0002,
+      "loss": 1.115,
+      "step": 2930
+    },
+    {
+      "epoch": 4.780487804878049,
+      "grad_norm": 1.5503566265106201,
+      "learning_rate": 0.0002,
+      "loss": 1.0741,
+      "step": 2940
+    },
+    {
+      "epoch": 4.796747967479675,
+      "grad_norm": 1.616410493850708,
+      "learning_rate": 0.0002,
+      "loss": 1.1681,
+      "step": 2950
+    },
+    {
+      "epoch": 4.8130081300813,
+      "grad_norm": 1.8752009868621826,
+      "learning_rate": 0.0002,
+      "loss": 1.1404,
+      "step": 2960
+    },
+    {
+      "epoch": 4.829268292682927,
+      "grad_norm": 1.6281180381774902,
+      "learning_rate": 0.0002,
+      "loss": 1.1325,
+      "step": 2970
+    },
+    {
+      "epoch": 4.845528455284553,
+      "grad_norm": 1.6588609218597412,
+      "learning_rate": 0.0002,
+      "loss": 1.0928,
+      "step": 2980
+    },
+    {
+      "epoch": 4.861788617886178,
+      "grad_norm": 1.7978718280792236,
+      "learning_rate": 0.0002,
+      "loss": 1.1603,
+      "step": 2990
+    },
+    {
+      "epoch": 4.878048780487805,
+      "grad_norm": 1.5647393465042114,
+      "learning_rate": 0.0002,
+      "loss": 1.1617,
+      "step": 3000
+    },
+    {
+      "epoch": 4.894308943089431,
+      "grad_norm": 1.5811057090759277,
+      "learning_rate": 0.0002,
+      "loss": 1.1394,
+      "step": 3010
+    },
+    {
+      "epoch": 4.9105691056910565,
+      "grad_norm": 1.9754141569137573,
+      "learning_rate": 0.0002,
+      "loss": 1.1287,
+      "step": 3020
+    },
+    {
+      "epoch": 4.926829268292683,
+      "grad_norm": 1.591244101524353,
+      "learning_rate": 0.0002,
+      "loss": 1.2431,
+      "step": 3030
+    },
+    {
+      "epoch": 4.943089430894309,
+      "grad_norm": 1.7036725282669067,
+      "learning_rate": 0.0002,
+      "loss": 1.128,
+      "step": 3040
+    },
+    {
+      "epoch": 4.959349593495935,
+      "grad_norm": 1.8453916311264038,
+      "learning_rate": 0.0002,
+      "loss": 1.1828,
+      "step": 3050
+    },
+    {
+      "epoch": 4.975609756097561,
+      "grad_norm": 1.3091868162155151,
+      "learning_rate": 0.0002,
+      "loss": 1.1542,
+      "step": 3060
+    },
+    {
+      "epoch": 4.991869918699187,
+      "grad_norm": 1.7609132528305054,
+      "learning_rate": 0.0002,
+      "loss": 1.1855,
+      "step": 3070
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 2.074026346206665,
+      "eval_runtime": 98.3931,
+      "eval_samples_per_second": 5.417,
+      "eval_steps_per_second": 0.681,
+      "step": 3075
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 4920,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.3491478462464e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}