MilaWang commited on Mar 28

Commit

50c8637

verified ·

1 Parent(s): 21670e5

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1386/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1386/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1386/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1386/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1386/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1386/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1386/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1386/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1386/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1386/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1386/trainer_state.json +1023 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1386/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1848/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1848/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1848/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1848/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1848/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1848/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1848/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1848/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1848/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1848/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1848/trainer_state.json +1353 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1848/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-2310/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-2310/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-2310/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-2310/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-2310/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-2310/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-2310/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-2310/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-2310/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-2310/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-2310/trainer_state.json +1690 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-2310/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-2772/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-2772/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-2772/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-2772/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-2772/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-2772/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-2772/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-2772/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-2772/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-2772/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-2772/trainer_state.json +2020 -0

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d7ef31f28a2864f9fd912d8347cc8ec69e66e1e28eca8b94c118f3681fdabf4a
+size 109069176

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1386/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1386/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5885009d3afa60a99e03636cf99f23bab6c6c58ec194086469b27e4c52199fb4
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:127f67d4db533203fb2fb44b827bf2235acf6444d7af9fd4d5d1a15c64674388
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae3e79f7bd64822d835e92384e077f0fb05113f7887dae147256d3de6f245d5e
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:237814caa6cb88953c1896aef7b5d4120ff5268c8f006284225db712de184866
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,1023 @@

+{
+  "best_metric": 1.7847579717636108,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-462",
+  "epoch": 3.0,
+  "eval_steps": 10,
+  "global_step": 1386,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.021645021645021644,
+      "grad_norm": 1.200374722480774,
+      "learning_rate": 0.0002,
+      "loss": 2.5092,
+      "step": 10
+    },
+    {
+      "epoch": 0.04329004329004329,
+      "grad_norm": 0.974091112613678,
+      "learning_rate": 0.0002,
+      "loss": 2.2672,
+      "step": 20
+    },
+    {
+      "epoch": 0.06493506493506493,
+      "grad_norm": 0.9070103168487549,
+      "learning_rate": 0.0002,
+      "loss": 2.1445,
+      "step": 30
+    },
+    {
+      "epoch": 0.08658008658008658,
+      "grad_norm": 0.6892510056495667,
+      "learning_rate": 0.0002,
+      "loss": 2.0634,
+      "step": 40
+    },
+    {
+      "epoch": 0.10822510822510822,
+      "grad_norm": 0.7840355038642883,
+      "learning_rate": 0.0002,
+      "loss": 2.039,
+      "step": 50
+    },
+    {
+      "epoch": 0.12987012987012986,
+      "grad_norm": 0.8381665349006653,
+      "learning_rate": 0.0002,
+      "loss": 1.9527,
+      "step": 60
+    },
+    {
+      "epoch": 0.15151515151515152,
+      "grad_norm": 0.6969044804573059,
+      "learning_rate": 0.0002,
+      "loss": 1.8852,
+      "step": 70
+    },
+    {
+      "epoch": 0.17316017316017315,
+      "grad_norm": 0.6608849763870239,
+      "learning_rate": 0.0002,
+      "loss": 1.8263,
+      "step": 80
+    },
+    {
+      "epoch": 0.19480519480519481,
+      "grad_norm": 0.6329185962677002,
+      "learning_rate": 0.0002,
+      "loss": 1.825,
+      "step": 90
+    },
+    {
+      "epoch": 0.21645021645021645,
+      "grad_norm": 0.723852276802063,
+      "learning_rate": 0.0002,
+      "loss": 1.8256,
+      "step": 100
+    },
+    {
+      "epoch": 0.23809523809523808,
+      "grad_norm": 0.8358765840530396,
+      "learning_rate": 0.0002,
+      "loss": 1.8758,
+      "step": 110
+    },
+    {
+      "epoch": 0.2597402597402597,
+      "grad_norm": 0.6025514006614685,
+      "learning_rate": 0.0002,
+      "loss": 1.8468,
+      "step": 120
+    },
+    {
+      "epoch": 0.2813852813852814,
+      "grad_norm": 0.5782386064529419,
+      "learning_rate": 0.0002,
+      "loss": 1.7487,
+      "step": 130
+    },
+    {
+      "epoch": 0.30303030303030304,
+      "grad_norm": 0.8589595556259155,
+      "learning_rate": 0.0002,
+      "loss": 1.7717,
+      "step": 140
+    },
+    {
+      "epoch": 0.3246753246753247,
+      "grad_norm": 0.5718036890029907,
+      "learning_rate": 0.0002,
+      "loss": 1.7726,
+      "step": 150
+    },
+    {
+      "epoch": 0.3463203463203463,
+      "grad_norm": 0.632756769657135,
+      "learning_rate": 0.0002,
+      "loss": 1.784,
+      "step": 160
+    },
+    {
+      "epoch": 0.36796536796536794,
+      "grad_norm": 0.5307920575141907,
+      "learning_rate": 0.0002,
+      "loss": 1.8176,
+      "step": 170
+    },
+    {
+      "epoch": 0.38961038961038963,
+      "grad_norm": 0.5692276358604431,
+      "learning_rate": 0.0002,
+      "loss": 1.7744,
+      "step": 180
+    },
+    {
+      "epoch": 0.41125541125541126,
+      "grad_norm": 0.6083813309669495,
+      "learning_rate": 0.0002,
+      "loss": 1.8075,
+      "step": 190
+    },
+    {
+      "epoch": 0.4329004329004329,
+      "grad_norm": 0.7849981188774109,
+      "learning_rate": 0.0002,
+      "loss": 1.8718,
+      "step": 200
+    },
+    {
+      "epoch": 0.45454545454545453,
+      "grad_norm": 0.6536546945571899,
+      "learning_rate": 0.0002,
+      "loss": 1.7946,
+      "step": 210
+    },
+    {
+      "epoch": 0.47619047619047616,
+      "grad_norm": 0.5180730223655701,
+      "learning_rate": 0.0002,
+      "loss": 1.8174,
+      "step": 220
+    },
+    {
+      "epoch": 0.49783549783549785,
+      "grad_norm": 0.5796821713447571,
+      "learning_rate": 0.0002,
+      "loss": 1.7857,
+      "step": 230
+    },
+    {
+      "epoch": 0.5194805194805194,
+      "grad_norm": 0.6185894012451172,
+      "learning_rate": 0.0002,
+      "loss": 1.8062,
+      "step": 240
+    },
+    {
+      "epoch": 0.5411255411255411,
+      "grad_norm": 0.6040953397750854,
+      "learning_rate": 0.0002,
+      "loss": 1.825,
+      "step": 250
+    },
+    {
+      "epoch": 0.5627705627705628,
+      "grad_norm": 0.6005431413650513,
+      "learning_rate": 0.0002,
+      "loss": 1.7785,
+      "step": 260
+    },
+    {
+      "epoch": 0.5844155844155844,
+      "grad_norm": 0.6693951487541199,
+      "learning_rate": 0.0002,
+      "loss": 1.8444,
+      "step": 270
+    },
+    {
+      "epoch": 0.6060606060606061,
+      "grad_norm": 0.5105443596839905,
+      "learning_rate": 0.0002,
+      "loss": 1.8471,
+      "step": 280
+    },
+    {
+      "epoch": 0.6277056277056277,
+      "grad_norm": 0.5175243616104126,
+      "learning_rate": 0.0002,
+      "loss": 1.8274,
+      "step": 290
+    },
+    {
+      "epoch": 0.6493506493506493,
+      "grad_norm": 0.4775221049785614,
+      "learning_rate": 0.0002,
+      "loss": 1.81,
+      "step": 300
+    },
+    {
+      "epoch": 0.670995670995671,
+      "grad_norm": 0.9106342792510986,
+      "learning_rate": 0.0002,
+      "loss": 1.7816,
+      "step": 310
+    },
+    {
+      "epoch": 0.6926406926406926,
+      "grad_norm": 1.9134571552276611,
+      "learning_rate": 0.0002,
+      "loss": 1.7764,
+      "step": 320
+    },
+    {
+      "epoch": 0.7142857142857143,
+      "grad_norm": 0.6287537217140198,
+      "learning_rate": 0.0002,
+      "loss": 1.7877,
+      "step": 330
+    },
+    {
+      "epoch": 0.7359307359307359,
+      "grad_norm": 0.5587132573127747,
+      "learning_rate": 0.0002,
+      "loss": 1.8499,
+      "step": 340
+    },
+    {
+      "epoch": 0.7575757575757576,
+      "grad_norm": 0.5827193260192871,
+      "learning_rate": 0.0002,
+      "loss": 1.7328,
+      "step": 350
+    },
+    {
+      "epoch": 0.7792207792207793,
+      "grad_norm": 0.572600781917572,
+      "learning_rate": 0.0002,
+      "loss": 1.8022,
+      "step": 360
+    },
+    {
+      "epoch": 0.8008658008658008,
+      "grad_norm": 0.6280586123466492,
+      "learning_rate": 0.0002,
+      "loss": 1.88,
+      "step": 370
+    },
+    {
+      "epoch": 0.8225108225108225,
+      "grad_norm": 0.6878819465637207,
+      "learning_rate": 0.0002,
+      "loss": 1.8116,
+      "step": 380
+    },
+    {
+      "epoch": 0.8441558441558441,
+      "grad_norm": 0.5876027345657349,
+      "learning_rate": 0.0002,
+      "loss": 1.8042,
+      "step": 390
+    },
+    {
+      "epoch": 0.8658008658008658,
+      "grad_norm": 0.5249695777893066,
+      "learning_rate": 0.0002,
+      "loss": 1.7501,
+      "step": 400
+    },
+    {
+      "epoch": 0.8874458874458875,
+      "grad_norm": 0.5510677695274353,
+      "learning_rate": 0.0002,
+      "loss": 1.7599,
+      "step": 410
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 0.6817089915275574,
+      "learning_rate": 0.0002,
+      "loss": 1.7737,
+      "step": 420
+    },
+    {
+      "epoch": 0.9307359307359307,
+      "grad_norm": 0.5116859078407288,
+      "learning_rate": 0.0002,
+      "loss": 1.7857,
+      "step": 430
+    },
+    {
+      "epoch": 0.9523809523809523,
+      "grad_norm": 0.5427846312522888,
+      "learning_rate": 0.0002,
+      "loss": 1.7361,
+      "step": 440
+    },
+    {
+      "epoch": 0.974025974025974,
+      "grad_norm": 0.5605915784835815,
+      "learning_rate": 0.0002,
+      "loss": 1.7812,
+      "step": 450
+    },
+    {
+      "epoch": 0.9956709956709957,
+      "grad_norm": 0.5166691541671753,
+      "learning_rate": 0.0002,
+      "loss": 1.7699,
+      "step": 460
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.7847579717636108,
+      "eval_runtime": 144.877,
+      "eval_samples_per_second": 3.679,
+      "eval_steps_per_second": 0.462,
+      "step": 462
+    },
+    {
+      "epoch": 1.0173160173160174,
+      "grad_norm": 0.5665210485458374,
+      "learning_rate": 0.0002,
+      "loss": 1.6725,
+      "step": 470
+    },
+    {
+      "epoch": 1.0389610389610389,
+      "grad_norm": 1.0514339208602905,
+      "learning_rate": 0.0002,
+      "loss": 1.6996,
+      "step": 480
+    },
+    {
+      "epoch": 1.0606060606060606,
+      "grad_norm": 0.5494309663772583,
+      "learning_rate": 0.0002,
+      "loss": 1.6573,
+      "step": 490
+    },
+    {
+      "epoch": 1.0822510822510822,
+      "grad_norm": 0.557016909122467,
+      "learning_rate": 0.0002,
+      "loss": 1.7314,
+      "step": 500
+    },
+    {
+      "epoch": 1.103896103896104,
+      "grad_norm": 0.6585943102836609,
+      "learning_rate": 0.0002,
+      "loss": 1.7284,
+      "step": 510
+    },
+    {
+      "epoch": 1.1255411255411256,
+      "grad_norm": 0.6703357696533203,
+      "learning_rate": 0.0002,
+      "loss": 1.7575,
+      "step": 520
+    },
+    {
+      "epoch": 1.1471861471861473,
+      "grad_norm": 1.9358264207839966,
+      "learning_rate": 0.0002,
+      "loss": 1.7013,
+      "step": 530
+    },
+    {
+      "epoch": 1.1688311688311688,
+      "grad_norm": 0.6128601431846619,
+      "learning_rate": 0.0002,
+      "loss": 1.6914,
+      "step": 540
+    },
+    {
+      "epoch": 1.1904761904761905,
+      "grad_norm": 0.6610239744186401,
+      "learning_rate": 0.0002,
+      "loss": 1.6358,
+      "step": 550
+    },
+    {
+      "epoch": 1.2121212121212122,
+      "grad_norm": 0.6083669662475586,
+      "learning_rate": 0.0002,
+      "loss": 1.7122,
+      "step": 560
+    },
+    {
+      "epoch": 1.2337662337662338,
+      "grad_norm": 0.7784225940704346,
+      "learning_rate": 0.0002,
+      "loss": 1.6771,
+      "step": 570
+    },
+    {
+      "epoch": 1.2554112554112553,
+      "grad_norm": 0.6141694784164429,
+      "learning_rate": 0.0002,
+      "loss": 1.6372,
+      "step": 580
+    },
+    {
+      "epoch": 1.277056277056277,
+      "grad_norm": 0.6129311323165894,
+      "learning_rate": 0.0002,
+      "loss": 1.6795,
+      "step": 590
+    },
+    {
+      "epoch": 1.2987012987012987,
+      "grad_norm": 0.6802751421928406,
+      "learning_rate": 0.0002,
+      "loss": 1.6664,
+      "step": 600
+    },
+    {
+      "epoch": 1.3203463203463204,
+      "grad_norm": 0.6065750122070312,
+      "learning_rate": 0.0002,
+      "loss": 1.6555,
+      "step": 610
+    },
+    {
+      "epoch": 1.341991341991342,
+      "grad_norm": 0.6713075637817383,
+      "learning_rate": 0.0002,
+      "loss": 1.679,
+      "step": 620
+    },
+    {
+      "epoch": 1.3636363636363638,
+      "grad_norm": 0.627552330493927,
+      "learning_rate": 0.0002,
+      "loss": 1.7412,
+      "step": 630
+    },
+    {
+      "epoch": 1.3852813852813852,
+      "grad_norm": 0.6579778790473938,
+      "learning_rate": 0.0002,
+      "loss": 1.6477,
+      "step": 640
+    },
+    {
+      "epoch": 1.406926406926407,
+      "grad_norm": 0.6381745934486389,
+      "learning_rate": 0.0002,
+      "loss": 1.7282,
+      "step": 650
+    },
+    {
+      "epoch": 1.4285714285714286,
+      "grad_norm": 0.7358919382095337,
+      "learning_rate": 0.0002,
+      "loss": 1.7218,
+      "step": 660
+    },
+    {
+      "epoch": 1.4502164502164503,
+      "grad_norm": 0.6294736266136169,
+      "learning_rate": 0.0002,
+      "loss": 1.7046,
+      "step": 670
+    },
+    {
+      "epoch": 1.4718614718614718,
+      "grad_norm": 0.6542870998382568,
+      "learning_rate": 0.0002,
+      "loss": 1.7143,
+      "step": 680
+    },
+    {
+      "epoch": 1.4935064935064934,
+      "grad_norm": 0.6003480553627014,
+      "learning_rate": 0.0002,
+      "loss": 1.7417,
+      "step": 690
+    },
+    {
+      "epoch": 1.5151515151515151,
+      "grad_norm": 0.8322144150733948,
+      "learning_rate": 0.0002,
+      "loss": 1.6487,
+      "step": 700
+    },
+    {
+      "epoch": 1.5367965367965368,
+      "grad_norm": 0.6853126287460327,
+      "learning_rate": 0.0002,
+      "loss": 1.7217,
+      "step": 710
+    },
+    {
+      "epoch": 1.5584415584415585,
+      "grad_norm": 0.6571378707885742,
+      "learning_rate": 0.0002,
+      "loss": 1.6888,
+      "step": 720
+    },
+    {
+      "epoch": 1.5800865800865802,
+      "grad_norm": 0.6957149505615234,
+      "learning_rate": 0.0002,
+      "loss": 1.7111,
+      "step": 730
+    },
+    {
+      "epoch": 1.601731601731602,
+      "grad_norm": 0.6495681405067444,
+      "learning_rate": 0.0002,
+      "loss": 1.7361,
+      "step": 740
+    },
+    {
+      "epoch": 1.6233766233766234,
+      "grad_norm": 0.6954384446144104,
+      "learning_rate": 0.0002,
+      "loss": 1.5709,
+      "step": 750
+    },
+    {
+      "epoch": 1.645021645021645,
+      "grad_norm": 0.7402207851409912,
+      "learning_rate": 0.0002,
+      "loss": 1.6851,
+      "step": 760
+    },
+    {
+      "epoch": 1.6666666666666665,
+      "grad_norm": 0.6827481985092163,
+      "learning_rate": 0.0002,
+      "loss": 1.6651,
+      "step": 770
+    },
+    {
+      "epoch": 1.6883116883116882,
+      "grad_norm": 0.6176769733428955,
+      "learning_rate": 0.0002,
+      "loss": 1.6827,
+      "step": 780
+    },
+    {
+      "epoch": 1.70995670995671,
+      "grad_norm": 0.6565108299255371,
+      "learning_rate": 0.0002,
+      "loss": 1.6291,
+      "step": 790
+    },
+    {
+      "epoch": 1.7316017316017316,
+      "grad_norm": 0.6303038001060486,
+      "learning_rate": 0.0002,
+      "loss": 1.6805,
+      "step": 800
+    },
+    {
+      "epoch": 1.7532467532467533,
+      "grad_norm": 0.6866182684898376,
+      "learning_rate": 0.0002,
+      "loss": 1.7321,
+      "step": 810
+    },
+    {
+      "epoch": 1.774891774891775,
+      "grad_norm": 0.7522535920143127,
+      "learning_rate": 0.0002,
+      "loss": 1.6847,
+      "step": 820
+    },
+    {
+      "epoch": 1.7965367965367967,
+      "grad_norm": 0.7703698873519897,
+      "learning_rate": 0.0002,
+      "loss": 1.679,
+      "step": 830
+    },
+    {
+      "epoch": 1.8181818181818183,
+      "grad_norm": 0.5955503582954407,
+      "learning_rate": 0.0002,
+      "loss": 1.6817,
+      "step": 840
+    },
+    {
+      "epoch": 1.8398268398268398,
+      "grad_norm": 0.707340657711029,
+      "learning_rate": 0.0002,
+      "loss": 1.7159,
+      "step": 850
+    },
+    {
+      "epoch": 1.8614718614718615,
+      "grad_norm": 0.7305465936660767,
+      "learning_rate": 0.0002,
+      "loss": 1.709,
+      "step": 860
+    },
+    {
+      "epoch": 1.883116883116883,
+      "grad_norm": 0.667972207069397,
+      "learning_rate": 0.0002,
+      "loss": 1.71,
+      "step": 870
+    },
+    {
+      "epoch": 1.9047619047619047,
+      "grad_norm": 0.654872477054596,
+      "learning_rate": 0.0002,
+      "loss": 1.7051,
+      "step": 880
+    },
+    {
+      "epoch": 1.9264069264069263,
+      "grad_norm": 0.6718705296516418,
+      "learning_rate": 0.0002,
+      "loss": 1.6316,
+      "step": 890
+    },
+    {
+      "epoch": 1.948051948051948,
+      "grad_norm": 0.6363692879676819,
+      "learning_rate": 0.0002,
+      "loss": 1.623,
+      "step": 900
+    },
+    {
+      "epoch": 1.9696969696969697,
+      "grad_norm": 0.6861362457275391,
+      "learning_rate": 0.0002,
+      "loss": 1.6725,
+      "step": 910
+    },
+    {
+      "epoch": 1.9913419913419914,
+      "grad_norm": 0.6531493067741394,
+      "learning_rate": 0.0002,
+      "loss": 1.6833,
+      "step": 920
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.7908068895339966,
+      "eval_runtime": 144.0281,
+      "eval_samples_per_second": 3.701,
+      "eval_steps_per_second": 0.465,
+      "step": 924
+    },
+    {
+      "epoch": 2.012987012987013,
+      "grad_norm": 0.6030914187431335,
+      "learning_rate": 0.0002,
+      "loss": 1.5922,
+      "step": 930
+    },
+    {
+      "epoch": 2.034632034632035,
+      "grad_norm": 0.7416430711746216,
+      "learning_rate": 0.0002,
+      "loss": 1.5215,
+      "step": 940
+    },
+    {
+      "epoch": 2.0562770562770565,
+      "grad_norm": 0.7020093202590942,
+      "learning_rate": 0.0002,
+      "loss": 1.5759,
+      "step": 950
+    },
+    {
+      "epoch": 2.0779220779220777,
+      "grad_norm": 0.8007868528366089,
+      "learning_rate": 0.0002,
+      "loss": 1.4751,
+      "step": 960
+    },
+    {
+      "epoch": 2.0995670995670994,
+      "grad_norm": 0.7111671566963196,
+      "learning_rate": 0.0002,
+      "loss": 1.4808,
+      "step": 970
+    },
+    {
+      "epoch": 2.121212121212121,
+      "grad_norm": 0.7257682085037231,
+      "learning_rate": 0.0002,
+      "loss": 1.53,
+      "step": 980
+    },
+    {
+      "epoch": 2.142857142857143,
+      "grad_norm": 0.8737282156944275,
+      "learning_rate": 0.0002,
+      "loss": 1.5097,
+      "step": 990
+    },
+    {
+      "epoch": 2.1645021645021645,
+      "grad_norm": 0.9281378984451294,
+      "learning_rate": 0.0002,
+      "loss": 1.537,
+      "step": 1000
+    },
+    {
+      "epoch": 2.186147186147186,
+      "grad_norm": 1.0217959880828857,
+      "learning_rate": 0.0002,
+      "loss": 1.5223,
+      "step": 1010
+    },
+    {
+      "epoch": 2.207792207792208,
+      "grad_norm": 0.8430958986282349,
+      "learning_rate": 0.0002,
+      "loss": 1.4253,
+      "step": 1020
+    },
+    {
+      "epoch": 2.2294372294372296,
+      "grad_norm": 0.8123440742492676,
+      "learning_rate": 0.0002,
+      "loss": 1.5294,
+      "step": 1030
+    },
+    {
+      "epoch": 2.2510822510822512,
+      "grad_norm": 0.9429558515548706,
+      "learning_rate": 0.0002,
+      "loss": 1.5167,
+      "step": 1040
+    },
+    {
+      "epoch": 2.2727272727272725,
+      "grad_norm": 0.811696469783783,
+      "learning_rate": 0.0002,
+      "loss": 1.4711,
+      "step": 1050
+    },
+    {
+      "epoch": 2.2943722943722946,
+      "grad_norm": 0.8424768447875977,
+      "learning_rate": 0.0002,
+      "loss": 1.4656,
+      "step": 1060
+    },
+    {
+      "epoch": 2.316017316017316,
+      "grad_norm": 0.8870340585708618,
+      "learning_rate": 0.0002,
+      "loss": 1.5618,
+      "step": 1070
+    },
+    {
+      "epoch": 2.3376623376623376,
+      "grad_norm": 0.8600393533706665,
+      "learning_rate": 0.0002,
+      "loss": 1.5368,
+      "step": 1080
+    },
+    {
+      "epoch": 2.3593073593073592,
+      "grad_norm": 0.8447834253311157,
+      "learning_rate": 0.0002,
+      "loss": 1.5028,
+      "step": 1090
+    },
+    {
+      "epoch": 2.380952380952381,
+      "grad_norm": 0.9303842186927795,
+      "learning_rate": 0.0002,
+      "loss": 1.4885,
+      "step": 1100
+    },
+    {
+      "epoch": 2.4025974025974026,
+      "grad_norm": 0.8144819140434265,
+      "learning_rate": 0.0002,
+      "loss": 1.5163,
+      "step": 1110
+    },
+    {
+      "epoch": 2.4242424242424243,
+      "grad_norm": 0.92924964427948,
+      "learning_rate": 0.0002,
+      "loss": 1.4805,
+      "step": 1120
+    },
+    {
+      "epoch": 2.445887445887446,
+      "grad_norm": 0.8560649156570435,
+      "learning_rate": 0.0002,
+      "loss": 1.4608,
+      "step": 1130
+    },
+    {
+      "epoch": 2.4675324675324677,
+      "grad_norm": 0.8532574772834778,
+      "learning_rate": 0.0002,
+      "loss": 1.5541,
+      "step": 1140
+    },
+    {
+      "epoch": 2.4891774891774894,
+      "grad_norm": 0.8702793717384338,
+      "learning_rate": 0.0002,
+      "loss": 1.5607,
+      "step": 1150
+    },
+    {
+      "epoch": 2.5108225108225106,
+      "grad_norm": 0.9125854969024658,
+      "learning_rate": 0.0002,
+      "loss": 1.5194,
+      "step": 1160
+    },
+    {
+      "epoch": 2.5324675324675323,
+      "grad_norm": 0.9579735398292542,
+      "learning_rate": 0.0002,
+      "loss": 1.5953,
+      "step": 1170
+    },
+    {
+      "epoch": 2.554112554112554,
+      "grad_norm": 0.8561005592346191,
+      "learning_rate": 0.0002,
+      "loss": 1.5088,
+      "step": 1180
+    },
+    {
+      "epoch": 2.5757575757575757,
+      "grad_norm": 0.9103630185127258,
+      "learning_rate": 0.0002,
+      "loss": 1.5636,
+      "step": 1190
+    },
+    {
+      "epoch": 2.5974025974025974,
+      "grad_norm": 0.8527248501777649,
+      "learning_rate": 0.0002,
+      "loss": 1.5497,
+      "step": 1200
+    },
+    {
+      "epoch": 2.619047619047619,
+      "grad_norm": 0.8368656039237976,
+      "learning_rate": 0.0002,
+      "loss": 1.5845,
+      "step": 1210
+    },
+    {
+      "epoch": 2.6406926406926408,
+      "grad_norm": 0.9644360542297363,
+      "learning_rate": 0.0002,
+      "loss": 1.574,
+      "step": 1220
+    },
+    {
+      "epoch": 2.6623376623376624,
+      "grad_norm": 0.9691457748413086,
+      "learning_rate": 0.0002,
+      "loss": 1.5623,
+      "step": 1230
+    },
+    {
+      "epoch": 2.683982683982684,
+      "grad_norm": 0.8851862549781799,
+      "learning_rate": 0.0002,
+      "loss": 1.5894,
+      "step": 1240
+    },
+    {
+      "epoch": 2.7056277056277054,
+      "grad_norm": 1.0715088844299316,
+      "learning_rate": 0.0002,
+      "loss": 1.5251,
+      "step": 1250
+    },
+    {
+      "epoch": 2.7272727272727275,
+      "grad_norm": 0.8532006740570068,
+      "learning_rate": 0.0002,
+      "loss": 1.5903,
+      "step": 1260
+    },
+    {
+      "epoch": 2.7489177489177488,
+      "grad_norm": 0.9172760248184204,
+      "learning_rate": 0.0002,
+      "loss": 1.5261,
+      "step": 1270
+    },
+    {
+      "epoch": 2.7705627705627704,
+      "grad_norm": 0.8991577625274658,
+      "learning_rate": 0.0002,
+      "loss": 1.5029,
+      "step": 1280
+    },
+    {
+      "epoch": 2.792207792207792,
+      "grad_norm": 0.8205381631851196,
+      "learning_rate": 0.0002,
+      "loss": 1.5207,
+      "step": 1290
+    },
+    {
+      "epoch": 2.813852813852814,
+      "grad_norm": 0.9733313918113708,
+      "learning_rate": 0.0002,
+      "loss": 1.5328,
+      "step": 1300
+    },
+    {
+      "epoch": 2.8354978354978355,
+      "grad_norm": 1.0313537120819092,
+      "learning_rate": 0.0002,
+      "loss": 1.5373,
+      "step": 1310
+    },
+    {
+      "epoch": 2.857142857142857,
+      "grad_norm": 0.8865208625793457,
+      "learning_rate": 0.0002,
+      "loss": 1.4832,
+      "step": 1320
+    },
+    {
+      "epoch": 2.878787878787879,
+      "grad_norm": 1.1407958269119263,
+      "learning_rate": 0.0002,
+      "loss": 1.5297,
+      "step": 1330
+    },
+    {
+      "epoch": 2.9004329004329006,
+      "grad_norm": 0.879891574382782,
+      "learning_rate": 0.0002,
+      "loss": 1.5435,
+      "step": 1340
+    },
+    {
+      "epoch": 2.9220779220779223,
+      "grad_norm": 0.9538708925247192,
+      "learning_rate": 0.0002,
+      "loss": 1.5523,
+      "step": 1350
+    },
+    {
+      "epoch": 2.9437229437229435,
+      "grad_norm": 0.7732896208763123,
+      "learning_rate": 0.0002,
+      "loss": 1.4881,
+      "step": 1360
+    },
+    {
+      "epoch": 2.965367965367965,
+      "grad_norm": 0.9062705636024475,
+      "learning_rate": 0.0002,
+      "loss": 1.4959,
+      "step": 1370
+    },
+    {
+      "epoch": 2.987012987012987,
+      "grad_norm": 0.9082673192024231,
+      "learning_rate": 0.0002,
+      "loss": 1.5508,
+      "step": 1380
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 1.842921257019043,
+      "eval_runtime": 138.5715,
+      "eval_samples_per_second": 3.846,
+      "eval_steps_per_second": 0.484,
+      "step": 1386
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3696,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 6.08103712161792e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d349b8fe4d8871a8479d9bb3b1cba8d39e96d113c8c86cdb28a7fc6969f53ba6
+size 5560

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-1848/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5504f650277854d192b97753ca606c988a632c2cbcd62d444cab21f0ab82a141
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12c0ccaca6638a1a73230fc361bc5359206353e084c132ed1e15188820c42965
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7c9519a423472b0e09caffb8a1a47938b6b04cd535a5ec771672c03e6a23ebc
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:30fcae4a89d4c234120b0406d6521923488e077e6c4aa142e4c91c84ecd612c3
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,1353 @@

+{
+  "best_metric": 1.7847579717636108,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-462",
+  "epoch": 4.0,
+  "eval_steps": 10,
+  "global_step": 1848,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.021645021645021644,
+      "grad_norm": 1.200374722480774,
+      "learning_rate": 0.0002,
+      "loss": 2.5092,
+      "step": 10
+    },
+    {
+      "epoch": 0.04329004329004329,
+      "grad_norm": 0.974091112613678,
+      "learning_rate": 0.0002,
+      "loss": 2.2672,
+      "step": 20
+    },
+    {
+      "epoch": 0.06493506493506493,
+      "grad_norm": 0.9070103168487549,
+      "learning_rate": 0.0002,
+      "loss": 2.1445,
+      "step": 30
+    },
+    {
+      "epoch": 0.08658008658008658,
+      "grad_norm": 0.6892510056495667,
+      "learning_rate": 0.0002,
+      "loss": 2.0634,
+      "step": 40
+    },
+    {
+      "epoch": 0.10822510822510822,
+      "grad_norm": 0.7840355038642883,
+      "learning_rate": 0.0002,
+      "loss": 2.039,
+      "step": 50
+    },
+    {
+      "epoch": 0.12987012987012986,
+      "grad_norm": 0.8381665349006653,
+      "learning_rate": 0.0002,
+      "loss": 1.9527,
+      "step": 60
+    },
+    {
+      "epoch": 0.15151515151515152,
+      "grad_norm": 0.6969044804573059,
+      "learning_rate": 0.0002,
+      "loss": 1.8852,
+      "step": 70
+    },
+    {
+      "epoch": 0.17316017316017315,
+      "grad_norm": 0.6608849763870239,
+      "learning_rate": 0.0002,
+      "loss": 1.8263,
+      "step": 80
+    },
+    {
+      "epoch": 0.19480519480519481,
+      "grad_norm": 0.6329185962677002,
+      "learning_rate": 0.0002,
+      "loss": 1.825,
+      "step": 90
+    },
+    {
+      "epoch": 0.21645021645021645,
+      "grad_norm": 0.723852276802063,
+      "learning_rate": 0.0002,
+      "loss": 1.8256,
+      "step": 100
+    },
+    {
+      "epoch": 0.23809523809523808,
+      "grad_norm": 0.8358765840530396,
+      "learning_rate": 0.0002,
+      "loss": 1.8758,
+      "step": 110
+    },
+    {
+      "epoch": 0.2597402597402597,
+      "grad_norm": 0.6025514006614685,
+      "learning_rate": 0.0002,
+      "loss": 1.8468,
+      "step": 120
+    },
+    {
+      "epoch": 0.2813852813852814,
+      "grad_norm": 0.5782386064529419,
+      "learning_rate": 0.0002,
+      "loss": 1.7487,
+      "step": 130
+    },
+    {
+      "epoch": 0.30303030303030304,
+      "grad_norm": 0.8589595556259155,
+      "learning_rate": 0.0002,
+      "loss": 1.7717,
+      "step": 140
+    },
+    {
+      "epoch": 0.3246753246753247,
+      "grad_norm": 0.5718036890029907,
+      "learning_rate": 0.0002,
+      "loss": 1.7726,
+      "step": 150
+    },
+    {
+      "epoch": 0.3463203463203463,
+      "grad_norm": 0.632756769657135,
+      "learning_rate": 0.0002,
+      "loss": 1.784,
+      "step": 160
+    },
+    {
+      "epoch": 0.36796536796536794,
+      "grad_norm": 0.5307920575141907,
+      "learning_rate": 0.0002,
+      "loss": 1.8176,
+      "step": 170
+    },
+    {
+      "epoch": 0.38961038961038963,
+      "grad_norm": 0.5692276358604431,
+      "learning_rate": 0.0002,
+      "loss": 1.7744,
+      "step": 180
+    },
+    {
+      "epoch": 0.41125541125541126,
+      "grad_norm": 0.6083813309669495,
+      "learning_rate": 0.0002,
+      "loss": 1.8075,
+      "step": 190
+    },
+    {
+      "epoch": 0.4329004329004329,
+      "grad_norm": 0.7849981188774109,
+      "learning_rate": 0.0002,
+      "loss": 1.8718,
+      "step": 200
+    },
+    {
+      "epoch": 0.45454545454545453,
+      "grad_norm": 0.6536546945571899,
+      "learning_rate": 0.0002,
+      "loss": 1.7946,
+      "step": 210
+    },
+    {
+      "epoch": 0.47619047619047616,
+      "grad_norm": 0.5180730223655701,
+      "learning_rate": 0.0002,
+      "loss": 1.8174,
+      "step": 220
+    },
+    {
+      "epoch": 0.49783549783549785,
+      "grad_norm": 0.5796821713447571,
+      "learning_rate": 0.0002,
+      "loss": 1.7857,
+      "step": 230
+    },
+    {
+      "epoch": 0.5194805194805194,
+      "grad_norm": 0.6185894012451172,
+      "learning_rate": 0.0002,
+      "loss": 1.8062,
+      "step": 240
+    },
+    {
+      "epoch": 0.5411255411255411,
+      "grad_norm": 0.6040953397750854,
+      "learning_rate": 0.0002,
+      "loss": 1.825,
+      "step": 250
+    },
+    {
+      "epoch": 0.5627705627705628,
+      "grad_norm": 0.6005431413650513,
+      "learning_rate": 0.0002,
+      "loss": 1.7785,
+      "step": 260
+    },
+    {
+      "epoch": 0.5844155844155844,
+      "grad_norm": 0.6693951487541199,
+      "learning_rate": 0.0002,
+      "loss": 1.8444,
+      "step": 270
+    },
+    {
+      "epoch": 0.6060606060606061,
+      "grad_norm": 0.5105443596839905,
+      "learning_rate": 0.0002,
+      "loss": 1.8471,
+      "step": 280
+    },
+    {
+      "epoch": 0.6277056277056277,
+      "grad_norm": 0.5175243616104126,
+      "learning_rate": 0.0002,
+      "loss": 1.8274,
+      "step": 290
+    },
+    {
+      "epoch": 0.6493506493506493,
+      "grad_norm": 0.4775221049785614,
+      "learning_rate": 0.0002,
+      "loss": 1.81,
+      "step": 300
+    },
+    {
+      "epoch": 0.670995670995671,
+      "grad_norm": 0.9106342792510986,
+      "learning_rate": 0.0002,
+      "loss": 1.7816,
+      "step": 310
+    },
+    {
+      "epoch": 0.6926406926406926,
+      "grad_norm": 1.9134571552276611,
+      "learning_rate": 0.0002,
+      "loss": 1.7764,
+      "step": 320
+    },
+    {
+      "epoch": 0.7142857142857143,
+      "grad_norm": 0.6287537217140198,
+      "learning_rate": 0.0002,
+      "loss": 1.7877,
+      "step": 330
+    },
+    {
+      "epoch": 0.7359307359307359,
+      "grad_norm": 0.5587132573127747,
+      "learning_rate": 0.0002,
+      "loss": 1.8499,
+      "step": 340
+    },
+    {
+      "epoch": 0.7575757575757576,
+      "grad_norm": 0.5827193260192871,
+      "learning_rate": 0.0002,
+      "loss": 1.7328,
+      "step": 350
+    },
+    {
+      "epoch": 0.7792207792207793,
+      "grad_norm": 0.572600781917572,
+      "learning_rate": 0.0002,
+      "loss": 1.8022,
+      "step": 360
+    },
+    {
+      "epoch": 0.8008658008658008,
+      "grad_norm": 0.6280586123466492,
+      "learning_rate": 0.0002,
+      "loss": 1.88,
+      "step": 370
+    },
+    {
+      "epoch": 0.8225108225108225,
+      "grad_norm": 0.6878819465637207,
+      "learning_rate": 0.0002,
+      "loss": 1.8116,
+      "step": 380
+    },
+    {
+      "epoch": 0.8441558441558441,
+      "grad_norm": 0.5876027345657349,
+      "learning_rate": 0.0002,
+      "loss": 1.8042,
+      "step": 390
+    },
+    {
+      "epoch": 0.8658008658008658,
+      "grad_norm": 0.5249695777893066,
+      "learning_rate": 0.0002,
+      "loss": 1.7501,
+      "step": 400
+    },
+    {
+      "epoch": 0.8874458874458875,
+      "grad_norm": 0.5510677695274353,
+      "learning_rate": 0.0002,
+      "loss": 1.7599,
+      "step": 410
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 0.6817089915275574,
+      "learning_rate": 0.0002,
+      "loss": 1.7737,
+      "step": 420
+    },
+    {
+      "epoch": 0.9307359307359307,
+      "grad_norm": 0.5116859078407288,
+      "learning_rate": 0.0002,
+      "loss": 1.7857,
+      "step": 430
+    },
+    {
+      "epoch": 0.9523809523809523,
+      "grad_norm": 0.5427846312522888,
+      "learning_rate": 0.0002,
+      "loss": 1.7361,
+      "step": 440
+    },
+    {
+      "epoch": 0.974025974025974,
+      "grad_norm": 0.5605915784835815,
+      "learning_rate": 0.0002,
+      "loss": 1.7812,
+      "step": 450
+    },
+    {
+      "epoch": 0.9956709956709957,
+      "grad_norm": 0.5166691541671753,
+      "learning_rate": 0.0002,
+      "loss": 1.7699,
+      "step": 460
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.7847579717636108,
+      "eval_runtime": 144.877,
+      "eval_samples_per_second": 3.679,
+      "eval_steps_per_second": 0.462,
+      "step": 462
+    },
+    {
+      "epoch": 1.0173160173160174,
+      "grad_norm": 0.5665210485458374,
+      "learning_rate": 0.0002,
+      "loss": 1.6725,
+      "step": 470
+    },
+    {
+      "epoch": 1.0389610389610389,
+      "grad_norm": 1.0514339208602905,
+      "learning_rate": 0.0002,
+      "loss": 1.6996,
+      "step": 480
+    },
+    {
+      "epoch": 1.0606060606060606,
+      "grad_norm": 0.5494309663772583,
+      "learning_rate": 0.0002,
+      "loss": 1.6573,
+      "step": 490
+    },
+    {
+      "epoch": 1.0822510822510822,
+      "grad_norm": 0.557016909122467,
+      "learning_rate": 0.0002,
+      "loss": 1.7314,
+      "step": 500
+    },
+    {
+      "epoch": 1.103896103896104,
+      "grad_norm": 0.6585943102836609,
+      "learning_rate": 0.0002,
+      "loss": 1.7284,
+      "step": 510
+    },
+    {
+      "epoch": 1.1255411255411256,
+      "grad_norm": 0.6703357696533203,
+      "learning_rate": 0.0002,
+      "loss": 1.7575,
+      "step": 520
+    },
+    {
+      "epoch": 1.1471861471861473,
+      "grad_norm": 1.9358264207839966,
+      "learning_rate": 0.0002,
+      "loss": 1.7013,
+      "step": 530
+    },
+    {
+      "epoch": 1.1688311688311688,
+      "grad_norm": 0.6128601431846619,
+      "learning_rate": 0.0002,
+      "loss": 1.6914,
+      "step": 540
+    },
+    {
+      "epoch": 1.1904761904761905,
+      "grad_norm": 0.6610239744186401,
+      "learning_rate": 0.0002,
+      "loss": 1.6358,
+      "step": 550
+    },
+    {
+      "epoch": 1.2121212121212122,
+      "grad_norm": 0.6083669662475586,
+      "learning_rate": 0.0002,
+      "loss": 1.7122,
+      "step": 560
+    },
+    {
+      "epoch": 1.2337662337662338,
+      "grad_norm": 0.7784225940704346,
+      "learning_rate": 0.0002,
+      "loss": 1.6771,
+      "step": 570
+    },
+    {
+      "epoch": 1.2554112554112553,
+      "grad_norm": 0.6141694784164429,
+      "learning_rate": 0.0002,
+      "loss": 1.6372,
+      "step": 580
+    },
+    {
+      "epoch": 1.277056277056277,
+      "grad_norm": 0.6129311323165894,
+      "learning_rate": 0.0002,
+      "loss": 1.6795,
+      "step": 590
+    },
+    {
+      "epoch": 1.2987012987012987,
+      "grad_norm": 0.6802751421928406,
+      "learning_rate": 0.0002,
+      "loss": 1.6664,
+      "step": 600
+    },
+    {
+      "epoch": 1.3203463203463204,
+      "grad_norm": 0.6065750122070312,
+      "learning_rate": 0.0002,
+      "loss": 1.6555,
+      "step": 610
+    },
+    {
+      "epoch": 1.341991341991342,
+      "grad_norm": 0.6713075637817383,
+      "learning_rate": 0.0002,
+      "loss": 1.679,
+      "step": 620
+    },
+    {
+      "epoch": 1.3636363636363638,
+      "grad_norm": 0.627552330493927,
+      "learning_rate": 0.0002,
+      "loss": 1.7412,
+      "step": 630
+    },
+    {
+      "epoch": 1.3852813852813852,
+      "grad_norm": 0.6579778790473938,
+      "learning_rate": 0.0002,
+      "loss": 1.6477,
+      "step": 640
+    },
+    {
+      "epoch": 1.406926406926407,
+      "grad_norm": 0.6381745934486389,
+      "learning_rate": 0.0002,
+      "loss": 1.7282,
+      "step": 650
+    },
+    {
+      "epoch": 1.4285714285714286,
+      "grad_norm": 0.7358919382095337,
+      "learning_rate": 0.0002,
+      "loss": 1.7218,
+      "step": 660
+    },
+    {
+      "epoch": 1.4502164502164503,
+      "grad_norm": 0.6294736266136169,
+      "learning_rate": 0.0002,
+      "loss": 1.7046,
+      "step": 670
+    },
+    {
+      "epoch": 1.4718614718614718,
+      "grad_norm": 0.6542870998382568,
+      "learning_rate": 0.0002,
+      "loss": 1.7143,
+      "step": 680
+    },
+    {
+      "epoch": 1.4935064935064934,
+      "grad_norm": 0.6003480553627014,
+      "learning_rate": 0.0002,
+      "loss": 1.7417,
+      "step": 690
+    },
+    {
+      "epoch": 1.5151515151515151,
+      "grad_norm": 0.8322144150733948,
+      "learning_rate": 0.0002,
+      "loss": 1.6487,
+      "step": 700
+    },
+    {
+      "epoch": 1.5367965367965368,
+      "grad_norm": 0.6853126287460327,
+      "learning_rate": 0.0002,
+      "loss": 1.7217,
+      "step": 710
+    },
+    {
+      "epoch": 1.5584415584415585,
+      "grad_norm": 0.6571378707885742,
+      "learning_rate": 0.0002,
+      "loss": 1.6888,
+      "step": 720
+    },
+    {
+      "epoch": 1.5800865800865802,
+      "grad_norm": 0.6957149505615234,
+      "learning_rate": 0.0002,
+      "loss": 1.7111,
+      "step": 730
+    },
+    {
+      "epoch": 1.601731601731602,
+      "grad_norm": 0.6495681405067444,
+      "learning_rate": 0.0002,
+      "loss": 1.7361,
+      "step": 740
+    },
+    {
+      "epoch": 1.6233766233766234,
+      "grad_norm": 0.6954384446144104,
+      "learning_rate": 0.0002,
+      "loss": 1.5709,
+      "step": 750
+    },
+    {
+      "epoch": 1.645021645021645,
+      "grad_norm": 0.7402207851409912,
+      "learning_rate": 0.0002,
+      "loss": 1.6851,
+      "step": 760
+    },
+    {
+      "epoch": 1.6666666666666665,
+      "grad_norm": 0.6827481985092163,
+      "learning_rate": 0.0002,
+      "loss": 1.6651,
+      "step": 770
+    },
+    {
+      "epoch": 1.6883116883116882,
+      "grad_norm": 0.6176769733428955,
+      "learning_rate": 0.0002,
+      "loss": 1.6827,
+      "step": 780
+    },
+    {
+      "epoch": 1.70995670995671,
+      "grad_norm": 0.6565108299255371,
+      "learning_rate": 0.0002,
+      "loss": 1.6291,
+      "step": 790
+    },
+    {
+      "epoch": 1.7316017316017316,
+      "grad_norm": 0.6303038001060486,
+      "learning_rate": 0.0002,
+      "loss": 1.6805,
+      "step": 800
+    },
+    {
+      "epoch": 1.7532467532467533,
+      "grad_norm": 0.6866182684898376,
+      "learning_rate": 0.0002,
+      "loss": 1.7321,
+      "step": 810
+    },
+    {
+      "epoch": 1.774891774891775,
+      "grad_norm": 0.7522535920143127,
+      "learning_rate": 0.0002,
+      "loss": 1.6847,
+      "step": 820
+    },
+    {
+      "epoch": 1.7965367965367967,
+      "grad_norm": 0.7703698873519897,
+      "learning_rate": 0.0002,
+      "loss": 1.679,
+      "step": 830
+    },
+    {
+      "epoch": 1.8181818181818183,
+      "grad_norm": 0.5955503582954407,
+      "learning_rate": 0.0002,
+      "loss": 1.6817,
+      "step": 840
+    },
+    {
+      "epoch": 1.8398268398268398,
+      "grad_norm": 0.707340657711029,
+      "learning_rate": 0.0002,
+      "loss": 1.7159,
+      "step": 850
+    },
+    {
+      "epoch": 1.8614718614718615,
+      "grad_norm": 0.7305465936660767,
+      "learning_rate": 0.0002,
+      "loss": 1.709,
+      "step": 860
+    },
+    {
+      "epoch": 1.883116883116883,
+      "grad_norm": 0.667972207069397,
+      "learning_rate": 0.0002,
+      "loss": 1.71,
+      "step": 870
+    },
+    {
+      "epoch": 1.9047619047619047,
+      "grad_norm": 0.654872477054596,
+      "learning_rate": 0.0002,
+      "loss": 1.7051,
+      "step": 880
+    },
+    {
+      "epoch": 1.9264069264069263,
+      "grad_norm": 0.6718705296516418,
+      "learning_rate": 0.0002,
+      "loss": 1.6316,
+      "step": 890
+    },
+    {
+      "epoch": 1.948051948051948,
+      "grad_norm": 0.6363692879676819,
+      "learning_rate": 0.0002,
+      "loss": 1.623,
+      "step": 900
+    },
+    {
+      "epoch": 1.9696969696969697,
+      "grad_norm": 0.6861362457275391,
+      "learning_rate": 0.0002,
+      "loss": 1.6725,
+      "step": 910
+    },
+    {
+      "epoch": 1.9913419913419914,
+      "grad_norm": 0.6531493067741394,
+      "learning_rate": 0.0002,
+      "loss": 1.6833,
+      "step": 920
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.7908068895339966,
+      "eval_runtime": 144.0281,
+      "eval_samples_per_second": 3.701,
+      "eval_steps_per_second": 0.465,
+      "step": 924
+    },
+    {
+      "epoch": 2.012987012987013,
+      "grad_norm": 0.6030914187431335,
+      "learning_rate": 0.0002,
+      "loss": 1.5922,
+      "step": 930
+    },
+    {
+      "epoch": 2.034632034632035,
+      "grad_norm": 0.7416430711746216,
+      "learning_rate": 0.0002,
+      "loss": 1.5215,
+      "step": 940
+    },
+    {
+      "epoch": 2.0562770562770565,
+      "grad_norm": 0.7020093202590942,
+      "learning_rate": 0.0002,
+      "loss": 1.5759,
+      "step": 950
+    },
+    {
+      "epoch": 2.0779220779220777,
+      "grad_norm": 0.8007868528366089,
+      "learning_rate": 0.0002,
+      "loss": 1.4751,
+      "step": 960
+    },
+    {
+      "epoch": 2.0995670995670994,
+      "grad_norm": 0.7111671566963196,
+      "learning_rate": 0.0002,
+      "loss": 1.4808,
+      "step": 970
+    },
+    {
+      "epoch": 2.121212121212121,
+      "grad_norm": 0.7257682085037231,
+      "learning_rate": 0.0002,
+      "loss": 1.53,
+      "step": 980
+    },
+    {
+      "epoch": 2.142857142857143,
+      "grad_norm": 0.8737282156944275,
+      "learning_rate": 0.0002,
+      "loss": 1.5097,
+      "step": 990
+    },
+    {
+      "epoch": 2.1645021645021645,
+      "grad_norm": 0.9281378984451294,
+      "learning_rate": 0.0002,
+      "loss": 1.537,
+      "step": 1000
+    },
+    {
+      "epoch": 2.186147186147186,
+      "grad_norm": 1.0217959880828857,
+      "learning_rate": 0.0002,
+      "loss": 1.5223,
+      "step": 1010
+    },
+    {
+      "epoch": 2.207792207792208,
+      "grad_norm": 0.8430958986282349,
+      "learning_rate": 0.0002,
+      "loss": 1.4253,
+      "step": 1020
+    },
+    {
+      "epoch": 2.2294372294372296,
+      "grad_norm": 0.8123440742492676,
+      "learning_rate": 0.0002,
+      "loss": 1.5294,
+      "step": 1030
+    },
+    {
+      "epoch": 2.2510822510822512,
+      "grad_norm": 0.9429558515548706,
+      "learning_rate": 0.0002,
+      "loss": 1.5167,
+      "step": 1040
+    },
+    {
+      "epoch": 2.2727272727272725,
+      "grad_norm": 0.811696469783783,
+      "learning_rate": 0.0002,
+      "loss": 1.4711,
+      "step": 1050
+    },
+    {
+      "epoch": 2.2943722943722946,
+      "grad_norm": 0.8424768447875977,
+      "learning_rate": 0.0002,
+      "loss": 1.4656,
+      "step": 1060
+    },
+    {
+      "epoch": 2.316017316017316,
+      "grad_norm": 0.8870340585708618,
+      "learning_rate": 0.0002,
+      "loss": 1.5618,
+      "step": 1070
+    },
+    {
+      "epoch": 2.3376623376623376,
+      "grad_norm": 0.8600393533706665,
+      "learning_rate": 0.0002,
+      "loss": 1.5368,
+      "step": 1080
+    },
+    {
+      "epoch": 2.3593073593073592,
+      "grad_norm": 0.8447834253311157,
+      "learning_rate": 0.0002,
+      "loss": 1.5028,
+      "step": 1090
+    },
+    {
+      "epoch": 2.380952380952381,
+      "grad_norm": 0.9303842186927795,
+      "learning_rate": 0.0002,
+      "loss": 1.4885,
+      "step": 1100
+    },
+    {
+      "epoch": 2.4025974025974026,
+      "grad_norm": 0.8144819140434265,
+      "learning_rate": 0.0002,
+      "loss": 1.5163,
+      "step": 1110
+    },
+    {
+      "epoch": 2.4242424242424243,
+      "grad_norm": 0.92924964427948,
+      "learning_rate": 0.0002,
+      "loss": 1.4805,
+      "step": 1120
+    },
+    {
+      "epoch": 2.445887445887446,
+      "grad_norm": 0.8560649156570435,
+      "learning_rate": 0.0002,
+      "loss": 1.4608,
+      "step": 1130
+    },
+    {
+      "epoch": 2.4675324675324677,
+      "grad_norm": 0.8532574772834778,
+      "learning_rate": 0.0002,
+      "loss": 1.5541,
+      "step": 1140
+    },
+    {
+      "epoch": 2.4891774891774894,
+      "grad_norm": 0.8702793717384338,
+      "learning_rate": 0.0002,
+      "loss": 1.5607,
+      "step": 1150
+    },
+    {
+      "epoch": 2.5108225108225106,
+      "grad_norm": 0.9125854969024658,
+      "learning_rate": 0.0002,
+      "loss": 1.5194,
+      "step": 1160
+    },
+    {
+      "epoch": 2.5324675324675323,
+      "grad_norm": 0.9579735398292542,
+      "learning_rate": 0.0002,
+      "loss": 1.5953,
+      "step": 1170
+    },
+    {
+      "epoch": 2.554112554112554,
+      "grad_norm": 0.8561005592346191,
+      "learning_rate": 0.0002,
+      "loss": 1.5088,
+      "step": 1180
+    },
+    {
+      "epoch": 2.5757575757575757,
+      "grad_norm": 0.9103630185127258,
+      "learning_rate": 0.0002,
+      "loss": 1.5636,
+      "step": 1190
+    },
+    {
+      "epoch": 2.5974025974025974,
+      "grad_norm": 0.8527248501777649,
+      "learning_rate": 0.0002,
+      "loss": 1.5497,
+      "step": 1200
+    },
+    {
+      "epoch": 2.619047619047619,
+      "grad_norm": 0.8368656039237976,
+      "learning_rate": 0.0002,
+      "loss": 1.5845,
+      "step": 1210
+    },
+    {
+      "epoch": 2.6406926406926408,
+      "grad_norm": 0.9644360542297363,
+      "learning_rate": 0.0002,
+      "loss": 1.574,
+      "step": 1220
+    },
+    {
+      "epoch": 2.6623376623376624,
+      "grad_norm": 0.9691457748413086,
+      "learning_rate": 0.0002,
+      "loss": 1.5623,
+      "step": 1230
+    },
+    {
+      "epoch": 2.683982683982684,
+      "grad_norm": 0.8851862549781799,
+      "learning_rate": 0.0002,
+      "loss": 1.5894,
+      "step": 1240
+    },
+    {
+      "epoch": 2.7056277056277054,
+      "grad_norm": 1.0715088844299316,
+      "learning_rate": 0.0002,
+      "loss": 1.5251,
+      "step": 1250
+    },
+    {
+      "epoch": 2.7272727272727275,
+      "grad_norm": 0.8532006740570068,
+      "learning_rate": 0.0002,
+      "loss": 1.5903,
+      "step": 1260
+    },
+    {
+      "epoch": 2.7489177489177488,
+      "grad_norm": 0.9172760248184204,
+      "learning_rate": 0.0002,
+      "loss": 1.5261,
+      "step": 1270
+    },
+    {
+      "epoch": 2.7705627705627704,
+      "grad_norm": 0.8991577625274658,
+      "learning_rate": 0.0002,
+      "loss": 1.5029,
+      "step": 1280
+    },
+    {
+      "epoch": 2.792207792207792,
+      "grad_norm": 0.8205381631851196,
+      "learning_rate": 0.0002,
+      "loss": 1.5207,
+      "step": 1290
+    },
+    {
+      "epoch": 2.813852813852814,
+      "grad_norm": 0.9733313918113708,
+      "learning_rate": 0.0002,
+      "loss": 1.5328,
+      "step": 1300
+    },
+    {
+      "epoch": 2.8354978354978355,
+      "grad_norm": 1.0313537120819092,
+      "learning_rate": 0.0002,
+      "loss": 1.5373,
+      "step": 1310
+    },
+    {
+      "epoch": 2.857142857142857,
+      "grad_norm": 0.8865208625793457,
+      "learning_rate": 0.0002,
+      "loss": 1.4832,
+      "step": 1320
+    },
+    {
+      "epoch": 2.878787878787879,
+      "grad_norm": 1.1407958269119263,
+      "learning_rate": 0.0002,
+      "loss": 1.5297,
+      "step": 1330
+    },
+    {
+      "epoch": 2.9004329004329006,
+      "grad_norm": 0.879891574382782,
+      "learning_rate": 0.0002,
+      "loss": 1.5435,
+      "step": 1340
+    },
+    {
+      "epoch": 2.9220779220779223,
+      "grad_norm": 0.9538708925247192,
+      "learning_rate": 0.0002,
+      "loss": 1.5523,
+      "step": 1350
+    },
+    {
+      "epoch": 2.9437229437229435,
+      "grad_norm": 0.7732896208763123,
+      "learning_rate": 0.0002,
+      "loss": 1.4881,
+      "step": 1360
+    },
+    {
+      "epoch": 2.965367965367965,
+      "grad_norm": 0.9062705636024475,
+      "learning_rate": 0.0002,
+      "loss": 1.4959,
+      "step": 1370
+    },
+    {
+      "epoch": 2.987012987012987,
+      "grad_norm": 0.9082673192024231,
+      "learning_rate": 0.0002,
+      "loss": 1.5508,
+      "step": 1380
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 1.842921257019043,
+      "eval_runtime": 138.5715,
+      "eval_samples_per_second": 3.846,
+      "eval_steps_per_second": 0.484,
+      "step": 1386
+    },
+    {
+      "epoch": 3.0086580086580086,
+      "grad_norm": 0.8586050868034363,
+      "learning_rate": 0.0002,
+      "loss": 1.4376,
+      "step": 1390
+    },
+    {
+      "epoch": 3.0303030303030303,
+      "grad_norm": 1.127321720123291,
+      "learning_rate": 0.0002,
+      "loss": 1.2973,
+      "step": 1400
+    },
+    {
+      "epoch": 3.051948051948052,
+      "grad_norm": 1.3029290437698364,
+      "learning_rate": 0.0002,
+      "loss": 1.2804,
+      "step": 1410
+    },
+    {
+      "epoch": 3.0735930735930737,
+      "grad_norm": 1.4397313594818115,
+      "learning_rate": 0.0002,
+      "loss": 1.3353,
+      "step": 1420
+    },
+    {
+      "epoch": 3.0952380952380953,
+      "grad_norm": 1.5687700510025024,
+      "learning_rate": 0.0002,
+      "loss": 1.2783,
+      "step": 1430
+    },
+    {
+      "epoch": 3.116883116883117,
+      "grad_norm": 1.0821301937103271,
+      "learning_rate": 0.0002,
+      "loss": 1.2991,
+      "step": 1440
+    },
+    {
+      "epoch": 3.1385281385281387,
+      "grad_norm": 1.1222467422485352,
+      "learning_rate": 0.0002,
+      "loss": 1.2772,
+      "step": 1450
+    },
+    {
+      "epoch": 3.16017316017316,
+      "grad_norm": 1.196321964263916,
+      "learning_rate": 0.0002,
+      "loss": 1.3571,
+      "step": 1460
+    },
+    {
+      "epoch": 3.1818181818181817,
+      "grad_norm": 1.1099780797958374,
+      "learning_rate": 0.0002,
+      "loss": 1.2597,
+      "step": 1470
+    },
+    {
+      "epoch": 3.2034632034632033,
+      "grad_norm": 1.1216720342636108,
+      "learning_rate": 0.0002,
+      "loss": 1.3297,
+      "step": 1480
+    },
+    {
+      "epoch": 3.225108225108225,
+      "grad_norm": 1.2393304109573364,
+      "learning_rate": 0.0002,
+      "loss": 1.3066,
+      "step": 1490
+    },
+    {
+      "epoch": 3.2467532467532467,
+      "grad_norm": 1.2331798076629639,
+      "learning_rate": 0.0002,
+      "loss": 1.2445,
+      "step": 1500
+    },
+    {
+      "epoch": 3.2683982683982684,
+      "grad_norm": 1.1466370820999146,
+      "learning_rate": 0.0002,
+      "loss": 1.292,
+      "step": 1510
+    },
+    {
+      "epoch": 3.29004329004329,
+      "grad_norm": 1.6869697570800781,
+      "learning_rate": 0.0002,
+      "loss": 1.338,
+      "step": 1520
+    },
+    {
+      "epoch": 3.311688311688312,
+      "grad_norm": 1.2315126657485962,
+      "learning_rate": 0.0002,
+      "loss": 1.3152,
+      "step": 1530
+    },
+    {
+      "epoch": 3.3333333333333335,
+      "grad_norm": 1.2909607887268066,
+      "learning_rate": 0.0002,
+      "loss": 1.3555,
+      "step": 1540
+    },
+    {
+      "epoch": 3.354978354978355,
+      "grad_norm": 1.2874510288238525,
+      "learning_rate": 0.0002,
+      "loss": 1.2782,
+      "step": 1550
+    },
+    {
+      "epoch": 3.3766233766233764,
+      "grad_norm": 1.5269776582717896,
+      "learning_rate": 0.0002,
+      "loss": 1.308,
+      "step": 1560
+    },
+    {
+      "epoch": 3.398268398268398,
+      "grad_norm": 1.2578439712524414,
+      "learning_rate": 0.0002,
+      "loss": 1.3256,
+      "step": 1570
+    },
+    {
+      "epoch": 3.41991341991342,
+      "grad_norm": 1.1697931289672852,
+      "learning_rate": 0.0002,
+      "loss": 1.2783,
+      "step": 1580
+    },
+    {
+      "epoch": 3.4415584415584415,
+      "grad_norm": 1.314573049545288,
+      "learning_rate": 0.0002,
+      "loss": 1.3834,
+      "step": 1590
+    },
+    {
+      "epoch": 3.463203463203463,
+      "grad_norm": 1.2375879287719727,
+      "learning_rate": 0.0002,
+      "loss": 1.2516,
+      "step": 1600
+    },
+    {
+      "epoch": 3.484848484848485,
+      "grad_norm": 1.0980405807495117,
+      "learning_rate": 0.0002,
+      "loss": 1.2872,
+      "step": 1610
+    },
+    {
+      "epoch": 3.5064935064935066,
+      "grad_norm": 1.5183982849121094,
+      "learning_rate": 0.0002,
+      "loss": 1.2586,
+      "step": 1620
+    },
+    {
+      "epoch": 3.5281385281385282,
+      "grad_norm": 1.7712465524673462,
+      "learning_rate": 0.0002,
+      "loss": 1.3149,
+      "step": 1630
+    },
+    {
+      "epoch": 3.54978354978355,
+      "grad_norm": 1.4033244848251343,
+      "learning_rate": 0.0002,
+      "loss": 1.3097,
+      "step": 1640
+    },
+    {
+      "epoch": 3.571428571428571,
+      "grad_norm": 1.3502216339111328,
+      "learning_rate": 0.0002,
+      "loss": 1.3614,
+      "step": 1650
+    },
+    {
+      "epoch": 3.5930735930735933,
+      "grad_norm": 1.2922712564468384,
+      "learning_rate": 0.0002,
+      "loss": 1.3743,
+      "step": 1660
+    },
+    {
+      "epoch": 3.6147186147186146,
+      "grad_norm": 1.4703474044799805,
+      "learning_rate": 0.0002,
+      "loss": 1.3313,
+      "step": 1670
+    },
+    {
+      "epoch": 3.6363636363636362,
+      "grad_norm": 1.2576347589492798,
+      "learning_rate": 0.0002,
+      "loss": 1.3053,
+      "step": 1680
+    },
+    {
+      "epoch": 3.658008658008658,
+      "grad_norm": 1.361474633216858,
+      "learning_rate": 0.0002,
+      "loss": 1.3733,
+      "step": 1690
+    },
+    {
+      "epoch": 3.6796536796536796,
+      "grad_norm": 1.3686575889587402,
+      "learning_rate": 0.0002,
+      "loss": 1.4326,
+      "step": 1700
+    },
+    {
+      "epoch": 3.7012987012987013,
+      "grad_norm": 1.480577826499939,
+      "learning_rate": 0.0002,
+      "loss": 1.3832,
+      "step": 1710
+    },
+    {
+      "epoch": 3.722943722943723,
+      "grad_norm": 1.1896449327468872,
+      "learning_rate": 0.0002,
+      "loss": 1.3488,
+      "step": 1720
+    },
+    {
+      "epoch": 3.7445887445887447,
+      "grad_norm": 1.1765750646591187,
+      "learning_rate": 0.0002,
+      "loss": 1.2901,
+      "step": 1730
+    },
+    {
+      "epoch": 3.7662337662337664,
+      "grad_norm": 1.1575956344604492,
+      "learning_rate": 0.0002,
+      "loss": 1.3259,
+      "step": 1740
+    },
+    {
+      "epoch": 3.787878787878788,
+      "grad_norm": 1.1376453638076782,
+      "learning_rate": 0.0002,
+      "loss": 1.3073,
+      "step": 1750
+    },
+    {
+      "epoch": 3.8095238095238093,
+      "grad_norm": 1.1058441400527954,
+      "learning_rate": 0.0002,
+      "loss": 1.2997,
+      "step": 1760
+    },
+    {
+      "epoch": 3.8311688311688314,
+      "grad_norm": 1.3807097673416138,
+      "learning_rate": 0.0002,
+      "loss": 1.3549,
+      "step": 1770
+    },
+    {
+      "epoch": 3.8528138528138527,
+      "grad_norm": 1.1583185195922852,
+      "learning_rate": 0.0002,
+      "loss": 1.3589,
+      "step": 1780
+    },
+    {
+      "epoch": 3.8744588744588744,
+      "grad_norm": 1.0412019491195679,
+      "learning_rate": 0.0002,
+      "loss": 1.3855,
+      "step": 1790
+    },
+    {
+      "epoch": 3.896103896103896,
+      "grad_norm": 1.2590245008468628,
+      "learning_rate": 0.0002,
+      "loss": 1.3263,
+      "step": 1800
+    },
+    {
+      "epoch": 3.9177489177489178,
+      "grad_norm": 1.1784659624099731,
+      "learning_rate": 0.0002,
+      "loss": 1.333,
+      "step": 1810
+    },
+    {
+      "epoch": 3.9393939393939394,
+      "grad_norm": 1.2848402261734009,
+      "learning_rate": 0.0002,
+      "loss": 1.3326,
+      "step": 1820
+    },
+    {
+      "epoch": 3.961038961038961,
+      "grad_norm": 1.2152059078216553,
+      "learning_rate": 0.0002,
+      "loss": 1.3734,
+      "step": 1830
+    },
+    {
+      "epoch": 3.982683982683983,
+      "grad_norm": 1.3694654703140259,
+      "learning_rate": 0.0002,
+      "loss": 1.3563,
+      "step": 1840
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 1.9379768371582031,
+      "eval_runtime": 138.0181,
+      "eval_samples_per_second": 3.862,
+      "eval_steps_per_second": 0.485,
+      "step": 1848
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3696,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 8.10804949549056e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d349b8fe4d8871a8479d9bb3b1cba8d39e96d113c8c86cdb28a7fc6969f53ba6
+size 5560

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-2310/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3409ee7371681afe09dede9dc9270429f58fe1b94ef9e951ba0091e3a58b0d1f
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24fd38c322977fe2a8a77055a1a4b6fab5e45b708064b8cb41e98b9b5fe0604c
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ea3496e20fbdb98594e1ecf075be3338c434c9afa2ad29244c8d1e62c6b8baa
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:431b0fd475a4105d53ed604f0820e31e6c6e3e1a227ef81e9ad538a706e56df2
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,1690 @@

+{
+  "best_metric": 1.7847579717636108,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-462",
+  "epoch": 5.0,
+  "eval_steps": 10,
+  "global_step": 2310,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.021645021645021644,
+      "grad_norm": 1.200374722480774,
+      "learning_rate": 0.0002,
+      "loss": 2.5092,
+      "step": 10
+    },
+    {
+      "epoch": 0.04329004329004329,
+      "grad_norm": 0.974091112613678,
+      "learning_rate": 0.0002,
+      "loss": 2.2672,
+      "step": 20
+    },
+    {
+      "epoch": 0.06493506493506493,
+      "grad_norm": 0.9070103168487549,
+      "learning_rate": 0.0002,
+      "loss": 2.1445,
+      "step": 30
+    },
+    {
+      "epoch": 0.08658008658008658,
+      "grad_norm": 0.6892510056495667,
+      "learning_rate": 0.0002,
+      "loss": 2.0634,
+      "step": 40
+    },
+    {
+      "epoch": 0.10822510822510822,
+      "grad_norm": 0.7840355038642883,
+      "learning_rate": 0.0002,
+      "loss": 2.039,
+      "step": 50
+    },
+    {
+      "epoch": 0.12987012987012986,
+      "grad_norm": 0.8381665349006653,
+      "learning_rate": 0.0002,
+      "loss": 1.9527,
+      "step": 60
+    },
+    {
+      "epoch": 0.15151515151515152,
+      "grad_norm": 0.6969044804573059,
+      "learning_rate": 0.0002,
+      "loss": 1.8852,
+      "step": 70
+    },
+    {
+      "epoch": 0.17316017316017315,
+      "grad_norm": 0.6608849763870239,
+      "learning_rate": 0.0002,
+      "loss": 1.8263,
+      "step": 80
+    },
+    {
+      "epoch": 0.19480519480519481,
+      "grad_norm": 0.6329185962677002,
+      "learning_rate": 0.0002,
+      "loss": 1.825,
+      "step": 90
+    },
+    {
+      "epoch": 0.21645021645021645,
+      "grad_norm": 0.723852276802063,
+      "learning_rate": 0.0002,
+      "loss": 1.8256,
+      "step": 100
+    },
+    {
+      "epoch": 0.23809523809523808,
+      "grad_norm": 0.8358765840530396,
+      "learning_rate": 0.0002,
+      "loss": 1.8758,
+      "step": 110
+    },
+    {
+      "epoch": 0.2597402597402597,
+      "grad_norm": 0.6025514006614685,
+      "learning_rate": 0.0002,
+      "loss": 1.8468,
+      "step": 120
+    },
+    {
+      "epoch": 0.2813852813852814,
+      "grad_norm": 0.5782386064529419,
+      "learning_rate": 0.0002,
+      "loss": 1.7487,
+      "step": 130
+    },
+    {
+      "epoch": 0.30303030303030304,
+      "grad_norm": 0.8589595556259155,
+      "learning_rate": 0.0002,
+      "loss": 1.7717,
+      "step": 140
+    },
+    {
+      "epoch": 0.3246753246753247,
+      "grad_norm": 0.5718036890029907,
+      "learning_rate": 0.0002,
+      "loss": 1.7726,
+      "step": 150
+    },
+    {
+      "epoch": 0.3463203463203463,
+      "grad_norm": 0.632756769657135,
+      "learning_rate": 0.0002,
+      "loss": 1.784,
+      "step": 160
+    },
+    {
+      "epoch": 0.36796536796536794,
+      "grad_norm": 0.5307920575141907,
+      "learning_rate": 0.0002,
+      "loss": 1.8176,
+      "step": 170
+    },
+    {
+      "epoch": 0.38961038961038963,
+      "grad_norm": 0.5692276358604431,
+      "learning_rate": 0.0002,
+      "loss": 1.7744,
+      "step": 180
+    },
+    {
+      "epoch": 0.41125541125541126,
+      "grad_norm": 0.6083813309669495,
+      "learning_rate": 0.0002,
+      "loss": 1.8075,
+      "step": 190
+    },
+    {
+      "epoch": 0.4329004329004329,
+      "grad_norm": 0.7849981188774109,
+      "learning_rate": 0.0002,
+      "loss": 1.8718,
+      "step": 200
+    },
+    {
+      "epoch": 0.45454545454545453,
+      "grad_norm": 0.6536546945571899,
+      "learning_rate": 0.0002,
+      "loss": 1.7946,
+      "step": 210
+    },
+    {
+      "epoch": 0.47619047619047616,
+      "grad_norm": 0.5180730223655701,
+      "learning_rate": 0.0002,
+      "loss": 1.8174,
+      "step": 220
+    },
+    {
+      "epoch": 0.49783549783549785,
+      "grad_norm": 0.5796821713447571,
+      "learning_rate": 0.0002,
+      "loss": 1.7857,
+      "step": 230
+    },
+    {
+      "epoch": 0.5194805194805194,
+      "grad_norm": 0.6185894012451172,
+      "learning_rate": 0.0002,
+      "loss": 1.8062,
+      "step": 240
+    },
+    {
+      "epoch": 0.5411255411255411,
+      "grad_norm": 0.6040953397750854,
+      "learning_rate": 0.0002,
+      "loss": 1.825,
+      "step": 250
+    },
+    {
+      "epoch": 0.5627705627705628,
+      "grad_norm": 0.6005431413650513,
+      "learning_rate": 0.0002,
+      "loss": 1.7785,
+      "step": 260
+    },
+    {
+      "epoch": 0.5844155844155844,
+      "grad_norm": 0.6693951487541199,
+      "learning_rate": 0.0002,
+      "loss": 1.8444,
+      "step": 270
+    },
+    {
+      "epoch": 0.6060606060606061,
+      "grad_norm": 0.5105443596839905,
+      "learning_rate": 0.0002,
+      "loss": 1.8471,
+      "step": 280
+    },
+    {
+      "epoch": 0.6277056277056277,
+      "grad_norm": 0.5175243616104126,
+      "learning_rate": 0.0002,
+      "loss": 1.8274,
+      "step": 290
+    },
+    {
+      "epoch": 0.6493506493506493,
+      "grad_norm": 0.4775221049785614,
+      "learning_rate": 0.0002,
+      "loss": 1.81,
+      "step": 300
+    },
+    {
+      "epoch": 0.670995670995671,
+      "grad_norm": 0.9106342792510986,
+      "learning_rate": 0.0002,
+      "loss": 1.7816,
+      "step": 310
+    },
+    {
+      "epoch": 0.6926406926406926,
+      "grad_norm": 1.9134571552276611,
+      "learning_rate": 0.0002,
+      "loss": 1.7764,
+      "step": 320
+    },
+    {
+      "epoch": 0.7142857142857143,
+      "grad_norm": 0.6287537217140198,
+      "learning_rate": 0.0002,
+      "loss": 1.7877,
+      "step": 330
+    },
+    {
+      "epoch": 0.7359307359307359,
+      "grad_norm": 0.5587132573127747,
+      "learning_rate": 0.0002,
+      "loss": 1.8499,
+      "step": 340
+    },
+    {
+      "epoch": 0.7575757575757576,
+      "grad_norm": 0.5827193260192871,
+      "learning_rate": 0.0002,
+      "loss": 1.7328,
+      "step": 350
+    },
+    {
+      "epoch": 0.7792207792207793,
+      "grad_norm": 0.572600781917572,
+      "learning_rate": 0.0002,
+      "loss": 1.8022,
+      "step": 360
+    },
+    {
+      "epoch": 0.8008658008658008,
+      "grad_norm": 0.6280586123466492,
+      "learning_rate": 0.0002,
+      "loss": 1.88,
+      "step": 370
+    },
+    {
+      "epoch": 0.8225108225108225,
+      "grad_norm": 0.6878819465637207,
+      "learning_rate": 0.0002,
+      "loss": 1.8116,
+      "step": 380
+    },
+    {
+      "epoch": 0.8441558441558441,
+      "grad_norm": 0.5876027345657349,
+      "learning_rate": 0.0002,
+      "loss": 1.8042,
+      "step": 390
+    },
+    {
+      "epoch": 0.8658008658008658,
+      "grad_norm": 0.5249695777893066,
+      "learning_rate": 0.0002,
+      "loss": 1.7501,
+      "step": 400
+    },
+    {
+      "epoch": 0.8874458874458875,
+      "grad_norm": 0.5510677695274353,
+      "learning_rate": 0.0002,
+      "loss": 1.7599,
+      "step": 410
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 0.6817089915275574,
+      "learning_rate": 0.0002,
+      "loss": 1.7737,
+      "step": 420
+    },
+    {
+      "epoch": 0.9307359307359307,
+      "grad_norm": 0.5116859078407288,
+      "learning_rate": 0.0002,
+      "loss": 1.7857,
+      "step": 430
+    },
+    {
+      "epoch": 0.9523809523809523,
+      "grad_norm": 0.5427846312522888,
+      "learning_rate": 0.0002,
+      "loss": 1.7361,
+      "step": 440
+    },
+    {
+      "epoch": 0.974025974025974,
+      "grad_norm": 0.5605915784835815,
+      "learning_rate": 0.0002,
+      "loss": 1.7812,
+      "step": 450
+    },
+    {
+      "epoch": 0.9956709956709957,
+      "grad_norm": 0.5166691541671753,
+      "learning_rate": 0.0002,
+      "loss": 1.7699,
+      "step": 460
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.7847579717636108,
+      "eval_runtime": 144.877,
+      "eval_samples_per_second": 3.679,
+      "eval_steps_per_second": 0.462,
+      "step": 462
+    },
+    {
+      "epoch": 1.0173160173160174,
+      "grad_norm": 0.5665210485458374,
+      "learning_rate": 0.0002,
+      "loss": 1.6725,
+      "step": 470
+    },
+    {
+      "epoch": 1.0389610389610389,
+      "grad_norm": 1.0514339208602905,
+      "learning_rate": 0.0002,
+      "loss": 1.6996,
+      "step": 480
+    },
+    {
+      "epoch": 1.0606060606060606,
+      "grad_norm": 0.5494309663772583,
+      "learning_rate": 0.0002,
+      "loss": 1.6573,
+      "step": 490
+    },
+    {
+      "epoch": 1.0822510822510822,
+      "grad_norm": 0.557016909122467,
+      "learning_rate": 0.0002,
+      "loss": 1.7314,
+      "step": 500
+    },
+    {
+      "epoch": 1.103896103896104,
+      "grad_norm": 0.6585943102836609,
+      "learning_rate": 0.0002,
+      "loss": 1.7284,
+      "step": 510
+    },
+    {
+      "epoch": 1.1255411255411256,
+      "grad_norm": 0.6703357696533203,
+      "learning_rate": 0.0002,
+      "loss": 1.7575,
+      "step": 520
+    },
+    {
+      "epoch": 1.1471861471861473,
+      "grad_norm": 1.9358264207839966,
+      "learning_rate": 0.0002,
+      "loss": 1.7013,
+      "step": 530
+    },
+    {
+      "epoch": 1.1688311688311688,
+      "grad_norm": 0.6128601431846619,
+      "learning_rate": 0.0002,
+      "loss": 1.6914,
+      "step": 540
+    },
+    {
+      "epoch": 1.1904761904761905,
+      "grad_norm": 0.6610239744186401,
+      "learning_rate": 0.0002,
+      "loss": 1.6358,
+      "step": 550
+    },
+    {
+      "epoch": 1.2121212121212122,
+      "grad_norm": 0.6083669662475586,
+      "learning_rate": 0.0002,
+      "loss": 1.7122,
+      "step": 560
+    },
+    {
+      "epoch": 1.2337662337662338,
+      "grad_norm": 0.7784225940704346,
+      "learning_rate": 0.0002,
+      "loss": 1.6771,
+      "step": 570
+    },
+    {
+      "epoch": 1.2554112554112553,
+      "grad_norm": 0.6141694784164429,
+      "learning_rate": 0.0002,
+      "loss": 1.6372,
+      "step": 580
+    },
+    {
+      "epoch": 1.277056277056277,
+      "grad_norm": 0.6129311323165894,
+      "learning_rate": 0.0002,
+      "loss": 1.6795,
+      "step": 590
+    },
+    {
+      "epoch": 1.2987012987012987,
+      "grad_norm": 0.6802751421928406,
+      "learning_rate": 0.0002,
+      "loss": 1.6664,
+      "step": 600
+    },
+    {
+      "epoch": 1.3203463203463204,
+      "grad_norm": 0.6065750122070312,
+      "learning_rate": 0.0002,
+      "loss": 1.6555,
+      "step": 610
+    },
+    {
+      "epoch": 1.341991341991342,
+      "grad_norm": 0.6713075637817383,
+      "learning_rate": 0.0002,
+      "loss": 1.679,
+      "step": 620
+    },
+    {
+      "epoch": 1.3636363636363638,
+      "grad_norm": 0.627552330493927,
+      "learning_rate": 0.0002,
+      "loss": 1.7412,
+      "step": 630
+    },
+    {
+      "epoch": 1.3852813852813852,
+      "grad_norm": 0.6579778790473938,
+      "learning_rate": 0.0002,
+      "loss": 1.6477,
+      "step": 640
+    },
+    {
+      "epoch": 1.406926406926407,
+      "grad_norm": 0.6381745934486389,
+      "learning_rate": 0.0002,
+      "loss": 1.7282,
+      "step": 650
+    },
+    {
+      "epoch": 1.4285714285714286,
+      "grad_norm": 0.7358919382095337,
+      "learning_rate": 0.0002,
+      "loss": 1.7218,
+      "step": 660
+    },
+    {
+      "epoch": 1.4502164502164503,
+      "grad_norm": 0.6294736266136169,
+      "learning_rate": 0.0002,
+      "loss": 1.7046,
+      "step": 670
+    },
+    {
+      "epoch": 1.4718614718614718,
+      "grad_norm": 0.6542870998382568,
+      "learning_rate": 0.0002,
+      "loss": 1.7143,
+      "step": 680
+    },
+    {
+      "epoch": 1.4935064935064934,
+      "grad_norm": 0.6003480553627014,
+      "learning_rate": 0.0002,
+      "loss": 1.7417,
+      "step": 690
+    },
+    {
+      "epoch": 1.5151515151515151,
+      "grad_norm": 0.8322144150733948,
+      "learning_rate": 0.0002,
+      "loss": 1.6487,
+      "step": 700
+    },
+    {
+      "epoch": 1.5367965367965368,
+      "grad_norm": 0.6853126287460327,
+      "learning_rate": 0.0002,
+      "loss": 1.7217,
+      "step": 710
+    },
+    {
+      "epoch": 1.5584415584415585,
+      "grad_norm": 0.6571378707885742,
+      "learning_rate": 0.0002,
+      "loss": 1.6888,
+      "step": 720
+    },
+    {
+      "epoch": 1.5800865800865802,
+      "grad_norm": 0.6957149505615234,
+      "learning_rate": 0.0002,
+      "loss": 1.7111,
+      "step": 730
+    },
+    {
+      "epoch": 1.601731601731602,
+      "grad_norm": 0.6495681405067444,
+      "learning_rate": 0.0002,
+      "loss": 1.7361,
+      "step": 740
+    },
+    {
+      "epoch": 1.6233766233766234,
+      "grad_norm": 0.6954384446144104,
+      "learning_rate": 0.0002,
+      "loss": 1.5709,
+      "step": 750
+    },
+    {
+      "epoch": 1.645021645021645,
+      "grad_norm": 0.7402207851409912,
+      "learning_rate": 0.0002,
+      "loss": 1.6851,
+      "step": 760
+    },
+    {
+      "epoch": 1.6666666666666665,
+      "grad_norm": 0.6827481985092163,
+      "learning_rate": 0.0002,
+      "loss": 1.6651,
+      "step": 770
+    },
+    {
+      "epoch": 1.6883116883116882,
+      "grad_norm": 0.6176769733428955,
+      "learning_rate": 0.0002,
+      "loss": 1.6827,
+      "step": 780
+    },
+    {
+      "epoch": 1.70995670995671,
+      "grad_norm": 0.6565108299255371,
+      "learning_rate": 0.0002,
+      "loss": 1.6291,
+      "step": 790
+    },
+    {
+      "epoch": 1.7316017316017316,
+      "grad_norm": 0.6303038001060486,
+      "learning_rate": 0.0002,
+      "loss": 1.6805,
+      "step": 800
+    },
+    {
+      "epoch": 1.7532467532467533,
+      "grad_norm": 0.6866182684898376,
+      "learning_rate": 0.0002,
+      "loss": 1.7321,
+      "step": 810
+    },
+    {
+      "epoch": 1.774891774891775,
+      "grad_norm": 0.7522535920143127,
+      "learning_rate": 0.0002,
+      "loss": 1.6847,
+      "step": 820
+    },
+    {
+      "epoch": 1.7965367965367967,
+      "grad_norm": 0.7703698873519897,
+      "learning_rate": 0.0002,
+      "loss": 1.679,
+      "step": 830
+    },
+    {
+      "epoch": 1.8181818181818183,
+      "grad_norm": 0.5955503582954407,
+      "learning_rate": 0.0002,
+      "loss": 1.6817,
+      "step": 840
+    },
+    {
+      "epoch": 1.8398268398268398,
+      "grad_norm": 0.707340657711029,
+      "learning_rate": 0.0002,
+      "loss": 1.7159,
+      "step": 850
+    },
+    {
+      "epoch": 1.8614718614718615,
+      "grad_norm": 0.7305465936660767,
+      "learning_rate": 0.0002,
+      "loss": 1.709,
+      "step": 860
+    },
+    {
+      "epoch": 1.883116883116883,
+      "grad_norm": 0.667972207069397,
+      "learning_rate": 0.0002,
+      "loss": 1.71,
+      "step": 870
+    },
+    {
+      "epoch": 1.9047619047619047,
+      "grad_norm": 0.654872477054596,
+      "learning_rate": 0.0002,
+      "loss": 1.7051,
+      "step": 880
+    },
+    {
+      "epoch": 1.9264069264069263,
+      "grad_norm": 0.6718705296516418,
+      "learning_rate": 0.0002,
+      "loss": 1.6316,
+      "step": 890
+    },
+    {
+      "epoch": 1.948051948051948,
+      "grad_norm": 0.6363692879676819,
+      "learning_rate": 0.0002,
+      "loss": 1.623,
+      "step": 900
+    },
+    {
+      "epoch": 1.9696969696969697,
+      "grad_norm": 0.6861362457275391,
+      "learning_rate": 0.0002,
+      "loss": 1.6725,
+      "step": 910
+    },
+    {
+      "epoch": 1.9913419913419914,
+      "grad_norm": 0.6531493067741394,
+      "learning_rate": 0.0002,
+      "loss": 1.6833,
+      "step": 920
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.7908068895339966,
+      "eval_runtime": 144.0281,
+      "eval_samples_per_second": 3.701,
+      "eval_steps_per_second": 0.465,
+      "step": 924
+    },
+    {
+      "epoch": 2.012987012987013,
+      "grad_norm": 0.6030914187431335,
+      "learning_rate": 0.0002,
+      "loss": 1.5922,
+      "step": 930
+    },
+    {
+      "epoch": 2.034632034632035,
+      "grad_norm": 0.7416430711746216,
+      "learning_rate": 0.0002,
+      "loss": 1.5215,
+      "step": 940
+    },
+    {
+      "epoch": 2.0562770562770565,
+      "grad_norm": 0.7020093202590942,
+      "learning_rate": 0.0002,
+      "loss": 1.5759,
+      "step": 950
+    },
+    {
+      "epoch": 2.0779220779220777,
+      "grad_norm": 0.8007868528366089,
+      "learning_rate": 0.0002,
+      "loss": 1.4751,
+      "step": 960
+    },
+    {
+      "epoch": 2.0995670995670994,
+      "grad_norm": 0.7111671566963196,
+      "learning_rate": 0.0002,
+      "loss": 1.4808,
+      "step": 970
+    },
+    {
+      "epoch": 2.121212121212121,
+      "grad_norm": 0.7257682085037231,
+      "learning_rate": 0.0002,
+      "loss": 1.53,
+      "step": 980
+    },
+    {
+      "epoch": 2.142857142857143,
+      "grad_norm": 0.8737282156944275,
+      "learning_rate": 0.0002,
+      "loss": 1.5097,
+      "step": 990
+    },
+    {
+      "epoch": 2.1645021645021645,
+      "grad_norm": 0.9281378984451294,
+      "learning_rate": 0.0002,
+      "loss": 1.537,
+      "step": 1000
+    },
+    {
+      "epoch": 2.186147186147186,
+      "grad_norm": 1.0217959880828857,
+      "learning_rate": 0.0002,
+      "loss": 1.5223,
+      "step": 1010
+    },
+    {
+      "epoch": 2.207792207792208,
+      "grad_norm": 0.8430958986282349,
+      "learning_rate": 0.0002,
+      "loss": 1.4253,
+      "step": 1020
+    },
+    {
+      "epoch": 2.2294372294372296,
+      "grad_norm": 0.8123440742492676,
+      "learning_rate": 0.0002,
+      "loss": 1.5294,
+      "step": 1030
+    },
+    {
+      "epoch": 2.2510822510822512,
+      "grad_norm": 0.9429558515548706,
+      "learning_rate": 0.0002,
+      "loss": 1.5167,
+      "step": 1040
+    },
+    {
+      "epoch": 2.2727272727272725,
+      "grad_norm": 0.811696469783783,
+      "learning_rate": 0.0002,
+      "loss": 1.4711,
+      "step": 1050
+    },
+    {
+      "epoch": 2.2943722943722946,
+      "grad_norm": 0.8424768447875977,
+      "learning_rate": 0.0002,
+      "loss": 1.4656,
+      "step": 1060
+    },
+    {
+      "epoch": 2.316017316017316,
+      "grad_norm": 0.8870340585708618,
+      "learning_rate": 0.0002,
+      "loss": 1.5618,
+      "step": 1070
+    },
+    {
+      "epoch": 2.3376623376623376,
+      "grad_norm": 0.8600393533706665,
+      "learning_rate": 0.0002,
+      "loss": 1.5368,
+      "step": 1080
+    },
+    {
+      "epoch": 2.3593073593073592,
+      "grad_norm": 0.8447834253311157,
+      "learning_rate": 0.0002,
+      "loss": 1.5028,
+      "step": 1090
+    },
+    {
+      "epoch": 2.380952380952381,
+      "grad_norm": 0.9303842186927795,
+      "learning_rate": 0.0002,
+      "loss": 1.4885,
+      "step": 1100
+    },
+    {
+      "epoch": 2.4025974025974026,
+      "grad_norm": 0.8144819140434265,
+      "learning_rate": 0.0002,
+      "loss": 1.5163,
+      "step": 1110
+    },
+    {
+      "epoch": 2.4242424242424243,
+      "grad_norm": 0.92924964427948,
+      "learning_rate": 0.0002,
+      "loss": 1.4805,
+      "step": 1120
+    },
+    {
+      "epoch": 2.445887445887446,
+      "grad_norm": 0.8560649156570435,
+      "learning_rate": 0.0002,
+      "loss": 1.4608,
+      "step": 1130
+    },
+    {
+      "epoch": 2.4675324675324677,
+      "grad_norm": 0.8532574772834778,
+      "learning_rate": 0.0002,
+      "loss": 1.5541,
+      "step": 1140
+    },
+    {
+      "epoch": 2.4891774891774894,
+      "grad_norm": 0.8702793717384338,
+      "learning_rate": 0.0002,
+      "loss": 1.5607,
+      "step": 1150
+    },
+    {
+      "epoch": 2.5108225108225106,
+      "grad_norm": 0.9125854969024658,
+      "learning_rate": 0.0002,
+      "loss": 1.5194,
+      "step": 1160
+    },
+    {
+      "epoch": 2.5324675324675323,
+      "grad_norm": 0.9579735398292542,
+      "learning_rate": 0.0002,
+      "loss": 1.5953,
+      "step": 1170
+    },
+    {
+      "epoch": 2.554112554112554,
+      "grad_norm": 0.8561005592346191,
+      "learning_rate": 0.0002,
+      "loss": 1.5088,
+      "step": 1180
+    },
+    {
+      "epoch": 2.5757575757575757,
+      "grad_norm": 0.9103630185127258,
+      "learning_rate": 0.0002,
+      "loss": 1.5636,
+      "step": 1190
+    },
+    {
+      "epoch": 2.5974025974025974,
+      "grad_norm": 0.8527248501777649,
+      "learning_rate": 0.0002,
+      "loss": 1.5497,
+      "step": 1200
+    },
+    {
+      "epoch": 2.619047619047619,
+      "grad_norm": 0.8368656039237976,
+      "learning_rate": 0.0002,
+      "loss": 1.5845,
+      "step": 1210
+    },
+    {
+      "epoch": 2.6406926406926408,
+      "grad_norm": 0.9644360542297363,
+      "learning_rate": 0.0002,
+      "loss": 1.574,
+      "step": 1220
+    },
+    {
+      "epoch": 2.6623376623376624,
+      "grad_norm": 0.9691457748413086,
+      "learning_rate": 0.0002,
+      "loss": 1.5623,
+      "step": 1230
+    },
+    {
+      "epoch": 2.683982683982684,
+      "grad_norm": 0.8851862549781799,
+      "learning_rate": 0.0002,
+      "loss": 1.5894,
+      "step": 1240
+    },
+    {
+      "epoch": 2.7056277056277054,
+      "grad_norm": 1.0715088844299316,
+      "learning_rate": 0.0002,
+      "loss": 1.5251,
+      "step": 1250
+    },
+    {
+      "epoch": 2.7272727272727275,
+      "grad_norm": 0.8532006740570068,
+      "learning_rate": 0.0002,
+      "loss": 1.5903,
+      "step": 1260
+    },
+    {
+      "epoch": 2.7489177489177488,
+      "grad_norm": 0.9172760248184204,
+      "learning_rate": 0.0002,
+      "loss": 1.5261,
+      "step": 1270
+    },
+    {
+      "epoch": 2.7705627705627704,
+      "grad_norm": 0.8991577625274658,
+      "learning_rate": 0.0002,
+      "loss": 1.5029,
+      "step": 1280
+    },
+    {
+      "epoch": 2.792207792207792,
+      "grad_norm": 0.8205381631851196,
+      "learning_rate": 0.0002,
+      "loss": 1.5207,
+      "step": 1290
+    },
+    {
+      "epoch": 2.813852813852814,
+      "grad_norm": 0.9733313918113708,
+      "learning_rate": 0.0002,
+      "loss": 1.5328,
+      "step": 1300
+    },
+    {
+      "epoch": 2.8354978354978355,
+      "grad_norm": 1.0313537120819092,
+      "learning_rate": 0.0002,
+      "loss": 1.5373,
+      "step": 1310
+    },
+    {
+      "epoch": 2.857142857142857,
+      "grad_norm": 0.8865208625793457,
+      "learning_rate": 0.0002,
+      "loss": 1.4832,
+      "step": 1320
+    },
+    {
+      "epoch": 2.878787878787879,
+      "grad_norm": 1.1407958269119263,
+      "learning_rate": 0.0002,
+      "loss": 1.5297,
+      "step": 1330
+    },
+    {
+      "epoch": 2.9004329004329006,
+      "grad_norm": 0.879891574382782,
+      "learning_rate": 0.0002,
+      "loss": 1.5435,
+      "step": 1340
+    },
+    {
+      "epoch": 2.9220779220779223,
+      "grad_norm": 0.9538708925247192,
+      "learning_rate": 0.0002,
+      "loss": 1.5523,
+      "step": 1350
+    },
+    {
+      "epoch": 2.9437229437229435,
+      "grad_norm": 0.7732896208763123,
+      "learning_rate": 0.0002,
+      "loss": 1.4881,
+      "step": 1360
+    },
+    {
+      "epoch": 2.965367965367965,
+      "grad_norm": 0.9062705636024475,
+      "learning_rate": 0.0002,
+      "loss": 1.4959,
+      "step": 1370
+    },
+    {
+      "epoch": 2.987012987012987,
+      "grad_norm": 0.9082673192024231,
+      "learning_rate": 0.0002,
+      "loss": 1.5508,
+      "step": 1380
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 1.842921257019043,
+      "eval_runtime": 138.5715,
+      "eval_samples_per_second": 3.846,
+      "eval_steps_per_second": 0.484,
+      "step": 1386
+    },
+    {
+      "epoch": 3.0086580086580086,
+      "grad_norm": 0.8586050868034363,
+      "learning_rate": 0.0002,
+      "loss": 1.4376,
+      "step": 1390
+    },
+    {
+      "epoch": 3.0303030303030303,
+      "grad_norm": 1.127321720123291,
+      "learning_rate": 0.0002,
+      "loss": 1.2973,
+      "step": 1400
+    },
+    {
+      "epoch": 3.051948051948052,
+      "grad_norm": 1.3029290437698364,
+      "learning_rate": 0.0002,
+      "loss": 1.2804,
+      "step": 1410
+    },
+    {
+      "epoch": 3.0735930735930737,
+      "grad_norm": 1.4397313594818115,
+      "learning_rate": 0.0002,
+      "loss": 1.3353,
+      "step": 1420
+    },
+    {
+      "epoch": 3.0952380952380953,
+      "grad_norm": 1.5687700510025024,
+      "learning_rate": 0.0002,
+      "loss": 1.2783,
+      "step": 1430
+    },
+    {
+      "epoch": 3.116883116883117,
+      "grad_norm": 1.0821301937103271,
+      "learning_rate": 0.0002,
+      "loss": 1.2991,
+      "step": 1440
+    },
+    {
+      "epoch": 3.1385281385281387,
+      "grad_norm": 1.1222467422485352,
+      "learning_rate": 0.0002,
+      "loss": 1.2772,
+      "step": 1450
+    },
+    {
+      "epoch": 3.16017316017316,
+      "grad_norm": 1.196321964263916,
+      "learning_rate": 0.0002,
+      "loss": 1.3571,
+      "step": 1460
+    },
+    {
+      "epoch": 3.1818181818181817,
+      "grad_norm": 1.1099780797958374,
+      "learning_rate": 0.0002,
+      "loss": 1.2597,
+      "step": 1470
+    },
+    {
+      "epoch": 3.2034632034632033,
+      "grad_norm": 1.1216720342636108,
+      "learning_rate": 0.0002,
+      "loss": 1.3297,
+      "step": 1480
+    },
+    {
+      "epoch": 3.225108225108225,
+      "grad_norm": 1.2393304109573364,
+      "learning_rate": 0.0002,
+      "loss": 1.3066,
+      "step": 1490
+    },
+    {
+      "epoch": 3.2467532467532467,
+      "grad_norm": 1.2331798076629639,
+      "learning_rate": 0.0002,
+      "loss": 1.2445,
+      "step": 1500
+    },
+    {
+      "epoch": 3.2683982683982684,
+      "grad_norm": 1.1466370820999146,
+      "learning_rate": 0.0002,
+      "loss": 1.292,
+      "step": 1510
+    },
+    {
+      "epoch": 3.29004329004329,
+      "grad_norm": 1.6869697570800781,
+      "learning_rate": 0.0002,
+      "loss": 1.338,
+      "step": 1520
+    },
+    {
+      "epoch": 3.311688311688312,
+      "grad_norm": 1.2315126657485962,
+      "learning_rate": 0.0002,
+      "loss": 1.3152,
+      "step": 1530
+    },
+    {
+      "epoch": 3.3333333333333335,
+      "grad_norm": 1.2909607887268066,
+      "learning_rate": 0.0002,
+      "loss": 1.3555,
+      "step": 1540
+    },
+    {
+      "epoch": 3.354978354978355,
+      "grad_norm": 1.2874510288238525,
+      "learning_rate": 0.0002,
+      "loss": 1.2782,
+      "step": 1550
+    },
+    {
+      "epoch": 3.3766233766233764,
+      "grad_norm": 1.5269776582717896,
+      "learning_rate": 0.0002,
+      "loss": 1.308,
+      "step": 1560
+    },
+    {
+      "epoch": 3.398268398268398,
+      "grad_norm": 1.2578439712524414,
+      "learning_rate": 0.0002,
+      "loss": 1.3256,
+      "step": 1570
+    },
+    {
+      "epoch": 3.41991341991342,
+      "grad_norm": 1.1697931289672852,
+      "learning_rate": 0.0002,
+      "loss": 1.2783,
+      "step": 1580
+    },
+    {
+      "epoch": 3.4415584415584415,
+      "grad_norm": 1.314573049545288,
+      "learning_rate": 0.0002,
+      "loss": 1.3834,
+      "step": 1590
+    },
+    {
+      "epoch": 3.463203463203463,
+      "grad_norm": 1.2375879287719727,
+      "learning_rate": 0.0002,
+      "loss": 1.2516,
+      "step": 1600
+    },
+    {
+      "epoch": 3.484848484848485,
+      "grad_norm": 1.0980405807495117,
+      "learning_rate": 0.0002,
+      "loss": 1.2872,
+      "step": 1610
+    },
+    {
+      "epoch": 3.5064935064935066,
+      "grad_norm": 1.5183982849121094,
+      "learning_rate": 0.0002,
+      "loss": 1.2586,
+      "step": 1620
+    },
+    {
+      "epoch": 3.5281385281385282,
+      "grad_norm": 1.7712465524673462,
+      "learning_rate": 0.0002,
+      "loss": 1.3149,
+      "step": 1630
+    },
+    {
+      "epoch": 3.54978354978355,
+      "grad_norm": 1.4033244848251343,
+      "learning_rate": 0.0002,
+      "loss": 1.3097,
+      "step": 1640
+    },
+    {
+      "epoch": 3.571428571428571,
+      "grad_norm": 1.3502216339111328,
+      "learning_rate": 0.0002,
+      "loss": 1.3614,
+      "step": 1650
+    },
+    {
+      "epoch": 3.5930735930735933,
+      "grad_norm": 1.2922712564468384,
+      "learning_rate": 0.0002,
+      "loss": 1.3743,
+      "step": 1660
+    },
+    {
+      "epoch": 3.6147186147186146,
+      "grad_norm": 1.4703474044799805,
+      "learning_rate": 0.0002,
+      "loss": 1.3313,
+      "step": 1670
+    },
+    {
+      "epoch": 3.6363636363636362,
+      "grad_norm": 1.2576347589492798,
+      "learning_rate": 0.0002,
+      "loss": 1.3053,
+      "step": 1680
+    },
+    {
+      "epoch": 3.658008658008658,
+      "grad_norm": 1.361474633216858,
+      "learning_rate": 0.0002,
+      "loss": 1.3733,
+      "step": 1690
+    },
+    {
+      "epoch": 3.6796536796536796,
+      "grad_norm": 1.3686575889587402,
+      "learning_rate": 0.0002,
+      "loss": 1.4326,
+      "step": 1700
+    },
+    {
+      "epoch": 3.7012987012987013,
+      "grad_norm": 1.480577826499939,
+      "learning_rate": 0.0002,
+      "loss": 1.3832,
+      "step": 1710
+    },
+    {
+      "epoch": 3.722943722943723,
+      "grad_norm": 1.1896449327468872,
+      "learning_rate": 0.0002,
+      "loss": 1.3488,
+      "step": 1720
+    },
+    {
+      "epoch": 3.7445887445887447,
+      "grad_norm": 1.1765750646591187,
+      "learning_rate": 0.0002,
+      "loss": 1.2901,
+      "step": 1730
+    },
+    {
+      "epoch": 3.7662337662337664,
+      "grad_norm": 1.1575956344604492,
+      "learning_rate": 0.0002,
+      "loss": 1.3259,
+      "step": 1740
+    },
+    {
+      "epoch": 3.787878787878788,
+      "grad_norm": 1.1376453638076782,
+      "learning_rate": 0.0002,
+      "loss": 1.3073,
+      "step": 1750
+    },
+    {
+      "epoch": 3.8095238095238093,
+      "grad_norm": 1.1058441400527954,
+      "learning_rate": 0.0002,
+      "loss": 1.2997,
+      "step": 1760
+    },
+    {
+      "epoch": 3.8311688311688314,
+      "grad_norm": 1.3807097673416138,
+      "learning_rate": 0.0002,
+      "loss": 1.3549,
+      "step": 1770
+    },
+    {
+      "epoch": 3.8528138528138527,
+      "grad_norm": 1.1583185195922852,
+      "learning_rate": 0.0002,
+      "loss": 1.3589,
+      "step": 1780
+    },
+    {
+      "epoch": 3.8744588744588744,
+      "grad_norm": 1.0412019491195679,
+      "learning_rate": 0.0002,
+      "loss": 1.3855,
+      "step": 1790
+    },
+    {
+      "epoch": 3.896103896103896,
+      "grad_norm": 1.2590245008468628,
+      "learning_rate": 0.0002,
+      "loss": 1.3263,
+      "step": 1800
+    },
+    {
+      "epoch": 3.9177489177489178,
+      "grad_norm": 1.1784659624099731,
+      "learning_rate": 0.0002,
+      "loss": 1.333,
+      "step": 1810
+    },
+    {
+      "epoch": 3.9393939393939394,
+      "grad_norm": 1.2848402261734009,
+      "learning_rate": 0.0002,
+      "loss": 1.3326,
+      "step": 1820
+    },
+    {
+      "epoch": 3.961038961038961,
+      "grad_norm": 1.2152059078216553,
+      "learning_rate": 0.0002,
+      "loss": 1.3734,
+      "step": 1830
+    },
+    {
+      "epoch": 3.982683982683983,
+      "grad_norm": 1.3694654703140259,
+      "learning_rate": 0.0002,
+      "loss": 1.3563,
+      "step": 1840
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 1.9379768371582031,
+      "eval_runtime": 138.0181,
+      "eval_samples_per_second": 3.862,
+      "eval_steps_per_second": 0.485,
+      "step": 1848
+    },
+    {
+      "epoch": 4.004329004329004,
+      "grad_norm": 1.1592340469360352,
+      "learning_rate": 0.0002,
+      "loss": 1.2715,
+      "step": 1850
+    },
+    {
+      "epoch": 4.025974025974026,
+      "grad_norm": 1.4811842441558838,
+      "learning_rate": 0.0002,
+      "loss": 1.0985,
+      "step": 1860
+    },
+    {
+      "epoch": 4.0476190476190474,
+      "grad_norm": 1.4762481451034546,
+      "learning_rate": 0.0002,
+      "loss": 1.0392,
+      "step": 1870
+    },
+    {
+      "epoch": 4.06926406926407,
+      "grad_norm": 1.1761656999588013,
+      "learning_rate": 0.0002,
+      "loss": 1.1076,
+      "step": 1880
+    },
+    {
+      "epoch": 4.090909090909091,
+      "grad_norm": 1.621068000793457,
+      "learning_rate": 0.0002,
+      "loss": 1.0813,
+      "step": 1890
+    },
+    {
+      "epoch": 4.112554112554113,
+      "grad_norm": 1.7963402271270752,
+      "learning_rate": 0.0002,
+      "loss": 1.0849,
+      "step": 1900
+    },
+    {
+      "epoch": 4.134199134199134,
+      "grad_norm": 1.682166337966919,
+      "learning_rate": 0.0002,
+      "loss": 1.115,
+      "step": 1910
+    },
+    {
+      "epoch": 4.1558441558441555,
+      "grad_norm": 1.765175700187683,
+      "learning_rate": 0.0002,
+      "loss": 1.0142,
+      "step": 1920
+    },
+    {
+      "epoch": 4.177489177489178,
+      "grad_norm": 1.7437595129013062,
+      "learning_rate": 0.0002,
+      "loss": 1.0237,
+      "step": 1930
+    },
+    {
+      "epoch": 4.199134199134199,
+      "grad_norm": 1.487619400024414,
+      "learning_rate": 0.0002,
+      "loss": 1.1269,
+      "step": 1940
+    },
+    {
+      "epoch": 4.220779220779221,
+      "grad_norm": 1.5726702213287354,
+      "learning_rate": 0.0002,
+      "loss": 1.1398,
+      "step": 1950
+    },
+    {
+      "epoch": 4.242424242424242,
+      "grad_norm": 1.675681471824646,
+      "learning_rate": 0.0002,
+      "loss": 1.0203,
+      "step": 1960
+    },
+    {
+      "epoch": 4.264069264069264,
+      "grad_norm": 1.5381293296813965,
+      "learning_rate": 0.0002,
+      "loss": 1.0001,
+      "step": 1970
+    },
+    {
+      "epoch": 4.285714285714286,
+      "grad_norm": 1.6634043455123901,
+      "learning_rate": 0.0002,
+      "loss": 1.1608,
+      "step": 1980
+    },
+    {
+      "epoch": 4.307359307359308,
+      "grad_norm": 1.4991868734359741,
+      "learning_rate": 0.0002,
+      "loss": 1.0914,
+      "step": 1990
+    },
+    {
+      "epoch": 4.329004329004329,
+      "grad_norm": 1.7046575546264648,
+      "learning_rate": 0.0002,
+      "loss": 1.0208,
+      "step": 2000
+    },
+    {
+      "epoch": 4.35064935064935,
+      "grad_norm": 1.8189613819122314,
+      "learning_rate": 0.0002,
+      "loss": 1.0671,
+      "step": 2010
+    },
+    {
+      "epoch": 4.372294372294372,
+      "grad_norm": 1.7232930660247803,
+      "learning_rate": 0.0002,
+      "loss": 1.0771,
+      "step": 2020
+    },
+    {
+      "epoch": 4.393939393939394,
+      "grad_norm": 2.037747859954834,
+      "learning_rate": 0.0002,
+      "loss": 1.0849,
+      "step": 2030
+    },
+    {
+      "epoch": 4.415584415584416,
+      "grad_norm": 1.6157771348953247,
+      "learning_rate": 0.0002,
+      "loss": 1.0984,
+      "step": 2040
+    },
+    {
+      "epoch": 4.437229437229437,
+      "grad_norm": 1.6834640502929688,
+      "learning_rate": 0.0002,
+      "loss": 1.0542,
+      "step": 2050
+    },
+    {
+      "epoch": 4.458874458874459,
+      "grad_norm": 1.5155940055847168,
+      "learning_rate": 0.0002,
+      "loss": 1.1582,
+      "step": 2060
+    },
+    {
+      "epoch": 4.48051948051948,
+      "grad_norm": 1.9364410638809204,
+      "learning_rate": 0.0002,
+      "loss": 1.1593,
+      "step": 2070
+    },
+    {
+      "epoch": 4.5021645021645025,
+      "grad_norm": 1.512215256690979,
+      "learning_rate": 0.0002,
+      "loss": 1.1484,
+      "step": 2080
+    },
+    {
+      "epoch": 4.523809523809524,
+      "grad_norm": 1.7659000158309937,
+      "learning_rate": 0.0002,
+      "loss": 1.0858,
+      "step": 2090
+    },
+    {
+      "epoch": 4.545454545454545,
+      "grad_norm": 1.8038681745529175,
+      "learning_rate": 0.0002,
+      "loss": 1.1309,
+      "step": 2100
+    },
+    {
+      "epoch": 4.567099567099567,
+      "grad_norm": 1.6234548091888428,
+      "learning_rate": 0.0002,
+      "loss": 1.1466,
+      "step": 2110
+    },
+    {
+      "epoch": 4.588744588744589,
+      "grad_norm": 1.7181912660598755,
+      "learning_rate": 0.0002,
+      "loss": 1.1237,
+      "step": 2120
+    },
+    {
+      "epoch": 4.6103896103896105,
+      "grad_norm": 1.5204529762268066,
+      "learning_rate": 0.0002,
+      "loss": 1.129,
+      "step": 2130
+    },
+    {
+      "epoch": 4.632034632034632,
+      "grad_norm": 1.6626766920089722,
+      "learning_rate": 0.0002,
+      "loss": 1.1338,
+      "step": 2140
+    },
+    {
+      "epoch": 4.653679653679654,
+      "grad_norm": 1.6722981929779053,
+      "learning_rate": 0.0002,
+      "loss": 1.1135,
+      "step": 2150
+    },
+    {
+      "epoch": 4.675324675324675,
+      "grad_norm": 1.5929896831512451,
+      "learning_rate": 0.0002,
+      "loss": 1.1243,
+      "step": 2160
+    },
+    {
+      "epoch": 4.696969696969697,
+      "grad_norm": 1.8637045621871948,
+      "learning_rate": 0.0002,
+      "loss": 1.1511,
+      "step": 2170
+    },
+    {
+      "epoch": 4.7186147186147185,
+      "grad_norm": 1.7406965494155884,
+      "learning_rate": 0.0002,
+      "loss": 1.0816,
+      "step": 2180
+    },
+    {
+      "epoch": 4.740259740259741,
+      "grad_norm": 1.9259464740753174,
+      "learning_rate": 0.0002,
+      "loss": 1.0913,
+      "step": 2190
+    },
+    {
+      "epoch": 4.761904761904762,
+      "grad_norm": 1.5640064477920532,
+      "learning_rate": 0.0002,
+      "loss": 1.1273,
+      "step": 2200
+    },
+    {
+      "epoch": 4.783549783549784,
+      "grad_norm": 1.5039080381393433,
+      "learning_rate": 0.0002,
+      "loss": 1.095,
+      "step": 2210
+    },
+    {
+      "epoch": 4.805194805194805,
+      "grad_norm": 2.086487293243408,
+      "learning_rate": 0.0002,
+      "loss": 1.1082,
+      "step": 2220
+    },
+    {
+      "epoch": 4.8268398268398265,
+      "grad_norm": 1.8213051557540894,
+      "learning_rate": 0.0002,
+      "loss": 1.1299,
+      "step": 2230
+    },
+    {
+      "epoch": 4.848484848484849,
+      "grad_norm": 1.6772842407226562,
+      "learning_rate": 0.0002,
+      "loss": 1.1339,
+      "step": 2240
+    },
+    {
+      "epoch": 4.87012987012987,
+      "grad_norm": 1.855952262878418,
+      "learning_rate": 0.0002,
+      "loss": 1.0688,
+      "step": 2250
+    },
+    {
+      "epoch": 4.891774891774892,
+      "grad_norm": 1.703018069267273,
+      "learning_rate": 0.0002,
+      "loss": 1.1242,
+      "step": 2260
+    },
+    {
+      "epoch": 4.913419913419913,
+      "grad_norm": 1.5779869556427002,
+      "learning_rate": 0.0002,
+      "loss": 1.1745,
+      "step": 2270
+    },
+    {
+      "epoch": 4.935064935064935,
+      "grad_norm": 1.873153567314148,
+      "learning_rate": 0.0002,
+      "loss": 1.1367,
+      "step": 2280
+    },
+    {
+      "epoch": 4.956709956709957,
+      "grad_norm": 1.845137119293213,
+      "learning_rate": 0.0002,
+      "loss": 1.1469,
+      "step": 2290
+    },
+    {
+      "epoch": 4.978354978354979,
+      "grad_norm": 1.5848972797393799,
+      "learning_rate": 0.0002,
+      "loss": 1.176,
+      "step": 2300
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 1.7801740169525146,
+      "learning_rate": 0.0002,
+      "loss": 1.1427,
+      "step": 2310
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 2.1164023876190186,
+      "eval_runtime": 138.8921,
+      "eval_samples_per_second": 3.838,
+      "eval_steps_per_second": 0.482,
+      "step": 2310
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3696,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.01350618693632e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d349b8fe4d8871a8479d9bb3b1cba8d39e96d113c8c86cdb28a7fc6969f53ba6
+size 5560

Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-2772/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "q_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c02bd8f9cbf6c31e6614a5974b6e5ce60c0d6410e340dda226b60e2cbe1a215a
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:58c16e7eabbb413a68f670fe00cc27b12eab63ec8e22687cd91ee3c8c648f0a7
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bbf48f2f778bc43abea17e7d1a1b0d37289965aac2075f6d73208cec0b0aa8c8
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:496b2f188f4b585c8b55ecd150fb699fd83971e2fa6797790d5b503c86f79829
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,2020 @@

+{
+  "best_metric": 1.7847579717636108,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.3-num-2108-sd-1/checkpoint-462",
+  "epoch": 6.0,
+  "eval_steps": 10,
+  "global_step": 2772,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.021645021645021644,
+      "grad_norm": 1.200374722480774,
+      "learning_rate": 0.0002,
+      "loss": 2.5092,
+      "step": 10
+    },
+    {
+      "epoch": 0.04329004329004329,
+      "grad_norm": 0.974091112613678,
+      "learning_rate": 0.0002,
+      "loss": 2.2672,
+      "step": 20
+    },
+    {
+      "epoch": 0.06493506493506493,
+      "grad_norm": 0.9070103168487549,
+      "learning_rate": 0.0002,
+      "loss": 2.1445,
+      "step": 30
+    },
+    {
+      "epoch": 0.08658008658008658,
+      "grad_norm": 0.6892510056495667,
+      "learning_rate": 0.0002,
+      "loss": 2.0634,
+      "step": 40
+    },
+    {
+      "epoch": 0.10822510822510822,
+      "grad_norm": 0.7840355038642883,
+      "learning_rate": 0.0002,
+      "loss": 2.039,
+      "step": 50
+    },
+    {
+      "epoch": 0.12987012987012986,
+      "grad_norm": 0.8381665349006653,
+      "learning_rate": 0.0002,
+      "loss": 1.9527,
+      "step": 60
+    },
+    {
+      "epoch": 0.15151515151515152,
+      "grad_norm": 0.6969044804573059,
+      "learning_rate": 0.0002,
+      "loss": 1.8852,
+      "step": 70
+    },
+    {
+      "epoch": 0.17316017316017315,
+      "grad_norm": 0.6608849763870239,
+      "learning_rate": 0.0002,
+      "loss": 1.8263,
+      "step": 80
+    },
+    {
+      "epoch": 0.19480519480519481,
+      "grad_norm": 0.6329185962677002,
+      "learning_rate": 0.0002,
+      "loss": 1.825,
+      "step": 90
+    },
+    {
+      "epoch": 0.21645021645021645,
+      "grad_norm": 0.723852276802063,
+      "learning_rate": 0.0002,
+      "loss": 1.8256,
+      "step": 100
+    },
+    {
+      "epoch": 0.23809523809523808,
+      "grad_norm": 0.8358765840530396,
+      "learning_rate": 0.0002,
+      "loss": 1.8758,
+      "step": 110
+    },
+    {
+      "epoch": 0.2597402597402597,
+      "grad_norm": 0.6025514006614685,
+      "learning_rate": 0.0002,
+      "loss": 1.8468,
+      "step": 120
+    },
+    {
+      "epoch": 0.2813852813852814,
+      "grad_norm": 0.5782386064529419,
+      "learning_rate": 0.0002,
+      "loss": 1.7487,
+      "step": 130
+    },
+    {
+      "epoch": 0.30303030303030304,
+      "grad_norm": 0.8589595556259155,
+      "learning_rate": 0.0002,
+      "loss": 1.7717,
+      "step": 140
+    },
+    {
+      "epoch": 0.3246753246753247,
+      "grad_norm": 0.5718036890029907,
+      "learning_rate": 0.0002,
+      "loss": 1.7726,
+      "step": 150
+    },
+    {
+      "epoch": 0.3463203463203463,
+      "grad_norm": 0.632756769657135,
+      "learning_rate": 0.0002,
+      "loss": 1.784,
+      "step": 160
+    },
+    {
+      "epoch": 0.36796536796536794,
+      "grad_norm": 0.5307920575141907,
+      "learning_rate": 0.0002,
+      "loss": 1.8176,
+      "step": 170
+    },
+    {
+      "epoch": 0.38961038961038963,
+      "grad_norm": 0.5692276358604431,
+      "learning_rate": 0.0002,
+      "loss": 1.7744,
+      "step": 180
+    },
+    {
+      "epoch": 0.41125541125541126,
+      "grad_norm": 0.6083813309669495,
+      "learning_rate": 0.0002,
+      "loss": 1.8075,
+      "step": 190
+    },
+    {
+      "epoch": 0.4329004329004329,
+      "grad_norm": 0.7849981188774109,
+      "learning_rate": 0.0002,
+      "loss": 1.8718,
+      "step": 200
+    },
+    {
+      "epoch": 0.45454545454545453,
+      "grad_norm": 0.6536546945571899,
+      "learning_rate": 0.0002,
+      "loss": 1.7946,
+      "step": 210
+    },
+    {
+      "epoch": 0.47619047619047616,
+      "grad_norm": 0.5180730223655701,
+      "learning_rate": 0.0002,
+      "loss": 1.8174,
+      "step": 220
+    },
+    {
+      "epoch": 0.49783549783549785,
+      "grad_norm": 0.5796821713447571,
+      "learning_rate": 0.0002,
+      "loss": 1.7857,
+      "step": 230
+    },
+    {
+      "epoch": 0.5194805194805194,
+      "grad_norm": 0.6185894012451172,
+      "learning_rate": 0.0002,
+      "loss": 1.8062,
+      "step": 240
+    },
+    {
+      "epoch": 0.5411255411255411,
+      "grad_norm": 0.6040953397750854,
+      "learning_rate": 0.0002,
+      "loss": 1.825,
+      "step": 250
+    },
+    {
+      "epoch": 0.5627705627705628,
+      "grad_norm": 0.6005431413650513,
+      "learning_rate": 0.0002,
+      "loss": 1.7785,
+      "step": 260
+    },
+    {
+      "epoch": 0.5844155844155844,
+      "grad_norm": 0.6693951487541199,
+      "learning_rate": 0.0002,
+      "loss": 1.8444,
+      "step": 270
+    },
+    {
+      "epoch": 0.6060606060606061,
+      "grad_norm": 0.5105443596839905,
+      "learning_rate": 0.0002,
+      "loss": 1.8471,
+      "step": 280
+    },
+    {
+      "epoch": 0.6277056277056277,
+      "grad_norm": 0.5175243616104126,
+      "learning_rate": 0.0002,
+      "loss": 1.8274,
+      "step": 290
+    },
+    {
+      "epoch": 0.6493506493506493,
+      "grad_norm": 0.4775221049785614,
+      "learning_rate": 0.0002,
+      "loss": 1.81,
+      "step": 300
+    },
+    {
+      "epoch": 0.670995670995671,
+      "grad_norm": 0.9106342792510986,
+      "learning_rate": 0.0002,
+      "loss": 1.7816,
+      "step": 310
+    },
+    {
+      "epoch": 0.6926406926406926,
+      "grad_norm": 1.9134571552276611,
+      "learning_rate": 0.0002,
+      "loss": 1.7764,
+      "step": 320
+    },
+    {
+      "epoch": 0.7142857142857143,
+      "grad_norm": 0.6287537217140198,
+      "learning_rate": 0.0002,
+      "loss": 1.7877,
+      "step": 330
+    },
+    {
+      "epoch": 0.7359307359307359,
+      "grad_norm": 0.5587132573127747,
+      "learning_rate": 0.0002,
+      "loss": 1.8499,
+      "step": 340
+    },
+    {
+      "epoch": 0.7575757575757576,
+      "grad_norm": 0.5827193260192871,
+      "learning_rate": 0.0002,
+      "loss": 1.7328,
+      "step": 350
+    },
+    {
+      "epoch": 0.7792207792207793,
+      "grad_norm": 0.572600781917572,
+      "learning_rate": 0.0002,
+      "loss": 1.8022,
+      "step": 360
+    },
+    {
+      "epoch": 0.8008658008658008,
+      "grad_norm": 0.6280586123466492,
+      "learning_rate": 0.0002,
+      "loss": 1.88,
+      "step": 370
+    },
+    {
+      "epoch": 0.8225108225108225,
+      "grad_norm": 0.6878819465637207,
+      "learning_rate": 0.0002,
+      "loss": 1.8116,
+      "step": 380
+    },
+    {
+      "epoch": 0.8441558441558441,
+      "grad_norm": 0.5876027345657349,
+      "learning_rate": 0.0002,
+      "loss": 1.8042,
+      "step": 390
+    },
+    {
+      "epoch": 0.8658008658008658,
+      "grad_norm": 0.5249695777893066,
+      "learning_rate": 0.0002,
+      "loss": 1.7501,
+      "step": 400
+    },
+    {
+      "epoch": 0.8874458874458875,
+      "grad_norm": 0.5510677695274353,
+      "learning_rate": 0.0002,
+      "loss": 1.7599,
+      "step": 410
+    },
+    {
+      "epoch": 0.9090909090909091,
+      "grad_norm": 0.6817089915275574,
+      "learning_rate": 0.0002,
+      "loss": 1.7737,
+      "step": 420
+    },
+    {
+      "epoch": 0.9307359307359307,
+      "grad_norm": 0.5116859078407288,
+      "learning_rate": 0.0002,
+      "loss": 1.7857,
+      "step": 430
+    },
+    {
+      "epoch": 0.9523809523809523,
+      "grad_norm": 0.5427846312522888,
+      "learning_rate": 0.0002,
+      "loss": 1.7361,
+      "step": 440
+    },
+    {
+      "epoch": 0.974025974025974,
+      "grad_norm": 0.5605915784835815,
+      "learning_rate": 0.0002,
+      "loss": 1.7812,
+      "step": 450
+    },
+    {
+      "epoch": 0.9956709956709957,
+      "grad_norm": 0.5166691541671753,
+      "learning_rate": 0.0002,
+      "loss": 1.7699,
+      "step": 460
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 1.7847579717636108,
+      "eval_runtime": 144.877,
+      "eval_samples_per_second": 3.679,
+      "eval_steps_per_second": 0.462,
+      "step": 462
+    },
+    {
+      "epoch": 1.0173160173160174,
+      "grad_norm": 0.5665210485458374,
+      "learning_rate": 0.0002,
+      "loss": 1.6725,
+      "step": 470
+    },
+    {
+      "epoch": 1.0389610389610389,
+      "grad_norm": 1.0514339208602905,
+      "learning_rate": 0.0002,
+      "loss": 1.6996,
+      "step": 480
+    },
+    {
+      "epoch": 1.0606060606060606,
+      "grad_norm": 0.5494309663772583,
+      "learning_rate": 0.0002,
+      "loss": 1.6573,
+      "step": 490
+    },
+    {
+      "epoch": 1.0822510822510822,
+      "grad_norm": 0.557016909122467,
+      "learning_rate": 0.0002,
+      "loss": 1.7314,
+      "step": 500
+    },
+    {
+      "epoch": 1.103896103896104,
+      "grad_norm": 0.6585943102836609,
+      "learning_rate": 0.0002,
+      "loss": 1.7284,
+      "step": 510
+    },
+    {
+      "epoch": 1.1255411255411256,
+      "grad_norm": 0.6703357696533203,
+      "learning_rate": 0.0002,
+      "loss": 1.7575,
+      "step": 520
+    },
+    {
+      "epoch": 1.1471861471861473,
+      "grad_norm": 1.9358264207839966,
+      "learning_rate": 0.0002,
+      "loss": 1.7013,
+      "step": 530
+    },
+    {
+      "epoch": 1.1688311688311688,
+      "grad_norm": 0.6128601431846619,
+      "learning_rate": 0.0002,
+      "loss": 1.6914,
+      "step": 540
+    },
+    {
+      "epoch": 1.1904761904761905,
+      "grad_norm": 0.6610239744186401,
+      "learning_rate": 0.0002,
+      "loss": 1.6358,
+      "step": 550
+    },
+    {
+      "epoch": 1.2121212121212122,
+      "grad_norm": 0.6083669662475586,
+      "learning_rate": 0.0002,
+      "loss": 1.7122,
+      "step": 560
+    },
+    {
+      "epoch": 1.2337662337662338,
+      "grad_norm": 0.7784225940704346,
+      "learning_rate": 0.0002,
+      "loss": 1.6771,
+      "step": 570
+    },
+    {
+      "epoch": 1.2554112554112553,
+      "grad_norm": 0.6141694784164429,
+      "learning_rate": 0.0002,
+      "loss": 1.6372,
+      "step": 580
+    },
+    {
+      "epoch": 1.277056277056277,
+      "grad_norm": 0.6129311323165894,
+      "learning_rate": 0.0002,
+      "loss": 1.6795,
+      "step": 590
+    },
+    {
+      "epoch": 1.2987012987012987,
+      "grad_norm": 0.6802751421928406,
+      "learning_rate": 0.0002,
+      "loss": 1.6664,
+      "step": 600
+    },
+    {
+      "epoch": 1.3203463203463204,
+      "grad_norm": 0.6065750122070312,
+      "learning_rate": 0.0002,
+      "loss": 1.6555,
+      "step": 610
+    },
+    {
+      "epoch": 1.341991341991342,
+      "grad_norm": 0.6713075637817383,
+      "learning_rate": 0.0002,
+      "loss": 1.679,
+      "step": 620
+    },
+    {
+      "epoch": 1.3636363636363638,
+      "grad_norm": 0.627552330493927,
+      "learning_rate": 0.0002,
+      "loss": 1.7412,
+      "step": 630
+    },
+    {
+      "epoch": 1.3852813852813852,
+      "grad_norm": 0.6579778790473938,
+      "learning_rate": 0.0002,
+      "loss": 1.6477,
+      "step": 640
+    },
+    {
+      "epoch": 1.406926406926407,
+      "grad_norm": 0.6381745934486389,
+      "learning_rate": 0.0002,
+      "loss": 1.7282,
+      "step": 650
+    },
+    {
+      "epoch": 1.4285714285714286,
+      "grad_norm": 0.7358919382095337,
+      "learning_rate": 0.0002,
+      "loss": 1.7218,
+      "step": 660
+    },
+    {
+      "epoch": 1.4502164502164503,
+      "grad_norm": 0.6294736266136169,
+      "learning_rate": 0.0002,
+      "loss": 1.7046,
+      "step": 670
+    },
+    {
+      "epoch": 1.4718614718614718,
+      "grad_norm": 0.6542870998382568,
+      "learning_rate": 0.0002,
+      "loss": 1.7143,
+      "step": 680
+    },
+    {
+      "epoch": 1.4935064935064934,
+      "grad_norm": 0.6003480553627014,
+      "learning_rate": 0.0002,
+      "loss": 1.7417,
+      "step": 690
+    },
+    {
+      "epoch": 1.5151515151515151,
+      "grad_norm": 0.8322144150733948,
+      "learning_rate": 0.0002,
+      "loss": 1.6487,
+      "step": 700
+    },
+    {
+      "epoch": 1.5367965367965368,
+      "grad_norm": 0.6853126287460327,
+      "learning_rate": 0.0002,
+      "loss": 1.7217,
+      "step": 710
+    },
+    {
+      "epoch": 1.5584415584415585,
+      "grad_norm": 0.6571378707885742,
+      "learning_rate": 0.0002,
+      "loss": 1.6888,
+      "step": 720
+    },
+    {
+      "epoch": 1.5800865800865802,
+      "grad_norm": 0.6957149505615234,
+      "learning_rate": 0.0002,
+      "loss": 1.7111,
+      "step": 730
+    },
+    {
+      "epoch": 1.601731601731602,
+      "grad_norm": 0.6495681405067444,
+      "learning_rate": 0.0002,
+      "loss": 1.7361,
+      "step": 740
+    },
+    {
+      "epoch": 1.6233766233766234,
+      "grad_norm": 0.6954384446144104,
+      "learning_rate": 0.0002,
+      "loss": 1.5709,
+      "step": 750
+    },
+    {
+      "epoch": 1.645021645021645,
+      "grad_norm": 0.7402207851409912,
+      "learning_rate": 0.0002,
+      "loss": 1.6851,
+      "step": 760
+    },
+    {
+      "epoch": 1.6666666666666665,
+      "grad_norm": 0.6827481985092163,
+      "learning_rate": 0.0002,
+      "loss": 1.6651,
+      "step": 770
+    },
+    {
+      "epoch": 1.6883116883116882,
+      "grad_norm": 0.6176769733428955,
+      "learning_rate": 0.0002,
+      "loss": 1.6827,
+      "step": 780
+    },
+    {
+      "epoch": 1.70995670995671,
+      "grad_norm": 0.6565108299255371,
+      "learning_rate": 0.0002,
+      "loss": 1.6291,
+      "step": 790
+    },
+    {
+      "epoch": 1.7316017316017316,
+      "grad_norm": 0.6303038001060486,
+      "learning_rate": 0.0002,
+      "loss": 1.6805,
+      "step": 800
+    },
+    {
+      "epoch": 1.7532467532467533,
+      "grad_norm": 0.6866182684898376,
+      "learning_rate": 0.0002,
+      "loss": 1.7321,
+      "step": 810
+    },
+    {
+      "epoch": 1.774891774891775,
+      "grad_norm": 0.7522535920143127,
+      "learning_rate": 0.0002,
+      "loss": 1.6847,
+      "step": 820
+    },
+    {
+      "epoch": 1.7965367965367967,
+      "grad_norm": 0.7703698873519897,
+      "learning_rate": 0.0002,
+      "loss": 1.679,
+      "step": 830
+    },
+    {
+      "epoch": 1.8181818181818183,
+      "grad_norm": 0.5955503582954407,
+      "learning_rate": 0.0002,
+      "loss": 1.6817,
+      "step": 840
+    },
+    {
+      "epoch": 1.8398268398268398,
+      "grad_norm": 0.707340657711029,
+      "learning_rate": 0.0002,
+      "loss": 1.7159,
+      "step": 850
+    },
+    {
+      "epoch": 1.8614718614718615,
+      "grad_norm": 0.7305465936660767,
+      "learning_rate": 0.0002,
+      "loss": 1.709,
+      "step": 860
+    },
+    {
+      "epoch": 1.883116883116883,
+      "grad_norm": 0.667972207069397,
+      "learning_rate": 0.0002,
+      "loss": 1.71,
+      "step": 870
+    },
+    {
+      "epoch": 1.9047619047619047,
+      "grad_norm": 0.654872477054596,
+      "learning_rate": 0.0002,
+      "loss": 1.7051,
+      "step": 880
+    },
+    {
+      "epoch": 1.9264069264069263,
+      "grad_norm": 0.6718705296516418,
+      "learning_rate": 0.0002,
+      "loss": 1.6316,
+      "step": 890
+    },
+    {
+      "epoch": 1.948051948051948,
+      "grad_norm": 0.6363692879676819,
+      "learning_rate": 0.0002,
+      "loss": 1.623,
+      "step": 900
+    },
+    {
+      "epoch": 1.9696969696969697,
+      "grad_norm": 0.6861362457275391,
+      "learning_rate": 0.0002,
+      "loss": 1.6725,
+      "step": 910
+    },
+    {
+      "epoch": 1.9913419913419914,
+      "grad_norm": 0.6531493067741394,
+      "learning_rate": 0.0002,
+      "loss": 1.6833,
+      "step": 920
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.7908068895339966,
+      "eval_runtime": 144.0281,
+      "eval_samples_per_second": 3.701,
+      "eval_steps_per_second": 0.465,
+      "step": 924
+    },
+    {
+      "epoch": 2.012987012987013,
+      "grad_norm": 0.6030914187431335,
+      "learning_rate": 0.0002,
+      "loss": 1.5922,
+      "step": 930
+    },
+    {
+      "epoch": 2.034632034632035,
+      "grad_norm": 0.7416430711746216,
+      "learning_rate": 0.0002,
+      "loss": 1.5215,
+      "step": 940
+    },
+    {
+      "epoch": 2.0562770562770565,
+      "grad_norm": 0.7020093202590942,
+      "learning_rate": 0.0002,
+      "loss": 1.5759,
+      "step": 950
+    },
+    {
+      "epoch": 2.0779220779220777,
+      "grad_norm": 0.8007868528366089,
+      "learning_rate": 0.0002,
+      "loss": 1.4751,
+      "step": 960
+    },
+    {
+      "epoch": 2.0995670995670994,
+      "grad_norm": 0.7111671566963196,
+      "learning_rate": 0.0002,
+      "loss": 1.4808,
+      "step": 970
+    },
+    {
+      "epoch": 2.121212121212121,
+      "grad_norm": 0.7257682085037231,
+      "learning_rate": 0.0002,
+      "loss": 1.53,
+      "step": 980
+    },
+    {
+      "epoch": 2.142857142857143,
+      "grad_norm": 0.8737282156944275,
+      "learning_rate": 0.0002,
+      "loss": 1.5097,
+      "step": 990
+    },
+    {
+      "epoch": 2.1645021645021645,
+      "grad_norm": 0.9281378984451294,
+      "learning_rate": 0.0002,
+      "loss": 1.537,
+      "step": 1000
+    },
+    {
+      "epoch": 2.186147186147186,
+      "grad_norm": 1.0217959880828857,
+      "learning_rate": 0.0002,
+      "loss": 1.5223,
+      "step": 1010
+    },
+    {
+      "epoch": 2.207792207792208,
+      "grad_norm": 0.8430958986282349,
+      "learning_rate": 0.0002,
+      "loss": 1.4253,
+      "step": 1020
+    },
+    {
+      "epoch": 2.2294372294372296,
+      "grad_norm": 0.8123440742492676,
+      "learning_rate": 0.0002,
+      "loss": 1.5294,
+      "step": 1030
+    },
+    {
+      "epoch": 2.2510822510822512,
+      "grad_norm": 0.9429558515548706,
+      "learning_rate": 0.0002,
+      "loss": 1.5167,
+      "step": 1040
+    },
+    {
+      "epoch": 2.2727272727272725,
+      "grad_norm": 0.811696469783783,
+      "learning_rate": 0.0002,
+      "loss": 1.4711,
+      "step": 1050
+    },
+    {
+      "epoch": 2.2943722943722946,
+      "grad_norm": 0.8424768447875977,
+      "learning_rate": 0.0002,
+      "loss": 1.4656,
+      "step": 1060
+    },
+    {
+      "epoch": 2.316017316017316,
+      "grad_norm": 0.8870340585708618,
+      "learning_rate": 0.0002,
+      "loss": 1.5618,
+      "step": 1070
+    },
+    {
+      "epoch": 2.3376623376623376,
+      "grad_norm": 0.8600393533706665,
+      "learning_rate": 0.0002,
+      "loss": 1.5368,
+      "step": 1080
+    },
+    {
+      "epoch": 2.3593073593073592,
+      "grad_norm": 0.8447834253311157,
+      "learning_rate": 0.0002,
+      "loss": 1.5028,
+      "step": 1090
+    },
+    {
+      "epoch": 2.380952380952381,
+      "grad_norm": 0.9303842186927795,
+      "learning_rate": 0.0002,
+      "loss": 1.4885,
+      "step": 1100
+    },
+    {
+      "epoch": 2.4025974025974026,
+      "grad_norm": 0.8144819140434265,
+      "learning_rate": 0.0002,
+      "loss": 1.5163,
+      "step": 1110
+    },
+    {
+      "epoch": 2.4242424242424243,
+      "grad_norm": 0.92924964427948,
+      "learning_rate": 0.0002,
+      "loss": 1.4805,
+      "step": 1120
+    },
+    {
+      "epoch": 2.445887445887446,
+      "grad_norm": 0.8560649156570435,
+      "learning_rate": 0.0002,
+      "loss": 1.4608,
+      "step": 1130
+    },
+    {
+      "epoch": 2.4675324675324677,
+      "grad_norm": 0.8532574772834778,
+      "learning_rate": 0.0002,
+      "loss": 1.5541,
+      "step": 1140
+    },
+    {
+      "epoch": 2.4891774891774894,
+      "grad_norm": 0.8702793717384338,
+      "learning_rate": 0.0002,
+      "loss": 1.5607,
+      "step": 1150
+    },
+    {
+      "epoch": 2.5108225108225106,
+      "grad_norm": 0.9125854969024658,
+      "learning_rate": 0.0002,
+      "loss": 1.5194,
+      "step": 1160
+    },
+    {
+      "epoch": 2.5324675324675323,
+      "grad_norm": 0.9579735398292542,
+      "learning_rate": 0.0002,
+      "loss": 1.5953,
+      "step": 1170
+    },
+    {
+      "epoch": 2.554112554112554,
+      "grad_norm": 0.8561005592346191,
+      "learning_rate": 0.0002,
+      "loss": 1.5088,
+      "step": 1180
+    },
+    {
+      "epoch": 2.5757575757575757,
+      "grad_norm": 0.9103630185127258,
+      "learning_rate": 0.0002,
+      "loss": 1.5636,
+      "step": 1190
+    },
+    {
+      "epoch": 2.5974025974025974,
+      "grad_norm": 0.8527248501777649,
+      "learning_rate": 0.0002,
+      "loss": 1.5497,
+      "step": 1200
+    },
+    {
+      "epoch": 2.619047619047619,
+      "grad_norm": 0.8368656039237976,
+      "learning_rate": 0.0002,
+      "loss": 1.5845,
+      "step": 1210
+    },
+    {
+      "epoch": 2.6406926406926408,
+      "grad_norm": 0.9644360542297363,
+      "learning_rate": 0.0002,
+      "loss": 1.574,
+      "step": 1220
+    },
+    {
+      "epoch": 2.6623376623376624,
+      "grad_norm": 0.9691457748413086,
+      "learning_rate": 0.0002,
+      "loss": 1.5623,
+      "step": 1230
+    },
+    {
+      "epoch": 2.683982683982684,
+      "grad_norm": 0.8851862549781799,
+      "learning_rate": 0.0002,
+      "loss": 1.5894,
+      "step": 1240
+    },
+    {
+      "epoch": 2.7056277056277054,
+      "grad_norm": 1.0715088844299316,
+      "learning_rate": 0.0002,
+      "loss": 1.5251,
+      "step": 1250
+    },
+    {
+      "epoch": 2.7272727272727275,
+      "grad_norm": 0.8532006740570068,
+      "learning_rate": 0.0002,
+      "loss": 1.5903,
+      "step": 1260
+    },
+    {
+      "epoch": 2.7489177489177488,
+      "grad_norm": 0.9172760248184204,
+      "learning_rate": 0.0002,
+      "loss": 1.5261,
+      "step": 1270
+    },
+    {
+      "epoch": 2.7705627705627704,
+      "grad_norm": 0.8991577625274658,
+      "learning_rate": 0.0002,
+      "loss": 1.5029,
+      "step": 1280
+    },
+    {
+      "epoch": 2.792207792207792,
+      "grad_norm": 0.8205381631851196,
+      "learning_rate": 0.0002,
+      "loss": 1.5207,
+      "step": 1290
+    },
+    {
+      "epoch": 2.813852813852814,
+      "grad_norm": 0.9733313918113708,
+      "learning_rate": 0.0002,
+      "loss": 1.5328,
+      "step": 1300
+    },
+    {
+      "epoch": 2.8354978354978355,
+      "grad_norm": 1.0313537120819092,
+      "learning_rate": 0.0002,
+      "loss": 1.5373,
+      "step": 1310
+    },
+    {
+      "epoch": 2.857142857142857,
+      "grad_norm": 0.8865208625793457,
+      "learning_rate": 0.0002,
+      "loss": 1.4832,
+      "step": 1320
+    },
+    {
+      "epoch": 2.878787878787879,
+      "grad_norm": 1.1407958269119263,
+      "learning_rate": 0.0002,
+      "loss": 1.5297,
+      "step": 1330
+    },
+    {
+      "epoch": 2.9004329004329006,
+      "grad_norm": 0.879891574382782,
+      "learning_rate": 0.0002,
+      "loss": 1.5435,
+      "step": 1340
+    },
+    {
+      "epoch": 2.9220779220779223,
+      "grad_norm": 0.9538708925247192,
+      "learning_rate": 0.0002,
+      "loss": 1.5523,
+      "step": 1350
+    },
+    {
+      "epoch": 2.9437229437229435,
+      "grad_norm": 0.7732896208763123,
+      "learning_rate": 0.0002,
+      "loss": 1.4881,
+      "step": 1360
+    },
+    {
+      "epoch": 2.965367965367965,
+      "grad_norm": 0.9062705636024475,
+      "learning_rate": 0.0002,
+      "loss": 1.4959,
+      "step": 1370
+    },
+    {
+      "epoch": 2.987012987012987,
+      "grad_norm": 0.9082673192024231,
+      "learning_rate": 0.0002,
+      "loss": 1.5508,
+      "step": 1380
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 1.842921257019043,
+      "eval_runtime": 138.5715,
+      "eval_samples_per_second": 3.846,
+      "eval_steps_per_second": 0.484,
+      "step": 1386
+    },
+    {
+      "epoch": 3.0086580086580086,
+      "grad_norm": 0.8586050868034363,
+      "learning_rate": 0.0002,
+      "loss": 1.4376,
+      "step": 1390
+    },
+    {
+      "epoch": 3.0303030303030303,
+      "grad_norm": 1.127321720123291,
+      "learning_rate": 0.0002,
+      "loss": 1.2973,
+      "step": 1400
+    },
+    {
+      "epoch": 3.051948051948052,
+      "grad_norm": 1.3029290437698364,
+      "learning_rate": 0.0002,
+      "loss": 1.2804,
+      "step": 1410
+    },
+    {
+      "epoch": 3.0735930735930737,
+      "grad_norm": 1.4397313594818115,
+      "learning_rate": 0.0002,
+      "loss": 1.3353,
+      "step": 1420
+    },
+    {
+      "epoch": 3.0952380952380953,
+      "grad_norm": 1.5687700510025024,
+      "learning_rate": 0.0002,
+      "loss": 1.2783,
+      "step": 1430
+    },
+    {
+      "epoch": 3.116883116883117,
+      "grad_norm": 1.0821301937103271,
+      "learning_rate": 0.0002,
+      "loss": 1.2991,
+      "step": 1440
+    },
+    {
+      "epoch": 3.1385281385281387,
+      "grad_norm": 1.1222467422485352,
+      "learning_rate": 0.0002,
+      "loss": 1.2772,
+      "step": 1450
+    },
+    {
+      "epoch": 3.16017316017316,
+      "grad_norm": 1.196321964263916,
+      "learning_rate": 0.0002,
+      "loss": 1.3571,
+      "step": 1460
+    },
+    {
+      "epoch": 3.1818181818181817,
+      "grad_norm": 1.1099780797958374,
+      "learning_rate": 0.0002,
+      "loss": 1.2597,
+      "step": 1470
+    },
+    {
+      "epoch": 3.2034632034632033,
+      "grad_norm": 1.1216720342636108,
+      "learning_rate": 0.0002,
+      "loss": 1.3297,
+      "step": 1480
+    },
+    {
+      "epoch": 3.225108225108225,
+      "grad_norm": 1.2393304109573364,
+      "learning_rate": 0.0002,
+      "loss": 1.3066,
+      "step": 1490
+    },
+    {
+      "epoch": 3.2467532467532467,
+      "grad_norm": 1.2331798076629639,
+      "learning_rate": 0.0002,
+      "loss": 1.2445,
+      "step": 1500
+    },
+    {
+      "epoch": 3.2683982683982684,
+      "grad_norm": 1.1466370820999146,
+      "learning_rate": 0.0002,
+      "loss": 1.292,
+      "step": 1510
+    },
+    {
+      "epoch": 3.29004329004329,
+      "grad_norm": 1.6869697570800781,
+      "learning_rate": 0.0002,
+      "loss": 1.338,
+      "step": 1520
+    },
+    {
+      "epoch": 3.311688311688312,
+      "grad_norm": 1.2315126657485962,
+      "learning_rate": 0.0002,
+      "loss": 1.3152,
+      "step": 1530
+    },
+    {
+      "epoch": 3.3333333333333335,
+      "grad_norm": 1.2909607887268066,
+      "learning_rate": 0.0002,
+      "loss": 1.3555,
+      "step": 1540
+    },
+    {
+      "epoch": 3.354978354978355,
+      "grad_norm": 1.2874510288238525,
+      "learning_rate": 0.0002,
+      "loss": 1.2782,
+      "step": 1550
+    },
+    {
+      "epoch": 3.3766233766233764,
+      "grad_norm": 1.5269776582717896,
+      "learning_rate": 0.0002,
+      "loss": 1.308,
+      "step": 1560
+    },
+    {
+      "epoch": 3.398268398268398,
+      "grad_norm": 1.2578439712524414,
+      "learning_rate": 0.0002,
+      "loss": 1.3256,
+      "step": 1570
+    },
+    {
+      "epoch": 3.41991341991342,
+      "grad_norm": 1.1697931289672852,
+      "learning_rate": 0.0002,
+      "loss": 1.2783,
+      "step": 1580
+    },
+    {
+      "epoch": 3.4415584415584415,
+      "grad_norm": 1.314573049545288,
+      "learning_rate": 0.0002,
+      "loss": 1.3834,
+      "step": 1590
+    },
+    {
+      "epoch": 3.463203463203463,
+      "grad_norm": 1.2375879287719727,
+      "learning_rate": 0.0002,
+      "loss": 1.2516,
+      "step": 1600
+    },
+    {
+      "epoch": 3.484848484848485,
+      "grad_norm": 1.0980405807495117,
+      "learning_rate": 0.0002,
+      "loss": 1.2872,
+      "step": 1610
+    },
+    {
+      "epoch": 3.5064935064935066,
+      "grad_norm": 1.5183982849121094,
+      "learning_rate": 0.0002,
+      "loss": 1.2586,
+      "step": 1620
+    },
+    {
+      "epoch": 3.5281385281385282,
+      "grad_norm": 1.7712465524673462,
+      "learning_rate": 0.0002,
+      "loss": 1.3149,
+      "step": 1630
+    },
+    {
+      "epoch": 3.54978354978355,
+      "grad_norm": 1.4033244848251343,
+      "learning_rate": 0.0002,
+      "loss": 1.3097,
+      "step": 1640
+    },
+    {
+      "epoch": 3.571428571428571,
+      "grad_norm": 1.3502216339111328,
+      "learning_rate": 0.0002,
+      "loss": 1.3614,
+      "step": 1650
+    },
+    {
+      "epoch": 3.5930735930735933,
+      "grad_norm": 1.2922712564468384,
+      "learning_rate": 0.0002,
+      "loss": 1.3743,
+      "step": 1660
+    },
+    {
+      "epoch": 3.6147186147186146,
+      "grad_norm": 1.4703474044799805,
+      "learning_rate": 0.0002,
+      "loss": 1.3313,
+      "step": 1670
+    },
+    {
+      "epoch": 3.6363636363636362,
+      "grad_norm": 1.2576347589492798,
+      "learning_rate": 0.0002,
+      "loss": 1.3053,
+      "step": 1680
+    },
+    {
+      "epoch": 3.658008658008658,
+      "grad_norm": 1.361474633216858,
+      "learning_rate": 0.0002,
+      "loss": 1.3733,
+      "step": 1690
+    },
+    {
+      "epoch": 3.6796536796536796,
+      "grad_norm": 1.3686575889587402,
+      "learning_rate": 0.0002,
+      "loss": 1.4326,
+      "step": 1700
+    },
+    {
+      "epoch": 3.7012987012987013,
+      "grad_norm": 1.480577826499939,
+      "learning_rate": 0.0002,
+      "loss": 1.3832,
+      "step": 1710
+    },
+    {
+      "epoch": 3.722943722943723,
+      "grad_norm": 1.1896449327468872,
+      "learning_rate": 0.0002,
+      "loss": 1.3488,
+      "step": 1720
+    },
+    {
+      "epoch": 3.7445887445887447,
+      "grad_norm": 1.1765750646591187,
+      "learning_rate": 0.0002,
+      "loss": 1.2901,
+      "step": 1730
+    },
+    {
+      "epoch": 3.7662337662337664,
+      "grad_norm": 1.1575956344604492,
+      "learning_rate": 0.0002,
+      "loss": 1.3259,
+      "step": 1740
+    },
+    {
+      "epoch": 3.787878787878788,
+      "grad_norm": 1.1376453638076782,
+      "learning_rate": 0.0002,
+      "loss": 1.3073,
+      "step": 1750
+    },
+    {
+      "epoch": 3.8095238095238093,
+      "grad_norm": 1.1058441400527954,
+      "learning_rate": 0.0002,
+      "loss": 1.2997,
+      "step": 1760
+    },
+    {
+      "epoch": 3.8311688311688314,
+      "grad_norm": 1.3807097673416138,
+      "learning_rate": 0.0002,
+      "loss": 1.3549,
+      "step": 1770
+    },
+    {
+      "epoch": 3.8528138528138527,
+      "grad_norm": 1.1583185195922852,
+      "learning_rate": 0.0002,
+      "loss": 1.3589,
+      "step": 1780
+    },
+    {
+      "epoch": 3.8744588744588744,
+      "grad_norm": 1.0412019491195679,
+      "learning_rate": 0.0002,
+      "loss": 1.3855,
+      "step": 1790
+    },
+    {
+      "epoch": 3.896103896103896,
+      "grad_norm": 1.2590245008468628,
+      "learning_rate": 0.0002,
+      "loss": 1.3263,
+      "step": 1800
+    },
+    {
+      "epoch": 3.9177489177489178,
+      "grad_norm": 1.1784659624099731,
+      "learning_rate": 0.0002,
+      "loss": 1.333,
+      "step": 1810
+    },
+    {
+      "epoch": 3.9393939393939394,
+      "grad_norm": 1.2848402261734009,
+      "learning_rate": 0.0002,
+      "loss": 1.3326,
+      "step": 1820
+    },
+    {
+      "epoch": 3.961038961038961,
+      "grad_norm": 1.2152059078216553,
+      "learning_rate": 0.0002,
+      "loss": 1.3734,
+      "step": 1830
+    },
+    {
+      "epoch": 3.982683982683983,
+      "grad_norm": 1.3694654703140259,
+      "learning_rate": 0.0002,
+      "loss": 1.3563,
+      "step": 1840
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 1.9379768371582031,
+      "eval_runtime": 138.0181,
+      "eval_samples_per_second": 3.862,
+      "eval_steps_per_second": 0.485,
+      "step": 1848
+    },
+    {
+      "epoch": 4.004329004329004,
+      "grad_norm": 1.1592340469360352,
+      "learning_rate": 0.0002,
+      "loss": 1.2715,
+      "step": 1850
+    },
+    {
+      "epoch": 4.025974025974026,
+      "grad_norm": 1.4811842441558838,
+      "learning_rate": 0.0002,
+      "loss": 1.0985,
+      "step": 1860
+    },
+    {
+      "epoch": 4.0476190476190474,
+      "grad_norm": 1.4762481451034546,
+      "learning_rate": 0.0002,
+      "loss": 1.0392,
+      "step": 1870
+    },
+    {
+      "epoch": 4.06926406926407,
+      "grad_norm": 1.1761656999588013,
+      "learning_rate": 0.0002,
+      "loss": 1.1076,
+      "step": 1880
+    },
+    {
+      "epoch": 4.090909090909091,
+      "grad_norm": 1.621068000793457,
+      "learning_rate": 0.0002,
+      "loss": 1.0813,
+      "step": 1890
+    },
+    {
+      "epoch": 4.112554112554113,
+      "grad_norm": 1.7963402271270752,
+      "learning_rate": 0.0002,
+      "loss": 1.0849,
+      "step": 1900
+    },
+    {
+      "epoch": 4.134199134199134,
+      "grad_norm": 1.682166337966919,
+      "learning_rate": 0.0002,
+      "loss": 1.115,
+      "step": 1910
+    },
+    {
+      "epoch": 4.1558441558441555,
+      "grad_norm": 1.765175700187683,
+      "learning_rate": 0.0002,
+      "loss": 1.0142,
+      "step": 1920
+    },
+    {
+      "epoch": 4.177489177489178,
+      "grad_norm": 1.7437595129013062,
+      "learning_rate": 0.0002,
+      "loss": 1.0237,
+      "step": 1930
+    },
+    {
+      "epoch": 4.199134199134199,
+      "grad_norm": 1.487619400024414,
+      "learning_rate": 0.0002,
+      "loss": 1.1269,
+      "step": 1940
+    },
+    {
+      "epoch": 4.220779220779221,
+      "grad_norm": 1.5726702213287354,
+      "learning_rate": 0.0002,
+      "loss": 1.1398,
+      "step": 1950
+    },
+    {
+      "epoch": 4.242424242424242,
+      "grad_norm": 1.675681471824646,
+      "learning_rate": 0.0002,
+      "loss": 1.0203,
+      "step": 1960
+    },
+    {
+      "epoch": 4.264069264069264,
+      "grad_norm": 1.5381293296813965,
+      "learning_rate": 0.0002,
+      "loss": 1.0001,
+      "step": 1970
+    },
+    {
+      "epoch": 4.285714285714286,
+      "grad_norm": 1.6634043455123901,
+      "learning_rate": 0.0002,
+      "loss": 1.1608,
+      "step": 1980
+    },
+    {
+      "epoch": 4.307359307359308,
+      "grad_norm": 1.4991868734359741,
+      "learning_rate": 0.0002,
+      "loss": 1.0914,
+      "step": 1990
+    },
+    {
+      "epoch": 4.329004329004329,
+      "grad_norm": 1.7046575546264648,
+      "learning_rate": 0.0002,
+      "loss": 1.0208,
+      "step": 2000
+    },
+    {
+      "epoch": 4.35064935064935,
+      "grad_norm": 1.8189613819122314,
+      "learning_rate": 0.0002,
+      "loss": 1.0671,
+      "step": 2010
+    },
+    {
+      "epoch": 4.372294372294372,
+      "grad_norm": 1.7232930660247803,
+      "learning_rate": 0.0002,
+      "loss": 1.0771,
+      "step": 2020
+    },
+    {
+      "epoch": 4.393939393939394,
+      "grad_norm": 2.037747859954834,
+      "learning_rate": 0.0002,
+      "loss": 1.0849,
+      "step": 2030
+    },
+    {
+      "epoch": 4.415584415584416,
+      "grad_norm": 1.6157771348953247,
+      "learning_rate": 0.0002,
+      "loss": 1.0984,
+      "step": 2040
+    },
+    {
+      "epoch": 4.437229437229437,
+      "grad_norm": 1.6834640502929688,
+      "learning_rate": 0.0002,
+      "loss": 1.0542,
+      "step": 2050
+    },
+    {
+      "epoch": 4.458874458874459,
+      "grad_norm": 1.5155940055847168,
+      "learning_rate": 0.0002,
+      "loss": 1.1582,
+      "step": 2060
+    },
+    {
+      "epoch": 4.48051948051948,
+      "grad_norm": 1.9364410638809204,
+      "learning_rate": 0.0002,
+      "loss": 1.1593,
+      "step": 2070
+    },
+    {
+      "epoch": 4.5021645021645025,
+      "grad_norm": 1.512215256690979,
+      "learning_rate": 0.0002,
+      "loss": 1.1484,
+      "step": 2080
+    },
+    {
+      "epoch": 4.523809523809524,
+      "grad_norm": 1.7659000158309937,
+      "learning_rate": 0.0002,
+      "loss": 1.0858,
+      "step": 2090
+    },
+    {
+      "epoch": 4.545454545454545,
+      "grad_norm": 1.8038681745529175,
+      "learning_rate": 0.0002,
+      "loss": 1.1309,
+      "step": 2100
+    },
+    {
+      "epoch": 4.567099567099567,
+      "grad_norm": 1.6234548091888428,
+      "learning_rate": 0.0002,
+      "loss": 1.1466,
+      "step": 2110
+    },
+    {
+      "epoch": 4.588744588744589,
+      "grad_norm": 1.7181912660598755,
+      "learning_rate": 0.0002,
+      "loss": 1.1237,
+      "step": 2120
+    },
+    {
+      "epoch": 4.6103896103896105,
+      "grad_norm": 1.5204529762268066,
+      "learning_rate": 0.0002,
+      "loss": 1.129,
+      "step": 2130
+    },
+    {
+      "epoch": 4.632034632034632,
+      "grad_norm": 1.6626766920089722,
+      "learning_rate": 0.0002,
+      "loss": 1.1338,
+      "step": 2140
+    },
+    {
+      "epoch": 4.653679653679654,
+      "grad_norm": 1.6722981929779053,
+      "learning_rate": 0.0002,
+      "loss": 1.1135,
+      "step": 2150
+    },
+    {
+      "epoch": 4.675324675324675,
+      "grad_norm": 1.5929896831512451,
+      "learning_rate": 0.0002,
+      "loss": 1.1243,
+      "step": 2160
+    },
+    {
+      "epoch": 4.696969696969697,
+      "grad_norm": 1.8637045621871948,
+      "learning_rate": 0.0002,
+      "loss": 1.1511,
+      "step": 2170
+    },
+    {
+      "epoch": 4.7186147186147185,
+      "grad_norm": 1.7406965494155884,
+      "learning_rate": 0.0002,
+      "loss": 1.0816,
+      "step": 2180
+    },
+    {
+      "epoch": 4.740259740259741,
+      "grad_norm": 1.9259464740753174,
+      "learning_rate": 0.0002,
+      "loss": 1.0913,
+      "step": 2190
+    },
+    {
+      "epoch": 4.761904761904762,
+      "grad_norm": 1.5640064477920532,
+      "learning_rate": 0.0002,
+      "loss": 1.1273,
+      "step": 2200
+    },
+    {
+      "epoch": 4.783549783549784,
+      "grad_norm": 1.5039080381393433,
+      "learning_rate": 0.0002,
+      "loss": 1.095,
+      "step": 2210
+    },
+    {
+      "epoch": 4.805194805194805,
+      "grad_norm": 2.086487293243408,
+      "learning_rate": 0.0002,
+      "loss": 1.1082,
+      "step": 2220
+    },
+    {
+      "epoch": 4.8268398268398265,
+      "grad_norm": 1.8213051557540894,
+      "learning_rate": 0.0002,
+      "loss": 1.1299,
+      "step": 2230
+    },
+    {
+      "epoch": 4.848484848484849,
+      "grad_norm": 1.6772842407226562,
+      "learning_rate": 0.0002,
+      "loss": 1.1339,
+      "step": 2240
+    },
+    {
+      "epoch": 4.87012987012987,
+      "grad_norm": 1.855952262878418,
+      "learning_rate": 0.0002,
+      "loss": 1.0688,
+      "step": 2250
+    },
+    {
+      "epoch": 4.891774891774892,
+      "grad_norm": 1.703018069267273,
+      "learning_rate": 0.0002,
+      "loss": 1.1242,
+      "step": 2260
+    },
+    {
+      "epoch": 4.913419913419913,
+      "grad_norm": 1.5779869556427002,
+      "learning_rate": 0.0002,
+      "loss": 1.1745,
+      "step": 2270
+    },
+    {
+      "epoch": 4.935064935064935,
+      "grad_norm": 1.873153567314148,
+      "learning_rate": 0.0002,
+      "loss": 1.1367,
+      "step": 2280
+    },
+    {
+      "epoch": 4.956709956709957,
+      "grad_norm": 1.845137119293213,
+      "learning_rate": 0.0002,
+      "loss": 1.1469,
+      "step": 2290
+    },
+    {
+      "epoch": 4.978354978354979,
+      "grad_norm": 1.5848972797393799,
+      "learning_rate": 0.0002,
+      "loss": 1.176,
+      "step": 2300
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 1.7801740169525146,
+      "learning_rate": 0.0002,
+      "loss": 1.1427,
+      "step": 2310
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 2.1164023876190186,
+      "eval_runtime": 138.8921,
+      "eval_samples_per_second": 3.838,
+      "eval_steps_per_second": 0.482,
+      "step": 2310
+    },
+    {
+      "epoch": 5.021645021645021,
+      "grad_norm": 2.066721200942993,
+      "learning_rate": 0.0002,
+      "loss": 0.8797,
+      "step": 2320
+    },
+    {
+      "epoch": 5.043290043290043,
+      "grad_norm": 2.151554822921753,
+      "learning_rate": 0.0002,
+      "loss": 0.8525,
+      "step": 2330
+    },
+    {
+      "epoch": 5.064935064935065,
+      "grad_norm": 2.2179667949676514,
+      "learning_rate": 0.0002,
+      "loss": 0.8484,
+      "step": 2340
+    },
+    {
+      "epoch": 5.086580086580087,
+      "grad_norm": 2.125246047973633,
+      "learning_rate": 0.0002,
+      "loss": 0.8321,
+      "step": 2350
+    },
+    {
+      "epoch": 5.108225108225108,
+      "grad_norm": 2.4084312915802,
+      "learning_rate": 0.0002,
+      "loss": 0.7672,
+      "step": 2360
+    },
+    {
+      "epoch": 5.12987012987013,
+      "grad_norm": 2.2142670154571533,
+      "learning_rate": 0.0002,
+      "loss": 0.8809,
+      "step": 2370
+    },
+    {
+      "epoch": 5.151515151515151,
+      "grad_norm": 2.1560709476470947,
+      "learning_rate": 0.0002,
+      "loss": 0.9011,
+      "step": 2380
+    },
+    {
+      "epoch": 5.1731601731601735,
+      "grad_norm": 1.8703010082244873,
+      "learning_rate": 0.0002,
+      "loss": 0.8028,
+      "step": 2390
+    },
+    {
+      "epoch": 5.194805194805195,
+      "grad_norm": 1.8351138830184937,
+      "learning_rate": 0.0002,
+      "loss": 0.8215,
+      "step": 2400
+    },
+    {
+      "epoch": 5.216450216450216,
+      "grad_norm": 2.173983573913574,
+      "learning_rate": 0.0002,
+      "loss": 0.8257,
+      "step": 2410
+    },
+    {
+      "epoch": 5.238095238095238,
+      "grad_norm": 2.6962268352508545,
+      "learning_rate": 0.0002,
+      "loss": 0.8822,
+      "step": 2420
+    },
+    {
+      "epoch": 5.259740259740259,
+      "grad_norm": 1.917742133140564,
+      "learning_rate": 0.0002,
+      "loss": 0.8879,
+      "step": 2430
+    },
+    {
+      "epoch": 5.2813852813852815,
+      "grad_norm": 2.2485885620117188,
+      "learning_rate": 0.0002,
+      "loss": 0.8619,
+      "step": 2440
+    },
+    {
+      "epoch": 5.303030303030303,
+      "grad_norm": 2.158888816833496,
+      "learning_rate": 0.0002,
+      "loss": 0.8841,
+      "step": 2450
+    },
+    {
+      "epoch": 5.324675324675325,
+      "grad_norm": 2.187177896499634,
+      "learning_rate": 0.0002,
+      "loss": 0.8522,
+      "step": 2460
+    },
+    {
+      "epoch": 5.346320346320346,
+      "grad_norm": 2.137540340423584,
+      "learning_rate": 0.0002,
+      "loss": 0.839,
+      "step": 2470
+    },
+    {
+      "epoch": 5.367965367965368,
+      "grad_norm": 2.399334669113159,
+      "learning_rate": 0.0002,
+      "loss": 0.904,
+      "step": 2480
+    },
+    {
+      "epoch": 5.3896103896103895,
+      "grad_norm": 2.04976749420166,
+      "learning_rate": 0.0002,
+      "loss": 0.8378,
+      "step": 2490
+    },
+    {
+      "epoch": 5.411255411255412,
+      "grad_norm": 2.226628541946411,
+      "learning_rate": 0.0002,
+      "loss": 0.9134,
+      "step": 2500
+    },
+    {
+      "epoch": 5.432900432900433,
+      "grad_norm": 1.9237712621688843,
+      "learning_rate": 0.0002,
+      "loss": 0.8879,
+      "step": 2510
+    },
+    {
+      "epoch": 5.454545454545454,
+      "grad_norm": 2.0607564449310303,
+      "learning_rate": 0.0002,
+      "loss": 0.9233,
+      "step": 2520
+    },
+    {
+      "epoch": 5.476190476190476,
+      "grad_norm": 2.0660619735717773,
+      "learning_rate": 0.0002,
+      "loss": 0.9363,
+      "step": 2530
+    },
+    {
+      "epoch": 5.4978354978354975,
+      "grad_norm": 2.276259183883667,
+      "learning_rate": 0.0002,
+      "loss": 0.9041,
+      "step": 2540
+    },
+    {
+      "epoch": 5.51948051948052,
+      "grad_norm": 2.2037975788116455,
+      "learning_rate": 0.0002,
+      "loss": 0.9227,
+      "step": 2550
+    },
+    {
+      "epoch": 5.541125541125541,
+      "grad_norm": 2.0093777179718018,
+      "learning_rate": 0.0002,
+      "loss": 0.9183,
+      "step": 2560
+    },
+    {
+      "epoch": 5.562770562770563,
+      "grad_norm": 1.7906461954116821,
+      "learning_rate": 0.0002,
+      "loss": 0.9106,
+      "step": 2570
+    },
+    {
+      "epoch": 5.584415584415584,
+      "grad_norm": 2.3503541946411133,
+      "learning_rate": 0.0002,
+      "loss": 0.9136,
+      "step": 2580
+    },
+    {
+      "epoch": 5.606060606060606,
+      "grad_norm": 1.9468884468078613,
+      "learning_rate": 0.0002,
+      "loss": 0.9194,
+      "step": 2590
+    },
+    {
+      "epoch": 5.627705627705628,
+      "grad_norm": 2.4006402492523193,
+      "learning_rate": 0.0002,
+      "loss": 0.9215,
+      "step": 2600
+    },
+    {
+      "epoch": 5.64935064935065,
+      "grad_norm": 2.1397976875305176,
+      "learning_rate": 0.0002,
+      "loss": 0.9575,
+      "step": 2610
+    },
+    {
+      "epoch": 5.670995670995671,
+      "grad_norm": 2.331625461578369,
+      "learning_rate": 0.0002,
+      "loss": 0.9327,
+      "step": 2620
+    },
+    {
+      "epoch": 5.692640692640692,
+      "grad_norm": 2.3320906162261963,
+      "learning_rate": 0.0002,
+      "loss": 0.9834,
+      "step": 2630
+    },
+    {
+      "epoch": 5.714285714285714,
+      "grad_norm": 2.0336201190948486,
+      "learning_rate": 0.0002,
+      "loss": 0.8753,
+      "step": 2640
+    },
+    {
+      "epoch": 5.735930735930736,
+      "grad_norm": 2.29776930809021,
+      "learning_rate": 0.0002,
+      "loss": 0.9356,
+      "step": 2650
+    },
+    {
+      "epoch": 5.757575757575758,
+      "grad_norm": 2.0348799228668213,
+      "learning_rate": 0.0002,
+      "loss": 0.8807,
+      "step": 2660
+    },
+    {
+      "epoch": 5.779220779220779,
+      "grad_norm": 2.993572235107422,
+      "learning_rate": 0.0002,
+      "loss": 0.9054,
+      "step": 2670
+    },
+    {
+      "epoch": 5.800865800865801,
+      "grad_norm": 1.9768412113189697,
+      "learning_rate": 0.0002,
+      "loss": 0.9626,
+      "step": 2680
+    },
+    {
+      "epoch": 5.822510822510822,
+      "grad_norm": 2.4398624897003174,
+      "learning_rate": 0.0002,
+      "loss": 0.9142,
+      "step": 2690
+    },
+    {
+      "epoch": 5.8441558441558445,
+      "grad_norm": 1.9682047367095947,
+      "learning_rate": 0.0002,
+      "loss": 0.9422,
+      "step": 2700
+    },
+    {
+      "epoch": 5.865800865800866,
+      "grad_norm": 2.1124305725097656,
+      "learning_rate": 0.0002,
+      "loss": 0.9564,
+      "step": 2710
+    },
+    {
+      "epoch": 5.887445887445887,
+      "grad_norm": 2.2763118743896484,
+      "learning_rate": 0.0002,
+      "loss": 0.95,
+      "step": 2720
+    },
+    {
+      "epoch": 5.909090909090909,
+      "grad_norm": 1.8851757049560547,
+      "learning_rate": 0.0002,
+      "loss": 0.9262,
+      "step": 2730
+    },
+    {
+      "epoch": 5.93073593073593,
+      "grad_norm": 1.7605366706848145,
+      "learning_rate": 0.0002,
+      "loss": 0.8785,
+      "step": 2740
+    },
+    {
+      "epoch": 5.9523809523809526,
+      "grad_norm": 2.063319683074951,
+      "learning_rate": 0.0002,
+      "loss": 0.9263,
+      "step": 2750
+    },
+    {
+      "epoch": 5.974025974025974,
+      "grad_norm": 1.972980260848999,
+      "learning_rate": 0.0002,
+      "loss": 0.9285,
+      "step": 2760
+    },
+    {
+      "epoch": 5.995670995670996,
+      "grad_norm": 2.29889178276062,
+      "learning_rate": 0.0002,
+      "loss": 0.9176,
+      "step": 2770
+    },
+    {
+      "epoch": 6.0,
+      "eval_loss": 2.265646457672119,
+      "eval_runtime": 137.3578,
+      "eval_samples_per_second": 3.88,
+      "eval_steps_per_second": 0.488,
+      "step": 2772
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3696,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.216207424323584e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}