MilaWang commited on Mar 28

Commit

2853246

verified ·

1 Parent(s): 477da4b

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-1613/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-1613/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-1613/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-1613/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-1613/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-1613/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-1613/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-1613/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-1613/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-1613/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-1613/trainer_state.json +1176 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-1613/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-2419/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-2419/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-2419/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-2419/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-2419/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-2419/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-2419/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-2419/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-2419/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-2419/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-2419/trainer_state.json +1744 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-2419/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-3226/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-3226/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-3226/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-3226/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-3226/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-3226/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-3226/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-3226/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-3226/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-3226/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-3226/trainer_state.json +2319 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-3226/training_args.bin +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-4032/README.md +202 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-4032/adapter_config.json +29 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-4032/adapter_model.safetensors +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-4032/optimizer.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-4032/rng_state.pth +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-4032/scheduler.pt +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-4032/special_tokens_map.json +24 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-4032/tokenizer.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-4032/tokenizer.model +3 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-4032/tokenizer_config.json +0 -0
Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-4032/trainer_state.json +2894 -0

Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/adapter_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e874f2e9428c075b5468619aa828adfd7970ec89fb046cd4b04b5ff9637bab7c
+size 109069176

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e874f2e9428c075b5468619aa828adfd7970ec89fb046cd4b04b5ff9637bab7c
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0cc8e51fe48593fc52b4a9c6ecc3b108e42bed72ace17e4e55824879d4555a94
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01d3f13ef50eeffd6108fd5cdcc39935e98b1fa9ec9bb18bab640108d36187de
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:92baa84cde23f9b4f7dbdf737520214bb9b1757ce8a7fb401d4ce26fc10ae684
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,1176 @@

+{
+  "best_metric": 1.2014765739440918,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-1613",
+  "epoch": 2.0,
+  "eval_steps": 10,
+  "global_step": 1613,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.012399256044637322,
+      "grad_norm": 1.6176791191101074,
+      "learning_rate": 0.0002,
+      "loss": 1.8616,
+      "step": 10
+    },
+    {
+      "epoch": 0.024798512089274645,
+      "grad_norm": 0.7599679827690125,
+      "learning_rate": 0.0002,
+      "loss": 1.5953,
+      "step": 20
+    },
+    {
+      "epoch": 0.037197768133911964,
+      "grad_norm": 0.8452111482620239,
+      "learning_rate": 0.0002,
+      "loss": 1.5705,
+      "step": 30
+    },
+    {
+      "epoch": 0.04959702417854929,
+      "grad_norm": 0.8393070101737976,
+      "learning_rate": 0.0002,
+      "loss": 1.5647,
+      "step": 40
+    },
+    {
+      "epoch": 0.06199628022318661,
+      "grad_norm": 1.117109775543213,
+      "learning_rate": 0.0002,
+      "loss": 1.4628,
+      "step": 50
+    },
+    {
+      "epoch": 0.07439553626782393,
+      "grad_norm": 0.8330236077308655,
+      "learning_rate": 0.0002,
+      "loss": 1.3492,
+      "step": 60
+    },
+    {
+      "epoch": 0.08679479231246125,
+      "grad_norm": 0.8670704960823059,
+      "learning_rate": 0.0002,
+      "loss": 1.367,
+      "step": 70
+    },
+    {
+      "epoch": 0.09919404835709858,
+      "grad_norm": 0.6262535452842712,
+      "learning_rate": 0.0002,
+      "loss": 1.2357,
+      "step": 80
+    },
+    {
+      "epoch": 0.1115933044017359,
+      "grad_norm": 0.753338098526001,
+      "learning_rate": 0.0002,
+      "loss": 1.3651,
+      "step": 90
+    },
+    {
+      "epoch": 0.12399256044637322,
+      "grad_norm": 0.6324933171272278,
+      "learning_rate": 0.0002,
+      "loss": 1.2789,
+      "step": 100
+    },
+    {
+      "epoch": 0.13639181649101054,
+      "grad_norm": 0.7270851135253906,
+      "learning_rate": 0.0002,
+      "loss": 1.2393,
+      "step": 110
+    },
+    {
+      "epoch": 0.14879107253564786,
+      "grad_norm": 0.7036070227622986,
+      "learning_rate": 0.0002,
+      "loss": 1.177,
+      "step": 120
+    },
+    {
+      "epoch": 0.16119032858028517,
+      "grad_norm": 0.6269583106040955,
+      "learning_rate": 0.0002,
+      "loss": 1.2808,
+      "step": 130
+    },
+    {
+      "epoch": 0.1735895846249225,
+      "grad_norm": 0.6848828792572021,
+      "learning_rate": 0.0002,
+      "loss": 1.3039,
+      "step": 140
+    },
+    {
+      "epoch": 0.1859888406695598,
+      "grad_norm": 0.5589784383773804,
+      "learning_rate": 0.0002,
+      "loss": 1.1925,
+      "step": 150
+    },
+    {
+      "epoch": 0.19838809671419716,
+      "grad_norm": 0.8350988626480103,
+      "learning_rate": 0.0002,
+      "loss": 1.2724,
+      "step": 160
+    },
+    {
+      "epoch": 0.21078735275883448,
+      "grad_norm": 1.1780346632003784,
+      "learning_rate": 0.0002,
+      "loss": 1.2093,
+      "step": 170
+    },
+    {
+      "epoch": 0.2231866088034718,
+      "grad_norm": 0.674608588218689,
+      "learning_rate": 0.0002,
+      "loss": 1.2573,
+      "step": 180
+    },
+    {
+      "epoch": 0.23558586484810912,
+      "grad_norm": 0.6972184181213379,
+      "learning_rate": 0.0002,
+      "loss": 1.2629,
+      "step": 190
+    },
+    {
+      "epoch": 0.24798512089274644,
+      "grad_norm": 0.5187845230102539,
+      "learning_rate": 0.0002,
+      "loss": 1.2618,
+      "step": 200
+    },
+    {
+      "epoch": 0.26038437693738375,
+      "grad_norm": 0.7513871192932129,
+      "learning_rate": 0.0002,
+      "loss": 1.3478,
+      "step": 210
+    },
+    {
+      "epoch": 0.2727836329820211,
+      "grad_norm": 0.5859110951423645,
+      "learning_rate": 0.0002,
+      "loss": 1.1843,
+      "step": 220
+    },
+    {
+      "epoch": 0.2851828890266584,
+      "grad_norm": 0.5547062754631042,
+      "learning_rate": 0.0002,
+      "loss": 1.1784,
+      "step": 230
+    },
+    {
+      "epoch": 0.2975821450712957,
+      "grad_norm": 3.5287671089172363,
+      "learning_rate": 0.0002,
+      "loss": 1.2564,
+      "step": 240
+    },
+    {
+      "epoch": 0.30998140111593303,
+      "grad_norm": 0.8644460439682007,
+      "learning_rate": 0.0002,
+      "loss": 1.313,
+      "step": 250
+    },
+    {
+      "epoch": 0.32238065716057035,
+      "grad_norm": 0.6270064115524292,
+      "learning_rate": 0.0002,
+      "loss": 1.2187,
+      "step": 260
+    },
+    {
+      "epoch": 0.33477991320520767,
+      "grad_norm": 1.170295000076294,
+      "learning_rate": 0.0002,
+      "loss": 1.2017,
+      "step": 270
+    },
+    {
+      "epoch": 0.347179169249845,
+      "grad_norm": 0.5701245069503784,
+      "learning_rate": 0.0002,
+      "loss": 1.1075,
+      "step": 280
+    },
+    {
+      "epoch": 0.3595784252944823,
+      "grad_norm": 0.6373095512390137,
+      "learning_rate": 0.0002,
+      "loss": 1.1185,
+      "step": 290
+    },
+    {
+      "epoch": 0.3719776813391196,
+      "grad_norm": 0.5740704536437988,
+      "learning_rate": 0.0002,
+      "loss": 1.1738,
+      "step": 300
+    },
+    {
+      "epoch": 0.384376937383757,
+      "grad_norm": 0.5516835451126099,
+      "learning_rate": 0.0002,
+      "loss": 1.2858,
+      "step": 310
+    },
+    {
+      "epoch": 0.3967761934283943,
+      "grad_norm": 0.5212382078170776,
+      "learning_rate": 0.0002,
+      "loss": 1.2315,
+      "step": 320
+    },
+    {
+      "epoch": 0.40917544947303164,
+      "grad_norm": 0.540307343006134,
+      "learning_rate": 0.0002,
+      "loss": 1.172,
+      "step": 330
+    },
+    {
+      "epoch": 0.42157470551766896,
+      "grad_norm": 0.7454301714897156,
+      "learning_rate": 0.0002,
+      "loss": 1.2736,
+      "step": 340
+    },
+    {
+      "epoch": 0.4339739615623063,
+      "grad_norm": 0.7390317916870117,
+      "learning_rate": 0.0002,
+      "loss": 1.3013,
+      "step": 350
+    },
+    {
+      "epoch": 0.4463732176069436,
+      "grad_norm": 0.5498788356781006,
+      "learning_rate": 0.0002,
+      "loss": 1.0615,
+      "step": 360
+    },
+    {
+      "epoch": 0.4587724736515809,
+      "grad_norm": 0.5776252150535583,
+      "learning_rate": 0.0002,
+      "loss": 1.2251,
+      "step": 370
+    },
+    {
+      "epoch": 0.47117172969621823,
+      "grad_norm": 0.6941552758216858,
+      "learning_rate": 0.0002,
+      "loss": 1.1932,
+      "step": 380
+    },
+    {
+      "epoch": 0.48357098574085555,
+      "grad_norm": 0.7936233282089233,
+      "learning_rate": 0.0002,
+      "loss": 1.23,
+      "step": 390
+    },
+    {
+      "epoch": 0.49597024178549287,
+      "grad_norm": 0.5257220268249512,
+      "learning_rate": 0.0002,
+      "loss": 1.1137,
+      "step": 400
+    },
+    {
+      "epoch": 0.5083694978301302,
+      "grad_norm": 0.5740510821342468,
+      "learning_rate": 0.0002,
+      "loss": 1.1867,
+      "step": 410
+    },
+    {
+      "epoch": 0.5207687538747675,
+      "grad_norm": 0.6181507110595703,
+      "learning_rate": 0.0002,
+      "loss": 1.1049,
+      "step": 420
+    },
+    {
+      "epoch": 0.5331680099194048,
+      "grad_norm": 0.6333999037742615,
+      "learning_rate": 0.0002,
+      "loss": 1.2303,
+      "step": 430
+    },
+    {
+      "epoch": 0.5455672659640421,
+      "grad_norm": 0.5667845010757446,
+      "learning_rate": 0.0002,
+      "loss": 1.2457,
+      "step": 440
+    },
+    {
+      "epoch": 0.5579665220086795,
+      "grad_norm": 0.5254231095314026,
+      "learning_rate": 0.0002,
+      "loss": 1.2547,
+      "step": 450
+    },
+    {
+      "epoch": 0.5703657780533168,
+      "grad_norm": 0.5938495993614197,
+      "learning_rate": 0.0002,
+      "loss": 1.2118,
+      "step": 460
+    },
+    {
+      "epoch": 0.5827650340979541,
+      "grad_norm": 0.7733635902404785,
+      "learning_rate": 0.0002,
+      "loss": 1.2409,
+      "step": 470
+    },
+    {
+      "epoch": 0.5951642901425914,
+      "grad_norm": 0.6114753484725952,
+      "learning_rate": 0.0002,
+      "loss": 1.2343,
+      "step": 480
+    },
+    {
+      "epoch": 0.6075635461872287,
+      "grad_norm": 0.5587155818939209,
+      "learning_rate": 0.0002,
+      "loss": 1.1779,
+      "step": 490
+    },
+    {
+      "epoch": 0.6199628022318661,
+      "grad_norm": 0.7636917233467102,
+      "learning_rate": 0.0002,
+      "loss": 1.2136,
+      "step": 500
+    },
+    {
+      "epoch": 0.6323620582765034,
+      "grad_norm": 0.5896942615509033,
+      "learning_rate": 0.0002,
+      "loss": 1.1301,
+      "step": 510
+    },
+    {
+      "epoch": 0.6447613143211407,
+      "grad_norm": 0.8594750165939331,
+      "learning_rate": 0.0002,
+      "loss": 1.2089,
+      "step": 520
+    },
+    {
+      "epoch": 0.657160570365778,
+      "grad_norm": 0.6459881067276001,
+      "learning_rate": 0.0002,
+      "loss": 1.1551,
+      "step": 530
+    },
+    {
+      "epoch": 0.6695598264104153,
+      "grad_norm": 0.650656521320343,
+      "learning_rate": 0.0002,
+      "loss": 1.175,
+      "step": 540
+    },
+    {
+      "epoch": 0.6819590824550527,
+      "grad_norm": 0.7238242626190186,
+      "learning_rate": 0.0002,
+      "loss": 1.2143,
+      "step": 550
+    },
+    {
+      "epoch": 0.69435833849969,
+      "grad_norm": 0.6289859414100647,
+      "learning_rate": 0.0002,
+      "loss": 1.0961,
+      "step": 560
+    },
+    {
+      "epoch": 0.7067575945443273,
+      "grad_norm": 0.6108142137527466,
+      "learning_rate": 0.0002,
+      "loss": 1.2316,
+      "step": 570
+    },
+    {
+      "epoch": 0.7191568505889646,
+      "grad_norm": 0.6905024647712708,
+      "learning_rate": 0.0002,
+      "loss": 1.1315,
+      "step": 580
+    },
+    {
+      "epoch": 0.7315561066336019,
+      "grad_norm": 0.5975471138954163,
+      "learning_rate": 0.0002,
+      "loss": 1.2368,
+      "step": 590
+    },
+    {
+      "epoch": 0.7439553626782393,
+      "grad_norm": 0.49540066719055176,
+      "learning_rate": 0.0002,
+      "loss": 1.1014,
+      "step": 600
+    },
+    {
+      "epoch": 0.7563546187228767,
+      "grad_norm": 0.5365461707115173,
+      "learning_rate": 0.0002,
+      "loss": 1.1359,
+      "step": 610
+    },
+    {
+      "epoch": 0.768753874767514,
+      "grad_norm": 0.6156648993492126,
+      "learning_rate": 0.0002,
+      "loss": 1.2552,
+      "step": 620
+    },
+    {
+      "epoch": 0.7811531308121513,
+      "grad_norm": 0.656879186630249,
+      "learning_rate": 0.0002,
+      "loss": 1.1929,
+      "step": 630
+    },
+    {
+      "epoch": 0.7935523868567886,
+      "grad_norm": 0.8963037729263306,
+      "learning_rate": 0.0002,
+      "loss": 1.3063,
+      "step": 640
+    },
+    {
+      "epoch": 0.805951642901426,
+      "grad_norm": 1.0569753646850586,
+      "learning_rate": 0.0002,
+      "loss": 1.219,
+      "step": 650
+    },
+    {
+      "epoch": 0.8183508989460633,
+      "grad_norm": 0.7332107424736023,
+      "learning_rate": 0.0002,
+      "loss": 1.2563,
+      "step": 660
+    },
+    {
+      "epoch": 0.8307501549907006,
+      "grad_norm": 0.589097797870636,
+      "learning_rate": 0.0002,
+      "loss": 1.1029,
+      "step": 670
+    },
+    {
+      "epoch": 0.8431494110353379,
+      "grad_norm": 0.9553480744361877,
+      "learning_rate": 0.0002,
+      "loss": 1.1705,
+      "step": 680
+    },
+    {
+      "epoch": 0.8555486670799752,
+      "grad_norm": 0.7076331973075867,
+      "learning_rate": 0.0002,
+      "loss": 1.1605,
+      "step": 690
+    },
+    {
+      "epoch": 0.8679479231246126,
+      "grad_norm": 0.597531795501709,
+      "learning_rate": 0.0002,
+      "loss": 1.2346,
+      "step": 700
+    },
+    {
+      "epoch": 0.8803471791692499,
+      "grad_norm": 0.7023149132728577,
+      "learning_rate": 0.0002,
+      "loss": 1.1637,
+      "step": 710
+    },
+    {
+      "epoch": 0.8927464352138872,
+      "grad_norm": 1.4393764734268188,
+      "learning_rate": 0.0002,
+      "loss": 1.2717,
+      "step": 720
+    },
+    {
+      "epoch": 0.9051456912585245,
+      "grad_norm": 0.5944231152534485,
+      "learning_rate": 0.0002,
+      "loss": 1.216,
+      "step": 730
+    },
+    {
+      "epoch": 0.9175449473031618,
+      "grad_norm": 0.5712162852287292,
+      "learning_rate": 0.0002,
+      "loss": 1.148,
+      "step": 740
+    },
+    {
+      "epoch": 0.9299442033477991,
+      "grad_norm": 0.5335281491279602,
+      "learning_rate": 0.0002,
+      "loss": 1.2318,
+      "step": 750
+    },
+    {
+      "epoch": 0.9423434593924365,
+      "grad_norm": 0.8050292730331421,
+      "learning_rate": 0.0002,
+      "loss": 1.149,
+      "step": 760
+    },
+    {
+      "epoch": 0.9547427154370738,
+      "grad_norm": 0.6092700958251953,
+      "learning_rate": 0.0002,
+      "loss": 1.0862,
+      "step": 770
+    },
+    {
+      "epoch": 0.9671419714817111,
+      "grad_norm": 0.7012797594070435,
+      "learning_rate": 0.0002,
+      "loss": 1.3204,
+      "step": 780
+    },
+    {
+      "epoch": 0.9795412275263484,
+      "grad_norm": 0.6228184103965759,
+      "learning_rate": 0.0002,
+      "loss": 1.1641,
+      "step": 790
+    },
+    {
+      "epoch": 0.9919404835709857,
+      "grad_norm": 0.5482686161994934,
+      "learning_rate": 0.0002,
+      "loss": 1.131,
+      "step": 800
+    },
+    {
+      "epoch": 0.9993800371977681,
+      "eval_loss": 1.2057286500930786,
+      "eval_runtime": 164.6087,
+      "eval_samples_per_second": 2.77,
+      "eval_steps_per_second": 0.346,
+      "step": 806
+    },
+    {
+      "epoch": 1.004339739615623,
+      "grad_norm": 0.6331814527511597,
+      "learning_rate": 0.0002,
+      "loss": 1.0899,
+      "step": 810
+    },
+    {
+      "epoch": 1.0167389956602604,
+      "grad_norm": 0.6160872578620911,
+      "learning_rate": 0.0002,
+      "loss": 1.0551,
+      "step": 820
+    },
+    {
+      "epoch": 1.0291382517048977,
+      "grad_norm": 0.6104072332382202,
+      "learning_rate": 0.0002,
+      "loss": 0.9934,
+      "step": 830
+    },
+    {
+      "epoch": 1.041537507749535,
+      "grad_norm": 0.7619274854660034,
+      "learning_rate": 0.0002,
+      "loss": 1.0776,
+      "step": 840
+    },
+    {
+      "epoch": 1.0539367637941723,
+      "grad_norm": 0.761172890663147,
+      "learning_rate": 0.0002,
+      "loss": 0.9929,
+      "step": 850
+    },
+    {
+      "epoch": 1.0663360198388097,
+      "grad_norm": 0.7563514113426208,
+      "learning_rate": 0.0002,
+      "loss": 1.0543,
+      "step": 860
+    },
+    {
+      "epoch": 1.078735275883447,
+      "grad_norm": 0.521998941898346,
+      "learning_rate": 0.0002,
+      "loss": 1.0812,
+      "step": 870
+    },
+    {
+      "epoch": 1.0911345319280843,
+      "grad_norm": 0.824347972869873,
+      "learning_rate": 0.0002,
+      "loss": 1.1417,
+      "step": 880
+    },
+    {
+      "epoch": 1.1035337879727216,
+      "grad_norm": 0.5645424127578735,
+      "learning_rate": 0.0002,
+      "loss": 1.1096,
+      "step": 890
+    },
+    {
+      "epoch": 1.115933044017359,
+      "grad_norm": 0.8568223714828491,
+      "learning_rate": 0.0002,
+      "loss": 1.1005,
+      "step": 900
+    },
+    {
+      "epoch": 1.1283323000619963,
+      "grad_norm": 0.68181312084198,
+      "learning_rate": 0.0002,
+      "loss": 1.088,
+      "step": 910
+    },
+    {
+      "epoch": 1.1407315561066336,
+      "grad_norm": 0.7577647566795349,
+      "learning_rate": 0.0002,
+      "loss": 1.0281,
+      "step": 920
+    },
+    {
+      "epoch": 1.153130812151271,
+      "grad_norm": 0.6968798637390137,
+      "learning_rate": 0.0002,
+      "loss": 0.9812,
+      "step": 930
+    },
+    {
+      "epoch": 1.1655300681959082,
+      "grad_norm": 0.5769661664962769,
+      "learning_rate": 0.0002,
+      "loss": 1.0539,
+      "step": 940
+    },
+    {
+      "epoch": 1.1779293242405455,
+      "grad_norm": 0.6399155259132385,
+      "learning_rate": 0.0002,
+      "loss": 1.0831,
+      "step": 950
+    },
+    {
+      "epoch": 1.1903285802851828,
+      "grad_norm": 0.9824289679527283,
+      "learning_rate": 0.0002,
+      "loss": 1.0464,
+      "step": 960
+    },
+    {
+      "epoch": 1.2027278363298202,
+      "grad_norm": 0.7485893964767456,
+      "learning_rate": 0.0002,
+      "loss": 1.1068,
+      "step": 970
+    },
+    {
+      "epoch": 1.2151270923744575,
+      "grad_norm": 0.668736457824707,
+      "learning_rate": 0.0002,
+      "loss": 1.0047,
+      "step": 980
+    },
+    {
+      "epoch": 1.2275263484190948,
+      "grad_norm": 0.7041404843330383,
+      "learning_rate": 0.0002,
+      "loss": 1.0818,
+      "step": 990
+    },
+    {
+      "epoch": 1.2399256044637321,
+      "grad_norm": 0.7070603966712952,
+      "learning_rate": 0.0002,
+      "loss": 1.0847,
+      "step": 1000
+    },
+    {
+      "epoch": 1.2523248605083694,
+      "grad_norm": 0.7828628420829773,
+      "learning_rate": 0.0002,
+      "loss": 1.047,
+      "step": 1010
+    },
+    {
+      "epoch": 1.2647241165530068,
+      "grad_norm": 0.7149654626846313,
+      "learning_rate": 0.0002,
+      "loss": 1.0658,
+      "step": 1020
+    },
+    {
+      "epoch": 1.277123372597644,
+      "grad_norm": 0.7691766619682312,
+      "learning_rate": 0.0002,
+      "loss": 0.9791,
+      "step": 1030
+    },
+    {
+      "epoch": 1.2895226286422814,
+      "grad_norm": 0.8022137880325317,
+      "learning_rate": 0.0002,
+      "loss": 1.0242,
+      "step": 1040
+    },
+    {
+      "epoch": 1.3019218846869187,
+      "grad_norm": 0.6709204316139221,
+      "learning_rate": 0.0002,
+      "loss": 1.0837,
+      "step": 1050
+    },
+    {
+      "epoch": 1.314321140731556,
+      "grad_norm": 0.7368158102035522,
+      "learning_rate": 0.0002,
+      "loss": 1.0382,
+      "step": 1060
+    },
+    {
+      "epoch": 1.3267203967761934,
+      "grad_norm": 0.8408007621765137,
+      "learning_rate": 0.0002,
+      "loss": 1.0371,
+      "step": 1070
+    },
+    {
+      "epoch": 1.3391196528208307,
+      "grad_norm": 1.2165539264678955,
+      "learning_rate": 0.0002,
+      "loss": 0.9633,
+      "step": 1080
+    },
+    {
+      "epoch": 1.351518908865468,
+      "grad_norm": 0.7284916043281555,
+      "learning_rate": 0.0002,
+      "loss": 1.0079,
+      "step": 1090
+    },
+    {
+      "epoch": 1.3639181649101053,
+      "grad_norm": 0.7994557619094849,
+      "learning_rate": 0.0002,
+      "loss": 1.0211,
+      "step": 1100
+    },
+    {
+      "epoch": 1.3763174209547429,
+      "grad_norm": 0.9658345580101013,
+      "learning_rate": 0.0002,
+      "loss": 1.0892,
+      "step": 1110
+    },
+    {
+      "epoch": 1.3887166769993802,
+      "grad_norm": 0.6312829852104187,
+      "learning_rate": 0.0002,
+      "loss": 1.2088,
+      "step": 1120
+    },
+    {
+      "epoch": 1.4011159330440175,
+      "grad_norm": 0.7263661026954651,
+      "learning_rate": 0.0002,
+      "loss": 1.1055,
+      "step": 1130
+    },
+    {
+      "epoch": 1.4135151890886548,
+      "grad_norm": 0.829082727432251,
+      "learning_rate": 0.0002,
+      "loss": 1.0232,
+      "step": 1140
+    },
+    {
+      "epoch": 1.4259144451332921,
+      "grad_norm": 0.6168127059936523,
+      "learning_rate": 0.0002,
+      "loss": 1.1413,
+      "step": 1150
+    },
+    {
+      "epoch": 1.4383137011779294,
+      "grad_norm": 0.8351425528526306,
+      "learning_rate": 0.0002,
+      "loss": 1.0283,
+      "step": 1160
+    },
+    {
+      "epoch": 1.4507129572225668,
+      "grad_norm": 0.8814472556114197,
+      "learning_rate": 0.0002,
+      "loss": 1.1146,
+      "step": 1170
+    },
+    {
+      "epoch": 1.463112213267204,
+      "grad_norm": 0.6913689970970154,
+      "learning_rate": 0.0002,
+      "loss": 1.0932,
+      "step": 1180
+    },
+    {
+      "epoch": 1.4755114693118414,
+      "grad_norm": 0.7907165884971619,
+      "learning_rate": 0.0002,
+      "loss": 1.1066,
+      "step": 1190
+    },
+    {
+      "epoch": 1.4879107253564787,
+      "grad_norm": 0.8361626267433167,
+      "learning_rate": 0.0002,
+      "loss": 1.0738,
+      "step": 1200
+    },
+    {
+      "epoch": 1.500309981401116,
+      "grad_norm": 1.073534607887268,
+      "learning_rate": 0.0002,
+      "loss": 1.0559,
+      "step": 1210
+    },
+    {
+      "epoch": 1.5127092374457534,
+      "grad_norm": 0.8416345119476318,
+      "learning_rate": 0.0002,
+      "loss": 1.0204,
+      "step": 1220
+    },
+    {
+      "epoch": 1.5251084934903907,
+      "grad_norm": 1.0225597620010376,
+      "learning_rate": 0.0002,
+      "loss": 1.0941,
+      "step": 1230
+    },
+    {
+      "epoch": 1.537507749535028,
+      "grad_norm": 0.6662965416908264,
+      "learning_rate": 0.0002,
+      "loss": 1.0854,
+      "step": 1240
+    },
+    {
+      "epoch": 1.5499070055796653,
+      "grad_norm": 0.7363991737365723,
+      "learning_rate": 0.0002,
+      "loss": 1.0816,
+      "step": 1250
+    },
+    {
+      "epoch": 1.5623062616243026,
+      "grad_norm": 0.9029574990272522,
+      "learning_rate": 0.0002,
+      "loss": 1.0271,
+      "step": 1260
+    },
+    {
+      "epoch": 1.57470551766894,
+      "grad_norm": 0.7992424368858337,
+      "learning_rate": 0.0002,
+      "loss": 1.0206,
+      "step": 1270
+    },
+    {
+      "epoch": 1.5871047737135773,
+      "grad_norm": 0.8108977675437927,
+      "learning_rate": 0.0002,
+      "loss": 1.0114,
+      "step": 1280
+    },
+    {
+      "epoch": 1.5995040297582146,
+      "grad_norm": 0.8257458806037903,
+      "learning_rate": 0.0002,
+      "loss": 1.0264,
+      "step": 1290
+    },
+    {
+      "epoch": 1.611903285802852,
+      "grad_norm": 0.8265092968940735,
+      "learning_rate": 0.0002,
+      "loss": 1.0944,
+      "step": 1300
+    },
+    {
+      "epoch": 1.6243025418474892,
+      "grad_norm": 0.6568580269813538,
+      "learning_rate": 0.0002,
+      "loss": 1.0136,
+      "step": 1310
+    },
+    {
+      "epoch": 1.6367017978921266,
+      "grad_norm": 0.7608488202095032,
+      "learning_rate": 0.0002,
+      "loss": 1.009,
+      "step": 1320
+    },
+    {
+      "epoch": 1.6491010539367639,
+      "grad_norm": 0.7511259317398071,
+      "learning_rate": 0.0002,
+      "loss": 1.1202,
+      "step": 1330
+    },
+    {
+      "epoch": 1.6615003099814012,
+      "grad_norm": 0.7942162752151489,
+      "learning_rate": 0.0002,
+      "loss": 1.0528,
+      "step": 1340
+    },
+    {
+      "epoch": 1.6738995660260385,
+      "grad_norm": 0.8253659605979919,
+      "learning_rate": 0.0002,
+      "loss": 1.0411,
+      "step": 1350
+    },
+    {
+      "epoch": 1.6862988220706758,
+      "grad_norm": 1.1318382024765015,
+      "learning_rate": 0.0002,
+      "loss": 1.001,
+      "step": 1360
+    },
+    {
+      "epoch": 1.6986980781153131,
+      "grad_norm": 0.693403959274292,
+      "learning_rate": 0.0002,
+      "loss": 1.0727,
+      "step": 1370
+    },
+    {
+      "epoch": 1.7110973341599505,
+      "grad_norm": 0.7107617259025574,
+      "learning_rate": 0.0002,
+      "loss": 1.073,
+      "step": 1380
+    },
+    {
+      "epoch": 1.7234965902045878,
+      "grad_norm": 0.8169032335281372,
+      "learning_rate": 0.0002,
+      "loss": 1.0849,
+      "step": 1390
+    },
+    {
+      "epoch": 1.735895846249225,
+      "grad_norm": 0.8940841555595398,
+      "learning_rate": 0.0002,
+      "loss": 1.0578,
+      "step": 1400
+    },
+    {
+      "epoch": 1.7482951022938624,
+      "grad_norm": 0.7862188220024109,
+      "learning_rate": 0.0002,
+      "loss": 1.0891,
+      "step": 1410
+    },
+    {
+      "epoch": 1.7606943583384997,
+      "grad_norm": 1.136338472366333,
+      "learning_rate": 0.0002,
+      "loss": 0.9962,
+      "step": 1420
+    },
+    {
+      "epoch": 1.773093614383137,
+      "grad_norm": 0.9534069895744324,
+      "learning_rate": 0.0002,
+      "loss": 1.0943,
+      "step": 1430
+    },
+    {
+      "epoch": 1.7854928704277744,
+      "grad_norm": 1.0747562646865845,
+      "learning_rate": 0.0002,
+      "loss": 1.1257,
+      "step": 1440
+    },
+    {
+      "epoch": 1.7978921264724117,
+      "grad_norm": 0.8557891249656677,
+      "learning_rate": 0.0002,
+      "loss": 1.0556,
+      "step": 1450
+    },
+    {
+      "epoch": 1.810291382517049,
+      "grad_norm": 0.6829259991645813,
+      "learning_rate": 0.0002,
+      "loss": 1.0128,
+      "step": 1460
+    },
+    {
+      "epoch": 1.8226906385616863,
+      "grad_norm": 0.8164441585540771,
+      "learning_rate": 0.0002,
+      "loss": 1.0313,
+      "step": 1470
+    },
+    {
+      "epoch": 1.8350898946063237,
+      "grad_norm": 0.9458068609237671,
+      "learning_rate": 0.0002,
+      "loss": 1.1136,
+      "step": 1480
+    },
+    {
+      "epoch": 1.847489150650961,
+      "grad_norm": 0.743009626865387,
+      "learning_rate": 0.0002,
+      "loss": 1.0457,
+      "step": 1490
+    },
+    {
+      "epoch": 1.8598884066955983,
+      "grad_norm": 0.7137694358825684,
+      "learning_rate": 0.0002,
+      "loss": 1.0107,
+      "step": 1500
+    },
+    {
+      "epoch": 1.8722876627402356,
+      "grad_norm": 0.7618028521537781,
+      "learning_rate": 0.0002,
+      "loss": 1.0633,
+      "step": 1510
+    },
+    {
+      "epoch": 1.884686918784873,
+      "grad_norm": 0.8153398633003235,
+      "learning_rate": 0.0002,
+      "loss": 1.103,
+      "step": 1520
+    },
+    {
+      "epoch": 1.8970861748295103,
+      "grad_norm": 0.9127124547958374,
+      "learning_rate": 0.0002,
+      "loss": 1.2094,
+      "step": 1530
+    },
+    {
+      "epoch": 1.9094854308741476,
+      "grad_norm": 0.7699425220489502,
+      "learning_rate": 0.0002,
+      "loss": 1.0379,
+      "step": 1540
+    },
+    {
+      "epoch": 1.921884686918785,
+      "grad_norm": 0.8807545304298401,
+      "learning_rate": 0.0002,
+      "loss": 0.9849,
+      "step": 1550
+    },
+    {
+      "epoch": 1.9342839429634222,
+      "grad_norm": 0.7340815663337708,
+      "learning_rate": 0.0002,
+      "loss": 1.033,
+      "step": 1560
+    },
+    {
+      "epoch": 1.9466831990080595,
+      "grad_norm": 1.070056676864624,
+      "learning_rate": 0.0002,
+      "loss": 1.0083,
+      "step": 1570
+    },
+    {
+      "epoch": 1.9590824550526968,
+      "grad_norm": 0.8195573687553406,
+      "learning_rate": 0.0002,
+      "loss": 1.0023,
+      "step": 1580
+    },
+    {
+      "epoch": 1.9714817110973342,
+      "grad_norm": 0.7938687205314636,
+      "learning_rate": 0.0002,
+      "loss": 1.029,
+      "step": 1590
+    },
+    {
+      "epoch": 1.9838809671419715,
+      "grad_norm": 0.7632259726524353,
+      "learning_rate": 0.0002,
+      "loss": 1.0512,
+      "step": 1600
+    },
+    {
+      "epoch": 1.9962802231866088,
+      "grad_norm": 0.7921916246414185,
+      "learning_rate": 0.0002,
+      "loss": 1.0426,
+      "step": 1610
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.2014765739440918,
+      "eval_runtime": 159.8677,
+      "eval_samples_per_second": 2.852,
+      "eval_steps_per_second": 0.357,
+      "step": 1613
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 6448,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 7.07699341787136e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a0a54cf05b51cc3ca7cba649c3e96685958c9d310c181dff0c31954ec4641225
+size 5560

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d10271c618d36c826ace5e45001c7771ded5c31082f53ab7bdb92f05315fb25a
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b7ba379db8ad00f7f893f770c91af779ba350dc59dd78a4ee8081924fc40a35f
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:87ee99a079ce34c26aa81ce69d3f226916a2c733d8107600fbf6b0f6daebad6c
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:27f7bcd01e452f6167b74d0bec132ca6ee19005f211aee6650a726693c8fb6fc
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,1744 @@

+{
+  "best_metric": 1.2014765739440918,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-1613",
+  "epoch": 2.999380037197768,
+  "eval_steps": 10,
+  "global_step": 2419,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.012399256044637322,
+      "grad_norm": 1.6176791191101074,
+      "learning_rate": 0.0002,
+      "loss": 1.8616,
+      "step": 10
+    },
+    {
+      "epoch": 0.024798512089274645,
+      "grad_norm": 0.7599679827690125,
+      "learning_rate": 0.0002,
+      "loss": 1.5953,
+      "step": 20
+    },
+    {
+      "epoch": 0.037197768133911964,
+      "grad_norm": 0.8452111482620239,
+      "learning_rate": 0.0002,
+      "loss": 1.5705,
+      "step": 30
+    },
+    {
+      "epoch": 0.04959702417854929,
+      "grad_norm": 0.8393070101737976,
+      "learning_rate": 0.0002,
+      "loss": 1.5647,
+      "step": 40
+    },
+    {
+      "epoch": 0.06199628022318661,
+      "grad_norm": 1.117109775543213,
+      "learning_rate": 0.0002,
+      "loss": 1.4628,
+      "step": 50
+    },
+    {
+      "epoch": 0.07439553626782393,
+      "grad_norm": 0.8330236077308655,
+      "learning_rate": 0.0002,
+      "loss": 1.3492,
+      "step": 60
+    },
+    {
+      "epoch": 0.08679479231246125,
+      "grad_norm": 0.8670704960823059,
+      "learning_rate": 0.0002,
+      "loss": 1.367,
+      "step": 70
+    },
+    {
+      "epoch": 0.09919404835709858,
+      "grad_norm": 0.6262535452842712,
+      "learning_rate": 0.0002,
+      "loss": 1.2357,
+      "step": 80
+    },
+    {
+      "epoch": 0.1115933044017359,
+      "grad_norm": 0.753338098526001,
+      "learning_rate": 0.0002,
+      "loss": 1.3651,
+      "step": 90
+    },
+    {
+      "epoch": 0.12399256044637322,
+      "grad_norm": 0.6324933171272278,
+      "learning_rate": 0.0002,
+      "loss": 1.2789,
+      "step": 100
+    },
+    {
+      "epoch": 0.13639181649101054,
+      "grad_norm": 0.7270851135253906,
+      "learning_rate": 0.0002,
+      "loss": 1.2393,
+      "step": 110
+    },
+    {
+      "epoch": 0.14879107253564786,
+      "grad_norm": 0.7036070227622986,
+      "learning_rate": 0.0002,
+      "loss": 1.177,
+      "step": 120
+    },
+    {
+      "epoch": 0.16119032858028517,
+      "grad_norm": 0.6269583106040955,
+      "learning_rate": 0.0002,
+      "loss": 1.2808,
+      "step": 130
+    },
+    {
+      "epoch": 0.1735895846249225,
+      "grad_norm": 0.6848828792572021,
+      "learning_rate": 0.0002,
+      "loss": 1.3039,
+      "step": 140
+    },
+    {
+      "epoch": 0.1859888406695598,
+      "grad_norm": 0.5589784383773804,
+      "learning_rate": 0.0002,
+      "loss": 1.1925,
+      "step": 150
+    },
+    {
+      "epoch": 0.19838809671419716,
+      "grad_norm": 0.8350988626480103,
+      "learning_rate": 0.0002,
+      "loss": 1.2724,
+      "step": 160
+    },
+    {
+      "epoch": 0.21078735275883448,
+      "grad_norm": 1.1780346632003784,
+      "learning_rate": 0.0002,
+      "loss": 1.2093,
+      "step": 170
+    },
+    {
+      "epoch": 0.2231866088034718,
+      "grad_norm": 0.674608588218689,
+      "learning_rate": 0.0002,
+      "loss": 1.2573,
+      "step": 180
+    },
+    {
+      "epoch": 0.23558586484810912,
+      "grad_norm": 0.6972184181213379,
+      "learning_rate": 0.0002,
+      "loss": 1.2629,
+      "step": 190
+    },
+    {
+      "epoch": 0.24798512089274644,
+      "grad_norm": 0.5187845230102539,
+      "learning_rate": 0.0002,
+      "loss": 1.2618,
+      "step": 200
+    },
+    {
+      "epoch": 0.26038437693738375,
+      "grad_norm": 0.7513871192932129,
+      "learning_rate": 0.0002,
+      "loss": 1.3478,
+      "step": 210
+    },
+    {
+      "epoch": 0.2727836329820211,
+      "grad_norm": 0.5859110951423645,
+      "learning_rate": 0.0002,
+      "loss": 1.1843,
+      "step": 220
+    },
+    {
+      "epoch": 0.2851828890266584,
+      "grad_norm": 0.5547062754631042,
+      "learning_rate": 0.0002,
+      "loss": 1.1784,
+      "step": 230
+    },
+    {
+      "epoch": 0.2975821450712957,
+      "grad_norm": 3.5287671089172363,
+      "learning_rate": 0.0002,
+      "loss": 1.2564,
+      "step": 240
+    },
+    {
+      "epoch": 0.30998140111593303,
+      "grad_norm": 0.8644460439682007,
+      "learning_rate": 0.0002,
+      "loss": 1.313,
+      "step": 250
+    },
+    {
+      "epoch": 0.32238065716057035,
+      "grad_norm": 0.6270064115524292,
+      "learning_rate": 0.0002,
+      "loss": 1.2187,
+      "step": 260
+    },
+    {
+      "epoch": 0.33477991320520767,
+      "grad_norm": 1.170295000076294,
+      "learning_rate": 0.0002,
+      "loss": 1.2017,
+      "step": 270
+    },
+    {
+      "epoch": 0.347179169249845,
+      "grad_norm": 0.5701245069503784,
+      "learning_rate": 0.0002,
+      "loss": 1.1075,
+      "step": 280
+    },
+    {
+      "epoch": 0.3595784252944823,
+      "grad_norm": 0.6373095512390137,
+      "learning_rate": 0.0002,
+      "loss": 1.1185,
+      "step": 290
+    },
+    {
+      "epoch": 0.3719776813391196,
+      "grad_norm": 0.5740704536437988,
+      "learning_rate": 0.0002,
+      "loss": 1.1738,
+      "step": 300
+    },
+    {
+      "epoch": 0.384376937383757,
+      "grad_norm": 0.5516835451126099,
+      "learning_rate": 0.0002,
+      "loss": 1.2858,
+      "step": 310
+    },
+    {
+      "epoch": 0.3967761934283943,
+      "grad_norm": 0.5212382078170776,
+      "learning_rate": 0.0002,
+      "loss": 1.2315,
+      "step": 320
+    },
+    {
+      "epoch": 0.40917544947303164,
+      "grad_norm": 0.540307343006134,
+      "learning_rate": 0.0002,
+      "loss": 1.172,
+      "step": 330
+    },
+    {
+      "epoch": 0.42157470551766896,
+      "grad_norm": 0.7454301714897156,
+      "learning_rate": 0.0002,
+      "loss": 1.2736,
+      "step": 340
+    },
+    {
+      "epoch": 0.4339739615623063,
+      "grad_norm": 0.7390317916870117,
+      "learning_rate": 0.0002,
+      "loss": 1.3013,
+      "step": 350
+    },
+    {
+      "epoch": 0.4463732176069436,
+      "grad_norm": 0.5498788356781006,
+      "learning_rate": 0.0002,
+      "loss": 1.0615,
+      "step": 360
+    },
+    {
+      "epoch": 0.4587724736515809,
+      "grad_norm": 0.5776252150535583,
+      "learning_rate": 0.0002,
+      "loss": 1.2251,
+      "step": 370
+    },
+    {
+      "epoch": 0.47117172969621823,
+      "grad_norm": 0.6941552758216858,
+      "learning_rate": 0.0002,
+      "loss": 1.1932,
+      "step": 380
+    },
+    {
+      "epoch": 0.48357098574085555,
+      "grad_norm": 0.7936233282089233,
+      "learning_rate": 0.0002,
+      "loss": 1.23,
+      "step": 390
+    },
+    {
+      "epoch": 0.49597024178549287,
+      "grad_norm": 0.5257220268249512,
+      "learning_rate": 0.0002,
+      "loss": 1.1137,
+      "step": 400
+    },
+    {
+      "epoch": 0.5083694978301302,
+      "grad_norm": 0.5740510821342468,
+      "learning_rate": 0.0002,
+      "loss": 1.1867,
+      "step": 410
+    },
+    {
+      "epoch": 0.5207687538747675,
+      "grad_norm": 0.6181507110595703,
+      "learning_rate": 0.0002,
+      "loss": 1.1049,
+      "step": 420
+    },
+    {
+      "epoch": 0.5331680099194048,
+      "grad_norm": 0.6333999037742615,
+      "learning_rate": 0.0002,
+      "loss": 1.2303,
+      "step": 430
+    },
+    {
+      "epoch": 0.5455672659640421,
+      "grad_norm": 0.5667845010757446,
+      "learning_rate": 0.0002,
+      "loss": 1.2457,
+      "step": 440
+    },
+    {
+      "epoch": 0.5579665220086795,
+      "grad_norm": 0.5254231095314026,
+      "learning_rate": 0.0002,
+      "loss": 1.2547,
+      "step": 450
+    },
+    {
+      "epoch": 0.5703657780533168,
+      "grad_norm": 0.5938495993614197,
+      "learning_rate": 0.0002,
+      "loss": 1.2118,
+      "step": 460
+    },
+    {
+      "epoch": 0.5827650340979541,
+      "grad_norm": 0.7733635902404785,
+      "learning_rate": 0.0002,
+      "loss": 1.2409,
+      "step": 470
+    },
+    {
+      "epoch": 0.5951642901425914,
+      "grad_norm": 0.6114753484725952,
+      "learning_rate": 0.0002,
+      "loss": 1.2343,
+      "step": 480
+    },
+    {
+      "epoch": 0.6075635461872287,
+      "grad_norm": 0.5587155818939209,
+      "learning_rate": 0.0002,
+      "loss": 1.1779,
+      "step": 490
+    },
+    {
+      "epoch": 0.6199628022318661,
+      "grad_norm": 0.7636917233467102,
+      "learning_rate": 0.0002,
+      "loss": 1.2136,
+      "step": 500
+    },
+    {
+      "epoch": 0.6323620582765034,
+      "grad_norm": 0.5896942615509033,
+      "learning_rate": 0.0002,
+      "loss": 1.1301,
+      "step": 510
+    },
+    {
+      "epoch": 0.6447613143211407,
+      "grad_norm": 0.8594750165939331,
+      "learning_rate": 0.0002,
+      "loss": 1.2089,
+      "step": 520
+    },
+    {
+      "epoch": 0.657160570365778,
+      "grad_norm": 0.6459881067276001,
+      "learning_rate": 0.0002,
+      "loss": 1.1551,
+      "step": 530
+    },
+    {
+      "epoch": 0.6695598264104153,
+      "grad_norm": 0.650656521320343,
+      "learning_rate": 0.0002,
+      "loss": 1.175,
+      "step": 540
+    },
+    {
+      "epoch": 0.6819590824550527,
+      "grad_norm": 0.7238242626190186,
+      "learning_rate": 0.0002,
+      "loss": 1.2143,
+      "step": 550
+    },
+    {
+      "epoch": 0.69435833849969,
+      "grad_norm": 0.6289859414100647,
+      "learning_rate": 0.0002,
+      "loss": 1.0961,
+      "step": 560
+    },
+    {
+      "epoch": 0.7067575945443273,
+      "grad_norm": 0.6108142137527466,
+      "learning_rate": 0.0002,
+      "loss": 1.2316,
+      "step": 570
+    },
+    {
+      "epoch": 0.7191568505889646,
+      "grad_norm": 0.6905024647712708,
+      "learning_rate": 0.0002,
+      "loss": 1.1315,
+      "step": 580
+    },
+    {
+      "epoch": 0.7315561066336019,
+      "grad_norm": 0.5975471138954163,
+      "learning_rate": 0.0002,
+      "loss": 1.2368,
+      "step": 590
+    },
+    {
+      "epoch": 0.7439553626782393,
+      "grad_norm": 0.49540066719055176,
+      "learning_rate": 0.0002,
+      "loss": 1.1014,
+      "step": 600
+    },
+    {
+      "epoch": 0.7563546187228767,
+      "grad_norm": 0.5365461707115173,
+      "learning_rate": 0.0002,
+      "loss": 1.1359,
+      "step": 610
+    },
+    {
+      "epoch": 0.768753874767514,
+      "grad_norm": 0.6156648993492126,
+      "learning_rate": 0.0002,
+      "loss": 1.2552,
+      "step": 620
+    },
+    {
+      "epoch": 0.7811531308121513,
+      "grad_norm": 0.656879186630249,
+      "learning_rate": 0.0002,
+      "loss": 1.1929,
+      "step": 630
+    },
+    {
+      "epoch": 0.7935523868567886,
+      "grad_norm": 0.8963037729263306,
+      "learning_rate": 0.0002,
+      "loss": 1.3063,
+      "step": 640
+    },
+    {
+      "epoch": 0.805951642901426,
+      "grad_norm": 1.0569753646850586,
+      "learning_rate": 0.0002,
+      "loss": 1.219,
+      "step": 650
+    },
+    {
+      "epoch": 0.8183508989460633,
+      "grad_norm": 0.7332107424736023,
+      "learning_rate": 0.0002,
+      "loss": 1.2563,
+      "step": 660
+    },
+    {
+      "epoch": 0.8307501549907006,
+      "grad_norm": 0.589097797870636,
+      "learning_rate": 0.0002,
+      "loss": 1.1029,
+      "step": 670
+    },
+    {
+      "epoch": 0.8431494110353379,
+      "grad_norm": 0.9553480744361877,
+      "learning_rate": 0.0002,
+      "loss": 1.1705,
+      "step": 680
+    },
+    {
+      "epoch": 0.8555486670799752,
+      "grad_norm": 0.7076331973075867,
+      "learning_rate": 0.0002,
+      "loss": 1.1605,
+      "step": 690
+    },
+    {
+      "epoch": 0.8679479231246126,
+      "grad_norm": 0.597531795501709,
+      "learning_rate": 0.0002,
+      "loss": 1.2346,
+      "step": 700
+    },
+    {
+      "epoch": 0.8803471791692499,
+      "grad_norm": 0.7023149132728577,
+      "learning_rate": 0.0002,
+      "loss": 1.1637,
+      "step": 710
+    },
+    {
+      "epoch": 0.8927464352138872,
+      "grad_norm": 1.4393764734268188,
+      "learning_rate": 0.0002,
+      "loss": 1.2717,
+      "step": 720
+    },
+    {
+      "epoch": 0.9051456912585245,
+      "grad_norm": 0.5944231152534485,
+      "learning_rate": 0.0002,
+      "loss": 1.216,
+      "step": 730
+    },
+    {
+      "epoch": 0.9175449473031618,
+      "grad_norm": 0.5712162852287292,
+      "learning_rate": 0.0002,
+      "loss": 1.148,
+      "step": 740
+    },
+    {
+      "epoch": 0.9299442033477991,
+      "grad_norm": 0.5335281491279602,
+      "learning_rate": 0.0002,
+      "loss": 1.2318,
+      "step": 750
+    },
+    {
+      "epoch": 0.9423434593924365,
+      "grad_norm": 0.8050292730331421,
+      "learning_rate": 0.0002,
+      "loss": 1.149,
+      "step": 760
+    },
+    {
+      "epoch": 0.9547427154370738,
+      "grad_norm": 0.6092700958251953,
+      "learning_rate": 0.0002,
+      "loss": 1.0862,
+      "step": 770
+    },
+    {
+      "epoch": 0.9671419714817111,
+      "grad_norm": 0.7012797594070435,
+      "learning_rate": 0.0002,
+      "loss": 1.3204,
+      "step": 780
+    },
+    {
+      "epoch": 0.9795412275263484,
+      "grad_norm": 0.6228184103965759,
+      "learning_rate": 0.0002,
+      "loss": 1.1641,
+      "step": 790
+    },
+    {
+      "epoch": 0.9919404835709857,
+      "grad_norm": 0.5482686161994934,
+      "learning_rate": 0.0002,
+      "loss": 1.131,
+      "step": 800
+    },
+    {
+      "epoch": 0.9993800371977681,
+      "eval_loss": 1.2057286500930786,
+      "eval_runtime": 164.6087,
+      "eval_samples_per_second": 2.77,
+      "eval_steps_per_second": 0.346,
+      "step": 806
+    },
+    {
+      "epoch": 1.004339739615623,
+      "grad_norm": 0.6331814527511597,
+      "learning_rate": 0.0002,
+      "loss": 1.0899,
+      "step": 810
+    },
+    {
+      "epoch": 1.0167389956602604,
+      "grad_norm": 0.6160872578620911,
+      "learning_rate": 0.0002,
+      "loss": 1.0551,
+      "step": 820
+    },
+    {
+      "epoch": 1.0291382517048977,
+      "grad_norm": 0.6104072332382202,
+      "learning_rate": 0.0002,
+      "loss": 0.9934,
+      "step": 830
+    },
+    {
+      "epoch": 1.041537507749535,
+      "grad_norm": 0.7619274854660034,
+      "learning_rate": 0.0002,
+      "loss": 1.0776,
+      "step": 840
+    },
+    {
+      "epoch": 1.0539367637941723,
+      "grad_norm": 0.761172890663147,
+      "learning_rate": 0.0002,
+      "loss": 0.9929,
+      "step": 850
+    },
+    {
+      "epoch": 1.0663360198388097,
+      "grad_norm": 0.7563514113426208,
+      "learning_rate": 0.0002,
+      "loss": 1.0543,
+      "step": 860
+    },
+    {
+      "epoch": 1.078735275883447,
+      "grad_norm": 0.521998941898346,
+      "learning_rate": 0.0002,
+      "loss": 1.0812,
+      "step": 870
+    },
+    {
+      "epoch": 1.0911345319280843,
+      "grad_norm": 0.824347972869873,
+      "learning_rate": 0.0002,
+      "loss": 1.1417,
+      "step": 880
+    },
+    {
+      "epoch": 1.1035337879727216,
+      "grad_norm": 0.5645424127578735,
+      "learning_rate": 0.0002,
+      "loss": 1.1096,
+      "step": 890
+    },
+    {
+      "epoch": 1.115933044017359,
+      "grad_norm": 0.8568223714828491,
+      "learning_rate": 0.0002,
+      "loss": 1.1005,
+      "step": 900
+    },
+    {
+      "epoch": 1.1283323000619963,
+      "grad_norm": 0.68181312084198,
+      "learning_rate": 0.0002,
+      "loss": 1.088,
+      "step": 910
+    },
+    {
+      "epoch": 1.1407315561066336,
+      "grad_norm": 0.7577647566795349,
+      "learning_rate": 0.0002,
+      "loss": 1.0281,
+      "step": 920
+    },
+    {
+      "epoch": 1.153130812151271,
+      "grad_norm": 0.6968798637390137,
+      "learning_rate": 0.0002,
+      "loss": 0.9812,
+      "step": 930
+    },
+    {
+      "epoch": 1.1655300681959082,
+      "grad_norm": 0.5769661664962769,
+      "learning_rate": 0.0002,
+      "loss": 1.0539,
+      "step": 940
+    },
+    {
+      "epoch": 1.1779293242405455,
+      "grad_norm": 0.6399155259132385,
+      "learning_rate": 0.0002,
+      "loss": 1.0831,
+      "step": 950
+    },
+    {
+      "epoch": 1.1903285802851828,
+      "grad_norm": 0.9824289679527283,
+      "learning_rate": 0.0002,
+      "loss": 1.0464,
+      "step": 960
+    },
+    {
+      "epoch": 1.2027278363298202,
+      "grad_norm": 0.7485893964767456,
+      "learning_rate": 0.0002,
+      "loss": 1.1068,
+      "step": 970
+    },
+    {
+      "epoch": 1.2151270923744575,
+      "grad_norm": 0.668736457824707,
+      "learning_rate": 0.0002,
+      "loss": 1.0047,
+      "step": 980
+    },
+    {
+      "epoch": 1.2275263484190948,
+      "grad_norm": 0.7041404843330383,
+      "learning_rate": 0.0002,
+      "loss": 1.0818,
+      "step": 990
+    },
+    {
+      "epoch": 1.2399256044637321,
+      "grad_norm": 0.7070603966712952,
+      "learning_rate": 0.0002,
+      "loss": 1.0847,
+      "step": 1000
+    },
+    {
+      "epoch": 1.2523248605083694,
+      "grad_norm": 0.7828628420829773,
+      "learning_rate": 0.0002,
+      "loss": 1.047,
+      "step": 1010
+    },
+    {
+      "epoch": 1.2647241165530068,
+      "grad_norm": 0.7149654626846313,
+      "learning_rate": 0.0002,
+      "loss": 1.0658,
+      "step": 1020
+    },
+    {
+      "epoch": 1.277123372597644,
+      "grad_norm": 0.7691766619682312,
+      "learning_rate": 0.0002,
+      "loss": 0.9791,
+      "step": 1030
+    },
+    {
+      "epoch": 1.2895226286422814,
+      "grad_norm": 0.8022137880325317,
+      "learning_rate": 0.0002,
+      "loss": 1.0242,
+      "step": 1040
+    },
+    {
+      "epoch": 1.3019218846869187,
+      "grad_norm": 0.6709204316139221,
+      "learning_rate": 0.0002,
+      "loss": 1.0837,
+      "step": 1050
+    },
+    {
+      "epoch": 1.314321140731556,
+      "grad_norm": 0.7368158102035522,
+      "learning_rate": 0.0002,
+      "loss": 1.0382,
+      "step": 1060
+    },
+    {
+      "epoch": 1.3267203967761934,
+      "grad_norm": 0.8408007621765137,
+      "learning_rate": 0.0002,
+      "loss": 1.0371,
+      "step": 1070
+    },
+    {
+      "epoch": 1.3391196528208307,
+      "grad_norm": 1.2165539264678955,
+      "learning_rate": 0.0002,
+      "loss": 0.9633,
+      "step": 1080
+    },
+    {
+      "epoch": 1.351518908865468,
+      "grad_norm": 0.7284916043281555,
+      "learning_rate": 0.0002,
+      "loss": 1.0079,
+      "step": 1090
+    },
+    {
+      "epoch": 1.3639181649101053,
+      "grad_norm": 0.7994557619094849,
+      "learning_rate": 0.0002,
+      "loss": 1.0211,
+      "step": 1100
+    },
+    {
+      "epoch": 1.3763174209547429,
+      "grad_norm": 0.9658345580101013,
+      "learning_rate": 0.0002,
+      "loss": 1.0892,
+      "step": 1110
+    },
+    {
+      "epoch": 1.3887166769993802,
+      "grad_norm": 0.6312829852104187,
+      "learning_rate": 0.0002,
+      "loss": 1.2088,
+      "step": 1120
+    },
+    {
+      "epoch": 1.4011159330440175,
+      "grad_norm": 0.7263661026954651,
+      "learning_rate": 0.0002,
+      "loss": 1.1055,
+      "step": 1130
+    },
+    {
+      "epoch": 1.4135151890886548,
+      "grad_norm": 0.829082727432251,
+      "learning_rate": 0.0002,
+      "loss": 1.0232,
+      "step": 1140
+    },
+    {
+      "epoch": 1.4259144451332921,
+      "grad_norm": 0.6168127059936523,
+      "learning_rate": 0.0002,
+      "loss": 1.1413,
+      "step": 1150
+    },
+    {
+      "epoch": 1.4383137011779294,
+      "grad_norm": 0.8351425528526306,
+      "learning_rate": 0.0002,
+      "loss": 1.0283,
+      "step": 1160
+    },
+    {
+      "epoch": 1.4507129572225668,
+      "grad_norm": 0.8814472556114197,
+      "learning_rate": 0.0002,
+      "loss": 1.1146,
+      "step": 1170
+    },
+    {
+      "epoch": 1.463112213267204,
+      "grad_norm": 0.6913689970970154,
+      "learning_rate": 0.0002,
+      "loss": 1.0932,
+      "step": 1180
+    },
+    {
+      "epoch": 1.4755114693118414,
+      "grad_norm": 0.7907165884971619,
+      "learning_rate": 0.0002,
+      "loss": 1.1066,
+      "step": 1190
+    },
+    {
+      "epoch": 1.4879107253564787,
+      "grad_norm": 0.8361626267433167,
+      "learning_rate": 0.0002,
+      "loss": 1.0738,
+      "step": 1200
+    },
+    {
+      "epoch": 1.500309981401116,
+      "grad_norm": 1.073534607887268,
+      "learning_rate": 0.0002,
+      "loss": 1.0559,
+      "step": 1210
+    },
+    {
+      "epoch": 1.5127092374457534,
+      "grad_norm": 0.8416345119476318,
+      "learning_rate": 0.0002,
+      "loss": 1.0204,
+      "step": 1220
+    },
+    {
+      "epoch": 1.5251084934903907,
+      "grad_norm": 1.0225597620010376,
+      "learning_rate": 0.0002,
+      "loss": 1.0941,
+      "step": 1230
+    },
+    {
+      "epoch": 1.537507749535028,
+      "grad_norm": 0.6662965416908264,
+      "learning_rate": 0.0002,
+      "loss": 1.0854,
+      "step": 1240
+    },
+    {
+      "epoch": 1.5499070055796653,
+      "grad_norm": 0.7363991737365723,
+      "learning_rate": 0.0002,
+      "loss": 1.0816,
+      "step": 1250
+    },
+    {
+      "epoch": 1.5623062616243026,
+      "grad_norm": 0.9029574990272522,
+      "learning_rate": 0.0002,
+      "loss": 1.0271,
+      "step": 1260
+    },
+    {
+      "epoch": 1.57470551766894,
+      "grad_norm": 0.7992424368858337,
+      "learning_rate": 0.0002,
+      "loss": 1.0206,
+      "step": 1270
+    },
+    {
+      "epoch": 1.5871047737135773,
+      "grad_norm": 0.8108977675437927,
+      "learning_rate": 0.0002,
+      "loss": 1.0114,
+      "step": 1280
+    },
+    {
+      "epoch": 1.5995040297582146,
+      "grad_norm": 0.8257458806037903,
+      "learning_rate": 0.0002,
+      "loss": 1.0264,
+      "step": 1290
+    },
+    {
+      "epoch": 1.611903285802852,
+      "grad_norm": 0.8265092968940735,
+      "learning_rate": 0.0002,
+      "loss": 1.0944,
+      "step": 1300
+    },
+    {
+      "epoch": 1.6243025418474892,
+      "grad_norm": 0.6568580269813538,
+      "learning_rate": 0.0002,
+      "loss": 1.0136,
+      "step": 1310
+    },
+    {
+      "epoch": 1.6367017978921266,
+      "grad_norm": 0.7608488202095032,
+      "learning_rate": 0.0002,
+      "loss": 1.009,
+      "step": 1320
+    },
+    {
+      "epoch": 1.6491010539367639,
+      "grad_norm": 0.7511259317398071,
+      "learning_rate": 0.0002,
+      "loss": 1.1202,
+      "step": 1330
+    },
+    {
+      "epoch": 1.6615003099814012,
+      "grad_norm": 0.7942162752151489,
+      "learning_rate": 0.0002,
+      "loss": 1.0528,
+      "step": 1340
+    },
+    {
+      "epoch": 1.6738995660260385,
+      "grad_norm": 0.8253659605979919,
+      "learning_rate": 0.0002,
+      "loss": 1.0411,
+      "step": 1350
+    },
+    {
+      "epoch": 1.6862988220706758,
+      "grad_norm": 1.1318382024765015,
+      "learning_rate": 0.0002,
+      "loss": 1.001,
+      "step": 1360
+    },
+    {
+      "epoch": 1.6986980781153131,
+      "grad_norm": 0.693403959274292,
+      "learning_rate": 0.0002,
+      "loss": 1.0727,
+      "step": 1370
+    },
+    {
+      "epoch": 1.7110973341599505,
+      "grad_norm": 0.7107617259025574,
+      "learning_rate": 0.0002,
+      "loss": 1.073,
+      "step": 1380
+    },
+    {
+      "epoch": 1.7234965902045878,
+      "grad_norm": 0.8169032335281372,
+      "learning_rate": 0.0002,
+      "loss": 1.0849,
+      "step": 1390
+    },
+    {
+      "epoch": 1.735895846249225,
+      "grad_norm": 0.8940841555595398,
+      "learning_rate": 0.0002,
+      "loss": 1.0578,
+      "step": 1400
+    },
+    {
+      "epoch": 1.7482951022938624,
+      "grad_norm": 0.7862188220024109,
+      "learning_rate": 0.0002,
+      "loss": 1.0891,
+      "step": 1410
+    },
+    {
+      "epoch": 1.7606943583384997,
+      "grad_norm": 1.136338472366333,
+      "learning_rate": 0.0002,
+      "loss": 0.9962,
+      "step": 1420
+    },
+    {
+      "epoch": 1.773093614383137,
+      "grad_norm": 0.9534069895744324,
+      "learning_rate": 0.0002,
+      "loss": 1.0943,
+      "step": 1430
+    },
+    {
+      "epoch": 1.7854928704277744,
+      "grad_norm": 1.0747562646865845,
+      "learning_rate": 0.0002,
+      "loss": 1.1257,
+      "step": 1440
+    },
+    {
+      "epoch": 1.7978921264724117,
+      "grad_norm": 0.8557891249656677,
+      "learning_rate": 0.0002,
+      "loss": 1.0556,
+      "step": 1450
+    },
+    {
+      "epoch": 1.810291382517049,
+      "grad_norm": 0.6829259991645813,
+      "learning_rate": 0.0002,
+      "loss": 1.0128,
+      "step": 1460
+    },
+    {
+      "epoch": 1.8226906385616863,
+      "grad_norm": 0.8164441585540771,
+      "learning_rate": 0.0002,
+      "loss": 1.0313,
+      "step": 1470
+    },
+    {
+      "epoch": 1.8350898946063237,
+      "grad_norm": 0.9458068609237671,
+      "learning_rate": 0.0002,
+      "loss": 1.1136,
+      "step": 1480
+    },
+    {
+      "epoch": 1.847489150650961,
+      "grad_norm": 0.743009626865387,
+      "learning_rate": 0.0002,
+      "loss": 1.0457,
+      "step": 1490
+    },
+    {
+      "epoch": 1.8598884066955983,
+      "grad_norm": 0.7137694358825684,
+      "learning_rate": 0.0002,
+      "loss": 1.0107,
+      "step": 1500
+    },
+    {
+      "epoch": 1.8722876627402356,
+      "grad_norm": 0.7618028521537781,
+      "learning_rate": 0.0002,
+      "loss": 1.0633,
+      "step": 1510
+    },
+    {
+      "epoch": 1.884686918784873,
+      "grad_norm": 0.8153398633003235,
+      "learning_rate": 0.0002,
+      "loss": 1.103,
+      "step": 1520
+    },
+    {
+      "epoch": 1.8970861748295103,
+      "grad_norm": 0.9127124547958374,
+      "learning_rate": 0.0002,
+      "loss": 1.2094,
+      "step": 1530
+    },
+    {
+      "epoch": 1.9094854308741476,
+      "grad_norm": 0.7699425220489502,
+      "learning_rate": 0.0002,
+      "loss": 1.0379,
+      "step": 1540
+    },
+    {
+      "epoch": 1.921884686918785,
+      "grad_norm": 0.8807545304298401,
+      "learning_rate": 0.0002,
+      "loss": 0.9849,
+      "step": 1550
+    },
+    {
+      "epoch": 1.9342839429634222,
+      "grad_norm": 0.7340815663337708,
+      "learning_rate": 0.0002,
+      "loss": 1.033,
+      "step": 1560
+    },
+    {
+      "epoch": 1.9466831990080595,
+      "grad_norm": 1.070056676864624,
+      "learning_rate": 0.0002,
+      "loss": 1.0083,
+      "step": 1570
+    },
+    {
+      "epoch": 1.9590824550526968,
+      "grad_norm": 0.8195573687553406,
+      "learning_rate": 0.0002,
+      "loss": 1.0023,
+      "step": 1580
+    },
+    {
+      "epoch": 1.9714817110973342,
+      "grad_norm": 0.7938687205314636,
+      "learning_rate": 0.0002,
+      "loss": 1.029,
+      "step": 1590
+    },
+    {
+      "epoch": 1.9838809671419715,
+      "grad_norm": 0.7632259726524353,
+      "learning_rate": 0.0002,
+      "loss": 1.0512,
+      "step": 1600
+    },
+    {
+      "epoch": 1.9962802231866088,
+      "grad_norm": 0.7921916246414185,
+      "learning_rate": 0.0002,
+      "loss": 1.0426,
+      "step": 1610
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.2014765739440918,
+      "eval_runtime": 159.8677,
+      "eval_samples_per_second": 2.852,
+      "eval_steps_per_second": 0.357,
+      "step": 1613
+    },
+    {
+      "epoch": 2.008679479231246,
+      "grad_norm": 1.1764529943466187,
+      "learning_rate": 0.0002,
+      "loss": 0.9239,
+      "step": 1620
+    },
+    {
+      "epoch": 2.0210787352758834,
+      "grad_norm": 1.0271947383880615,
+      "learning_rate": 0.0002,
+      "loss": 0.7995,
+      "step": 1630
+    },
+    {
+      "epoch": 2.0334779913205208,
+      "grad_norm": 0.7138071656227112,
+      "learning_rate": 0.0002,
+      "loss": 0.8592,
+      "step": 1640
+    },
+    {
+      "epoch": 2.045877247365158,
+      "grad_norm": 0.8644373416900635,
+      "learning_rate": 0.0002,
+      "loss": 0.8106,
+      "step": 1650
+    },
+    {
+      "epoch": 2.0582765034097954,
+      "grad_norm": 1.2262420654296875,
+      "learning_rate": 0.0002,
+      "loss": 0.8578,
+      "step": 1660
+    },
+    {
+      "epoch": 2.0706757594544327,
+      "grad_norm": 0.9718686938285828,
+      "learning_rate": 0.0002,
+      "loss": 0.8009,
+      "step": 1670
+    },
+    {
+      "epoch": 2.08307501549907,
+      "grad_norm": 1.0075122117996216,
+      "learning_rate": 0.0002,
+      "loss": 0.831,
+      "step": 1680
+    },
+    {
+      "epoch": 2.0954742715437074,
+      "grad_norm": 1.2113722562789917,
+      "learning_rate": 0.0002,
+      "loss": 0.8177,
+      "step": 1690
+    },
+    {
+      "epoch": 2.1078735275883447,
+      "grad_norm": 0.7911604642868042,
+      "learning_rate": 0.0002,
+      "loss": 0.8377,
+      "step": 1700
+    },
+    {
+      "epoch": 2.120272783632982,
+      "grad_norm": 0.8578933477401733,
+      "learning_rate": 0.0002,
+      "loss": 0.8405,
+      "step": 1710
+    },
+    {
+      "epoch": 2.1326720396776193,
+      "grad_norm": 1.1782084703445435,
+      "learning_rate": 0.0002,
+      "loss": 0.8784,
+      "step": 1720
+    },
+    {
+      "epoch": 2.1450712957222566,
+      "grad_norm": 1.3627573251724243,
+      "learning_rate": 0.0002,
+      "loss": 0.8543,
+      "step": 1730
+    },
+    {
+      "epoch": 2.157470551766894,
+      "grad_norm": 1.2948803901672363,
+      "learning_rate": 0.0002,
+      "loss": 0.8404,
+      "step": 1740
+    },
+    {
+      "epoch": 2.1698698078115313,
+      "grad_norm": 0.9353442788124084,
+      "learning_rate": 0.0002,
+      "loss": 0.8719,
+      "step": 1750
+    },
+    {
+      "epoch": 2.1822690638561686,
+      "grad_norm": 0.9063374400138855,
+      "learning_rate": 0.0002,
+      "loss": 0.8112,
+      "step": 1760
+    },
+    {
+      "epoch": 2.194668319900806,
+      "grad_norm": 1.3354851007461548,
+      "learning_rate": 0.0002,
+      "loss": 0.9441,
+      "step": 1770
+    },
+    {
+      "epoch": 2.2070675759454432,
+      "grad_norm": 0.8388507962226868,
+      "learning_rate": 0.0002,
+      "loss": 0.877,
+      "step": 1780
+    },
+    {
+      "epoch": 2.2194668319900805,
+      "grad_norm": 0.9509401321411133,
+      "learning_rate": 0.0002,
+      "loss": 0.8709,
+      "step": 1790
+    },
+    {
+      "epoch": 2.231866088034718,
+      "grad_norm": 1.0458593368530273,
+      "learning_rate": 0.0002,
+      "loss": 0.8212,
+      "step": 1800
+    },
+    {
+      "epoch": 2.244265344079355,
+      "grad_norm": 0.890088677406311,
+      "learning_rate": 0.0002,
+      "loss": 0.7667,
+      "step": 1810
+    },
+    {
+      "epoch": 2.2566646001239925,
+      "grad_norm": 1.1933976411819458,
+      "learning_rate": 0.0002,
+      "loss": 0.8431,
+      "step": 1820
+    },
+    {
+      "epoch": 2.26906385616863,
+      "grad_norm": 0.961398184299469,
+      "learning_rate": 0.0002,
+      "loss": 0.8697,
+      "step": 1830
+    },
+    {
+      "epoch": 2.281463112213267,
+      "grad_norm": 1.124961495399475,
+      "learning_rate": 0.0002,
+      "loss": 0.8403,
+      "step": 1840
+    },
+    {
+      "epoch": 2.2938623682579045,
+      "grad_norm": 0.9042379260063171,
+      "learning_rate": 0.0002,
+      "loss": 0.8431,
+      "step": 1850
+    },
+    {
+      "epoch": 2.306261624302542,
+      "grad_norm": 1.2250864505767822,
+      "learning_rate": 0.0002,
+      "loss": 0.8866,
+      "step": 1860
+    },
+    {
+      "epoch": 2.318660880347179,
+      "grad_norm": 1.1758817434310913,
+      "learning_rate": 0.0002,
+      "loss": 0.8514,
+      "step": 1870
+    },
+    {
+      "epoch": 2.3310601363918164,
+      "grad_norm": 0.9863199591636658,
+      "learning_rate": 0.0002,
+      "loss": 0.9316,
+      "step": 1880
+    },
+    {
+      "epoch": 2.3434593924364537,
+      "grad_norm": 1.1759305000305176,
+      "learning_rate": 0.0002,
+      "loss": 0.8854,
+      "step": 1890
+    },
+    {
+      "epoch": 2.355858648481091,
+      "grad_norm": 0.995716392993927,
+      "learning_rate": 0.0002,
+      "loss": 0.866,
+      "step": 1900
+    },
+    {
+      "epoch": 2.3682579045257284,
+      "grad_norm": 1.1816585063934326,
+      "learning_rate": 0.0002,
+      "loss": 0.8439,
+      "step": 1910
+    },
+    {
+      "epoch": 2.3806571605703657,
+      "grad_norm": 0.7498432397842407,
+      "learning_rate": 0.0002,
+      "loss": 0.9284,
+      "step": 1920
+    },
+    {
+      "epoch": 2.393056416615003,
+      "grad_norm": 0.9481443762779236,
+      "learning_rate": 0.0002,
+      "loss": 0.8243,
+      "step": 1930
+    },
+    {
+      "epoch": 2.4054556726596403,
+      "grad_norm": 1.1264584064483643,
+      "learning_rate": 0.0002,
+      "loss": 0.8083,
+      "step": 1940
+    },
+    {
+      "epoch": 2.4178549287042777,
+      "grad_norm": 0.8826232552528381,
+      "learning_rate": 0.0002,
+      "loss": 0.9122,
+      "step": 1950
+    },
+    {
+      "epoch": 2.430254184748915,
+      "grad_norm": 0.9702113270759583,
+      "learning_rate": 0.0002,
+      "loss": 0.8764,
+      "step": 1960
+    },
+    {
+      "epoch": 2.4426534407935523,
+      "grad_norm": 1.0663695335388184,
+      "learning_rate": 0.0002,
+      "loss": 0.8498,
+      "step": 1970
+    },
+    {
+      "epoch": 2.4550526968381896,
+      "grad_norm": 1.1186119318008423,
+      "learning_rate": 0.0002,
+      "loss": 0.888,
+      "step": 1980
+    },
+    {
+      "epoch": 2.467451952882827,
+      "grad_norm": 1.428774118423462,
+      "learning_rate": 0.0002,
+      "loss": 0.9327,
+      "step": 1990
+    },
+    {
+      "epoch": 2.4798512089274642,
+      "grad_norm": 1.3054901361465454,
+      "learning_rate": 0.0002,
+      "loss": 0.9423,
+      "step": 2000
+    },
+    {
+      "epoch": 2.4922504649721016,
+      "grad_norm": 0.9893805384635925,
+      "learning_rate": 0.0002,
+      "loss": 0.8494,
+      "step": 2010
+    },
+    {
+      "epoch": 2.504649721016739,
+      "grad_norm": 1.149538516998291,
+      "learning_rate": 0.0002,
+      "loss": 0.9982,
+      "step": 2020
+    },
+    {
+      "epoch": 2.517048977061376,
+      "grad_norm": 0.8716336488723755,
+      "learning_rate": 0.0002,
+      "loss": 0.881,
+      "step": 2030
+    },
+    {
+      "epoch": 2.5294482331060135,
+      "grad_norm": 1.0464730262756348,
+      "learning_rate": 0.0002,
+      "loss": 0.8483,
+      "step": 2040
+    },
+    {
+      "epoch": 2.541847489150651,
+      "grad_norm": 1.1451894044876099,
+      "learning_rate": 0.0002,
+      "loss": 0.9475,
+      "step": 2050
+    },
+    {
+      "epoch": 2.554246745195288,
+      "grad_norm": 1.3266205787658691,
+      "learning_rate": 0.0002,
+      "loss": 0.8238,
+      "step": 2060
+    },
+    {
+      "epoch": 2.5666460012399255,
+      "grad_norm": 1.2838176488876343,
+      "learning_rate": 0.0002,
+      "loss": 0.8457,
+      "step": 2070
+    },
+    {
+      "epoch": 2.579045257284563,
+      "grad_norm": 1.0352915525436401,
+      "learning_rate": 0.0002,
+      "loss": 0.7813,
+      "step": 2080
+    },
+    {
+      "epoch": 2.5914445133292,
+      "grad_norm": 1.181416392326355,
+      "learning_rate": 0.0002,
+      "loss": 0.895,
+      "step": 2090
+    },
+    {
+      "epoch": 2.6038437693738374,
+      "grad_norm": 1.2425765991210938,
+      "learning_rate": 0.0002,
+      "loss": 0.8537,
+      "step": 2100
+    },
+    {
+      "epoch": 2.6162430254184748,
+      "grad_norm": 1.2885762453079224,
+      "learning_rate": 0.0002,
+      "loss": 0.8561,
+      "step": 2110
+    },
+    {
+      "epoch": 2.628642281463112,
+      "grad_norm": 1.0179181098937988,
+      "learning_rate": 0.0002,
+      "loss": 0.8024,
+      "step": 2120
+    },
+    {
+      "epoch": 2.6410415375077494,
+      "grad_norm": 1.4908100366592407,
+      "learning_rate": 0.0002,
+      "loss": 0.8747,
+      "step": 2130
+    },
+    {
+      "epoch": 2.6534407935523867,
+      "grad_norm": 1.4854460954666138,
+      "learning_rate": 0.0002,
+      "loss": 0.8475,
+      "step": 2140
+    },
+    {
+      "epoch": 2.665840049597024,
+      "grad_norm": 0.994413435459137,
+      "learning_rate": 0.0002,
+      "loss": 0.8579,
+      "step": 2150
+    },
+    {
+      "epoch": 2.6782393056416613,
+      "grad_norm": 1.177201271057129,
+      "learning_rate": 0.0002,
+      "loss": 0.8606,
+      "step": 2160
+    },
+    {
+      "epoch": 2.6906385616862987,
+      "grad_norm": 1.2680933475494385,
+      "learning_rate": 0.0002,
+      "loss": 0.9233,
+      "step": 2170
+    },
+    {
+      "epoch": 2.703037817730936,
+      "grad_norm": 1.2201054096221924,
+      "learning_rate": 0.0002,
+      "loss": 0.8443,
+      "step": 2180
+    },
+    {
+      "epoch": 2.7154370737755733,
+      "grad_norm": 1.2058831453323364,
+      "learning_rate": 0.0002,
+      "loss": 0.8437,
+      "step": 2190
+    },
+    {
+      "epoch": 2.7278363298202106,
+      "grad_norm": 1.1667239665985107,
+      "learning_rate": 0.0002,
+      "loss": 0.9894,
+      "step": 2200
+    },
+    {
+      "epoch": 2.740235585864848,
+      "grad_norm": 1.1243321895599365,
+      "learning_rate": 0.0002,
+      "loss": 0.8501,
+      "step": 2210
+    },
+    {
+      "epoch": 2.7526348419094857,
+      "grad_norm": 1.0543156862258911,
+      "learning_rate": 0.0002,
+      "loss": 0.9387,
+      "step": 2220
+    },
+    {
+      "epoch": 2.765034097954123,
+      "grad_norm": 1.1922553777694702,
+      "learning_rate": 0.0002,
+      "loss": 0.9488,
+      "step": 2230
+    },
+    {
+      "epoch": 2.7774333539987603,
+      "grad_norm": 1.1266813278198242,
+      "learning_rate": 0.0002,
+      "loss": 0.8558,
+      "step": 2240
+    },
+    {
+      "epoch": 2.7898326100433977,
+      "grad_norm": 0.9645159840583801,
+      "learning_rate": 0.0002,
+      "loss": 0.8459,
+      "step": 2250
+    },
+    {
+      "epoch": 2.802231866088035,
+      "grad_norm": 1.0672235488891602,
+      "learning_rate": 0.0002,
+      "loss": 0.8862,
+      "step": 2260
+    },
+    {
+      "epoch": 2.8146311221326723,
+      "grad_norm": 1.5650453567504883,
+      "learning_rate": 0.0002,
+      "loss": 0.869,
+      "step": 2270
+    },
+    {
+      "epoch": 2.8270303781773096,
+      "grad_norm": 1.0414438247680664,
+      "learning_rate": 0.0002,
+      "loss": 0.8,
+      "step": 2280
+    },
+    {
+      "epoch": 2.839429634221947,
+      "grad_norm": 0.8878290057182312,
+      "learning_rate": 0.0002,
+      "loss": 0.8419,
+      "step": 2290
+    },
+    {
+      "epoch": 2.8518288902665843,
+      "grad_norm": 1.0500553846359253,
+      "learning_rate": 0.0002,
+      "loss": 0.9148,
+      "step": 2300
+    },
+    {
+      "epoch": 2.8642281463112216,
+      "grad_norm": 0.9779142737388611,
+      "learning_rate": 0.0002,
+      "loss": 0.8706,
+      "step": 2310
+    },
+    {
+      "epoch": 2.876627402355859,
+      "grad_norm": 0.8904196619987488,
+      "learning_rate": 0.0002,
+      "loss": 0.8385,
+      "step": 2320
+    },
+    {
+      "epoch": 2.889026658400496,
+      "grad_norm": 1.103608250617981,
+      "learning_rate": 0.0002,
+      "loss": 0.8768,
+      "step": 2330
+    },
+    {
+      "epoch": 2.9014259144451335,
+      "grad_norm": 1.2064822912216187,
+      "learning_rate": 0.0002,
+      "loss": 0.8659,
+      "step": 2340
+    },
+    {
+      "epoch": 2.913825170489771,
+      "grad_norm": 1.3073748350143433,
+      "learning_rate": 0.0002,
+      "loss": 0.9299,
+      "step": 2350
+    },
+    {
+      "epoch": 2.926224426534408,
+      "grad_norm": 1.4792760610580444,
+      "learning_rate": 0.0002,
+      "loss": 0.778,
+      "step": 2360
+    },
+    {
+      "epoch": 2.9386236825790455,
+      "grad_norm": 1.1670116186141968,
+      "learning_rate": 0.0002,
+      "loss": 0.9773,
+      "step": 2370
+    },
+    {
+      "epoch": 2.951022938623683,
+      "grad_norm": 1.235465168952942,
+      "learning_rate": 0.0002,
+      "loss": 0.8973,
+      "step": 2380
+    },
+    {
+      "epoch": 2.96342219466832,
+      "grad_norm": 1.7734158039093018,
+      "learning_rate": 0.0002,
+      "loss": 0.8646,
+      "step": 2390
+    },
+    {
+      "epoch": 2.9758214507129574,
+      "grad_norm": 1.3497414588928223,
+      "learning_rate": 0.0002,
+      "loss": 0.8784,
+      "step": 2400
+    },
+    {
+      "epoch": 2.9882207067575948,
+      "grad_norm": 1.1425493955612183,
+      "learning_rate": 0.0002,
+      "loss": 0.9116,
+      "step": 2410
+    },
+    {
+      "epoch": 2.999380037197768,
+      "eval_loss": 1.2303974628448486,
+      "eval_runtime": 126.4856,
+      "eval_samples_per_second": 3.605,
+      "eval_steps_per_second": 0.451,
+      "step": 2419
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 6448,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.061549012680704e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a0a54cf05b51cc3ca7cba649c3e96685958c9d310c181dff0c31954ec4641225
+size 5560

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5272faaeb909365f53df3d81564ee9751b14467acb7f059b654643e4ea7e4016
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6fa080a88bb511d21d0560f2d8bca5e012619588ab01c58c25e32826c2e229ff
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:15ac5ea13d2e8ce3c4c210e69db0f4ab9a202cfd737fb121deabf8e012216b46
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f37fd11b98c56b2bb9423a13020156309a19c186f1b079cca5de9c3a565ad2b
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,2319 @@

+{
+  "best_metric": 1.2014765739440918,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-1613",
+  "epoch": 4.0,
+  "eval_steps": 10,
+  "global_step": 3226,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.012399256044637322,
+      "grad_norm": 1.6176791191101074,
+      "learning_rate": 0.0002,
+      "loss": 1.8616,
+      "step": 10
+    },
+    {
+      "epoch": 0.024798512089274645,
+      "grad_norm": 0.7599679827690125,
+      "learning_rate": 0.0002,
+      "loss": 1.5953,
+      "step": 20
+    },
+    {
+      "epoch": 0.037197768133911964,
+      "grad_norm": 0.8452111482620239,
+      "learning_rate": 0.0002,
+      "loss": 1.5705,
+      "step": 30
+    },
+    {
+      "epoch": 0.04959702417854929,
+      "grad_norm": 0.8393070101737976,
+      "learning_rate": 0.0002,
+      "loss": 1.5647,
+      "step": 40
+    },
+    {
+      "epoch": 0.06199628022318661,
+      "grad_norm": 1.117109775543213,
+      "learning_rate": 0.0002,
+      "loss": 1.4628,
+      "step": 50
+    },
+    {
+      "epoch": 0.07439553626782393,
+      "grad_norm": 0.8330236077308655,
+      "learning_rate": 0.0002,
+      "loss": 1.3492,
+      "step": 60
+    },
+    {
+      "epoch": 0.08679479231246125,
+      "grad_norm": 0.8670704960823059,
+      "learning_rate": 0.0002,
+      "loss": 1.367,
+      "step": 70
+    },
+    {
+      "epoch": 0.09919404835709858,
+      "grad_norm": 0.6262535452842712,
+      "learning_rate": 0.0002,
+      "loss": 1.2357,
+      "step": 80
+    },
+    {
+      "epoch": 0.1115933044017359,
+      "grad_norm": 0.753338098526001,
+      "learning_rate": 0.0002,
+      "loss": 1.3651,
+      "step": 90
+    },
+    {
+      "epoch": 0.12399256044637322,
+      "grad_norm": 0.6324933171272278,
+      "learning_rate": 0.0002,
+      "loss": 1.2789,
+      "step": 100
+    },
+    {
+      "epoch": 0.13639181649101054,
+      "grad_norm": 0.7270851135253906,
+      "learning_rate": 0.0002,
+      "loss": 1.2393,
+      "step": 110
+    },
+    {
+      "epoch": 0.14879107253564786,
+      "grad_norm": 0.7036070227622986,
+      "learning_rate": 0.0002,
+      "loss": 1.177,
+      "step": 120
+    },
+    {
+      "epoch": 0.16119032858028517,
+      "grad_norm": 0.6269583106040955,
+      "learning_rate": 0.0002,
+      "loss": 1.2808,
+      "step": 130
+    },
+    {
+      "epoch": 0.1735895846249225,
+      "grad_norm": 0.6848828792572021,
+      "learning_rate": 0.0002,
+      "loss": 1.3039,
+      "step": 140
+    },
+    {
+      "epoch": 0.1859888406695598,
+      "grad_norm": 0.5589784383773804,
+      "learning_rate": 0.0002,
+      "loss": 1.1925,
+      "step": 150
+    },
+    {
+      "epoch": 0.19838809671419716,
+      "grad_norm": 0.8350988626480103,
+      "learning_rate": 0.0002,
+      "loss": 1.2724,
+      "step": 160
+    },
+    {
+      "epoch": 0.21078735275883448,
+      "grad_norm": 1.1780346632003784,
+      "learning_rate": 0.0002,
+      "loss": 1.2093,
+      "step": 170
+    },
+    {
+      "epoch": 0.2231866088034718,
+      "grad_norm": 0.674608588218689,
+      "learning_rate": 0.0002,
+      "loss": 1.2573,
+      "step": 180
+    },
+    {
+      "epoch": 0.23558586484810912,
+      "grad_norm": 0.6972184181213379,
+      "learning_rate": 0.0002,
+      "loss": 1.2629,
+      "step": 190
+    },
+    {
+      "epoch": 0.24798512089274644,
+      "grad_norm": 0.5187845230102539,
+      "learning_rate": 0.0002,
+      "loss": 1.2618,
+      "step": 200
+    },
+    {
+      "epoch": 0.26038437693738375,
+      "grad_norm": 0.7513871192932129,
+      "learning_rate": 0.0002,
+      "loss": 1.3478,
+      "step": 210
+    },
+    {
+      "epoch": 0.2727836329820211,
+      "grad_norm": 0.5859110951423645,
+      "learning_rate": 0.0002,
+      "loss": 1.1843,
+      "step": 220
+    },
+    {
+      "epoch": 0.2851828890266584,
+      "grad_norm": 0.5547062754631042,
+      "learning_rate": 0.0002,
+      "loss": 1.1784,
+      "step": 230
+    },
+    {
+      "epoch": 0.2975821450712957,
+      "grad_norm": 3.5287671089172363,
+      "learning_rate": 0.0002,
+      "loss": 1.2564,
+      "step": 240
+    },
+    {
+      "epoch": 0.30998140111593303,
+      "grad_norm": 0.8644460439682007,
+      "learning_rate": 0.0002,
+      "loss": 1.313,
+      "step": 250
+    },
+    {
+      "epoch": 0.32238065716057035,
+      "grad_norm": 0.6270064115524292,
+      "learning_rate": 0.0002,
+      "loss": 1.2187,
+      "step": 260
+    },
+    {
+      "epoch": 0.33477991320520767,
+      "grad_norm": 1.170295000076294,
+      "learning_rate": 0.0002,
+      "loss": 1.2017,
+      "step": 270
+    },
+    {
+      "epoch": 0.347179169249845,
+      "grad_norm": 0.5701245069503784,
+      "learning_rate": 0.0002,
+      "loss": 1.1075,
+      "step": 280
+    },
+    {
+      "epoch": 0.3595784252944823,
+      "grad_norm": 0.6373095512390137,
+      "learning_rate": 0.0002,
+      "loss": 1.1185,
+      "step": 290
+    },
+    {
+      "epoch": 0.3719776813391196,
+      "grad_norm": 0.5740704536437988,
+      "learning_rate": 0.0002,
+      "loss": 1.1738,
+      "step": 300
+    },
+    {
+      "epoch": 0.384376937383757,
+      "grad_norm": 0.5516835451126099,
+      "learning_rate": 0.0002,
+      "loss": 1.2858,
+      "step": 310
+    },
+    {
+      "epoch": 0.3967761934283943,
+      "grad_norm": 0.5212382078170776,
+      "learning_rate": 0.0002,
+      "loss": 1.2315,
+      "step": 320
+    },
+    {
+      "epoch": 0.40917544947303164,
+      "grad_norm": 0.540307343006134,
+      "learning_rate": 0.0002,
+      "loss": 1.172,
+      "step": 330
+    },
+    {
+      "epoch": 0.42157470551766896,
+      "grad_norm": 0.7454301714897156,
+      "learning_rate": 0.0002,
+      "loss": 1.2736,
+      "step": 340
+    },
+    {
+      "epoch": 0.4339739615623063,
+      "grad_norm": 0.7390317916870117,
+      "learning_rate": 0.0002,
+      "loss": 1.3013,
+      "step": 350
+    },
+    {
+      "epoch": 0.4463732176069436,
+      "grad_norm": 0.5498788356781006,
+      "learning_rate": 0.0002,
+      "loss": 1.0615,
+      "step": 360
+    },
+    {
+      "epoch": 0.4587724736515809,
+      "grad_norm": 0.5776252150535583,
+      "learning_rate": 0.0002,
+      "loss": 1.2251,
+      "step": 370
+    },
+    {
+      "epoch": 0.47117172969621823,
+      "grad_norm": 0.6941552758216858,
+      "learning_rate": 0.0002,
+      "loss": 1.1932,
+      "step": 380
+    },
+    {
+      "epoch": 0.48357098574085555,
+      "grad_norm": 0.7936233282089233,
+      "learning_rate": 0.0002,
+      "loss": 1.23,
+      "step": 390
+    },
+    {
+      "epoch": 0.49597024178549287,
+      "grad_norm": 0.5257220268249512,
+      "learning_rate": 0.0002,
+      "loss": 1.1137,
+      "step": 400
+    },
+    {
+      "epoch": 0.5083694978301302,
+      "grad_norm": 0.5740510821342468,
+      "learning_rate": 0.0002,
+      "loss": 1.1867,
+      "step": 410
+    },
+    {
+      "epoch": 0.5207687538747675,
+      "grad_norm": 0.6181507110595703,
+      "learning_rate": 0.0002,
+      "loss": 1.1049,
+      "step": 420
+    },
+    {
+      "epoch": 0.5331680099194048,
+      "grad_norm": 0.6333999037742615,
+      "learning_rate": 0.0002,
+      "loss": 1.2303,
+      "step": 430
+    },
+    {
+      "epoch": 0.5455672659640421,
+      "grad_norm": 0.5667845010757446,
+      "learning_rate": 0.0002,
+      "loss": 1.2457,
+      "step": 440
+    },
+    {
+      "epoch": 0.5579665220086795,
+      "grad_norm": 0.5254231095314026,
+      "learning_rate": 0.0002,
+      "loss": 1.2547,
+      "step": 450
+    },
+    {
+      "epoch": 0.5703657780533168,
+      "grad_norm": 0.5938495993614197,
+      "learning_rate": 0.0002,
+      "loss": 1.2118,
+      "step": 460
+    },
+    {
+      "epoch": 0.5827650340979541,
+      "grad_norm": 0.7733635902404785,
+      "learning_rate": 0.0002,
+      "loss": 1.2409,
+      "step": 470
+    },
+    {
+      "epoch": 0.5951642901425914,
+      "grad_norm": 0.6114753484725952,
+      "learning_rate": 0.0002,
+      "loss": 1.2343,
+      "step": 480
+    },
+    {
+      "epoch": 0.6075635461872287,
+      "grad_norm": 0.5587155818939209,
+      "learning_rate": 0.0002,
+      "loss": 1.1779,
+      "step": 490
+    },
+    {
+      "epoch": 0.6199628022318661,
+      "grad_norm": 0.7636917233467102,
+      "learning_rate": 0.0002,
+      "loss": 1.2136,
+      "step": 500
+    },
+    {
+      "epoch": 0.6323620582765034,
+      "grad_norm": 0.5896942615509033,
+      "learning_rate": 0.0002,
+      "loss": 1.1301,
+      "step": 510
+    },
+    {
+      "epoch": 0.6447613143211407,
+      "grad_norm": 0.8594750165939331,
+      "learning_rate": 0.0002,
+      "loss": 1.2089,
+      "step": 520
+    },
+    {
+      "epoch": 0.657160570365778,
+      "grad_norm": 0.6459881067276001,
+      "learning_rate": 0.0002,
+      "loss": 1.1551,
+      "step": 530
+    },
+    {
+      "epoch": 0.6695598264104153,
+      "grad_norm": 0.650656521320343,
+      "learning_rate": 0.0002,
+      "loss": 1.175,
+      "step": 540
+    },
+    {
+      "epoch": 0.6819590824550527,
+      "grad_norm": 0.7238242626190186,
+      "learning_rate": 0.0002,
+      "loss": 1.2143,
+      "step": 550
+    },
+    {
+      "epoch": 0.69435833849969,
+      "grad_norm": 0.6289859414100647,
+      "learning_rate": 0.0002,
+      "loss": 1.0961,
+      "step": 560
+    },
+    {
+      "epoch": 0.7067575945443273,
+      "grad_norm": 0.6108142137527466,
+      "learning_rate": 0.0002,
+      "loss": 1.2316,
+      "step": 570
+    },
+    {
+      "epoch": 0.7191568505889646,
+      "grad_norm": 0.6905024647712708,
+      "learning_rate": 0.0002,
+      "loss": 1.1315,
+      "step": 580
+    },
+    {
+      "epoch": 0.7315561066336019,
+      "grad_norm": 0.5975471138954163,
+      "learning_rate": 0.0002,
+      "loss": 1.2368,
+      "step": 590
+    },
+    {
+      "epoch": 0.7439553626782393,
+      "grad_norm": 0.49540066719055176,
+      "learning_rate": 0.0002,
+      "loss": 1.1014,
+      "step": 600
+    },
+    {
+      "epoch": 0.7563546187228767,
+      "grad_norm": 0.5365461707115173,
+      "learning_rate": 0.0002,
+      "loss": 1.1359,
+      "step": 610
+    },
+    {
+      "epoch": 0.768753874767514,
+      "grad_norm": 0.6156648993492126,
+      "learning_rate": 0.0002,
+      "loss": 1.2552,
+      "step": 620
+    },
+    {
+      "epoch": 0.7811531308121513,
+      "grad_norm": 0.656879186630249,
+      "learning_rate": 0.0002,
+      "loss": 1.1929,
+      "step": 630
+    },
+    {
+      "epoch": 0.7935523868567886,
+      "grad_norm": 0.8963037729263306,
+      "learning_rate": 0.0002,
+      "loss": 1.3063,
+      "step": 640
+    },
+    {
+      "epoch": 0.805951642901426,
+      "grad_norm": 1.0569753646850586,
+      "learning_rate": 0.0002,
+      "loss": 1.219,
+      "step": 650
+    },
+    {
+      "epoch": 0.8183508989460633,
+      "grad_norm": 0.7332107424736023,
+      "learning_rate": 0.0002,
+      "loss": 1.2563,
+      "step": 660
+    },
+    {
+      "epoch": 0.8307501549907006,
+      "grad_norm": 0.589097797870636,
+      "learning_rate": 0.0002,
+      "loss": 1.1029,
+      "step": 670
+    },
+    {
+      "epoch": 0.8431494110353379,
+      "grad_norm": 0.9553480744361877,
+      "learning_rate": 0.0002,
+      "loss": 1.1705,
+      "step": 680
+    },
+    {
+      "epoch": 0.8555486670799752,
+      "grad_norm": 0.7076331973075867,
+      "learning_rate": 0.0002,
+      "loss": 1.1605,
+      "step": 690
+    },
+    {
+      "epoch": 0.8679479231246126,
+      "grad_norm": 0.597531795501709,
+      "learning_rate": 0.0002,
+      "loss": 1.2346,
+      "step": 700
+    },
+    {
+      "epoch": 0.8803471791692499,
+      "grad_norm": 0.7023149132728577,
+      "learning_rate": 0.0002,
+      "loss": 1.1637,
+      "step": 710
+    },
+    {
+      "epoch": 0.8927464352138872,
+      "grad_norm": 1.4393764734268188,
+      "learning_rate": 0.0002,
+      "loss": 1.2717,
+      "step": 720
+    },
+    {
+      "epoch": 0.9051456912585245,
+      "grad_norm": 0.5944231152534485,
+      "learning_rate": 0.0002,
+      "loss": 1.216,
+      "step": 730
+    },
+    {
+      "epoch": 0.9175449473031618,
+      "grad_norm": 0.5712162852287292,
+      "learning_rate": 0.0002,
+      "loss": 1.148,
+      "step": 740
+    },
+    {
+      "epoch": 0.9299442033477991,
+      "grad_norm": 0.5335281491279602,
+      "learning_rate": 0.0002,
+      "loss": 1.2318,
+      "step": 750
+    },
+    {
+      "epoch": 0.9423434593924365,
+      "grad_norm": 0.8050292730331421,
+      "learning_rate": 0.0002,
+      "loss": 1.149,
+      "step": 760
+    },
+    {
+      "epoch": 0.9547427154370738,
+      "grad_norm": 0.6092700958251953,
+      "learning_rate": 0.0002,
+      "loss": 1.0862,
+      "step": 770
+    },
+    {
+      "epoch": 0.9671419714817111,
+      "grad_norm": 0.7012797594070435,
+      "learning_rate": 0.0002,
+      "loss": 1.3204,
+      "step": 780
+    },
+    {
+      "epoch": 0.9795412275263484,
+      "grad_norm": 0.6228184103965759,
+      "learning_rate": 0.0002,
+      "loss": 1.1641,
+      "step": 790
+    },
+    {
+      "epoch": 0.9919404835709857,
+      "grad_norm": 0.5482686161994934,
+      "learning_rate": 0.0002,
+      "loss": 1.131,
+      "step": 800
+    },
+    {
+      "epoch": 0.9993800371977681,
+      "eval_loss": 1.2057286500930786,
+      "eval_runtime": 164.6087,
+      "eval_samples_per_second": 2.77,
+      "eval_steps_per_second": 0.346,
+      "step": 806
+    },
+    {
+      "epoch": 1.004339739615623,
+      "grad_norm": 0.6331814527511597,
+      "learning_rate": 0.0002,
+      "loss": 1.0899,
+      "step": 810
+    },
+    {
+      "epoch": 1.0167389956602604,
+      "grad_norm": 0.6160872578620911,
+      "learning_rate": 0.0002,
+      "loss": 1.0551,
+      "step": 820
+    },
+    {
+      "epoch": 1.0291382517048977,
+      "grad_norm": 0.6104072332382202,
+      "learning_rate": 0.0002,
+      "loss": 0.9934,
+      "step": 830
+    },
+    {
+      "epoch": 1.041537507749535,
+      "grad_norm": 0.7619274854660034,
+      "learning_rate": 0.0002,
+      "loss": 1.0776,
+      "step": 840
+    },
+    {
+      "epoch": 1.0539367637941723,
+      "grad_norm": 0.761172890663147,
+      "learning_rate": 0.0002,
+      "loss": 0.9929,
+      "step": 850
+    },
+    {
+      "epoch": 1.0663360198388097,
+      "grad_norm": 0.7563514113426208,
+      "learning_rate": 0.0002,
+      "loss": 1.0543,
+      "step": 860
+    },
+    {
+      "epoch": 1.078735275883447,
+      "grad_norm": 0.521998941898346,
+      "learning_rate": 0.0002,
+      "loss": 1.0812,
+      "step": 870
+    },
+    {
+      "epoch": 1.0911345319280843,
+      "grad_norm": 0.824347972869873,
+      "learning_rate": 0.0002,
+      "loss": 1.1417,
+      "step": 880
+    },
+    {
+      "epoch": 1.1035337879727216,
+      "grad_norm": 0.5645424127578735,
+      "learning_rate": 0.0002,
+      "loss": 1.1096,
+      "step": 890
+    },
+    {
+      "epoch": 1.115933044017359,
+      "grad_norm": 0.8568223714828491,
+      "learning_rate": 0.0002,
+      "loss": 1.1005,
+      "step": 900
+    },
+    {
+      "epoch": 1.1283323000619963,
+      "grad_norm": 0.68181312084198,
+      "learning_rate": 0.0002,
+      "loss": 1.088,
+      "step": 910
+    },
+    {
+      "epoch": 1.1407315561066336,
+      "grad_norm": 0.7577647566795349,
+      "learning_rate": 0.0002,
+      "loss": 1.0281,
+      "step": 920
+    },
+    {
+      "epoch": 1.153130812151271,
+      "grad_norm": 0.6968798637390137,
+      "learning_rate": 0.0002,
+      "loss": 0.9812,
+      "step": 930
+    },
+    {
+      "epoch": 1.1655300681959082,
+      "grad_norm": 0.5769661664962769,
+      "learning_rate": 0.0002,
+      "loss": 1.0539,
+      "step": 940
+    },
+    {
+      "epoch": 1.1779293242405455,
+      "grad_norm": 0.6399155259132385,
+      "learning_rate": 0.0002,
+      "loss": 1.0831,
+      "step": 950
+    },
+    {
+      "epoch": 1.1903285802851828,
+      "grad_norm": 0.9824289679527283,
+      "learning_rate": 0.0002,
+      "loss": 1.0464,
+      "step": 960
+    },
+    {
+      "epoch": 1.2027278363298202,
+      "grad_norm": 0.7485893964767456,
+      "learning_rate": 0.0002,
+      "loss": 1.1068,
+      "step": 970
+    },
+    {
+      "epoch": 1.2151270923744575,
+      "grad_norm": 0.668736457824707,
+      "learning_rate": 0.0002,
+      "loss": 1.0047,
+      "step": 980
+    },
+    {
+      "epoch": 1.2275263484190948,
+      "grad_norm": 0.7041404843330383,
+      "learning_rate": 0.0002,
+      "loss": 1.0818,
+      "step": 990
+    },
+    {
+      "epoch": 1.2399256044637321,
+      "grad_norm": 0.7070603966712952,
+      "learning_rate": 0.0002,
+      "loss": 1.0847,
+      "step": 1000
+    },
+    {
+      "epoch": 1.2523248605083694,
+      "grad_norm": 0.7828628420829773,
+      "learning_rate": 0.0002,
+      "loss": 1.047,
+      "step": 1010
+    },
+    {
+      "epoch": 1.2647241165530068,
+      "grad_norm": 0.7149654626846313,
+      "learning_rate": 0.0002,
+      "loss": 1.0658,
+      "step": 1020
+    },
+    {
+      "epoch": 1.277123372597644,
+      "grad_norm": 0.7691766619682312,
+      "learning_rate": 0.0002,
+      "loss": 0.9791,
+      "step": 1030
+    },
+    {
+      "epoch": 1.2895226286422814,
+      "grad_norm": 0.8022137880325317,
+      "learning_rate": 0.0002,
+      "loss": 1.0242,
+      "step": 1040
+    },
+    {
+      "epoch": 1.3019218846869187,
+      "grad_norm": 0.6709204316139221,
+      "learning_rate": 0.0002,
+      "loss": 1.0837,
+      "step": 1050
+    },
+    {
+      "epoch": 1.314321140731556,
+      "grad_norm": 0.7368158102035522,
+      "learning_rate": 0.0002,
+      "loss": 1.0382,
+      "step": 1060
+    },
+    {
+      "epoch": 1.3267203967761934,
+      "grad_norm": 0.8408007621765137,
+      "learning_rate": 0.0002,
+      "loss": 1.0371,
+      "step": 1070
+    },
+    {
+      "epoch": 1.3391196528208307,
+      "grad_norm": 1.2165539264678955,
+      "learning_rate": 0.0002,
+      "loss": 0.9633,
+      "step": 1080
+    },
+    {
+      "epoch": 1.351518908865468,
+      "grad_norm": 0.7284916043281555,
+      "learning_rate": 0.0002,
+      "loss": 1.0079,
+      "step": 1090
+    },
+    {
+      "epoch": 1.3639181649101053,
+      "grad_norm": 0.7994557619094849,
+      "learning_rate": 0.0002,
+      "loss": 1.0211,
+      "step": 1100
+    },
+    {
+      "epoch": 1.3763174209547429,
+      "grad_norm": 0.9658345580101013,
+      "learning_rate": 0.0002,
+      "loss": 1.0892,
+      "step": 1110
+    },
+    {
+      "epoch": 1.3887166769993802,
+      "grad_norm": 0.6312829852104187,
+      "learning_rate": 0.0002,
+      "loss": 1.2088,
+      "step": 1120
+    },
+    {
+      "epoch": 1.4011159330440175,
+      "grad_norm": 0.7263661026954651,
+      "learning_rate": 0.0002,
+      "loss": 1.1055,
+      "step": 1130
+    },
+    {
+      "epoch": 1.4135151890886548,
+      "grad_norm": 0.829082727432251,
+      "learning_rate": 0.0002,
+      "loss": 1.0232,
+      "step": 1140
+    },
+    {
+      "epoch": 1.4259144451332921,
+      "grad_norm": 0.6168127059936523,
+      "learning_rate": 0.0002,
+      "loss": 1.1413,
+      "step": 1150
+    },
+    {
+      "epoch": 1.4383137011779294,
+      "grad_norm": 0.8351425528526306,
+      "learning_rate": 0.0002,
+      "loss": 1.0283,
+      "step": 1160
+    },
+    {
+      "epoch": 1.4507129572225668,
+      "grad_norm": 0.8814472556114197,
+      "learning_rate": 0.0002,
+      "loss": 1.1146,
+      "step": 1170
+    },
+    {
+      "epoch": 1.463112213267204,
+      "grad_norm": 0.6913689970970154,
+      "learning_rate": 0.0002,
+      "loss": 1.0932,
+      "step": 1180
+    },
+    {
+      "epoch": 1.4755114693118414,
+      "grad_norm": 0.7907165884971619,
+      "learning_rate": 0.0002,
+      "loss": 1.1066,
+      "step": 1190
+    },
+    {
+      "epoch": 1.4879107253564787,
+      "grad_norm": 0.8361626267433167,
+      "learning_rate": 0.0002,
+      "loss": 1.0738,
+      "step": 1200
+    },
+    {
+      "epoch": 1.500309981401116,
+      "grad_norm": 1.073534607887268,
+      "learning_rate": 0.0002,
+      "loss": 1.0559,
+      "step": 1210
+    },
+    {
+      "epoch": 1.5127092374457534,
+      "grad_norm": 0.8416345119476318,
+      "learning_rate": 0.0002,
+      "loss": 1.0204,
+      "step": 1220
+    },
+    {
+      "epoch": 1.5251084934903907,
+      "grad_norm": 1.0225597620010376,
+      "learning_rate": 0.0002,
+      "loss": 1.0941,
+      "step": 1230
+    },
+    {
+      "epoch": 1.537507749535028,
+      "grad_norm": 0.6662965416908264,
+      "learning_rate": 0.0002,
+      "loss": 1.0854,
+      "step": 1240
+    },
+    {
+      "epoch": 1.5499070055796653,
+      "grad_norm": 0.7363991737365723,
+      "learning_rate": 0.0002,
+      "loss": 1.0816,
+      "step": 1250
+    },
+    {
+      "epoch": 1.5623062616243026,
+      "grad_norm": 0.9029574990272522,
+      "learning_rate": 0.0002,
+      "loss": 1.0271,
+      "step": 1260
+    },
+    {
+      "epoch": 1.57470551766894,
+      "grad_norm": 0.7992424368858337,
+      "learning_rate": 0.0002,
+      "loss": 1.0206,
+      "step": 1270
+    },
+    {
+      "epoch": 1.5871047737135773,
+      "grad_norm": 0.8108977675437927,
+      "learning_rate": 0.0002,
+      "loss": 1.0114,
+      "step": 1280
+    },
+    {
+      "epoch": 1.5995040297582146,
+      "grad_norm": 0.8257458806037903,
+      "learning_rate": 0.0002,
+      "loss": 1.0264,
+      "step": 1290
+    },
+    {
+      "epoch": 1.611903285802852,
+      "grad_norm": 0.8265092968940735,
+      "learning_rate": 0.0002,
+      "loss": 1.0944,
+      "step": 1300
+    },
+    {
+      "epoch": 1.6243025418474892,
+      "grad_norm": 0.6568580269813538,
+      "learning_rate": 0.0002,
+      "loss": 1.0136,
+      "step": 1310
+    },
+    {
+      "epoch": 1.6367017978921266,
+      "grad_norm": 0.7608488202095032,
+      "learning_rate": 0.0002,
+      "loss": 1.009,
+      "step": 1320
+    },
+    {
+      "epoch": 1.6491010539367639,
+      "grad_norm": 0.7511259317398071,
+      "learning_rate": 0.0002,
+      "loss": 1.1202,
+      "step": 1330
+    },
+    {
+      "epoch": 1.6615003099814012,
+      "grad_norm": 0.7942162752151489,
+      "learning_rate": 0.0002,
+      "loss": 1.0528,
+      "step": 1340
+    },
+    {
+      "epoch": 1.6738995660260385,
+      "grad_norm": 0.8253659605979919,
+      "learning_rate": 0.0002,
+      "loss": 1.0411,
+      "step": 1350
+    },
+    {
+      "epoch": 1.6862988220706758,
+      "grad_norm": 1.1318382024765015,
+      "learning_rate": 0.0002,
+      "loss": 1.001,
+      "step": 1360
+    },
+    {
+      "epoch": 1.6986980781153131,
+      "grad_norm": 0.693403959274292,
+      "learning_rate": 0.0002,
+      "loss": 1.0727,
+      "step": 1370
+    },
+    {
+      "epoch": 1.7110973341599505,
+      "grad_norm": 0.7107617259025574,
+      "learning_rate": 0.0002,
+      "loss": 1.073,
+      "step": 1380
+    },
+    {
+      "epoch": 1.7234965902045878,
+      "grad_norm": 0.8169032335281372,
+      "learning_rate": 0.0002,
+      "loss": 1.0849,
+      "step": 1390
+    },
+    {
+      "epoch": 1.735895846249225,
+      "grad_norm": 0.8940841555595398,
+      "learning_rate": 0.0002,
+      "loss": 1.0578,
+      "step": 1400
+    },
+    {
+      "epoch": 1.7482951022938624,
+      "grad_norm": 0.7862188220024109,
+      "learning_rate": 0.0002,
+      "loss": 1.0891,
+      "step": 1410
+    },
+    {
+      "epoch": 1.7606943583384997,
+      "grad_norm": 1.136338472366333,
+      "learning_rate": 0.0002,
+      "loss": 0.9962,
+      "step": 1420
+    },
+    {
+      "epoch": 1.773093614383137,
+      "grad_norm": 0.9534069895744324,
+      "learning_rate": 0.0002,
+      "loss": 1.0943,
+      "step": 1430
+    },
+    {
+      "epoch": 1.7854928704277744,
+      "grad_norm": 1.0747562646865845,
+      "learning_rate": 0.0002,
+      "loss": 1.1257,
+      "step": 1440
+    },
+    {
+      "epoch": 1.7978921264724117,
+      "grad_norm": 0.8557891249656677,
+      "learning_rate": 0.0002,
+      "loss": 1.0556,
+      "step": 1450
+    },
+    {
+      "epoch": 1.810291382517049,
+      "grad_norm": 0.6829259991645813,
+      "learning_rate": 0.0002,
+      "loss": 1.0128,
+      "step": 1460
+    },
+    {
+      "epoch": 1.8226906385616863,
+      "grad_norm": 0.8164441585540771,
+      "learning_rate": 0.0002,
+      "loss": 1.0313,
+      "step": 1470
+    },
+    {
+      "epoch": 1.8350898946063237,
+      "grad_norm": 0.9458068609237671,
+      "learning_rate": 0.0002,
+      "loss": 1.1136,
+      "step": 1480
+    },
+    {
+      "epoch": 1.847489150650961,
+      "grad_norm": 0.743009626865387,
+      "learning_rate": 0.0002,
+      "loss": 1.0457,
+      "step": 1490
+    },
+    {
+      "epoch": 1.8598884066955983,
+      "grad_norm": 0.7137694358825684,
+      "learning_rate": 0.0002,
+      "loss": 1.0107,
+      "step": 1500
+    },
+    {
+      "epoch": 1.8722876627402356,
+      "grad_norm": 0.7618028521537781,
+      "learning_rate": 0.0002,
+      "loss": 1.0633,
+      "step": 1510
+    },
+    {
+      "epoch": 1.884686918784873,
+      "grad_norm": 0.8153398633003235,
+      "learning_rate": 0.0002,
+      "loss": 1.103,
+      "step": 1520
+    },
+    {
+      "epoch": 1.8970861748295103,
+      "grad_norm": 0.9127124547958374,
+      "learning_rate": 0.0002,
+      "loss": 1.2094,
+      "step": 1530
+    },
+    {
+      "epoch": 1.9094854308741476,
+      "grad_norm": 0.7699425220489502,
+      "learning_rate": 0.0002,
+      "loss": 1.0379,
+      "step": 1540
+    },
+    {
+      "epoch": 1.921884686918785,
+      "grad_norm": 0.8807545304298401,
+      "learning_rate": 0.0002,
+      "loss": 0.9849,
+      "step": 1550
+    },
+    {
+      "epoch": 1.9342839429634222,
+      "grad_norm": 0.7340815663337708,
+      "learning_rate": 0.0002,
+      "loss": 1.033,
+      "step": 1560
+    },
+    {
+      "epoch": 1.9466831990080595,
+      "grad_norm": 1.070056676864624,
+      "learning_rate": 0.0002,
+      "loss": 1.0083,
+      "step": 1570
+    },
+    {
+      "epoch": 1.9590824550526968,
+      "grad_norm": 0.8195573687553406,
+      "learning_rate": 0.0002,
+      "loss": 1.0023,
+      "step": 1580
+    },
+    {
+      "epoch": 1.9714817110973342,
+      "grad_norm": 0.7938687205314636,
+      "learning_rate": 0.0002,
+      "loss": 1.029,
+      "step": 1590
+    },
+    {
+      "epoch": 1.9838809671419715,
+      "grad_norm": 0.7632259726524353,
+      "learning_rate": 0.0002,
+      "loss": 1.0512,
+      "step": 1600
+    },
+    {
+      "epoch": 1.9962802231866088,
+      "grad_norm": 0.7921916246414185,
+      "learning_rate": 0.0002,
+      "loss": 1.0426,
+      "step": 1610
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.2014765739440918,
+      "eval_runtime": 159.8677,
+      "eval_samples_per_second": 2.852,
+      "eval_steps_per_second": 0.357,
+      "step": 1613
+    },
+    {
+      "epoch": 2.008679479231246,
+      "grad_norm": 1.1764529943466187,
+      "learning_rate": 0.0002,
+      "loss": 0.9239,
+      "step": 1620
+    },
+    {
+      "epoch": 2.0210787352758834,
+      "grad_norm": 1.0271947383880615,
+      "learning_rate": 0.0002,
+      "loss": 0.7995,
+      "step": 1630
+    },
+    {
+      "epoch": 2.0334779913205208,
+      "grad_norm": 0.7138071656227112,
+      "learning_rate": 0.0002,
+      "loss": 0.8592,
+      "step": 1640
+    },
+    {
+      "epoch": 2.045877247365158,
+      "grad_norm": 0.8644373416900635,
+      "learning_rate": 0.0002,
+      "loss": 0.8106,
+      "step": 1650
+    },
+    {
+      "epoch": 2.0582765034097954,
+      "grad_norm": 1.2262420654296875,
+      "learning_rate": 0.0002,
+      "loss": 0.8578,
+      "step": 1660
+    },
+    {
+      "epoch": 2.0706757594544327,
+      "grad_norm": 0.9718686938285828,
+      "learning_rate": 0.0002,
+      "loss": 0.8009,
+      "step": 1670
+    },
+    {
+      "epoch": 2.08307501549907,
+      "grad_norm": 1.0075122117996216,
+      "learning_rate": 0.0002,
+      "loss": 0.831,
+      "step": 1680
+    },
+    {
+      "epoch": 2.0954742715437074,
+      "grad_norm": 1.2113722562789917,
+      "learning_rate": 0.0002,
+      "loss": 0.8177,
+      "step": 1690
+    },
+    {
+      "epoch": 2.1078735275883447,
+      "grad_norm": 0.7911604642868042,
+      "learning_rate": 0.0002,
+      "loss": 0.8377,
+      "step": 1700
+    },
+    {
+      "epoch": 2.120272783632982,
+      "grad_norm": 0.8578933477401733,
+      "learning_rate": 0.0002,
+      "loss": 0.8405,
+      "step": 1710
+    },
+    {
+      "epoch": 2.1326720396776193,
+      "grad_norm": 1.1782084703445435,
+      "learning_rate": 0.0002,
+      "loss": 0.8784,
+      "step": 1720
+    },
+    {
+      "epoch": 2.1450712957222566,
+      "grad_norm": 1.3627573251724243,
+      "learning_rate": 0.0002,
+      "loss": 0.8543,
+      "step": 1730
+    },
+    {
+      "epoch": 2.157470551766894,
+      "grad_norm": 1.2948803901672363,
+      "learning_rate": 0.0002,
+      "loss": 0.8404,
+      "step": 1740
+    },
+    {
+      "epoch": 2.1698698078115313,
+      "grad_norm": 0.9353442788124084,
+      "learning_rate": 0.0002,
+      "loss": 0.8719,
+      "step": 1750
+    },
+    {
+      "epoch": 2.1822690638561686,
+      "grad_norm": 0.9063374400138855,
+      "learning_rate": 0.0002,
+      "loss": 0.8112,
+      "step": 1760
+    },
+    {
+      "epoch": 2.194668319900806,
+      "grad_norm": 1.3354851007461548,
+      "learning_rate": 0.0002,
+      "loss": 0.9441,
+      "step": 1770
+    },
+    {
+      "epoch": 2.2070675759454432,
+      "grad_norm": 0.8388507962226868,
+      "learning_rate": 0.0002,
+      "loss": 0.877,
+      "step": 1780
+    },
+    {
+      "epoch": 2.2194668319900805,
+      "grad_norm": 0.9509401321411133,
+      "learning_rate": 0.0002,
+      "loss": 0.8709,
+      "step": 1790
+    },
+    {
+      "epoch": 2.231866088034718,
+      "grad_norm": 1.0458593368530273,
+      "learning_rate": 0.0002,
+      "loss": 0.8212,
+      "step": 1800
+    },
+    {
+      "epoch": 2.244265344079355,
+      "grad_norm": 0.890088677406311,
+      "learning_rate": 0.0002,
+      "loss": 0.7667,
+      "step": 1810
+    },
+    {
+      "epoch": 2.2566646001239925,
+      "grad_norm": 1.1933976411819458,
+      "learning_rate": 0.0002,
+      "loss": 0.8431,
+      "step": 1820
+    },
+    {
+      "epoch": 2.26906385616863,
+      "grad_norm": 0.961398184299469,
+      "learning_rate": 0.0002,
+      "loss": 0.8697,
+      "step": 1830
+    },
+    {
+      "epoch": 2.281463112213267,
+      "grad_norm": 1.124961495399475,
+      "learning_rate": 0.0002,
+      "loss": 0.8403,
+      "step": 1840
+    },
+    {
+      "epoch": 2.2938623682579045,
+      "grad_norm": 0.9042379260063171,
+      "learning_rate": 0.0002,
+      "loss": 0.8431,
+      "step": 1850
+    },
+    {
+      "epoch": 2.306261624302542,
+      "grad_norm": 1.2250864505767822,
+      "learning_rate": 0.0002,
+      "loss": 0.8866,
+      "step": 1860
+    },
+    {
+      "epoch": 2.318660880347179,
+      "grad_norm": 1.1758817434310913,
+      "learning_rate": 0.0002,
+      "loss": 0.8514,
+      "step": 1870
+    },
+    {
+      "epoch": 2.3310601363918164,
+      "grad_norm": 0.9863199591636658,
+      "learning_rate": 0.0002,
+      "loss": 0.9316,
+      "step": 1880
+    },
+    {
+      "epoch": 2.3434593924364537,
+      "grad_norm": 1.1759305000305176,
+      "learning_rate": 0.0002,
+      "loss": 0.8854,
+      "step": 1890
+    },
+    {
+      "epoch": 2.355858648481091,
+      "grad_norm": 0.995716392993927,
+      "learning_rate": 0.0002,
+      "loss": 0.866,
+      "step": 1900
+    },
+    {
+      "epoch": 2.3682579045257284,
+      "grad_norm": 1.1816585063934326,
+      "learning_rate": 0.0002,
+      "loss": 0.8439,
+      "step": 1910
+    },
+    {
+      "epoch": 2.3806571605703657,
+      "grad_norm": 0.7498432397842407,
+      "learning_rate": 0.0002,
+      "loss": 0.9284,
+      "step": 1920
+    },
+    {
+      "epoch": 2.393056416615003,
+      "grad_norm": 0.9481443762779236,
+      "learning_rate": 0.0002,
+      "loss": 0.8243,
+      "step": 1930
+    },
+    {
+      "epoch": 2.4054556726596403,
+      "grad_norm": 1.1264584064483643,
+      "learning_rate": 0.0002,
+      "loss": 0.8083,
+      "step": 1940
+    },
+    {
+      "epoch": 2.4178549287042777,
+      "grad_norm": 0.8826232552528381,
+      "learning_rate": 0.0002,
+      "loss": 0.9122,
+      "step": 1950
+    },
+    {
+      "epoch": 2.430254184748915,
+      "grad_norm": 0.9702113270759583,
+      "learning_rate": 0.0002,
+      "loss": 0.8764,
+      "step": 1960
+    },
+    {
+      "epoch": 2.4426534407935523,
+      "grad_norm": 1.0663695335388184,
+      "learning_rate": 0.0002,
+      "loss": 0.8498,
+      "step": 1970
+    },
+    {
+      "epoch": 2.4550526968381896,
+      "grad_norm": 1.1186119318008423,
+      "learning_rate": 0.0002,
+      "loss": 0.888,
+      "step": 1980
+    },
+    {
+      "epoch": 2.467451952882827,
+      "grad_norm": 1.428774118423462,
+      "learning_rate": 0.0002,
+      "loss": 0.9327,
+      "step": 1990
+    },
+    {
+      "epoch": 2.4798512089274642,
+      "grad_norm": 1.3054901361465454,
+      "learning_rate": 0.0002,
+      "loss": 0.9423,
+      "step": 2000
+    },
+    {
+      "epoch": 2.4922504649721016,
+      "grad_norm": 0.9893805384635925,
+      "learning_rate": 0.0002,
+      "loss": 0.8494,
+      "step": 2010
+    },
+    {
+      "epoch": 2.504649721016739,
+      "grad_norm": 1.149538516998291,
+      "learning_rate": 0.0002,
+      "loss": 0.9982,
+      "step": 2020
+    },
+    {
+      "epoch": 2.517048977061376,
+      "grad_norm": 0.8716336488723755,
+      "learning_rate": 0.0002,
+      "loss": 0.881,
+      "step": 2030
+    },
+    {
+      "epoch": 2.5294482331060135,
+      "grad_norm": 1.0464730262756348,
+      "learning_rate": 0.0002,
+      "loss": 0.8483,
+      "step": 2040
+    },
+    {
+      "epoch": 2.541847489150651,
+      "grad_norm": 1.1451894044876099,
+      "learning_rate": 0.0002,
+      "loss": 0.9475,
+      "step": 2050
+    },
+    {
+      "epoch": 2.554246745195288,
+      "grad_norm": 1.3266205787658691,
+      "learning_rate": 0.0002,
+      "loss": 0.8238,
+      "step": 2060
+    },
+    {
+      "epoch": 2.5666460012399255,
+      "grad_norm": 1.2838176488876343,
+      "learning_rate": 0.0002,
+      "loss": 0.8457,
+      "step": 2070
+    },
+    {
+      "epoch": 2.579045257284563,
+      "grad_norm": 1.0352915525436401,
+      "learning_rate": 0.0002,
+      "loss": 0.7813,
+      "step": 2080
+    },
+    {
+      "epoch": 2.5914445133292,
+      "grad_norm": 1.181416392326355,
+      "learning_rate": 0.0002,
+      "loss": 0.895,
+      "step": 2090
+    },
+    {
+      "epoch": 2.6038437693738374,
+      "grad_norm": 1.2425765991210938,
+      "learning_rate": 0.0002,
+      "loss": 0.8537,
+      "step": 2100
+    },
+    {
+      "epoch": 2.6162430254184748,
+      "grad_norm": 1.2885762453079224,
+      "learning_rate": 0.0002,
+      "loss": 0.8561,
+      "step": 2110
+    },
+    {
+      "epoch": 2.628642281463112,
+      "grad_norm": 1.0179181098937988,
+      "learning_rate": 0.0002,
+      "loss": 0.8024,
+      "step": 2120
+    },
+    {
+      "epoch": 2.6410415375077494,
+      "grad_norm": 1.4908100366592407,
+      "learning_rate": 0.0002,
+      "loss": 0.8747,
+      "step": 2130
+    },
+    {
+      "epoch": 2.6534407935523867,
+      "grad_norm": 1.4854460954666138,
+      "learning_rate": 0.0002,
+      "loss": 0.8475,
+      "step": 2140
+    },
+    {
+      "epoch": 2.665840049597024,
+      "grad_norm": 0.994413435459137,
+      "learning_rate": 0.0002,
+      "loss": 0.8579,
+      "step": 2150
+    },
+    {
+      "epoch": 2.6782393056416613,
+      "grad_norm": 1.177201271057129,
+      "learning_rate": 0.0002,
+      "loss": 0.8606,
+      "step": 2160
+    },
+    {
+      "epoch": 2.6906385616862987,
+      "grad_norm": 1.2680933475494385,
+      "learning_rate": 0.0002,
+      "loss": 0.9233,
+      "step": 2170
+    },
+    {
+      "epoch": 2.703037817730936,
+      "grad_norm": 1.2201054096221924,
+      "learning_rate": 0.0002,
+      "loss": 0.8443,
+      "step": 2180
+    },
+    {
+      "epoch": 2.7154370737755733,
+      "grad_norm": 1.2058831453323364,
+      "learning_rate": 0.0002,
+      "loss": 0.8437,
+      "step": 2190
+    },
+    {
+      "epoch": 2.7278363298202106,
+      "grad_norm": 1.1667239665985107,
+      "learning_rate": 0.0002,
+      "loss": 0.9894,
+      "step": 2200
+    },
+    {
+      "epoch": 2.740235585864848,
+      "grad_norm": 1.1243321895599365,
+      "learning_rate": 0.0002,
+      "loss": 0.8501,
+      "step": 2210
+    },
+    {
+      "epoch": 2.7526348419094857,
+      "grad_norm": 1.0543156862258911,
+      "learning_rate": 0.0002,
+      "loss": 0.9387,
+      "step": 2220
+    },
+    {
+      "epoch": 2.765034097954123,
+      "grad_norm": 1.1922553777694702,
+      "learning_rate": 0.0002,
+      "loss": 0.9488,
+      "step": 2230
+    },
+    {
+      "epoch": 2.7774333539987603,
+      "grad_norm": 1.1266813278198242,
+      "learning_rate": 0.0002,
+      "loss": 0.8558,
+      "step": 2240
+    },
+    {
+      "epoch": 2.7898326100433977,
+      "grad_norm": 0.9645159840583801,
+      "learning_rate": 0.0002,
+      "loss": 0.8459,
+      "step": 2250
+    },
+    {
+      "epoch": 2.802231866088035,
+      "grad_norm": 1.0672235488891602,
+      "learning_rate": 0.0002,
+      "loss": 0.8862,
+      "step": 2260
+    },
+    {
+      "epoch": 2.8146311221326723,
+      "grad_norm": 1.5650453567504883,
+      "learning_rate": 0.0002,
+      "loss": 0.869,
+      "step": 2270
+    },
+    {
+      "epoch": 2.8270303781773096,
+      "grad_norm": 1.0414438247680664,
+      "learning_rate": 0.0002,
+      "loss": 0.8,
+      "step": 2280
+    },
+    {
+      "epoch": 2.839429634221947,
+      "grad_norm": 0.8878290057182312,
+      "learning_rate": 0.0002,
+      "loss": 0.8419,
+      "step": 2290
+    },
+    {
+      "epoch": 2.8518288902665843,
+      "grad_norm": 1.0500553846359253,
+      "learning_rate": 0.0002,
+      "loss": 0.9148,
+      "step": 2300
+    },
+    {
+      "epoch": 2.8642281463112216,
+      "grad_norm": 0.9779142737388611,
+      "learning_rate": 0.0002,
+      "loss": 0.8706,
+      "step": 2310
+    },
+    {
+      "epoch": 2.876627402355859,
+      "grad_norm": 0.8904196619987488,
+      "learning_rate": 0.0002,
+      "loss": 0.8385,
+      "step": 2320
+    },
+    {
+      "epoch": 2.889026658400496,
+      "grad_norm": 1.103608250617981,
+      "learning_rate": 0.0002,
+      "loss": 0.8768,
+      "step": 2330
+    },
+    {
+      "epoch": 2.9014259144451335,
+      "grad_norm": 1.2064822912216187,
+      "learning_rate": 0.0002,
+      "loss": 0.8659,
+      "step": 2340
+    },
+    {
+      "epoch": 2.913825170489771,
+      "grad_norm": 1.3073748350143433,
+      "learning_rate": 0.0002,
+      "loss": 0.9299,
+      "step": 2350
+    },
+    {
+      "epoch": 2.926224426534408,
+      "grad_norm": 1.4792760610580444,
+      "learning_rate": 0.0002,
+      "loss": 0.778,
+      "step": 2360
+    },
+    {
+      "epoch": 2.9386236825790455,
+      "grad_norm": 1.1670116186141968,
+      "learning_rate": 0.0002,
+      "loss": 0.9773,
+      "step": 2370
+    },
+    {
+      "epoch": 2.951022938623683,
+      "grad_norm": 1.235465168952942,
+      "learning_rate": 0.0002,
+      "loss": 0.8973,
+      "step": 2380
+    },
+    {
+      "epoch": 2.96342219466832,
+      "grad_norm": 1.7734158039093018,
+      "learning_rate": 0.0002,
+      "loss": 0.8646,
+      "step": 2390
+    },
+    {
+      "epoch": 2.9758214507129574,
+      "grad_norm": 1.3497414588928223,
+      "learning_rate": 0.0002,
+      "loss": 0.8784,
+      "step": 2400
+    },
+    {
+      "epoch": 2.9882207067575948,
+      "grad_norm": 1.1425493955612183,
+      "learning_rate": 0.0002,
+      "loss": 0.9116,
+      "step": 2410
+    },
+    {
+      "epoch": 2.999380037197768,
+      "eval_loss": 1.2303974628448486,
+      "eval_runtime": 126.4856,
+      "eval_samples_per_second": 3.605,
+      "eval_steps_per_second": 0.451,
+      "step": 2419
+    },
+    {
+      "epoch": 3.000619962802232,
+      "grad_norm": 1.4001394510269165,
+      "learning_rate": 0.0002,
+      "loss": 0.9395,
+      "step": 2420
+    },
+    {
+      "epoch": 3.0130192188468694,
+      "grad_norm": 2.4510438442230225,
+      "learning_rate": 0.0002,
+      "loss": 0.6538,
+      "step": 2430
+    },
+    {
+      "epoch": 3.0254184748915067,
+      "grad_norm": 1.5374444723129272,
+      "learning_rate": 0.0002,
+      "loss": 0.6732,
+      "step": 2440
+    },
+    {
+      "epoch": 3.037817730936144,
+      "grad_norm": 1.632250428199768,
+      "learning_rate": 0.0002,
+      "loss": 0.6934,
+      "step": 2450
+    },
+    {
+      "epoch": 3.0502169869807814,
+      "grad_norm": 1.5456780195236206,
+      "learning_rate": 0.0002,
+      "loss": 0.6266,
+      "step": 2460
+    },
+    {
+      "epoch": 3.0626162430254187,
+      "grad_norm": 1.3664451837539673,
+      "learning_rate": 0.0002,
+      "loss": 0.6467,
+      "step": 2470
+    },
+    {
+      "epoch": 3.075015499070056,
+      "grad_norm": 1.344169020652771,
+      "learning_rate": 0.0002,
+      "loss": 0.6351,
+      "step": 2480
+    },
+    {
+      "epoch": 3.0874147551146933,
+      "grad_norm": 0.9710949659347534,
+      "learning_rate": 0.0002,
+      "loss": 0.664,
+      "step": 2490
+    },
+    {
+      "epoch": 3.0998140111593306,
+      "grad_norm": 2.324171304702759,
+      "learning_rate": 0.0002,
+      "loss": 0.6232,
+      "step": 2500
+    },
+    {
+      "epoch": 3.112213267203968,
+      "grad_norm": 1.2885396480560303,
+      "learning_rate": 0.0002,
+      "loss": 0.7336,
+      "step": 2510
+    },
+    {
+      "epoch": 3.1246125232486053,
+      "grad_norm": 1.224718451499939,
+      "learning_rate": 0.0002,
+      "loss": 0.7095,
+      "step": 2520
+    },
+    {
+      "epoch": 3.1370117792932426,
+      "grad_norm": 1.1158969402313232,
+      "learning_rate": 0.0002,
+      "loss": 0.6451,
+      "step": 2530
+    },
+    {
+      "epoch": 3.14941103533788,
+      "grad_norm": 1.189963698387146,
+      "learning_rate": 0.0002,
+      "loss": 0.6024,
+      "step": 2540
+    },
+    {
+      "epoch": 3.1618102913825172,
+      "grad_norm": 1.2543222904205322,
+      "learning_rate": 0.0002,
+      "loss": 0.6996,
+      "step": 2550
+    },
+    {
+      "epoch": 3.1742095474271546,
+      "grad_norm": 1.4986658096313477,
+      "learning_rate": 0.0002,
+      "loss": 0.6854,
+      "step": 2560
+    },
+    {
+      "epoch": 3.186608803471792,
+      "grad_norm": 1.5848976373672485,
+      "learning_rate": 0.0002,
+      "loss": 0.5936,
+      "step": 2570
+    },
+    {
+      "epoch": 3.199008059516429,
+      "grad_norm": 1.2306287288665771,
+      "learning_rate": 0.0002,
+      "loss": 0.64,
+      "step": 2580
+    },
+    {
+      "epoch": 3.2114073155610665,
+      "grad_norm": 1.6327801942825317,
+      "learning_rate": 0.0002,
+      "loss": 0.6381,
+      "step": 2590
+    },
+    {
+      "epoch": 3.223806571605704,
+      "grad_norm": 1.191624402999878,
+      "learning_rate": 0.0002,
+      "loss": 0.6614,
+      "step": 2600
+    },
+    {
+      "epoch": 3.236205827650341,
+      "grad_norm": 1.546857476234436,
+      "learning_rate": 0.0002,
+      "loss": 0.5862,
+      "step": 2610
+    },
+    {
+      "epoch": 3.2486050836949785,
+      "grad_norm": 1.7683172225952148,
+      "learning_rate": 0.0002,
+      "loss": 0.697,
+      "step": 2620
+    },
+    {
+      "epoch": 3.261004339739616,
+      "grad_norm": 1.3910621404647827,
+      "learning_rate": 0.0002,
+      "loss": 0.6909,
+      "step": 2630
+    },
+    {
+      "epoch": 3.273403595784253,
+      "grad_norm": 1.205353021621704,
+      "learning_rate": 0.0002,
+      "loss": 0.6322,
+      "step": 2640
+    },
+    {
+      "epoch": 3.2858028518288904,
+      "grad_norm": 1.1997911930084229,
+      "learning_rate": 0.0002,
+      "loss": 0.6923,
+      "step": 2650
+    },
+    {
+      "epoch": 3.2982021078735277,
+      "grad_norm": 1.6746608018875122,
+      "learning_rate": 0.0002,
+      "loss": 0.6291,
+      "step": 2660
+    },
+    {
+      "epoch": 3.310601363918165,
+      "grad_norm": 1.0251612663269043,
+      "learning_rate": 0.0002,
+      "loss": 0.7021,
+      "step": 2670
+    },
+    {
+      "epoch": 3.3230006199628024,
+      "grad_norm": 1.3690581321716309,
+      "learning_rate": 0.0002,
+      "loss": 0.6958,
+      "step": 2680
+    },
+    {
+      "epoch": 3.3353998760074397,
+      "grad_norm": 1.5537537336349487,
+      "learning_rate": 0.0002,
+      "loss": 0.7439,
+      "step": 2690
+    },
+    {
+      "epoch": 3.347799132052077,
+      "grad_norm": 1.5438767671585083,
+      "learning_rate": 0.0002,
+      "loss": 0.692,
+      "step": 2700
+    },
+    {
+      "epoch": 3.3601983880967143,
+      "grad_norm": 1.2430849075317383,
+      "learning_rate": 0.0002,
+      "loss": 0.6698,
+      "step": 2710
+    },
+    {
+      "epoch": 3.3725976441413517,
+      "grad_norm": 1.1905370950698853,
+      "learning_rate": 0.0002,
+      "loss": 0.7447,
+      "step": 2720
+    },
+    {
+      "epoch": 3.384996900185989,
+      "grad_norm": 1.5106539726257324,
+      "learning_rate": 0.0002,
+      "loss": 0.6583,
+      "step": 2730
+    },
+    {
+      "epoch": 3.3973961562306263,
+      "grad_norm": 1.8480169773101807,
+      "learning_rate": 0.0002,
+      "loss": 0.6812,
+      "step": 2740
+    },
+    {
+      "epoch": 3.4097954122752636,
+      "grad_norm": 1.0991253852844238,
+      "learning_rate": 0.0002,
+      "loss": 0.6523,
+      "step": 2750
+    },
+    {
+      "epoch": 3.422194668319901,
+      "grad_norm": 1.5110164880752563,
+      "learning_rate": 0.0002,
+      "loss": 0.7371,
+      "step": 2760
+    },
+    {
+      "epoch": 3.4345939243645383,
+      "grad_norm": 1.7006158828735352,
+      "learning_rate": 0.0002,
+      "loss": 0.6632,
+      "step": 2770
+    },
+    {
+      "epoch": 3.4469931804091756,
+      "grad_norm": 1.3995729684829712,
+      "learning_rate": 0.0002,
+      "loss": 0.6938,
+      "step": 2780
+    },
+    {
+      "epoch": 3.459392436453813,
+      "grad_norm": 1.5709624290466309,
+      "learning_rate": 0.0002,
+      "loss": 0.704,
+      "step": 2790
+    },
+    {
+      "epoch": 3.47179169249845,
+      "grad_norm": 1.2154548168182373,
+      "learning_rate": 0.0002,
+      "loss": 0.629,
+      "step": 2800
+    },
+    {
+      "epoch": 3.4841909485430875,
+      "grad_norm": 1.5075860023498535,
+      "learning_rate": 0.0002,
+      "loss": 0.709,
+      "step": 2810
+    },
+    {
+      "epoch": 3.496590204587725,
+      "grad_norm": 2.296370029449463,
+      "learning_rate": 0.0002,
+      "loss": 0.6838,
+      "step": 2820
+    },
+    {
+      "epoch": 3.508989460632362,
+      "grad_norm": 1.5329245328903198,
+      "learning_rate": 0.0002,
+      "loss": 0.7216,
+      "step": 2830
+    },
+    {
+      "epoch": 3.5213887166769995,
+      "grad_norm": 2.391974925994873,
+      "learning_rate": 0.0002,
+      "loss": 0.702,
+      "step": 2840
+    },
+    {
+      "epoch": 3.533787972721637,
+      "grad_norm": 1.7627687454223633,
+      "learning_rate": 0.0002,
+      "loss": 0.6122,
+      "step": 2850
+    },
+    {
+      "epoch": 3.546187228766274,
+      "grad_norm": 1.8143539428710938,
+      "learning_rate": 0.0002,
+      "loss": 0.6612,
+      "step": 2860
+    },
+    {
+      "epoch": 3.5585864848109114,
+      "grad_norm": 1.8639698028564453,
+      "learning_rate": 0.0002,
+      "loss": 0.6875,
+      "step": 2870
+    },
+    {
+      "epoch": 3.5709857408555488,
+      "grad_norm": 1.9081439971923828,
+      "learning_rate": 0.0002,
+      "loss": 0.7133,
+      "step": 2880
+    },
+    {
+      "epoch": 3.583384996900186,
+      "grad_norm": 1.707095742225647,
+      "learning_rate": 0.0002,
+      "loss": 0.6669,
+      "step": 2890
+    },
+    {
+      "epoch": 3.5957842529448234,
+      "grad_norm": 1.561742901802063,
+      "learning_rate": 0.0002,
+      "loss": 0.6834,
+      "step": 2900
+    },
+    {
+      "epoch": 3.6081835089894607,
+      "grad_norm": 1.6129803657531738,
+      "learning_rate": 0.0002,
+      "loss": 0.7545,
+      "step": 2910
+    },
+    {
+      "epoch": 3.620582765034098,
+      "grad_norm": 1.1192500591278076,
+      "learning_rate": 0.0002,
+      "loss": 0.7182,
+      "step": 2920
+    },
+    {
+      "epoch": 3.6329820210787354,
+      "grad_norm": 1.420279622077942,
+      "learning_rate": 0.0002,
+      "loss": 0.6339,
+      "step": 2930
+    },
+    {
+      "epoch": 3.6453812771233727,
+      "grad_norm": 1.5851093530654907,
+      "learning_rate": 0.0002,
+      "loss": 0.7365,
+      "step": 2940
+    },
+    {
+      "epoch": 3.65778053316801,
+      "grad_norm": 1.4390369653701782,
+      "learning_rate": 0.0002,
+      "loss": 0.661,
+      "step": 2950
+    },
+    {
+      "epoch": 3.6701797892126473,
+      "grad_norm": 1.4419100284576416,
+      "learning_rate": 0.0002,
+      "loss": 0.7262,
+      "step": 2960
+    },
+    {
+      "epoch": 3.6825790452572846,
+      "grad_norm": 0.9472342133522034,
+      "learning_rate": 0.0002,
+      "loss": 0.7449,
+      "step": 2970
+    },
+    {
+      "epoch": 3.694978301301922,
+      "grad_norm": 1.194284200668335,
+      "learning_rate": 0.0002,
+      "loss": 0.696,
+      "step": 2980
+    },
+    {
+      "epoch": 3.7073775573465593,
+      "grad_norm": 1.233306884765625,
+      "learning_rate": 0.0002,
+      "loss": 0.6603,
+      "step": 2990
+    },
+    {
+      "epoch": 3.7197768133911966,
+      "grad_norm": 1.703479528427124,
+      "learning_rate": 0.0002,
+      "loss": 0.7155,
+      "step": 3000
+    },
+    {
+      "epoch": 3.732176069435834,
+      "grad_norm": 1.3840128183364868,
+      "learning_rate": 0.0002,
+      "loss": 0.6779,
+      "step": 3010
+    },
+    {
+      "epoch": 3.7445753254804712,
+      "grad_norm": 1.042277455329895,
+      "learning_rate": 0.0002,
+      "loss": 0.7428,
+      "step": 3020
+    },
+    {
+      "epoch": 3.7569745815251085,
+      "grad_norm": 1.3294179439544678,
+      "learning_rate": 0.0002,
+      "loss": 0.6937,
+      "step": 3030
+    },
+    {
+      "epoch": 3.769373837569746,
+      "grad_norm": 1.327108383178711,
+      "learning_rate": 0.0002,
+      "loss": 0.7233,
+      "step": 3040
+    },
+    {
+      "epoch": 3.781773093614383,
+      "grad_norm": 1.2039794921875,
+      "learning_rate": 0.0002,
+      "loss": 0.6109,
+      "step": 3050
+    },
+    {
+      "epoch": 3.7941723496590205,
+      "grad_norm": 1.2900311946868896,
+      "learning_rate": 0.0002,
+      "loss": 0.7614,
+      "step": 3060
+    },
+    {
+      "epoch": 3.806571605703658,
+      "grad_norm": 1.2003637552261353,
+      "learning_rate": 0.0002,
+      "loss": 0.7134,
+      "step": 3070
+    },
+    {
+      "epoch": 3.818970861748295,
+      "grad_norm": 1.2668299674987793,
+      "learning_rate": 0.0002,
+      "loss": 0.7145,
+      "step": 3080
+    },
+    {
+      "epoch": 3.8313701177929325,
+      "grad_norm": 1.5786389112472534,
+      "learning_rate": 0.0002,
+      "loss": 0.7429,
+      "step": 3090
+    },
+    {
+      "epoch": 3.84376937383757,
+      "grad_norm": 1.283626675605774,
+      "learning_rate": 0.0002,
+      "loss": 0.7045,
+      "step": 3100
+    },
+    {
+      "epoch": 3.856168629882207,
+      "grad_norm": 1.5252535343170166,
+      "learning_rate": 0.0002,
+      "loss": 0.6966,
+      "step": 3110
+    },
+    {
+      "epoch": 3.8685678859268444,
+      "grad_norm": 1.152452826499939,
+      "learning_rate": 0.0002,
+      "loss": 0.6737,
+      "step": 3120
+    },
+    {
+      "epoch": 3.8809671419714817,
+      "grad_norm": 1.3349536657333374,
+      "learning_rate": 0.0002,
+      "loss": 0.793,
+      "step": 3130
+    },
+    {
+      "epoch": 3.893366398016119,
+      "grad_norm": 1.3839694261550903,
+      "learning_rate": 0.0002,
+      "loss": 0.7107,
+      "step": 3140
+    },
+    {
+      "epoch": 3.9057656540607564,
+      "grad_norm": 1.668792724609375,
+      "learning_rate": 0.0002,
+      "loss": 0.8068,
+      "step": 3150
+    },
+    {
+      "epoch": 3.9181649101053937,
+      "grad_norm": 1.598772644996643,
+      "learning_rate": 0.0002,
+      "loss": 0.6843,
+      "step": 3160
+    },
+    {
+      "epoch": 3.930564166150031,
+      "grad_norm": 1.6434032917022705,
+      "learning_rate": 0.0002,
+      "loss": 0.6564,
+      "step": 3170
+    },
+    {
+      "epoch": 3.9429634221946683,
+      "grad_norm": 1.5382963418960571,
+      "learning_rate": 0.0002,
+      "loss": 0.7559,
+      "step": 3180
+    },
+    {
+      "epoch": 3.9553626782393057,
+      "grad_norm": 1.6733973026275635,
+      "learning_rate": 0.0002,
+      "loss": 0.7089,
+      "step": 3190
+    },
+    {
+      "epoch": 3.967761934283943,
+      "grad_norm": 1.5769109725952148,
+      "learning_rate": 0.0002,
+      "loss": 0.7051,
+      "step": 3200
+    },
+    {
+      "epoch": 3.9801611903285803,
+      "grad_norm": 1.5158107280731201,
+      "learning_rate": 0.0002,
+      "loss": 0.7548,
+      "step": 3210
+    },
+    {
+      "epoch": 3.9925604463732176,
+      "grad_norm": 2.034385919570923,
+      "learning_rate": 0.0002,
+      "loss": 0.6742,
+      "step": 3220
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 1.3322206735610962,
+      "eval_runtime": 127.309,
+      "eval_samples_per_second": 3.582,
+      "eval_steps_per_second": 0.448,
+      "step": 3226
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 6448,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.415398683574272e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a0a54cf05b51cc3ca7cba649c3e96685958c9d310c181dff0c31954ec4641225
+size 5560

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mistral-7B-Instruct-v0.3
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.13.1

	@@ -0,0 +1,29 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "v_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1aa44b577cdc47963b0c98990ce3bf9e021eb1a602311326175685b3ff19c72
+size 109069176

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9faff594bc1fb7c3c19d5a2dd8c8bbae732045a22a48bf3179d8ca89e8f3f923
+size 55532666

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ff3a1f2938602107fc8a6c87063e374ddcc67f7de4cffb03659d643be751c1a
+size 14244

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f4b25a929ec0550000ba9a4c4c598bf8752e2d277877020980e873b4c9e03e5
+size 1064

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37f00374dea48658ee8f5d0f21895b9bc55cb0103939607c8185bfd1c6ca1f89
+size 587404

The diff for this file is too large to render. See raw diff

	@@ -0,0 +1,2894 @@

+{
+  "best_metric": 1.2014765739440918,
+  "best_model_checkpoint": "outputs-001/Mistral-7B-Instruct-v0.3_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.5-num-4904-sd-1/checkpoint-1613",
+  "epoch": 4.999380037197768,
+  "eval_steps": 10,
+  "global_step": 4032,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.012399256044637322,
+      "grad_norm": 1.6176791191101074,
+      "learning_rate": 0.0002,
+      "loss": 1.8616,
+      "step": 10
+    },
+    {
+      "epoch": 0.024798512089274645,
+      "grad_norm": 0.7599679827690125,
+      "learning_rate": 0.0002,
+      "loss": 1.5953,
+      "step": 20
+    },
+    {
+      "epoch": 0.037197768133911964,
+      "grad_norm": 0.8452111482620239,
+      "learning_rate": 0.0002,
+      "loss": 1.5705,
+      "step": 30
+    },
+    {
+      "epoch": 0.04959702417854929,
+      "grad_norm": 0.8393070101737976,
+      "learning_rate": 0.0002,
+      "loss": 1.5647,
+      "step": 40
+    },
+    {
+      "epoch": 0.06199628022318661,
+      "grad_norm": 1.117109775543213,
+      "learning_rate": 0.0002,
+      "loss": 1.4628,
+      "step": 50
+    },
+    {
+      "epoch": 0.07439553626782393,
+      "grad_norm": 0.8330236077308655,
+      "learning_rate": 0.0002,
+      "loss": 1.3492,
+      "step": 60
+    },
+    {
+      "epoch": 0.08679479231246125,
+      "grad_norm": 0.8670704960823059,
+      "learning_rate": 0.0002,
+      "loss": 1.367,
+      "step": 70
+    },
+    {
+      "epoch": 0.09919404835709858,
+      "grad_norm": 0.6262535452842712,
+      "learning_rate": 0.0002,
+      "loss": 1.2357,
+      "step": 80
+    },
+    {
+      "epoch": 0.1115933044017359,
+      "grad_norm": 0.753338098526001,
+      "learning_rate": 0.0002,
+      "loss": 1.3651,
+      "step": 90
+    },
+    {
+      "epoch": 0.12399256044637322,
+      "grad_norm": 0.6324933171272278,
+      "learning_rate": 0.0002,
+      "loss": 1.2789,
+      "step": 100
+    },
+    {
+      "epoch": 0.13639181649101054,
+      "grad_norm": 0.7270851135253906,
+      "learning_rate": 0.0002,
+      "loss": 1.2393,
+      "step": 110
+    },
+    {
+      "epoch": 0.14879107253564786,
+      "grad_norm": 0.7036070227622986,
+      "learning_rate": 0.0002,
+      "loss": 1.177,
+      "step": 120
+    },
+    {
+      "epoch": 0.16119032858028517,
+      "grad_norm": 0.6269583106040955,
+      "learning_rate": 0.0002,
+      "loss": 1.2808,
+      "step": 130
+    },
+    {
+      "epoch": 0.1735895846249225,
+      "grad_norm": 0.6848828792572021,
+      "learning_rate": 0.0002,
+      "loss": 1.3039,
+      "step": 140
+    },
+    {
+      "epoch": 0.1859888406695598,
+      "grad_norm": 0.5589784383773804,
+      "learning_rate": 0.0002,
+      "loss": 1.1925,
+      "step": 150
+    },
+    {
+      "epoch": 0.19838809671419716,
+      "grad_norm": 0.8350988626480103,
+      "learning_rate": 0.0002,
+      "loss": 1.2724,
+      "step": 160
+    },
+    {
+      "epoch": 0.21078735275883448,
+      "grad_norm": 1.1780346632003784,
+      "learning_rate": 0.0002,
+      "loss": 1.2093,
+      "step": 170
+    },
+    {
+      "epoch": 0.2231866088034718,
+      "grad_norm": 0.674608588218689,
+      "learning_rate": 0.0002,
+      "loss": 1.2573,
+      "step": 180
+    },
+    {
+      "epoch": 0.23558586484810912,
+      "grad_norm": 0.6972184181213379,
+      "learning_rate": 0.0002,
+      "loss": 1.2629,
+      "step": 190
+    },
+    {
+      "epoch": 0.24798512089274644,
+      "grad_norm": 0.5187845230102539,
+      "learning_rate": 0.0002,
+      "loss": 1.2618,
+      "step": 200
+    },
+    {
+      "epoch": 0.26038437693738375,
+      "grad_norm": 0.7513871192932129,
+      "learning_rate": 0.0002,
+      "loss": 1.3478,
+      "step": 210
+    },
+    {
+      "epoch": 0.2727836329820211,
+      "grad_norm": 0.5859110951423645,
+      "learning_rate": 0.0002,
+      "loss": 1.1843,
+      "step": 220
+    },
+    {
+      "epoch": 0.2851828890266584,
+      "grad_norm": 0.5547062754631042,
+      "learning_rate": 0.0002,
+      "loss": 1.1784,
+      "step": 230
+    },
+    {
+      "epoch": 0.2975821450712957,
+      "grad_norm": 3.5287671089172363,
+      "learning_rate": 0.0002,
+      "loss": 1.2564,
+      "step": 240
+    },
+    {
+      "epoch": 0.30998140111593303,
+      "grad_norm": 0.8644460439682007,
+      "learning_rate": 0.0002,
+      "loss": 1.313,
+      "step": 250
+    },
+    {
+      "epoch": 0.32238065716057035,
+      "grad_norm": 0.6270064115524292,
+      "learning_rate": 0.0002,
+      "loss": 1.2187,
+      "step": 260
+    },
+    {
+      "epoch": 0.33477991320520767,
+      "grad_norm": 1.170295000076294,
+      "learning_rate": 0.0002,
+      "loss": 1.2017,
+      "step": 270
+    },
+    {
+      "epoch": 0.347179169249845,
+      "grad_norm": 0.5701245069503784,
+      "learning_rate": 0.0002,
+      "loss": 1.1075,
+      "step": 280
+    },
+    {
+      "epoch": 0.3595784252944823,
+      "grad_norm": 0.6373095512390137,
+      "learning_rate": 0.0002,
+      "loss": 1.1185,
+      "step": 290
+    },
+    {
+      "epoch": 0.3719776813391196,
+      "grad_norm": 0.5740704536437988,
+      "learning_rate": 0.0002,
+      "loss": 1.1738,
+      "step": 300
+    },
+    {
+      "epoch": 0.384376937383757,
+      "grad_norm": 0.5516835451126099,
+      "learning_rate": 0.0002,
+      "loss": 1.2858,
+      "step": 310
+    },
+    {
+      "epoch": 0.3967761934283943,
+      "grad_norm": 0.5212382078170776,
+      "learning_rate": 0.0002,
+      "loss": 1.2315,
+      "step": 320
+    },
+    {
+      "epoch": 0.40917544947303164,
+      "grad_norm": 0.540307343006134,
+      "learning_rate": 0.0002,
+      "loss": 1.172,
+      "step": 330
+    },
+    {
+      "epoch": 0.42157470551766896,
+      "grad_norm": 0.7454301714897156,
+      "learning_rate": 0.0002,
+      "loss": 1.2736,
+      "step": 340
+    },
+    {
+      "epoch": 0.4339739615623063,
+      "grad_norm": 0.7390317916870117,
+      "learning_rate": 0.0002,
+      "loss": 1.3013,
+      "step": 350
+    },
+    {
+      "epoch": 0.4463732176069436,
+      "grad_norm": 0.5498788356781006,
+      "learning_rate": 0.0002,
+      "loss": 1.0615,
+      "step": 360
+    },
+    {
+      "epoch": 0.4587724736515809,
+      "grad_norm": 0.5776252150535583,
+      "learning_rate": 0.0002,
+      "loss": 1.2251,
+      "step": 370
+    },
+    {
+      "epoch": 0.47117172969621823,
+      "grad_norm": 0.6941552758216858,
+      "learning_rate": 0.0002,
+      "loss": 1.1932,
+      "step": 380
+    },
+    {
+      "epoch": 0.48357098574085555,
+      "grad_norm": 0.7936233282089233,
+      "learning_rate": 0.0002,
+      "loss": 1.23,
+      "step": 390
+    },
+    {
+      "epoch": 0.49597024178549287,
+      "grad_norm": 0.5257220268249512,
+      "learning_rate": 0.0002,
+      "loss": 1.1137,
+      "step": 400
+    },
+    {
+      "epoch": 0.5083694978301302,
+      "grad_norm": 0.5740510821342468,
+      "learning_rate": 0.0002,
+      "loss": 1.1867,
+      "step": 410
+    },
+    {
+      "epoch": 0.5207687538747675,
+      "grad_norm": 0.6181507110595703,
+      "learning_rate": 0.0002,
+      "loss": 1.1049,
+      "step": 420
+    },
+    {
+      "epoch": 0.5331680099194048,
+      "grad_norm": 0.6333999037742615,
+      "learning_rate": 0.0002,
+      "loss": 1.2303,
+      "step": 430
+    },
+    {
+      "epoch": 0.5455672659640421,
+      "grad_norm": 0.5667845010757446,
+      "learning_rate": 0.0002,
+      "loss": 1.2457,
+      "step": 440
+    },
+    {
+      "epoch": 0.5579665220086795,
+      "grad_norm": 0.5254231095314026,
+      "learning_rate": 0.0002,
+      "loss": 1.2547,
+      "step": 450
+    },
+    {
+      "epoch": 0.5703657780533168,
+      "grad_norm": 0.5938495993614197,
+      "learning_rate": 0.0002,
+      "loss": 1.2118,
+      "step": 460
+    },
+    {
+      "epoch": 0.5827650340979541,
+      "grad_norm": 0.7733635902404785,
+      "learning_rate": 0.0002,
+      "loss": 1.2409,
+      "step": 470
+    },
+    {
+      "epoch": 0.5951642901425914,
+      "grad_norm": 0.6114753484725952,
+      "learning_rate": 0.0002,
+      "loss": 1.2343,
+      "step": 480
+    },
+    {
+      "epoch": 0.6075635461872287,
+      "grad_norm": 0.5587155818939209,
+      "learning_rate": 0.0002,
+      "loss": 1.1779,
+      "step": 490
+    },
+    {
+      "epoch": 0.6199628022318661,
+      "grad_norm": 0.7636917233467102,
+      "learning_rate": 0.0002,
+      "loss": 1.2136,
+      "step": 500
+    },
+    {
+      "epoch": 0.6323620582765034,
+      "grad_norm": 0.5896942615509033,
+      "learning_rate": 0.0002,
+      "loss": 1.1301,
+      "step": 510
+    },
+    {
+      "epoch": 0.6447613143211407,
+      "grad_norm": 0.8594750165939331,
+      "learning_rate": 0.0002,
+      "loss": 1.2089,
+      "step": 520
+    },
+    {
+      "epoch": 0.657160570365778,
+      "grad_norm": 0.6459881067276001,
+      "learning_rate": 0.0002,
+      "loss": 1.1551,
+      "step": 530
+    },
+    {
+      "epoch": 0.6695598264104153,
+      "grad_norm": 0.650656521320343,
+      "learning_rate": 0.0002,
+      "loss": 1.175,
+      "step": 540
+    },
+    {
+      "epoch": 0.6819590824550527,
+      "grad_norm": 0.7238242626190186,
+      "learning_rate": 0.0002,
+      "loss": 1.2143,
+      "step": 550
+    },
+    {
+      "epoch": 0.69435833849969,
+      "grad_norm": 0.6289859414100647,
+      "learning_rate": 0.0002,
+      "loss": 1.0961,
+      "step": 560
+    },
+    {
+      "epoch": 0.7067575945443273,
+      "grad_norm": 0.6108142137527466,
+      "learning_rate": 0.0002,
+      "loss": 1.2316,
+      "step": 570
+    },
+    {
+      "epoch": 0.7191568505889646,
+      "grad_norm": 0.6905024647712708,
+      "learning_rate": 0.0002,
+      "loss": 1.1315,
+      "step": 580
+    },
+    {
+      "epoch": 0.7315561066336019,
+      "grad_norm": 0.5975471138954163,
+      "learning_rate": 0.0002,
+      "loss": 1.2368,
+      "step": 590
+    },
+    {
+      "epoch": 0.7439553626782393,
+      "grad_norm": 0.49540066719055176,
+      "learning_rate": 0.0002,
+      "loss": 1.1014,
+      "step": 600
+    },
+    {
+      "epoch": 0.7563546187228767,
+      "grad_norm": 0.5365461707115173,
+      "learning_rate": 0.0002,
+      "loss": 1.1359,
+      "step": 610
+    },
+    {
+      "epoch": 0.768753874767514,
+      "grad_norm": 0.6156648993492126,
+      "learning_rate": 0.0002,
+      "loss": 1.2552,
+      "step": 620
+    },
+    {
+      "epoch": 0.7811531308121513,
+      "grad_norm": 0.656879186630249,
+      "learning_rate": 0.0002,
+      "loss": 1.1929,
+      "step": 630
+    },
+    {
+      "epoch": 0.7935523868567886,
+      "grad_norm": 0.8963037729263306,
+      "learning_rate": 0.0002,
+      "loss": 1.3063,
+      "step": 640
+    },
+    {
+      "epoch": 0.805951642901426,
+      "grad_norm": 1.0569753646850586,
+      "learning_rate": 0.0002,
+      "loss": 1.219,
+      "step": 650
+    },
+    {
+      "epoch": 0.8183508989460633,
+      "grad_norm": 0.7332107424736023,
+      "learning_rate": 0.0002,
+      "loss": 1.2563,
+      "step": 660
+    },
+    {
+      "epoch": 0.8307501549907006,
+      "grad_norm": 0.589097797870636,
+      "learning_rate": 0.0002,
+      "loss": 1.1029,
+      "step": 670
+    },
+    {
+      "epoch": 0.8431494110353379,
+      "grad_norm": 0.9553480744361877,
+      "learning_rate": 0.0002,
+      "loss": 1.1705,
+      "step": 680
+    },
+    {
+      "epoch": 0.8555486670799752,
+      "grad_norm": 0.7076331973075867,
+      "learning_rate": 0.0002,
+      "loss": 1.1605,
+      "step": 690
+    },
+    {
+      "epoch": 0.8679479231246126,
+      "grad_norm": 0.597531795501709,
+      "learning_rate": 0.0002,
+      "loss": 1.2346,
+      "step": 700
+    },
+    {
+      "epoch": 0.8803471791692499,
+      "grad_norm": 0.7023149132728577,
+      "learning_rate": 0.0002,
+      "loss": 1.1637,
+      "step": 710
+    },
+    {
+      "epoch": 0.8927464352138872,
+      "grad_norm": 1.4393764734268188,
+      "learning_rate": 0.0002,
+      "loss": 1.2717,
+      "step": 720
+    },
+    {
+      "epoch": 0.9051456912585245,
+      "grad_norm": 0.5944231152534485,
+      "learning_rate": 0.0002,
+      "loss": 1.216,
+      "step": 730
+    },
+    {
+      "epoch": 0.9175449473031618,
+      "grad_norm": 0.5712162852287292,
+      "learning_rate": 0.0002,
+      "loss": 1.148,
+      "step": 740
+    },
+    {
+      "epoch": 0.9299442033477991,
+      "grad_norm": 0.5335281491279602,
+      "learning_rate": 0.0002,
+      "loss": 1.2318,
+      "step": 750
+    },
+    {
+      "epoch": 0.9423434593924365,
+      "grad_norm": 0.8050292730331421,
+      "learning_rate": 0.0002,
+      "loss": 1.149,
+      "step": 760
+    },
+    {
+      "epoch": 0.9547427154370738,
+      "grad_norm": 0.6092700958251953,
+      "learning_rate": 0.0002,
+      "loss": 1.0862,
+      "step": 770
+    },
+    {
+      "epoch": 0.9671419714817111,
+      "grad_norm": 0.7012797594070435,
+      "learning_rate": 0.0002,
+      "loss": 1.3204,
+      "step": 780
+    },
+    {
+      "epoch": 0.9795412275263484,
+      "grad_norm": 0.6228184103965759,
+      "learning_rate": 0.0002,
+      "loss": 1.1641,
+      "step": 790
+    },
+    {
+      "epoch": 0.9919404835709857,
+      "grad_norm": 0.5482686161994934,
+      "learning_rate": 0.0002,
+      "loss": 1.131,
+      "step": 800
+    },
+    {
+      "epoch": 0.9993800371977681,
+      "eval_loss": 1.2057286500930786,
+      "eval_runtime": 164.6087,
+      "eval_samples_per_second": 2.77,
+      "eval_steps_per_second": 0.346,
+      "step": 806
+    },
+    {
+      "epoch": 1.004339739615623,
+      "grad_norm": 0.6331814527511597,
+      "learning_rate": 0.0002,
+      "loss": 1.0899,
+      "step": 810
+    },
+    {
+      "epoch": 1.0167389956602604,
+      "grad_norm": 0.6160872578620911,
+      "learning_rate": 0.0002,
+      "loss": 1.0551,
+      "step": 820
+    },
+    {
+      "epoch": 1.0291382517048977,
+      "grad_norm": 0.6104072332382202,
+      "learning_rate": 0.0002,
+      "loss": 0.9934,
+      "step": 830
+    },
+    {
+      "epoch": 1.041537507749535,
+      "grad_norm": 0.7619274854660034,
+      "learning_rate": 0.0002,
+      "loss": 1.0776,
+      "step": 840
+    },
+    {
+      "epoch": 1.0539367637941723,
+      "grad_norm": 0.761172890663147,
+      "learning_rate": 0.0002,
+      "loss": 0.9929,
+      "step": 850
+    },
+    {
+      "epoch": 1.0663360198388097,
+      "grad_norm": 0.7563514113426208,
+      "learning_rate": 0.0002,
+      "loss": 1.0543,
+      "step": 860
+    },
+    {
+      "epoch": 1.078735275883447,
+      "grad_norm": 0.521998941898346,
+      "learning_rate": 0.0002,
+      "loss": 1.0812,
+      "step": 870
+    },
+    {
+      "epoch": 1.0911345319280843,
+      "grad_norm": 0.824347972869873,
+      "learning_rate": 0.0002,
+      "loss": 1.1417,
+      "step": 880
+    },
+    {
+      "epoch": 1.1035337879727216,
+      "grad_norm": 0.5645424127578735,
+      "learning_rate": 0.0002,
+      "loss": 1.1096,
+      "step": 890
+    },
+    {
+      "epoch": 1.115933044017359,
+      "grad_norm": 0.8568223714828491,
+      "learning_rate": 0.0002,
+      "loss": 1.1005,
+      "step": 900
+    },
+    {
+      "epoch": 1.1283323000619963,
+      "grad_norm": 0.68181312084198,
+      "learning_rate": 0.0002,
+      "loss": 1.088,
+      "step": 910
+    },
+    {
+      "epoch": 1.1407315561066336,
+      "grad_norm": 0.7577647566795349,
+      "learning_rate": 0.0002,
+      "loss": 1.0281,
+      "step": 920
+    },
+    {
+      "epoch": 1.153130812151271,
+      "grad_norm": 0.6968798637390137,
+      "learning_rate": 0.0002,
+      "loss": 0.9812,
+      "step": 930
+    },
+    {
+      "epoch": 1.1655300681959082,
+      "grad_norm": 0.5769661664962769,
+      "learning_rate": 0.0002,
+      "loss": 1.0539,
+      "step": 940
+    },
+    {
+      "epoch": 1.1779293242405455,
+      "grad_norm": 0.6399155259132385,
+      "learning_rate": 0.0002,
+      "loss": 1.0831,
+      "step": 950
+    },
+    {
+      "epoch": 1.1903285802851828,
+      "grad_norm": 0.9824289679527283,
+      "learning_rate": 0.0002,
+      "loss": 1.0464,
+      "step": 960
+    },
+    {
+      "epoch": 1.2027278363298202,
+      "grad_norm": 0.7485893964767456,
+      "learning_rate": 0.0002,
+      "loss": 1.1068,
+      "step": 970
+    },
+    {
+      "epoch": 1.2151270923744575,
+      "grad_norm": 0.668736457824707,
+      "learning_rate": 0.0002,
+      "loss": 1.0047,
+      "step": 980
+    },
+    {
+      "epoch": 1.2275263484190948,
+      "grad_norm": 0.7041404843330383,
+      "learning_rate": 0.0002,
+      "loss": 1.0818,
+      "step": 990
+    },
+    {
+      "epoch": 1.2399256044637321,
+      "grad_norm": 0.7070603966712952,
+      "learning_rate": 0.0002,
+      "loss": 1.0847,
+      "step": 1000
+    },
+    {
+      "epoch": 1.2523248605083694,
+      "grad_norm": 0.7828628420829773,
+      "learning_rate": 0.0002,
+      "loss": 1.047,
+      "step": 1010
+    },
+    {
+      "epoch": 1.2647241165530068,
+      "grad_norm": 0.7149654626846313,
+      "learning_rate": 0.0002,
+      "loss": 1.0658,
+      "step": 1020
+    },
+    {
+      "epoch": 1.277123372597644,
+      "grad_norm": 0.7691766619682312,
+      "learning_rate": 0.0002,
+      "loss": 0.9791,
+      "step": 1030
+    },
+    {
+      "epoch": 1.2895226286422814,
+      "grad_norm": 0.8022137880325317,
+      "learning_rate": 0.0002,
+      "loss": 1.0242,
+      "step": 1040
+    },
+    {
+      "epoch": 1.3019218846869187,
+      "grad_norm": 0.6709204316139221,
+      "learning_rate": 0.0002,
+      "loss": 1.0837,
+      "step": 1050
+    },
+    {
+      "epoch": 1.314321140731556,
+      "grad_norm": 0.7368158102035522,
+      "learning_rate": 0.0002,
+      "loss": 1.0382,
+      "step": 1060
+    },
+    {
+      "epoch": 1.3267203967761934,
+      "grad_norm": 0.8408007621765137,
+      "learning_rate": 0.0002,
+      "loss": 1.0371,
+      "step": 1070
+    },
+    {
+      "epoch": 1.3391196528208307,
+      "grad_norm": 1.2165539264678955,
+      "learning_rate": 0.0002,
+      "loss": 0.9633,
+      "step": 1080
+    },
+    {
+      "epoch": 1.351518908865468,
+      "grad_norm": 0.7284916043281555,
+      "learning_rate": 0.0002,
+      "loss": 1.0079,
+      "step": 1090
+    },
+    {
+      "epoch": 1.3639181649101053,
+      "grad_norm": 0.7994557619094849,
+      "learning_rate": 0.0002,
+      "loss": 1.0211,
+      "step": 1100
+    },
+    {
+      "epoch": 1.3763174209547429,
+      "grad_norm": 0.9658345580101013,
+      "learning_rate": 0.0002,
+      "loss": 1.0892,
+      "step": 1110
+    },
+    {
+      "epoch": 1.3887166769993802,
+      "grad_norm": 0.6312829852104187,
+      "learning_rate": 0.0002,
+      "loss": 1.2088,
+      "step": 1120
+    },
+    {
+      "epoch": 1.4011159330440175,
+      "grad_norm": 0.7263661026954651,
+      "learning_rate": 0.0002,
+      "loss": 1.1055,
+      "step": 1130
+    },
+    {
+      "epoch": 1.4135151890886548,
+      "grad_norm": 0.829082727432251,
+      "learning_rate": 0.0002,
+      "loss": 1.0232,
+      "step": 1140
+    },
+    {
+      "epoch": 1.4259144451332921,
+      "grad_norm": 0.6168127059936523,
+      "learning_rate": 0.0002,
+      "loss": 1.1413,
+      "step": 1150
+    },
+    {
+      "epoch": 1.4383137011779294,
+      "grad_norm": 0.8351425528526306,
+      "learning_rate": 0.0002,
+      "loss": 1.0283,
+      "step": 1160
+    },
+    {
+      "epoch": 1.4507129572225668,
+      "grad_norm": 0.8814472556114197,
+      "learning_rate": 0.0002,
+      "loss": 1.1146,
+      "step": 1170
+    },
+    {
+      "epoch": 1.463112213267204,
+      "grad_norm": 0.6913689970970154,
+      "learning_rate": 0.0002,
+      "loss": 1.0932,
+      "step": 1180
+    },
+    {
+      "epoch": 1.4755114693118414,
+      "grad_norm": 0.7907165884971619,
+      "learning_rate": 0.0002,
+      "loss": 1.1066,
+      "step": 1190
+    },
+    {
+      "epoch": 1.4879107253564787,
+      "grad_norm": 0.8361626267433167,
+      "learning_rate": 0.0002,
+      "loss": 1.0738,
+      "step": 1200
+    },
+    {
+      "epoch": 1.500309981401116,
+      "grad_norm": 1.073534607887268,
+      "learning_rate": 0.0002,
+      "loss": 1.0559,
+      "step": 1210
+    },
+    {
+      "epoch": 1.5127092374457534,
+      "grad_norm": 0.8416345119476318,
+      "learning_rate": 0.0002,
+      "loss": 1.0204,
+      "step": 1220
+    },
+    {
+      "epoch": 1.5251084934903907,
+      "grad_norm": 1.0225597620010376,
+      "learning_rate": 0.0002,
+      "loss": 1.0941,
+      "step": 1230
+    },
+    {
+      "epoch": 1.537507749535028,
+      "grad_norm": 0.6662965416908264,
+      "learning_rate": 0.0002,
+      "loss": 1.0854,
+      "step": 1240
+    },
+    {
+      "epoch": 1.5499070055796653,
+      "grad_norm": 0.7363991737365723,
+      "learning_rate": 0.0002,
+      "loss": 1.0816,
+      "step": 1250
+    },
+    {
+      "epoch": 1.5623062616243026,
+      "grad_norm": 0.9029574990272522,
+      "learning_rate": 0.0002,
+      "loss": 1.0271,
+      "step": 1260
+    },
+    {
+      "epoch": 1.57470551766894,
+      "grad_norm": 0.7992424368858337,
+      "learning_rate": 0.0002,
+      "loss": 1.0206,
+      "step": 1270
+    },
+    {
+      "epoch": 1.5871047737135773,
+      "grad_norm": 0.8108977675437927,
+      "learning_rate": 0.0002,
+      "loss": 1.0114,
+      "step": 1280
+    },
+    {
+      "epoch": 1.5995040297582146,
+      "grad_norm": 0.8257458806037903,
+      "learning_rate": 0.0002,
+      "loss": 1.0264,
+      "step": 1290
+    },
+    {
+      "epoch": 1.611903285802852,
+      "grad_norm": 0.8265092968940735,
+      "learning_rate": 0.0002,
+      "loss": 1.0944,
+      "step": 1300
+    },
+    {
+      "epoch": 1.6243025418474892,
+      "grad_norm": 0.6568580269813538,
+      "learning_rate": 0.0002,
+      "loss": 1.0136,
+      "step": 1310
+    },
+    {
+      "epoch": 1.6367017978921266,
+      "grad_norm": 0.7608488202095032,
+      "learning_rate": 0.0002,
+      "loss": 1.009,
+      "step": 1320
+    },
+    {
+      "epoch": 1.6491010539367639,
+      "grad_norm": 0.7511259317398071,
+      "learning_rate": 0.0002,
+      "loss": 1.1202,
+      "step": 1330
+    },
+    {
+      "epoch": 1.6615003099814012,
+      "grad_norm": 0.7942162752151489,
+      "learning_rate": 0.0002,
+      "loss": 1.0528,
+      "step": 1340
+    },
+    {
+      "epoch": 1.6738995660260385,
+      "grad_norm": 0.8253659605979919,
+      "learning_rate": 0.0002,
+      "loss": 1.0411,
+      "step": 1350
+    },
+    {
+      "epoch": 1.6862988220706758,
+      "grad_norm": 1.1318382024765015,
+      "learning_rate": 0.0002,
+      "loss": 1.001,
+      "step": 1360
+    },
+    {
+      "epoch": 1.6986980781153131,
+      "grad_norm": 0.693403959274292,
+      "learning_rate": 0.0002,
+      "loss": 1.0727,
+      "step": 1370
+    },
+    {
+      "epoch": 1.7110973341599505,
+      "grad_norm": 0.7107617259025574,
+      "learning_rate": 0.0002,
+      "loss": 1.073,
+      "step": 1380
+    },
+    {
+      "epoch": 1.7234965902045878,
+      "grad_norm": 0.8169032335281372,
+      "learning_rate": 0.0002,
+      "loss": 1.0849,
+      "step": 1390
+    },
+    {
+      "epoch": 1.735895846249225,
+      "grad_norm": 0.8940841555595398,
+      "learning_rate": 0.0002,
+      "loss": 1.0578,
+      "step": 1400
+    },
+    {
+      "epoch": 1.7482951022938624,
+      "grad_norm": 0.7862188220024109,
+      "learning_rate": 0.0002,
+      "loss": 1.0891,
+      "step": 1410
+    },
+    {
+      "epoch": 1.7606943583384997,
+      "grad_norm": 1.136338472366333,
+      "learning_rate": 0.0002,
+      "loss": 0.9962,
+      "step": 1420
+    },
+    {
+      "epoch": 1.773093614383137,
+      "grad_norm": 0.9534069895744324,
+      "learning_rate": 0.0002,
+      "loss": 1.0943,
+      "step": 1430
+    },
+    {
+      "epoch": 1.7854928704277744,
+      "grad_norm": 1.0747562646865845,
+      "learning_rate": 0.0002,
+      "loss": 1.1257,
+      "step": 1440
+    },
+    {
+      "epoch": 1.7978921264724117,
+      "grad_norm": 0.8557891249656677,
+      "learning_rate": 0.0002,
+      "loss": 1.0556,
+      "step": 1450
+    },
+    {
+      "epoch": 1.810291382517049,
+      "grad_norm": 0.6829259991645813,
+      "learning_rate": 0.0002,
+      "loss": 1.0128,
+      "step": 1460
+    },
+    {
+      "epoch": 1.8226906385616863,
+      "grad_norm": 0.8164441585540771,
+      "learning_rate": 0.0002,
+      "loss": 1.0313,
+      "step": 1470
+    },
+    {
+      "epoch": 1.8350898946063237,
+      "grad_norm": 0.9458068609237671,
+      "learning_rate": 0.0002,
+      "loss": 1.1136,
+      "step": 1480
+    },
+    {
+      "epoch": 1.847489150650961,
+      "grad_norm": 0.743009626865387,
+      "learning_rate": 0.0002,
+      "loss": 1.0457,
+      "step": 1490
+    },
+    {
+      "epoch": 1.8598884066955983,
+      "grad_norm": 0.7137694358825684,
+      "learning_rate": 0.0002,
+      "loss": 1.0107,
+      "step": 1500
+    },
+    {
+      "epoch": 1.8722876627402356,
+      "grad_norm": 0.7618028521537781,
+      "learning_rate": 0.0002,
+      "loss": 1.0633,
+      "step": 1510
+    },
+    {
+      "epoch": 1.884686918784873,
+      "grad_norm": 0.8153398633003235,
+      "learning_rate": 0.0002,
+      "loss": 1.103,
+      "step": 1520
+    },
+    {
+      "epoch": 1.8970861748295103,
+      "grad_norm": 0.9127124547958374,
+      "learning_rate": 0.0002,
+      "loss": 1.2094,
+      "step": 1530
+    },
+    {
+      "epoch": 1.9094854308741476,
+      "grad_norm": 0.7699425220489502,
+      "learning_rate": 0.0002,
+      "loss": 1.0379,
+      "step": 1540
+    },
+    {
+      "epoch": 1.921884686918785,
+      "grad_norm": 0.8807545304298401,
+      "learning_rate": 0.0002,
+      "loss": 0.9849,
+      "step": 1550
+    },
+    {
+      "epoch": 1.9342839429634222,
+      "grad_norm": 0.7340815663337708,
+      "learning_rate": 0.0002,
+      "loss": 1.033,
+      "step": 1560
+    },
+    {
+      "epoch": 1.9466831990080595,
+      "grad_norm": 1.070056676864624,
+      "learning_rate": 0.0002,
+      "loss": 1.0083,
+      "step": 1570
+    },
+    {
+      "epoch": 1.9590824550526968,
+      "grad_norm": 0.8195573687553406,
+      "learning_rate": 0.0002,
+      "loss": 1.0023,
+      "step": 1580
+    },
+    {
+      "epoch": 1.9714817110973342,
+      "grad_norm": 0.7938687205314636,
+      "learning_rate": 0.0002,
+      "loss": 1.029,
+      "step": 1590
+    },
+    {
+      "epoch": 1.9838809671419715,
+      "grad_norm": 0.7632259726524353,
+      "learning_rate": 0.0002,
+      "loss": 1.0512,
+      "step": 1600
+    },
+    {
+      "epoch": 1.9962802231866088,
+      "grad_norm": 0.7921916246414185,
+      "learning_rate": 0.0002,
+      "loss": 1.0426,
+      "step": 1610
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.2014765739440918,
+      "eval_runtime": 159.8677,
+      "eval_samples_per_second": 2.852,
+      "eval_steps_per_second": 0.357,
+      "step": 1613
+    },
+    {
+      "epoch": 2.008679479231246,
+      "grad_norm": 1.1764529943466187,
+      "learning_rate": 0.0002,
+      "loss": 0.9239,
+      "step": 1620
+    },
+    {
+      "epoch": 2.0210787352758834,
+      "grad_norm": 1.0271947383880615,
+      "learning_rate": 0.0002,
+      "loss": 0.7995,
+      "step": 1630
+    },
+    {
+      "epoch": 2.0334779913205208,
+      "grad_norm": 0.7138071656227112,
+      "learning_rate": 0.0002,
+      "loss": 0.8592,
+      "step": 1640
+    },
+    {
+      "epoch": 2.045877247365158,
+      "grad_norm": 0.8644373416900635,
+      "learning_rate": 0.0002,
+      "loss": 0.8106,
+      "step": 1650
+    },
+    {
+      "epoch": 2.0582765034097954,
+      "grad_norm": 1.2262420654296875,
+      "learning_rate": 0.0002,
+      "loss": 0.8578,
+      "step": 1660
+    },
+    {
+      "epoch": 2.0706757594544327,
+      "grad_norm": 0.9718686938285828,
+      "learning_rate": 0.0002,
+      "loss": 0.8009,
+      "step": 1670
+    },
+    {
+      "epoch": 2.08307501549907,
+      "grad_norm": 1.0075122117996216,
+      "learning_rate": 0.0002,
+      "loss": 0.831,
+      "step": 1680
+    },
+    {
+      "epoch": 2.0954742715437074,
+      "grad_norm": 1.2113722562789917,
+      "learning_rate": 0.0002,
+      "loss": 0.8177,
+      "step": 1690
+    },
+    {
+      "epoch": 2.1078735275883447,
+      "grad_norm": 0.7911604642868042,
+      "learning_rate": 0.0002,
+      "loss": 0.8377,
+      "step": 1700
+    },
+    {
+      "epoch": 2.120272783632982,
+      "grad_norm": 0.8578933477401733,
+      "learning_rate": 0.0002,
+      "loss": 0.8405,
+      "step": 1710
+    },
+    {
+      "epoch": 2.1326720396776193,
+      "grad_norm": 1.1782084703445435,
+      "learning_rate": 0.0002,
+      "loss": 0.8784,
+      "step": 1720
+    },
+    {
+      "epoch": 2.1450712957222566,
+      "grad_norm": 1.3627573251724243,
+      "learning_rate": 0.0002,
+      "loss": 0.8543,
+      "step": 1730
+    },
+    {
+      "epoch": 2.157470551766894,
+      "grad_norm": 1.2948803901672363,
+      "learning_rate": 0.0002,
+      "loss": 0.8404,
+      "step": 1740
+    },
+    {
+      "epoch": 2.1698698078115313,
+      "grad_norm": 0.9353442788124084,
+      "learning_rate": 0.0002,
+      "loss": 0.8719,
+      "step": 1750
+    },
+    {
+      "epoch": 2.1822690638561686,
+      "grad_norm": 0.9063374400138855,
+      "learning_rate": 0.0002,
+      "loss": 0.8112,
+      "step": 1760
+    },
+    {
+      "epoch": 2.194668319900806,
+      "grad_norm": 1.3354851007461548,
+      "learning_rate": 0.0002,
+      "loss": 0.9441,
+      "step": 1770
+    },
+    {
+      "epoch": 2.2070675759454432,
+      "grad_norm": 0.8388507962226868,
+      "learning_rate": 0.0002,
+      "loss": 0.877,
+      "step": 1780
+    },
+    {
+      "epoch": 2.2194668319900805,
+      "grad_norm": 0.9509401321411133,
+      "learning_rate": 0.0002,
+      "loss": 0.8709,
+      "step": 1790
+    },
+    {
+      "epoch": 2.231866088034718,
+      "grad_norm": 1.0458593368530273,
+      "learning_rate": 0.0002,
+      "loss": 0.8212,
+      "step": 1800
+    },
+    {
+      "epoch": 2.244265344079355,
+      "grad_norm": 0.890088677406311,
+      "learning_rate": 0.0002,
+      "loss": 0.7667,
+      "step": 1810
+    },
+    {
+      "epoch": 2.2566646001239925,
+      "grad_norm": 1.1933976411819458,
+      "learning_rate": 0.0002,
+      "loss": 0.8431,
+      "step": 1820
+    },
+    {
+      "epoch": 2.26906385616863,
+      "grad_norm": 0.961398184299469,
+      "learning_rate": 0.0002,
+      "loss": 0.8697,
+      "step": 1830
+    },
+    {
+      "epoch": 2.281463112213267,
+      "grad_norm": 1.124961495399475,
+      "learning_rate": 0.0002,
+      "loss": 0.8403,
+      "step": 1840
+    },
+    {
+      "epoch": 2.2938623682579045,
+      "grad_norm": 0.9042379260063171,
+      "learning_rate": 0.0002,
+      "loss": 0.8431,
+      "step": 1850
+    },
+    {
+      "epoch": 2.306261624302542,
+      "grad_norm": 1.2250864505767822,
+      "learning_rate": 0.0002,
+      "loss": 0.8866,
+      "step": 1860
+    },
+    {
+      "epoch": 2.318660880347179,
+      "grad_norm": 1.1758817434310913,
+      "learning_rate": 0.0002,
+      "loss": 0.8514,
+      "step": 1870
+    },
+    {
+      "epoch": 2.3310601363918164,
+      "grad_norm": 0.9863199591636658,
+      "learning_rate": 0.0002,
+      "loss": 0.9316,
+      "step": 1880
+    },
+    {
+      "epoch": 2.3434593924364537,
+      "grad_norm": 1.1759305000305176,
+      "learning_rate": 0.0002,
+      "loss": 0.8854,
+      "step": 1890
+    },
+    {
+      "epoch": 2.355858648481091,
+      "grad_norm": 0.995716392993927,
+      "learning_rate": 0.0002,
+      "loss": 0.866,
+      "step": 1900
+    },
+    {
+      "epoch": 2.3682579045257284,
+      "grad_norm": 1.1816585063934326,
+      "learning_rate": 0.0002,
+      "loss": 0.8439,
+      "step": 1910
+    },
+    {
+      "epoch": 2.3806571605703657,
+      "grad_norm": 0.7498432397842407,
+      "learning_rate": 0.0002,
+      "loss": 0.9284,
+      "step": 1920
+    },
+    {
+      "epoch": 2.393056416615003,
+      "grad_norm": 0.9481443762779236,
+      "learning_rate": 0.0002,
+      "loss": 0.8243,
+      "step": 1930
+    },
+    {
+      "epoch": 2.4054556726596403,
+      "grad_norm": 1.1264584064483643,
+      "learning_rate": 0.0002,
+      "loss": 0.8083,
+      "step": 1940
+    },
+    {
+      "epoch": 2.4178549287042777,
+      "grad_norm": 0.8826232552528381,
+      "learning_rate": 0.0002,
+      "loss": 0.9122,
+      "step": 1950
+    },
+    {
+      "epoch": 2.430254184748915,
+      "grad_norm": 0.9702113270759583,
+      "learning_rate": 0.0002,
+      "loss": 0.8764,
+      "step": 1960
+    },
+    {
+      "epoch": 2.4426534407935523,
+      "grad_norm": 1.0663695335388184,
+      "learning_rate": 0.0002,
+      "loss": 0.8498,
+      "step": 1970
+    },
+    {
+      "epoch": 2.4550526968381896,
+      "grad_norm": 1.1186119318008423,
+      "learning_rate": 0.0002,
+      "loss": 0.888,
+      "step": 1980
+    },
+    {
+      "epoch": 2.467451952882827,
+      "grad_norm": 1.428774118423462,
+      "learning_rate": 0.0002,
+      "loss": 0.9327,
+      "step": 1990
+    },
+    {
+      "epoch": 2.4798512089274642,
+      "grad_norm": 1.3054901361465454,
+      "learning_rate": 0.0002,
+      "loss": 0.9423,
+      "step": 2000
+    },
+    {
+      "epoch": 2.4922504649721016,
+      "grad_norm": 0.9893805384635925,
+      "learning_rate": 0.0002,
+      "loss": 0.8494,
+      "step": 2010
+    },
+    {
+      "epoch": 2.504649721016739,
+      "grad_norm": 1.149538516998291,
+      "learning_rate": 0.0002,
+      "loss": 0.9982,
+      "step": 2020
+    },
+    {
+      "epoch": 2.517048977061376,
+      "grad_norm": 0.8716336488723755,
+      "learning_rate": 0.0002,
+      "loss": 0.881,
+      "step": 2030
+    },
+    {
+      "epoch": 2.5294482331060135,
+      "grad_norm": 1.0464730262756348,
+      "learning_rate": 0.0002,
+      "loss": 0.8483,
+      "step": 2040
+    },
+    {
+      "epoch": 2.541847489150651,
+      "grad_norm": 1.1451894044876099,
+      "learning_rate": 0.0002,
+      "loss": 0.9475,
+      "step": 2050
+    },
+    {
+      "epoch": 2.554246745195288,
+      "grad_norm": 1.3266205787658691,
+      "learning_rate": 0.0002,
+      "loss": 0.8238,
+      "step": 2060
+    },
+    {
+      "epoch": 2.5666460012399255,
+      "grad_norm": 1.2838176488876343,
+      "learning_rate": 0.0002,
+      "loss": 0.8457,
+      "step": 2070
+    },
+    {
+      "epoch": 2.579045257284563,
+      "grad_norm": 1.0352915525436401,
+      "learning_rate": 0.0002,
+      "loss": 0.7813,
+      "step": 2080
+    },
+    {
+      "epoch": 2.5914445133292,
+      "grad_norm": 1.181416392326355,
+      "learning_rate": 0.0002,
+      "loss": 0.895,
+      "step": 2090
+    },
+    {
+      "epoch": 2.6038437693738374,
+      "grad_norm": 1.2425765991210938,
+      "learning_rate": 0.0002,
+      "loss": 0.8537,
+      "step": 2100
+    },
+    {
+      "epoch": 2.6162430254184748,
+      "grad_norm": 1.2885762453079224,
+      "learning_rate": 0.0002,
+      "loss": 0.8561,
+      "step": 2110
+    },
+    {
+      "epoch": 2.628642281463112,
+      "grad_norm": 1.0179181098937988,
+      "learning_rate": 0.0002,
+      "loss": 0.8024,
+      "step": 2120
+    },
+    {
+      "epoch": 2.6410415375077494,
+      "grad_norm": 1.4908100366592407,
+      "learning_rate": 0.0002,
+      "loss": 0.8747,
+      "step": 2130
+    },
+    {
+      "epoch": 2.6534407935523867,
+      "grad_norm": 1.4854460954666138,
+      "learning_rate": 0.0002,
+      "loss": 0.8475,
+      "step": 2140
+    },
+    {
+      "epoch": 2.665840049597024,
+      "grad_norm": 0.994413435459137,
+      "learning_rate": 0.0002,
+      "loss": 0.8579,
+      "step": 2150
+    },
+    {
+      "epoch": 2.6782393056416613,
+      "grad_norm": 1.177201271057129,
+      "learning_rate": 0.0002,
+      "loss": 0.8606,
+      "step": 2160
+    },
+    {
+      "epoch": 2.6906385616862987,
+      "grad_norm": 1.2680933475494385,
+      "learning_rate": 0.0002,
+      "loss": 0.9233,
+      "step": 2170
+    },
+    {
+      "epoch": 2.703037817730936,
+      "grad_norm": 1.2201054096221924,
+      "learning_rate": 0.0002,
+      "loss": 0.8443,
+      "step": 2180
+    },
+    {
+      "epoch": 2.7154370737755733,
+      "grad_norm": 1.2058831453323364,
+      "learning_rate": 0.0002,
+      "loss": 0.8437,
+      "step": 2190
+    },
+    {
+      "epoch": 2.7278363298202106,
+      "grad_norm": 1.1667239665985107,
+      "learning_rate": 0.0002,
+      "loss": 0.9894,
+      "step": 2200
+    },
+    {
+      "epoch": 2.740235585864848,
+      "grad_norm": 1.1243321895599365,
+      "learning_rate": 0.0002,
+      "loss": 0.8501,
+      "step": 2210
+    },
+    {
+      "epoch": 2.7526348419094857,
+      "grad_norm": 1.0543156862258911,
+      "learning_rate": 0.0002,
+      "loss": 0.9387,
+      "step": 2220
+    },
+    {
+      "epoch": 2.765034097954123,
+      "grad_norm": 1.1922553777694702,
+      "learning_rate": 0.0002,
+      "loss": 0.9488,
+      "step": 2230
+    },
+    {
+      "epoch": 2.7774333539987603,
+      "grad_norm": 1.1266813278198242,
+      "learning_rate": 0.0002,
+      "loss": 0.8558,
+      "step": 2240
+    },
+    {
+      "epoch": 2.7898326100433977,
+      "grad_norm": 0.9645159840583801,
+      "learning_rate": 0.0002,
+      "loss": 0.8459,
+      "step": 2250
+    },
+    {
+      "epoch": 2.802231866088035,
+      "grad_norm": 1.0672235488891602,
+      "learning_rate": 0.0002,
+      "loss": 0.8862,
+      "step": 2260
+    },
+    {
+      "epoch": 2.8146311221326723,
+      "grad_norm": 1.5650453567504883,
+      "learning_rate": 0.0002,
+      "loss": 0.869,
+      "step": 2270
+    },
+    {
+      "epoch": 2.8270303781773096,
+      "grad_norm": 1.0414438247680664,
+      "learning_rate": 0.0002,
+      "loss": 0.8,
+      "step": 2280
+    },
+    {
+      "epoch": 2.839429634221947,
+      "grad_norm": 0.8878290057182312,
+      "learning_rate": 0.0002,
+      "loss": 0.8419,
+      "step": 2290
+    },
+    {
+      "epoch": 2.8518288902665843,
+      "grad_norm": 1.0500553846359253,
+      "learning_rate": 0.0002,
+      "loss": 0.9148,
+      "step": 2300
+    },
+    {
+      "epoch": 2.8642281463112216,
+      "grad_norm": 0.9779142737388611,
+      "learning_rate": 0.0002,
+      "loss": 0.8706,
+      "step": 2310
+    },
+    {
+      "epoch": 2.876627402355859,
+      "grad_norm": 0.8904196619987488,
+      "learning_rate": 0.0002,
+      "loss": 0.8385,
+      "step": 2320
+    },
+    {
+      "epoch": 2.889026658400496,
+      "grad_norm": 1.103608250617981,
+      "learning_rate": 0.0002,
+      "loss": 0.8768,
+      "step": 2330
+    },
+    {
+      "epoch": 2.9014259144451335,
+      "grad_norm": 1.2064822912216187,
+      "learning_rate": 0.0002,
+      "loss": 0.8659,
+      "step": 2340
+    },
+    {
+      "epoch": 2.913825170489771,
+      "grad_norm": 1.3073748350143433,
+      "learning_rate": 0.0002,
+      "loss": 0.9299,
+      "step": 2350
+    },
+    {
+      "epoch": 2.926224426534408,
+      "grad_norm": 1.4792760610580444,
+      "learning_rate": 0.0002,
+      "loss": 0.778,
+      "step": 2360
+    },
+    {
+      "epoch": 2.9386236825790455,
+      "grad_norm": 1.1670116186141968,
+      "learning_rate": 0.0002,
+      "loss": 0.9773,
+      "step": 2370
+    },
+    {
+      "epoch": 2.951022938623683,
+      "grad_norm": 1.235465168952942,
+      "learning_rate": 0.0002,
+      "loss": 0.8973,
+      "step": 2380
+    },
+    {
+      "epoch": 2.96342219466832,
+      "grad_norm": 1.7734158039093018,
+      "learning_rate": 0.0002,
+      "loss": 0.8646,
+      "step": 2390
+    },
+    {
+      "epoch": 2.9758214507129574,
+      "grad_norm": 1.3497414588928223,
+      "learning_rate": 0.0002,
+      "loss": 0.8784,
+      "step": 2400
+    },
+    {
+      "epoch": 2.9882207067575948,
+      "grad_norm": 1.1425493955612183,
+      "learning_rate": 0.0002,
+      "loss": 0.9116,
+      "step": 2410
+    },
+    {
+      "epoch": 2.999380037197768,
+      "eval_loss": 1.2303974628448486,
+      "eval_runtime": 126.4856,
+      "eval_samples_per_second": 3.605,
+      "eval_steps_per_second": 0.451,
+      "step": 2419
+    },
+    {
+      "epoch": 3.000619962802232,
+      "grad_norm": 1.4001394510269165,
+      "learning_rate": 0.0002,
+      "loss": 0.9395,
+      "step": 2420
+    },
+    {
+      "epoch": 3.0130192188468694,
+      "grad_norm": 2.4510438442230225,
+      "learning_rate": 0.0002,
+      "loss": 0.6538,
+      "step": 2430
+    },
+    {
+      "epoch": 3.0254184748915067,
+      "grad_norm": 1.5374444723129272,
+      "learning_rate": 0.0002,
+      "loss": 0.6732,
+      "step": 2440
+    },
+    {
+      "epoch": 3.037817730936144,
+      "grad_norm": 1.632250428199768,
+      "learning_rate": 0.0002,
+      "loss": 0.6934,
+      "step": 2450
+    },
+    {
+      "epoch": 3.0502169869807814,
+      "grad_norm": 1.5456780195236206,
+      "learning_rate": 0.0002,
+      "loss": 0.6266,
+      "step": 2460
+    },
+    {
+      "epoch": 3.0626162430254187,
+      "grad_norm": 1.3664451837539673,
+      "learning_rate": 0.0002,
+      "loss": 0.6467,
+      "step": 2470
+    },
+    {
+      "epoch": 3.075015499070056,
+      "grad_norm": 1.344169020652771,
+      "learning_rate": 0.0002,
+      "loss": 0.6351,
+      "step": 2480
+    },
+    {
+      "epoch": 3.0874147551146933,
+      "grad_norm": 0.9710949659347534,
+      "learning_rate": 0.0002,
+      "loss": 0.664,
+      "step": 2490
+    },
+    {
+      "epoch": 3.0998140111593306,
+      "grad_norm": 2.324171304702759,
+      "learning_rate": 0.0002,
+      "loss": 0.6232,
+      "step": 2500
+    },
+    {
+      "epoch": 3.112213267203968,
+      "grad_norm": 1.2885396480560303,
+      "learning_rate": 0.0002,
+      "loss": 0.7336,
+      "step": 2510
+    },
+    {
+      "epoch": 3.1246125232486053,
+      "grad_norm": 1.224718451499939,
+      "learning_rate": 0.0002,
+      "loss": 0.7095,
+      "step": 2520
+    },
+    {
+      "epoch": 3.1370117792932426,
+      "grad_norm": 1.1158969402313232,
+      "learning_rate": 0.0002,
+      "loss": 0.6451,
+      "step": 2530
+    },
+    {
+      "epoch": 3.14941103533788,
+      "grad_norm": 1.189963698387146,
+      "learning_rate": 0.0002,
+      "loss": 0.6024,
+      "step": 2540
+    },
+    {
+      "epoch": 3.1618102913825172,
+      "grad_norm": 1.2543222904205322,
+      "learning_rate": 0.0002,
+      "loss": 0.6996,
+      "step": 2550
+    },
+    {
+      "epoch": 3.1742095474271546,
+      "grad_norm": 1.4986658096313477,
+      "learning_rate": 0.0002,
+      "loss": 0.6854,
+      "step": 2560
+    },
+    {
+      "epoch": 3.186608803471792,
+      "grad_norm": 1.5848976373672485,
+      "learning_rate": 0.0002,
+      "loss": 0.5936,
+      "step": 2570
+    },
+    {
+      "epoch": 3.199008059516429,
+      "grad_norm": 1.2306287288665771,
+      "learning_rate": 0.0002,
+      "loss": 0.64,
+      "step": 2580
+    },
+    {
+      "epoch": 3.2114073155610665,
+      "grad_norm": 1.6327801942825317,
+      "learning_rate": 0.0002,
+      "loss": 0.6381,
+      "step": 2590
+    },
+    {
+      "epoch": 3.223806571605704,
+      "grad_norm": 1.191624402999878,
+      "learning_rate": 0.0002,
+      "loss": 0.6614,
+      "step": 2600
+    },
+    {
+      "epoch": 3.236205827650341,
+      "grad_norm": 1.546857476234436,
+      "learning_rate": 0.0002,
+      "loss": 0.5862,
+      "step": 2610
+    },
+    {
+      "epoch": 3.2486050836949785,
+      "grad_norm": 1.7683172225952148,
+      "learning_rate": 0.0002,
+      "loss": 0.697,
+      "step": 2620
+    },
+    {
+      "epoch": 3.261004339739616,
+      "grad_norm": 1.3910621404647827,
+      "learning_rate": 0.0002,
+      "loss": 0.6909,
+      "step": 2630
+    },
+    {
+      "epoch": 3.273403595784253,
+      "grad_norm": 1.205353021621704,
+      "learning_rate": 0.0002,
+      "loss": 0.6322,
+      "step": 2640
+    },
+    {
+      "epoch": 3.2858028518288904,
+      "grad_norm": 1.1997911930084229,
+      "learning_rate": 0.0002,
+      "loss": 0.6923,
+      "step": 2650
+    },
+    {
+      "epoch": 3.2982021078735277,
+      "grad_norm": 1.6746608018875122,
+      "learning_rate": 0.0002,
+      "loss": 0.6291,
+      "step": 2660
+    },
+    {
+      "epoch": 3.310601363918165,
+      "grad_norm": 1.0251612663269043,
+      "learning_rate": 0.0002,
+      "loss": 0.7021,
+      "step": 2670
+    },
+    {
+      "epoch": 3.3230006199628024,
+      "grad_norm": 1.3690581321716309,
+      "learning_rate": 0.0002,
+      "loss": 0.6958,
+      "step": 2680
+    },
+    {
+      "epoch": 3.3353998760074397,
+      "grad_norm": 1.5537537336349487,
+      "learning_rate": 0.0002,
+      "loss": 0.7439,
+      "step": 2690
+    },
+    {
+      "epoch": 3.347799132052077,
+      "grad_norm": 1.5438767671585083,
+      "learning_rate": 0.0002,
+      "loss": 0.692,
+      "step": 2700
+    },
+    {
+      "epoch": 3.3601983880967143,
+      "grad_norm": 1.2430849075317383,
+      "learning_rate": 0.0002,
+      "loss": 0.6698,
+      "step": 2710
+    },
+    {
+      "epoch": 3.3725976441413517,
+      "grad_norm": 1.1905370950698853,
+      "learning_rate": 0.0002,
+      "loss": 0.7447,
+      "step": 2720
+    },
+    {
+      "epoch": 3.384996900185989,
+      "grad_norm": 1.5106539726257324,
+      "learning_rate": 0.0002,
+      "loss": 0.6583,
+      "step": 2730
+    },
+    {
+      "epoch": 3.3973961562306263,
+      "grad_norm": 1.8480169773101807,
+      "learning_rate": 0.0002,
+      "loss": 0.6812,
+      "step": 2740
+    },
+    {
+      "epoch": 3.4097954122752636,
+      "grad_norm": 1.0991253852844238,
+      "learning_rate": 0.0002,
+      "loss": 0.6523,
+      "step": 2750
+    },
+    {
+      "epoch": 3.422194668319901,
+      "grad_norm": 1.5110164880752563,
+      "learning_rate": 0.0002,
+      "loss": 0.7371,
+      "step": 2760
+    },
+    {
+      "epoch": 3.4345939243645383,
+      "grad_norm": 1.7006158828735352,
+      "learning_rate": 0.0002,
+      "loss": 0.6632,
+      "step": 2770
+    },
+    {
+      "epoch": 3.4469931804091756,
+      "grad_norm": 1.3995729684829712,
+      "learning_rate": 0.0002,
+      "loss": 0.6938,
+      "step": 2780
+    },
+    {
+      "epoch": 3.459392436453813,
+      "grad_norm": 1.5709624290466309,
+      "learning_rate": 0.0002,
+      "loss": 0.704,
+      "step": 2790
+    },
+    {
+      "epoch": 3.47179169249845,
+      "grad_norm": 1.2154548168182373,
+      "learning_rate": 0.0002,
+      "loss": 0.629,
+      "step": 2800
+    },
+    {
+      "epoch": 3.4841909485430875,
+      "grad_norm": 1.5075860023498535,
+      "learning_rate": 0.0002,
+      "loss": 0.709,
+      "step": 2810
+    },
+    {
+      "epoch": 3.496590204587725,
+      "grad_norm": 2.296370029449463,
+      "learning_rate": 0.0002,
+      "loss": 0.6838,
+      "step": 2820
+    },
+    {
+      "epoch": 3.508989460632362,
+      "grad_norm": 1.5329245328903198,
+      "learning_rate": 0.0002,
+      "loss": 0.7216,
+      "step": 2830
+    },
+    {
+      "epoch": 3.5213887166769995,
+      "grad_norm": 2.391974925994873,
+      "learning_rate": 0.0002,
+      "loss": 0.702,
+      "step": 2840
+    },
+    {
+      "epoch": 3.533787972721637,
+      "grad_norm": 1.7627687454223633,
+      "learning_rate": 0.0002,
+      "loss": 0.6122,
+      "step": 2850
+    },
+    {
+      "epoch": 3.546187228766274,
+      "grad_norm": 1.8143539428710938,
+      "learning_rate": 0.0002,
+      "loss": 0.6612,
+      "step": 2860
+    },
+    {
+      "epoch": 3.5585864848109114,
+      "grad_norm": 1.8639698028564453,
+      "learning_rate": 0.0002,
+      "loss": 0.6875,
+      "step": 2870
+    },
+    {
+      "epoch": 3.5709857408555488,
+      "grad_norm": 1.9081439971923828,
+      "learning_rate": 0.0002,
+      "loss": 0.7133,
+      "step": 2880
+    },
+    {
+      "epoch": 3.583384996900186,
+      "grad_norm": 1.707095742225647,
+      "learning_rate": 0.0002,
+      "loss": 0.6669,
+      "step": 2890
+    },
+    {
+      "epoch": 3.5957842529448234,
+      "grad_norm": 1.561742901802063,
+      "learning_rate": 0.0002,
+      "loss": 0.6834,
+      "step": 2900
+    },
+    {
+      "epoch": 3.6081835089894607,
+      "grad_norm": 1.6129803657531738,
+      "learning_rate": 0.0002,
+      "loss": 0.7545,
+      "step": 2910
+    },
+    {
+      "epoch": 3.620582765034098,
+      "grad_norm": 1.1192500591278076,
+      "learning_rate": 0.0002,
+      "loss": 0.7182,
+      "step": 2920
+    },
+    {
+      "epoch": 3.6329820210787354,
+      "grad_norm": 1.420279622077942,
+      "learning_rate": 0.0002,
+      "loss": 0.6339,
+      "step": 2930
+    },
+    {
+      "epoch": 3.6453812771233727,
+      "grad_norm": 1.5851093530654907,
+      "learning_rate": 0.0002,
+      "loss": 0.7365,
+      "step": 2940
+    },
+    {
+      "epoch": 3.65778053316801,
+      "grad_norm": 1.4390369653701782,
+      "learning_rate": 0.0002,
+      "loss": 0.661,
+      "step": 2950
+    },
+    {
+      "epoch": 3.6701797892126473,
+      "grad_norm": 1.4419100284576416,
+      "learning_rate": 0.0002,
+      "loss": 0.7262,
+      "step": 2960
+    },
+    {
+      "epoch": 3.6825790452572846,
+      "grad_norm": 0.9472342133522034,
+      "learning_rate": 0.0002,
+      "loss": 0.7449,
+      "step": 2970
+    },
+    {
+      "epoch": 3.694978301301922,
+      "grad_norm": 1.194284200668335,
+      "learning_rate": 0.0002,
+      "loss": 0.696,
+      "step": 2980
+    },
+    {
+      "epoch": 3.7073775573465593,
+      "grad_norm": 1.233306884765625,
+      "learning_rate": 0.0002,
+      "loss": 0.6603,
+      "step": 2990
+    },
+    {
+      "epoch": 3.7197768133911966,
+      "grad_norm": 1.703479528427124,
+      "learning_rate": 0.0002,
+      "loss": 0.7155,
+      "step": 3000
+    },
+    {
+      "epoch": 3.732176069435834,
+      "grad_norm": 1.3840128183364868,
+      "learning_rate": 0.0002,
+      "loss": 0.6779,
+      "step": 3010
+    },
+    {
+      "epoch": 3.7445753254804712,
+      "grad_norm": 1.042277455329895,
+      "learning_rate": 0.0002,
+      "loss": 0.7428,
+      "step": 3020
+    },
+    {
+      "epoch": 3.7569745815251085,
+      "grad_norm": 1.3294179439544678,
+      "learning_rate": 0.0002,
+      "loss": 0.6937,
+      "step": 3030
+    },
+    {
+      "epoch": 3.769373837569746,
+      "grad_norm": 1.327108383178711,
+      "learning_rate": 0.0002,
+      "loss": 0.7233,
+      "step": 3040
+    },
+    {
+      "epoch": 3.781773093614383,
+      "grad_norm": 1.2039794921875,
+      "learning_rate": 0.0002,
+      "loss": 0.6109,
+      "step": 3050
+    },
+    {
+      "epoch": 3.7941723496590205,
+      "grad_norm": 1.2900311946868896,
+      "learning_rate": 0.0002,
+      "loss": 0.7614,
+      "step": 3060
+    },
+    {
+      "epoch": 3.806571605703658,
+      "grad_norm": 1.2003637552261353,
+      "learning_rate": 0.0002,
+      "loss": 0.7134,
+      "step": 3070
+    },
+    {
+      "epoch": 3.818970861748295,
+      "grad_norm": 1.2668299674987793,
+      "learning_rate": 0.0002,
+      "loss": 0.7145,
+      "step": 3080
+    },
+    {
+      "epoch": 3.8313701177929325,
+      "grad_norm": 1.5786389112472534,
+      "learning_rate": 0.0002,
+      "loss": 0.7429,
+      "step": 3090
+    },
+    {
+      "epoch": 3.84376937383757,
+      "grad_norm": 1.283626675605774,
+      "learning_rate": 0.0002,
+      "loss": 0.7045,
+      "step": 3100
+    },
+    {
+      "epoch": 3.856168629882207,
+      "grad_norm": 1.5252535343170166,
+      "learning_rate": 0.0002,
+      "loss": 0.6966,
+      "step": 3110
+    },
+    {
+      "epoch": 3.8685678859268444,
+      "grad_norm": 1.152452826499939,
+      "learning_rate": 0.0002,
+      "loss": 0.6737,
+      "step": 3120
+    },
+    {
+      "epoch": 3.8809671419714817,
+      "grad_norm": 1.3349536657333374,
+      "learning_rate": 0.0002,
+      "loss": 0.793,
+      "step": 3130
+    },
+    {
+      "epoch": 3.893366398016119,
+      "grad_norm": 1.3839694261550903,
+      "learning_rate": 0.0002,
+      "loss": 0.7107,
+      "step": 3140
+    },
+    {
+      "epoch": 3.9057656540607564,
+      "grad_norm": 1.668792724609375,
+      "learning_rate": 0.0002,
+      "loss": 0.8068,
+      "step": 3150
+    },
+    {
+      "epoch": 3.9181649101053937,
+      "grad_norm": 1.598772644996643,
+      "learning_rate": 0.0002,
+      "loss": 0.6843,
+      "step": 3160
+    },
+    {
+      "epoch": 3.930564166150031,
+      "grad_norm": 1.6434032917022705,
+      "learning_rate": 0.0002,
+      "loss": 0.6564,
+      "step": 3170
+    },
+    {
+      "epoch": 3.9429634221946683,
+      "grad_norm": 1.5382963418960571,
+      "learning_rate": 0.0002,
+      "loss": 0.7559,
+      "step": 3180
+    },
+    {
+      "epoch": 3.9553626782393057,
+      "grad_norm": 1.6733973026275635,
+      "learning_rate": 0.0002,
+      "loss": 0.7089,
+      "step": 3190
+    },
+    {
+      "epoch": 3.967761934283943,
+      "grad_norm": 1.5769109725952148,
+      "learning_rate": 0.0002,
+      "loss": 0.7051,
+      "step": 3200
+    },
+    {
+      "epoch": 3.9801611903285803,
+      "grad_norm": 1.5158107280731201,
+      "learning_rate": 0.0002,
+      "loss": 0.7548,
+      "step": 3210
+    },
+    {
+      "epoch": 3.9925604463732176,
+      "grad_norm": 2.034385919570923,
+      "learning_rate": 0.0002,
+      "loss": 0.6742,
+      "step": 3220
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 1.3322206735610962,
+      "eval_runtime": 127.309,
+      "eval_samples_per_second": 3.582,
+      "eval_steps_per_second": 0.448,
+      "step": 3226
+    },
+    {
+      "epoch": 4.004959702417855,
+      "grad_norm": 1.1264082193374634,
+      "learning_rate": 0.0002,
+      "loss": 0.5965,
+      "step": 3230
+    },
+    {
+      "epoch": 4.017358958462492,
+      "grad_norm": 2.171586275100708,
+      "learning_rate": 0.0002,
+      "loss": 0.5094,
+      "step": 3240
+    },
+    {
+      "epoch": 4.02975821450713,
+      "grad_norm": 1.413804292678833,
+      "learning_rate": 0.0002,
+      "loss": 0.479,
+      "step": 3250
+    },
+    {
+      "epoch": 4.042157470551767,
+      "grad_norm": 1.952918291091919,
+      "learning_rate": 0.0002,
+      "loss": 0.4871,
+      "step": 3260
+    },
+    {
+      "epoch": 4.054556726596404,
+      "grad_norm": 1.3221994638442993,
+      "learning_rate": 0.0002,
+      "loss": 0.4945,
+      "step": 3270
+    },
+    {
+      "epoch": 4.0669559826410415,
+      "grad_norm": 1.3864725828170776,
+      "learning_rate": 0.0002,
+      "loss": 0.5096,
+      "step": 3280
+    },
+    {
+      "epoch": 4.079355238685679,
+      "grad_norm": 1.9392046928405762,
+      "learning_rate": 0.0002,
+      "loss": 0.4855,
+      "step": 3290
+    },
+    {
+      "epoch": 4.091754494730316,
+      "grad_norm": 3.3523409366607666,
+      "learning_rate": 0.0002,
+      "loss": 0.4907,
+      "step": 3300
+    },
+    {
+      "epoch": 4.1041537507749535,
+      "grad_norm": 1.8055517673492432,
+      "learning_rate": 0.0002,
+      "loss": 0.5436,
+      "step": 3310
+    },
+    {
+      "epoch": 4.116553006819591,
+      "grad_norm": 1.217043399810791,
+      "learning_rate": 0.0002,
+      "loss": 0.5105,
+      "step": 3320
+    },
+    {
+      "epoch": 4.128952262864228,
+      "grad_norm": 1.752600908279419,
+      "learning_rate": 0.0002,
+      "loss": 0.542,
+      "step": 3330
+    },
+    {
+      "epoch": 4.141351518908865,
+      "grad_norm": 1.8071316480636597,
+      "learning_rate": 0.0002,
+      "loss": 0.4927,
+      "step": 3340
+    },
+    {
+      "epoch": 4.153750774953503,
+      "grad_norm": 1.793209433555603,
+      "learning_rate": 0.0002,
+      "loss": 0.4872,
+      "step": 3350
+    },
+    {
+      "epoch": 4.16615003099814,
+      "grad_norm": 1.6746844053268433,
+      "learning_rate": 0.0002,
+      "loss": 0.5381,
+      "step": 3360
+    },
+    {
+      "epoch": 4.178549287042777,
+      "grad_norm": 1.5232614278793335,
+      "learning_rate": 0.0002,
+      "loss": 0.4707,
+      "step": 3370
+    },
+    {
+      "epoch": 4.190948543087415,
+      "grad_norm": 1.7812004089355469,
+      "learning_rate": 0.0002,
+      "loss": 0.5028,
+      "step": 3380
+    },
+    {
+      "epoch": 4.203347799132052,
+      "grad_norm": 2.0417628288269043,
+      "learning_rate": 0.0002,
+      "loss": 0.5712,
+      "step": 3390
+    },
+    {
+      "epoch": 4.215747055176689,
+      "grad_norm": 1.4618799686431885,
+      "learning_rate": 0.0002,
+      "loss": 0.5198,
+      "step": 3400
+    },
+    {
+      "epoch": 4.228146311221327,
+      "grad_norm": 2.140191078186035,
+      "learning_rate": 0.0002,
+      "loss": 0.4982,
+      "step": 3410
+    },
+    {
+      "epoch": 4.240545567265964,
+      "grad_norm": 1.8133856058120728,
+      "learning_rate": 0.0002,
+      "loss": 0.5411,
+      "step": 3420
+    },
+    {
+      "epoch": 4.252944823310601,
+      "grad_norm": 1.4271091222763062,
+      "learning_rate": 0.0002,
+      "loss": 0.5225,
+      "step": 3430
+    },
+    {
+      "epoch": 4.265344079355239,
+      "grad_norm": 1.3198683261871338,
+      "learning_rate": 0.0002,
+      "loss": 0.5175,
+      "step": 3440
+    },
+    {
+      "epoch": 4.277743335399876,
+      "grad_norm": 1.88148832321167,
+      "learning_rate": 0.0002,
+      "loss": 0.5247,
+      "step": 3450
+    },
+    {
+      "epoch": 4.290142591444513,
+      "grad_norm": 1.277328372001648,
+      "learning_rate": 0.0002,
+      "loss": 0.5331,
+      "step": 3460
+    },
+    {
+      "epoch": 4.302541847489151,
+      "grad_norm": 1.8816628456115723,
+      "learning_rate": 0.0002,
+      "loss": 0.5732,
+      "step": 3470
+    },
+    {
+      "epoch": 4.314941103533788,
+      "grad_norm": 1.7252274751663208,
+      "learning_rate": 0.0002,
+      "loss": 0.5297,
+      "step": 3480
+    },
+    {
+      "epoch": 4.327340359578425,
+      "grad_norm": 1.7671009302139282,
+      "learning_rate": 0.0002,
+      "loss": 0.4707,
+      "step": 3490
+    },
+    {
+      "epoch": 4.3397396156230625,
+      "grad_norm": 1.9531593322753906,
+      "learning_rate": 0.0002,
+      "loss": 0.5508,
+      "step": 3500
+    },
+    {
+      "epoch": 4.3521388716677,
+      "grad_norm": 2.207097053527832,
+      "learning_rate": 0.0002,
+      "loss": 0.5321,
+      "step": 3510
+    },
+    {
+      "epoch": 4.364538127712337,
+      "grad_norm": 1.564458966255188,
+      "learning_rate": 0.0002,
+      "loss": 0.5312,
+      "step": 3520
+    },
+    {
+      "epoch": 4.3769373837569745,
+      "grad_norm": 1.2955191135406494,
+      "learning_rate": 0.0002,
+      "loss": 0.4301,
+      "step": 3530
+    },
+    {
+      "epoch": 4.389336639801612,
+      "grad_norm": 1.745345950126648,
+      "learning_rate": 0.0002,
+      "loss": 0.5036,
+      "step": 3540
+    },
+    {
+      "epoch": 4.401735895846249,
+      "grad_norm": 1.7884204387664795,
+      "learning_rate": 0.0002,
+      "loss": 0.5743,
+      "step": 3550
+    },
+    {
+      "epoch": 4.4141351518908865,
+      "grad_norm": 1.654018759727478,
+      "learning_rate": 0.0002,
+      "loss": 0.5793,
+      "step": 3560
+    },
+    {
+      "epoch": 4.426534407935524,
+      "grad_norm": 2.1989598274230957,
+      "learning_rate": 0.0002,
+      "loss": 0.5579,
+      "step": 3570
+    },
+    {
+      "epoch": 4.438933663980161,
+      "grad_norm": 1.7936158180236816,
+      "learning_rate": 0.0002,
+      "loss": 0.5972,
+      "step": 3580
+    },
+    {
+      "epoch": 4.451332920024798,
+      "grad_norm": 1.3981733322143555,
+      "learning_rate": 0.0002,
+      "loss": 0.5629,
+      "step": 3590
+    },
+    {
+      "epoch": 4.463732176069436,
+      "grad_norm": 2.0535473823547363,
+      "learning_rate": 0.0002,
+      "loss": 0.5179,
+      "step": 3600
+    },
+    {
+      "epoch": 4.476131432114073,
+      "grad_norm": 1.6257543563842773,
+      "learning_rate": 0.0002,
+      "loss": 0.5872,
+      "step": 3610
+    },
+    {
+      "epoch": 4.48853068815871,
+      "grad_norm": 1.5662637948989868,
+      "learning_rate": 0.0002,
+      "loss": 0.5461,
+      "step": 3620
+    },
+    {
+      "epoch": 4.500929944203348,
+      "grad_norm": 1.8960483074188232,
+      "learning_rate": 0.0002,
+      "loss": 0.5815,
+      "step": 3630
+    },
+    {
+      "epoch": 4.513329200247985,
+      "grad_norm": 1.3862426280975342,
+      "learning_rate": 0.0002,
+      "loss": 0.5599,
+      "step": 3640
+    },
+    {
+      "epoch": 4.525728456292622,
+      "grad_norm": 1.855873942375183,
+      "learning_rate": 0.0002,
+      "loss": 0.5722,
+      "step": 3650
+    },
+    {
+      "epoch": 4.53812771233726,
+      "grad_norm": 1.422516942024231,
+      "learning_rate": 0.0002,
+      "loss": 0.5686,
+      "step": 3660
+    },
+    {
+      "epoch": 4.550526968381897,
+      "grad_norm": 1.4135394096374512,
+      "learning_rate": 0.0002,
+      "loss": 0.5521,
+      "step": 3670
+    },
+    {
+      "epoch": 4.562926224426534,
+      "grad_norm": 1.860640287399292,
+      "learning_rate": 0.0002,
+      "loss": 0.5544,
+      "step": 3680
+    },
+    {
+      "epoch": 4.575325480471172,
+      "grad_norm": 2.173950672149658,
+      "learning_rate": 0.0002,
+      "loss": 0.5778,
+      "step": 3690
+    },
+    {
+      "epoch": 4.587724736515809,
+      "grad_norm": 1.5037490129470825,
+      "learning_rate": 0.0002,
+      "loss": 0.5598,
+      "step": 3700
+    },
+    {
+      "epoch": 4.600123992560446,
+      "grad_norm": 1.5990253686904907,
+      "learning_rate": 0.0002,
+      "loss": 0.5129,
+      "step": 3710
+    },
+    {
+      "epoch": 4.612523248605084,
+      "grad_norm": 1.2791721820831299,
+      "learning_rate": 0.0002,
+      "loss": 0.548,
+      "step": 3720
+    },
+    {
+      "epoch": 4.624922504649721,
+      "grad_norm": 1.767350435256958,
+      "learning_rate": 0.0002,
+      "loss": 0.553,
+      "step": 3730
+    },
+    {
+      "epoch": 4.637321760694358,
+      "grad_norm": 1.9509570598602295,
+      "learning_rate": 0.0002,
+      "loss": 0.5862,
+      "step": 3740
+    },
+    {
+      "epoch": 4.6497210167389955,
+      "grad_norm": 1.6632015705108643,
+      "learning_rate": 0.0002,
+      "loss": 0.5838,
+      "step": 3750
+    },
+    {
+      "epoch": 4.662120272783633,
+      "grad_norm": 1.7643373012542725,
+      "learning_rate": 0.0002,
+      "loss": 0.5603,
+      "step": 3760
+    },
+    {
+      "epoch": 4.67451952882827,
+      "grad_norm": 1.7088392972946167,
+      "learning_rate": 0.0002,
+      "loss": 0.5575,
+      "step": 3770
+    },
+    {
+      "epoch": 4.6869187848729075,
+      "grad_norm": 1.802544116973877,
+      "learning_rate": 0.0002,
+      "loss": 0.5595,
+      "step": 3780
+    },
+    {
+      "epoch": 4.699318040917545,
+      "grad_norm": 1.8390076160430908,
+      "learning_rate": 0.0002,
+      "loss": 0.564,
+      "step": 3790
+    },
+    {
+      "epoch": 4.711717296962182,
+      "grad_norm": 1.5216279029846191,
+      "learning_rate": 0.0002,
+      "loss": 0.6185,
+      "step": 3800
+    },
+    {
+      "epoch": 4.724116553006819,
+      "grad_norm": 1.5545401573181152,
+      "learning_rate": 0.0002,
+      "loss": 0.5452,
+      "step": 3810
+    },
+    {
+      "epoch": 4.736515809051457,
+      "grad_norm": 1.71843683719635,
+      "learning_rate": 0.0002,
+      "loss": 0.5512,
+      "step": 3820
+    },
+    {
+      "epoch": 4.748915065096094,
+      "grad_norm": 2.1453139781951904,
+      "learning_rate": 0.0002,
+      "loss": 0.5768,
+      "step": 3830
+    },
+    {
+      "epoch": 4.761314321140731,
+      "grad_norm": 2.3012070655822754,
+      "learning_rate": 0.0002,
+      "loss": 0.5805,
+      "step": 3840
+    },
+    {
+      "epoch": 4.773713577185369,
+      "grad_norm": 1.6964452266693115,
+      "learning_rate": 0.0002,
+      "loss": 0.5789,
+      "step": 3850
+    },
+    {
+      "epoch": 4.786112833230006,
+      "grad_norm": 1.7206791639328003,
+      "learning_rate": 0.0002,
+      "loss": 0.5861,
+      "step": 3860
+    },
+    {
+      "epoch": 4.798512089274643,
+      "grad_norm": 1.5777926445007324,
+      "learning_rate": 0.0002,
+      "loss": 0.5877,
+      "step": 3870
+    },
+    {
+      "epoch": 4.810911345319281,
+      "grad_norm": 1.7264010906219482,
+      "learning_rate": 0.0002,
+      "loss": 0.5583,
+      "step": 3880
+    },
+    {
+      "epoch": 4.823310601363918,
+      "grad_norm": 1.6638274192810059,
+      "learning_rate": 0.0002,
+      "loss": 0.5259,
+      "step": 3890
+    },
+    {
+      "epoch": 4.835709857408555,
+      "grad_norm": 1.8086934089660645,
+      "learning_rate": 0.0002,
+      "loss": 0.6181,
+      "step": 3900
+    },
+    {
+      "epoch": 4.848109113453193,
+      "grad_norm": 1.5027598142623901,
+      "learning_rate": 0.0002,
+      "loss": 0.5042,
+      "step": 3910
+    },
+    {
+      "epoch": 4.86050836949783,
+      "grad_norm": 1.8851488828659058,
+      "learning_rate": 0.0002,
+      "loss": 0.5322,
+      "step": 3920
+    },
+    {
+      "epoch": 4.872907625542467,
+      "grad_norm": 1.2437249422073364,
+      "learning_rate": 0.0002,
+      "loss": 0.5928,
+      "step": 3930
+    },
+    {
+      "epoch": 4.885306881587105,
+      "grad_norm": 1.7656266689300537,
+      "learning_rate": 0.0002,
+      "loss": 0.5724,
+      "step": 3940
+    },
+    {
+      "epoch": 4.897706137631742,
+      "grad_norm": 1.3089631795883179,
+      "learning_rate": 0.0002,
+      "loss": 0.5116,
+      "step": 3950
+    },
+    {
+      "epoch": 4.910105393676379,
+      "grad_norm": 1.8345088958740234,
+      "learning_rate": 0.0002,
+      "loss": 0.5473,
+      "step": 3960
+    },
+    {
+      "epoch": 4.9225046497210165,
+      "grad_norm": 1.8577536344528198,
+      "learning_rate": 0.0002,
+      "loss": 0.5865,
+      "step": 3970
+    },
+    {
+      "epoch": 4.934903905765654,
+      "grad_norm": 1.9529849290847778,
+      "learning_rate": 0.0002,
+      "loss": 0.6106,
+      "step": 3980
+    },
+    {
+      "epoch": 4.947303161810291,
+      "grad_norm": 1.7244911193847656,
+      "learning_rate": 0.0002,
+      "loss": 0.5968,
+      "step": 3990
+    },
+    {
+      "epoch": 4.9597024178549285,
+      "grad_norm": 1.7554820775985718,
+      "learning_rate": 0.0002,
+      "loss": 0.6262,
+      "step": 4000
+    },
+    {
+      "epoch": 4.972101673899566,
+      "grad_norm": 1.5834285020828247,
+      "learning_rate": 0.0002,
+      "loss": 0.6047,
+      "step": 4010
+    },
+    {
+      "epoch": 4.984500929944203,
+      "grad_norm": 2.1360528469085693,
+      "learning_rate": 0.0002,
+      "loss": 0.6312,
+      "step": 4020
+    },
+    {
+      "epoch": 4.9969001859888404,
+      "grad_norm": 1.5836342573165894,
+      "learning_rate": 0.0002,
+      "loss": 0.5715,
+      "step": 4030
+    },
+    {
+      "epoch": 4.999380037197768,
+      "eval_loss": 1.438460111618042,
+      "eval_runtime": 126.5486,
+      "eval_samples_per_second": 3.603,
+      "eval_steps_per_second": 0.45,
+      "step": 4032
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 6448,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 8,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 1.76924835446784e+17,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}