diff --git a/.gitattributes b/.gitattributes index 7680d27e364f3d16e4dcfebbc72153262315be12..a6c4c904308038ff534fc2bd5f761cd699ab03f5 100644 --- a/.gitattributes +++ b/.gitattributes @@ -2020,3 +2020,12 @@ gemma-2-9b-it_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a- gemma-2-9b-it_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-102-sd-1/checkpoint-73/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2-9b-it_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-102-sd-1/checkpoint-80/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2-9b-it_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.1-num-102-sd-1/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..57c0547536cf633208bd0753b6f45625ae2084ca --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:160428ee21539129b32c4f50e46b1c3c4e1b361d3d8a42304e1e889b4311be3f +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4af5e8ac081e351427a7d80895162354afcd5c41 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b6415221e63bd9243653ca4db881c9a24e06529573526a57c54638a354ff704 +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/optimizer.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9f37f0c6f36863a476bc5d782da1dc1227d4a3a --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c2dacc6026c12435306b5e224714e03840f94e8bcf4edaac51836364ce95581 +size 72886650 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/rng_state.pth b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4dbb86c96c25c37e0ee412e257b05f66ec5a0653 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e01ed117ebafd67aa3ac43f1079b6c9a2464882d1daaaf6d16e7f2436768cd91 +size 14244 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/scheduler.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a950a8c6d57271ef31241e933d75f97ce5ae95f9 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2aa5fa807faf2a2d533b4e4cb39858015cf2b195082b515ecc420c95a95ec16 +size 1064 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/trainer_state.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..44546134701828ee7218a2fd8b705334b2e54d9e --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/trainer_state.json @@ -0,0 +1,7804 @@ +{ + "best_metric": 1.1086540222167969, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755", + "epoch": 7.997096188747731, + "eval_steps": 10, + "global_step": 11016, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007259528130671506, + "grad_norm": 0.46010470390319824, + "learning_rate": 0.0002, + "loss": 2.4936, + "step": 10 + }, + { + "epoch": 0.014519056261343012, + "grad_norm": 0.5103179216384888, + "learning_rate": 0.0002, + "loss": 1.8442, + "step": 20 + }, + { + "epoch": 0.021778584392014518, + "grad_norm": 0.6282716989517212, + "learning_rate": 0.0002, + "loss": 1.559, + "step": 30 + }, + { + "epoch": 0.029038112522686024, + "grad_norm": 1.2480497360229492, + "learning_rate": 0.0002, + "loss": 1.3618, + "step": 40 + }, + { + "epoch": 0.036297640653357534, + "grad_norm": 0.4114855229854584, + "learning_rate": 0.0002, + "loss": 1.3874, + "step": 50 + }, + { + "epoch": 0.043557168784029036, + "grad_norm": 0.49482840299606323, + "learning_rate": 0.0002, + "loss": 1.2836, + "step": 60 + }, + { + "epoch": 0.050816696914700546, + "grad_norm": 0.4536272883415222, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 70 + }, + { + "epoch": 0.05807622504537205, + "grad_norm": 0.32328274846076965, + "learning_rate": 0.0002, + "loss": 1.1125, + "step": 80 + }, + { + "epoch": 0.06533575317604355, + "grad_norm": 0.40990468859672546, + "learning_rate": 0.0002, + "loss": 1.2447, + "step": 90 + }, + { + "epoch": 0.07259528130671507, + "grad_norm": 0.37273502349853516, + "learning_rate": 0.0002, + "loss": 1.2216, + "step": 100 + }, + { + "epoch": 0.07985480943738657, + "grad_norm": 0.3903871476650238, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 110 + }, + { + "epoch": 0.08711433756805807, + "grad_norm": 0.3272787928581238, + "learning_rate": 0.0002, + "loss": 1.3718, + "step": 120 + }, + { + "epoch": 0.09437386569872959, + "grad_norm": 0.3622824251651764, + "learning_rate": 0.0002, + "loss": 1.1832, + "step": 130 + }, + { + "epoch": 0.10163339382940109, + "grad_norm": 0.3503916561603546, + "learning_rate": 0.0002, + "loss": 1.2966, + "step": 140 + }, + { + "epoch": 0.1088929219600726, + "grad_norm": 0.32787832617759705, + "learning_rate": 0.0002, + "loss": 1.0913, + "step": 150 + }, + { + "epoch": 0.1161524500907441, + "grad_norm": 0.5822657942771912, + "learning_rate": 0.0002, + "loss": 1.2143, + "step": 160 + }, + { + "epoch": 0.12341197822141561, + "grad_norm": 0.28028249740600586, + "learning_rate": 0.0002, + "loss": 1.2091, + "step": 170 + }, + { + "epoch": 0.1306715063520871, + "grad_norm": 0.33602750301361084, + "learning_rate": 0.0002, + "loss": 1.1731, + "step": 180 + }, + { + "epoch": 0.13793103448275862, + "grad_norm": 0.29106274247169495, + "learning_rate": 0.0002, + "loss": 1.127, + "step": 190 + }, + { + "epoch": 0.14519056261343014, + "grad_norm": 0.38753363490104675, + "learning_rate": 0.0002, + "loss": 1.3261, + "step": 200 + }, + { + "epoch": 0.15245009074410162, + "grad_norm": 0.361009418964386, + "learning_rate": 0.0002, + "loss": 1.1536, + "step": 210 + }, + { + "epoch": 0.15970961887477314, + "grad_norm": 0.6743836402893066, + "learning_rate": 0.0002, + "loss": 1.1483, + "step": 220 + }, + { + "epoch": 0.16696914700544466, + "grad_norm": 0.3923613727092743, + "learning_rate": 0.0002, + "loss": 1.264, + "step": 230 + }, + { + "epoch": 0.17422867513611615, + "grad_norm": 0.2809699773788452, + "learning_rate": 0.0002, + "loss": 1.0437, + "step": 240 + }, + { + "epoch": 0.18148820326678766, + "grad_norm": 0.3631494641304016, + "learning_rate": 0.0002, + "loss": 1.1826, + "step": 250 + }, + { + "epoch": 0.18874773139745918, + "grad_norm": 0.24658171832561493, + "learning_rate": 0.0002, + "loss": 1.1584, + "step": 260 + }, + { + "epoch": 0.19600725952813067, + "grad_norm": 0.5780664682388306, + "learning_rate": 0.0002, + "loss": 1.0065, + "step": 270 + }, + { + "epoch": 0.20326678765880218, + "grad_norm": 0.3056720495223999, + "learning_rate": 0.0002, + "loss": 1.1251, + "step": 280 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.2857084572315216, + "learning_rate": 0.0002, + "loss": 1.1503, + "step": 290 + }, + { + "epoch": 0.2177858439201452, + "grad_norm": 0.6645345687866211, + "learning_rate": 0.0002, + "loss": 1.1215, + "step": 300 + }, + { + "epoch": 0.2250453720508167, + "grad_norm": 0.5966078639030457, + "learning_rate": 0.0002, + "loss": 1.127, + "step": 310 + }, + { + "epoch": 0.2323049001814882, + "grad_norm": 0.40937140583992004, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 320 + }, + { + "epoch": 0.2395644283121597, + "grad_norm": 0.5642806887626648, + "learning_rate": 0.0002, + "loss": 1.2385, + "step": 330 + }, + { + "epoch": 0.24682395644283123, + "grad_norm": 0.2750748097896576, + "learning_rate": 0.0002, + "loss": 1.1703, + "step": 340 + }, + { + "epoch": 0.2540834845735027, + "grad_norm": 0.34350234270095825, + "learning_rate": 0.0002, + "loss": 1.2598, + "step": 350 + }, + { + "epoch": 0.2613430127041742, + "grad_norm": 0.6767239570617676, + "learning_rate": 0.0002, + "loss": 1.1942, + "step": 360 + }, + { + "epoch": 0.26860254083484575, + "grad_norm": 0.31006959080696106, + "learning_rate": 0.0002, + "loss": 1.1436, + "step": 370 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 0.3825474679470062, + "learning_rate": 0.0002, + "loss": 1.2084, + "step": 380 + }, + { + "epoch": 0.2831215970961887, + "grad_norm": 0.30241551995277405, + "learning_rate": 0.0002, + "loss": 1.1523, + "step": 390 + }, + { + "epoch": 0.29038112522686027, + "grad_norm": 0.2962397336959839, + "learning_rate": 0.0002, + "loss": 1.1298, + "step": 400 + }, + { + "epoch": 0.29764065335753176, + "grad_norm": 0.2600369155406952, + "learning_rate": 0.0002, + "loss": 1.1299, + "step": 410 + }, + { + "epoch": 0.30490018148820325, + "grad_norm": 0.3675060272216797, + "learning_rate": 0.0002, + "loss": 1.1366, + "step": 420 + }, + { + "epoch": 0.3121597096188748, + "grad_norm": 0.3429498076438904, + "learning_rate": 0.0002, + "loss": 1.156, + "step": 430 + }, + { + "epoch": 0.3194192377495463, + "grad_norm": 0.34311825037002563, + "learning_rate": 0.0002, + "loss": 1.2741, + "step": 440 + }, + { + "epoch": 0.32667876588021777, + "grad_norm": 0.37872210144996643, + "learning_rate": 0.0002, + "loss": 1.3523, + "step": 450 + }, + { + "epoch": 0.3339382940108893, + "grad_norm": 0.33271121978759766, + "learning_rate": 0.0002, + "loss": 1.1365, + "step": 460 + }, + { + "epoch": 0.3411978221415608, + "grad_norm": 0.34605276584625244, + "learning_rate": 0.0002, + "loss": 1.1572, + "step": 470 + }, + { + "epoch": 0.3484573502722323, + "grad_norm": 0.41050562262535095, + "learning_rate": 0.0002, + "loss": 1.2755, + "step": 480 + }, + { + "epoch": 0.35571687840290384, + "grad_norm": 0.2066836953163147, + "learning_rate": 0.0002, + "loss": 1.075, + "step": 490 + }, + { + "epoch": 0.3629764065335753, + "grad_norm": 0.2859014868736267, + "learning_rate": 0.0002, + "loss": 1.2072, + "step": 500 + }, + { + "epoch": 0.3702359346642468, + "grad_norm": 0.28763777017593384, + "learning_rate": 0.0002, + "loss": 1.1435, + "step": 510 + }, + { + "epoch": 0.37749546279491836, + "grad_norm": 0.2730471193790436, + "learning_rate": 0.0002, + "loss": 1.1341, + "step": 520 + }, + { + "epoch": 0.38475499092558985, + "grad_norm": 0.3968936800956726, + "learning_rate": 0.0002, + "loss": 1.348, + "step": 530 + }, + { + "epoch": 0.39201451905626133, + "grad_norm": 0.3624701201915741, + "learning_rate": 0.0002, + "loss": 1.2363, + "step": 540 + }, + { + "epoch": 0.3992740471869328, + "grad_norm": 0.3303608298301697, + "learning_rate": 0.0002, + "loss": 1.1206, + "step": 550 + }, + { + "epoch": 0.40653357531760437, + "grad_norm": 0.33507466316223145, + "learning_rate": 0.0002, + "loss": 1.203, + "step": 560 + }, + { + "epoch": 0.41379310344827586, + "grad_norm": 0.3297670781612396, + "learning_rate": 0.0002, + "loss": 1.2294, + "step": 570 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.32334890961647034, + "learning_rate": 0.0002, + "loss": 1.2, + "step": 580 + }, + { + "epoch": 0.4283121597096189, + "grad_norm": 0.30281195044517517, + "learning_rate": 0.0002, + "loss": 1.112, + "step": 590 + }, + { + "epoch": 0.4355716878402904, + "grad_norm": 0.5900027751922607, + "learning_rate": 0.0002, + "loss": 1.1352, + "step": 600 + }, + { + "epoch": 0.44283121597096187, + "grad_norm": 0.28437477350234985, + "learning_rate": 0.0002, + "loss": 1.1575, + "step": 610 + }, + { + "epoch": 0.4500907441016334, + "grad_norm": 0.39601704478263855, + "learning_rate": 0.0002, + "loss": 1.2009, + "step": 620 + }, + { + "epoch": 0.4573502722323049, + "grad_norm": 0.41971510648727417, + "learning_rate": 0.0002, + "loss": 1.2302, + "step": 630 + }, + { + "epoch": 0.4646098003629764, + "grad_norm": 0.33814409375190735, + "learning_rate": 0.0002, + "loss": 1.236, + "step": 640 + }, + { + "epoch": 0.47186932849364793, + "grad_norm": 0.575718104839325, + "learning_rate": 0.0002, + "loss": 1.2471, + "step": 650 + }, + { + "epoch": 0.4791288566243194, + "grad_norm": 0.37927401065826416, + "learning_rate": 0.0002, + "loss": 1.1012, + "step": 660 + }, + { + "epoch": 0.4863883847549909, + "grad_norm": 0.3224332630634308, + "learning_rate": 0.0002, + "loss": 1.1552, + "step": 670 + }, + { + "epoch": 0.49364791288566245, + "grad_norm": 0.32683515548706055, + "learning_rate": 0.0002, + "loss": 1.1354, + "step": 680 + }, + { + "epoch": 0.5009074410163339, + "grad_norm": 0.4316163659095764, + "learning_rate": 0.0002, + "loss": 1.207, + "step": 690 + }, + { + "epoch": 0.5081669691470054, + "grad_norm": 0.342602401971817, + "learning_rate": 0.0002, + "loss": 1.2116, + "step": 700 + }, + { + "epoch": 0.515426497277677, + "grad_norm": 0.2794898748397827, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 710 + }, + { + "epoch": 0.5226860254083484, + "grad_norm": 0.3322339951992035, + "learning_rate": 0.0002, + "loss": 1.2395, + "step": 720 + }, + { + "epoch": 0.52994555353902, + "grad_norm": 0.3088509142398834, + "learning_rate": 0.0002, + "loss": 1.0762, + "step": 730 + }, + { + "epoch": 0.5372050816696915, + "grad_norm": 0.24444378912448883, + "learning_rate": 0.0002, + "loss": 1.1495, + "step": 740 + }, + { + "epoch": 0.5444646098003629, + "grad_norm": 0.3483171761035919, + "learning_rate": 0.0002, + "loss": 1.2176, + "step": 750 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 0.3425690531730652, + "learning_rate": 0.0002, + "loss": 1.2248, + "step": 760 + }, + { + "epoch": 0.558983666061706, + "grad_norm": 0.31841927766799927, + "learning_rate": 0.0002, + "loss": 1.1127, + "step": 770 + }, + { + "epoch": 0.5662431941923775, + "grad_norm": 0.39423868060112, + "learning_rate": 0.0002, + "loss": 1.1841, + "step": 780 + }, + { + "epoch": 0.573502722323049, + "grad_norm": 0.30328479409217834, + "learning_rate": 0.0002, + "loss": 1.0583, + "step": 790 + }, + { + "epoch": 0.5807622504537205, + "grad_norm": 0.24475938081741333, + "learning_rate": 0.0002, + "loss": 1.0718, + "step": 800 + }, + { + "epoch": 0.588021778584392, + "grad_norm": 0.37132805585861206, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 810 + }, + { + "epoch": 0.5952813067150635, + "grad_norm": 0.32195979356765747, + "learning_rate": 0.0002, + "loss": 1.1544, + "step": 820 + }, + { + "epoch": 0.6025408348457351, + "grad_norm": 0.2848738729953766, + "learning_rate": 0.0002, + "loss": 0.9937, + "step": 830 + }, + { + "epoch": 0.6098003629764065, + "grad_norm": 0.28015264868736267, + "learning_rate": 0.0002, + "loss": 1.0937, + "step": 840 + }, + { + "epoch": 0.617059891107078, + "grad_norm": 0.37796008586883545, + "learning_rate": 0.0002, + "loss": 1.1234, + "step": 850 + }, + { + "epoch": 0.6243194192377496, + "grad_norm": 0.39311841130256653, + "learning_rate": 0.0002, + "loss": 1.1455, + "step": 860 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.2761685252189636, + "learning_rate": 0.0002, + "loss": 1.1066, + "step": 870 + }, + { + "epoch": 0.6388384754990926, + "grad_norm": 0.3826720118522644, + "learning_rate": 0.0002, + "loss": 1.0859, + "step": 880 + }, + { + "epoch": 0.6460980036297641, + "grad_norm": 0.30076679587364197, + "learning_rate": 0.0002, + "loss": 1.1182, + "step": 890 + }, + { + "epoch": 0.6533575317604355, + "grad_norm": 0.21997687220573425, + "learning_rate": 0.0002, + "loss": 1.0927, + "step": 900 + }, + { + "epoch": 0.6606170598911071, + "grad_norm": 0.32593777775764465, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 910 + }, + { + "epoch": 0.6678765880217786, + "grad_norm": 0.30347898602485657, + "learning_rate": 0.0002, + "loss": 1.1578, + "step": 920 + }, + { + "epoch": 0.6751361161524501, + "grad_norm": 0.44173774123191833, + "learning_rate": 0.0002, + "loss": 1.1642, + "step": 930 + }, + { + "epoch": 0.6823956442831216, + "grad_norm": 0.2507467269897461, + "learning_rate": 0.0002, + "loss": 1.1832, + "step": 940 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 0.29463833570480347, + "learning_rate": 0.0002, + "loss": 1.3001, + "step": 950 + }, + { + "epoch": 0.6969147005444646, + "grad_norm": 0.9363154172897339, + "learning_rate": 0.0002, + "loss": 1.1651, + "step": 960 + }, + { + "epoch": 0.7041742286751361, + "grad_norm": 0.3236212134361267, + "learning_rate": 0.0002, + "loss": 1.158, + "step": 970 + }, + { + "epoch": 0.7114337568058077, + "grad_norm": 0.3123254179954529, + "learning_rate": 0.0002, + "loss": 1.1078, + "step": 980 + }, + { + "epoch": 0.7186932849364791, + "grad_norm": 0.3395805060863495, + "learning_rate": 0.0002, + "loss": 1.1578, + "step": 990 + }, + { + "epoch": 0.7259528130671506, + "grad_norm": 0.3240964412689209, + "learning_rate": 0.0002, + "loss": 1.0754, + "step": 1000 + }, + { + "epoch": 0.7332123411978222, + "grad_norm": 0.31902948021888733, + "learning_rate": 0.0002, + "loss": 1.0313, + "step": 1010 + }, + { + "epoch": 0.7404718693284936, + "grad_norm": 0.4848408102989197, + "learning_rate": 0.0002, + "loss": 1.1664, + "step": 1020 + }, + { + "epoch": 0.7477313974591652, + "grad_norm": 0.33006033301353455, + "learning_rate": 0.0002, + "loss": 1.0838, + "step": 1030 + }, + { + "epoch": 0.7549909255898367, + "grad_norm": 0.2928730547428131, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 1040 + }, + { + "epoch": 0.7622504537205081, + "grad_norm": 0.3529164791107178, + "learning_rate": 0.0002, + "loss": 1.1351, + "step": 1050 + }, + { + "epoch": 0.7695099818511797, + "grad_norm": 0.2736213803291321, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 1060 + }, + { + "epoch": 0.7767695099818511, + "grad_norm": 0.7200686931610107, + "learning_rate": 0.0002, + "loss": 1.1169, + "step": 1070 + }, + { + "epoch": 0.7840290381125227, + "grad_norm": 0.33396708965301514, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 1080 + }, + { + "epoch": 0.7912885662431942, + "grad_norm": 1.5760449171066284, + "learning_rate": 0.0002, + "loss": 1.2447, + "step": 1090 + }, + { + "epoch": 0.7985480943738656, + "grad_norm": 0.28138381242752075, + "learning_rate": 0.0002, + "loss": 1.1126, + "step": 1100 + }, + { + "epoch": 0.8058076225045372, + "grad_norm": 0.2597472369670868, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 1110 + }, + { + "epoch": 0.8130671506352087, + "grad_norm": 0.3305445611476898, + "learning_rate": 0.0002, + "loss": 1.1177, + "step": 1120 + }, + { + "epoch": 0.8203266787658802, + "grad_norm": 0.3934599459171295, + "learning_rate": 0.0002, + "loss": 1.0849, + "step": 1130 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 0.3472191393375397, + "learning_rate": 0.0002, + "loss": 1.0889, + "step": 1140 + }, + { + "epoch": 0.8348457350272233, + "grad_norm": 0.2857365906238556, + "learning_rate": 0.0002, + "loss": 1.265, + "step": 1150 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.3207702934741974, + "learning_rate": 0.0002, + "loss": 1.03, + "step": 1160 + }, + { + "epoch": 0.8493647912885662, + "grad_norm": 0.3176484704017639, + "learning_rate": 0.0002, + "loss": 1.1669, + "step": 1170 + }, + { + "epoch": 0.8566243194192378, + "grad_norm": 0.40685558319091797, + "learning_rate": 0.0002, + "loss": 1.1386, + "step": 1180 + }, + { + "epoch": 0.8638838475499092, + "grad_norm": 0.31125199794769287, + "learning_rate": 0.0002, + "loss": 1.1383, + "step": 1190 + }, + { + "epoch": 0.8711433756805808, + "grad_norm": 0.7361181378364563, + "learning_rate": 0.0002, + "loss": 1.1512, + "step": 1200 + }, + { + "epoch": 0.8784029038112523, + "grad_norm": 0.33699527382850647, + "learning_rate": 0.0002, + "loss": 0.9983, + "step": 1210 + }, + { + "epoch": 0.8856624319419237, + "grad_norm": 0.3315220773220062, + "learning_rate": 0.0002, + "loss": 1.11, + "step": 1220 + }, + { + "epoch": 0.8929219600725953, + "grad_norm": 0.6256054043769836, + "learning_rate": 0.0002, + "loss": 1.0266, + "step": 1230 + }, + { + "epoch": 0.9001814882032668, + "grad_norm": 0.3692137897014618, + "learning_rate": 0.0002, + "loss": 1.1738, + "step": 1240 + }, + { + "epoch": 0.9074410163339383, + "grad_norm": 0.3538484573364258, + "learning_rate": 0.0002, + "loss": 1.1112, + "step": 1250 + }, + { + "epoch": 0.9147005444646098, + "grad_norm": 0.27863040566444397, + "learning_rate": 0.0002, + "loss": 1.0878, + "step": 1260 + }, + { + "epoch": 0.9219600725952813, + "grad_norm": 0.3322528302669525, + "learning_rate": 0.0002, + "loss": 1.0826, + "step": 1270 + }, + { + "epoch": 0.9292196007259528, + "grad_norm": 0.7553173303604126, + "learning_rate": 0.0002, + "loss": 1.0158, + "step": 1280 + }, + { + "epoch": 0.9364791288566243, + "grad_norm": 0.4856191575527191, + "learning_rate": 0.0002, + "loss": 1.198, + "step": 1290 + }, + { + "epoch": 0.9437386569872959, + "grad_norm": 0.3668074905872345, + "learning_rate": 0.0002, + "loss": 1.1678, + "step": 1300 + }, + { + "epoch": 0.9509981851179673, + "grad_norm": 0.29851067066192627, + "learning_rate": 0.0002, + "loss": 1.1809, + "step": 1310 + }, + { + "epoch": 0.9582577132486388, + "grad_norm": 0.276664674282074, + "learning_rate": 0.0002, + "loss": 1.1299, + "step": 1320 + }, + { + "epoch": 0.9655172413793104, + "grad_norm": 0.2941018342971802, + "learning_rate": 0.0002, + "loss": 1.0597, + "step": 1330 + }, + { + "epoch": 0.9727767695099818, + "grad_norm": 0.3505859076976776, + "learning_rate": 0.0002, + "loss": 1.1731, + "step": 1340 + }, + { + "epoch": 0.9800362976406534, + "grad_norm": 0.3067687451839447, + "learning_rate": 0.0002, + "loss": 1.1809, + "step": 1350 + }, + { + "epoch": 0.9872958257713249, + "grad_norm": 0.27151066064834595, + "learning_rate": 0.0002, + "loss": 1.1511, + "step": 1360 + }, + { + "epoch": 0.9945553539019963, + "grad_norm": 0.36370083689689636, + "learning_rate": 0.0002, + "loss": 1.1919, + "step": 1370 + }, + { + "epoch": 0.9996370235934664, + "eval_loss": 1.1381088495254517, + "eval_runtime": 96.0848, + "eval_samples_per_second": 4.538, + "eval_steps_per_second": 0.572, + "step": 1377 + }, + { + "epoch": 1.0018148820326678, + "grad_norm": 0.27980583906173706, + "learning_rate": 0.0002, + "loss": 1.0082, + "step": 1380 + }, + { + "epoch": 1.0090744101633393, + "grad_norm": 0.26713913679122925, + "learning_rate": 0.0002, + "loss": 1.0565, + "step": 1390 + }, + { + "epoch": 1.0163339382940109, + "grad_norm": 0.3089541494846344, + "learning_rate": 0.0002, + "loss": 1.0241, + "step": 1400 + }, + { + "epoch": 1.0235934664246824, + "grad_norm": 0.4188242256641388, + "learning_rate": 0.0002, + "loss": 0.9649, + "step": 1410 + }, + { + "epoch": 1.030852994555354, + "grad_norm": 0.5246463418006897, + "learning_rate": 0.0002, + "loss": 0.9821, + "step": 1420 + }, + { + "epoch": 1.0381125226860255, + "grad_norm": 0.2728777825832367, + "learning_rate": 0.0002, + "loss": 1.0143, + "step": 1430 + }, + { + "epoch": 1.0453720508166968, + "grad_norm": 0.38167616724967957, + "learning_rate": 0.0002, + "loss": 1.075, + "step": 1440 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.4439380168914795, + "learning_rate": 0.0002, + "loss": 1.0229, + "step": 1450 + }, + { + "epoch": 1.05989110707804, + "grad_norm": 0.30954182147979736, + "learning_rate": 0.0002, + "loss": 1.0451, + "step": 1460 + }, + { + "epoch": 1.0671506352087115, + "grad_norm": 0.4022280275821686, + "learning_rate": 0.0002, + "loss": 1.0762, + "step": 1470 + }, + { + "epoch": 1.074410163339383, + "grad_norm": 0.7390811443328857, + "learning_rate": 0.0002, + "loss": 1.0754, + "step": 1480 + }, + { + "epoch": 1.0816696914700545, + "grad_norm": 0.3885486423969269, + "learning_rate": 0.0002, + "loss": 1.1378, + "step": 1490 + }, + { + "epoch": 1.0889292196007259, + "grad_norm": 0.5275560617446899, + "learning_rate": 0.0002, + "loss": 1.005, + "step": 1500 + }, + { + "epoch": 1.0961887477313974, + "grad_norm": 0.35112282633781433, + "learning_rate": 0.0002, + "loss": 1.0147, + "step": 1510 + }, + { + "epoch": 1.103448275862069, + "grad_norm": 0.33714351058006287, + "learning_rate": 0.0002, + "loss": 1.0331, + "step": 1520 + }, + { + "epoch": 1.1107078039927405, + "grad_norm": 0.31221693754196167, + "learning_rate": 0.0002, + "loss": 0.8723, + "step": 1530 + }, + { + "epoch": 1.117967332123412, + "grad_norm": 0.27549654245376587, + "learning_rate": 0.0002, + "loss": 1.0701, + "step": 1540 + }, + { + "epoch": 1.1252268602540836, + "grad_norm": 0.8465521335601807, + "learning_rate": 0.0002, + "loss": 1.0114, + "step": 1550 + }, + { + "epoch": 1.132486388384755, + "grad_norm": 0.36125949025154114, + "learning_rate": 0.0002, + "loss": 0.9643, + "step": 1560 + }, + { + "epoch": 1.1397459165154264, + "grad_norm": 0.37420371174812317, + "learning_rate": 0.0002, + "loss": 1.0744, + "step": 1570 + }, + { + "epoch": 1.147005444646098, + "grad_norm": 0.3294760584831238, + "learning_rate": 0.0002, + "loss": 1.165, + "step": 1580 + }, + { + "epoch": 1.1542649727767695, + "grad_norm": 0.3881238102912903, + "learning_rate": 0.0002, + "loss": 0.9207, + "step": 1590 + }, + { + "epoch": 1.161524500907441, + "grad_norm": 0.4766491651535034, + "learning_rate": 0.0002, + "loss": 1.0983, + "step": 1600 + }, + { + "epoch": 1.1687840290381124, + "grad_norm": 0.395530104637146, + "learning_rate": 0.0002, + "loss": 1.0092, + "step": 1610 + }, + { + "epoch": 1.176043557168784, + "grad_norm": 0.3297106623649597, + "learning_rate": 0.0002, + "loss": 0.9895, + "step": 1620 + }, + { + "epoch": 1.1833030852994555, + "grad_norm": 0.39528271555900574, + "learning_rate": 0.0002, + "loss": 1.1121, + "step": 1630 + }, + { + "epoch": 1.190562613430127, + "grad_norm": 0.3370221257209778, + "learning_rate": 0.0002, + "loss": 0.9202, + "step": 1640 + }, + { + "epoch": 1.1978221415607986, + "grad_norm": 0.31922030448913574, + "learning_rate": 0.0002, + "loss": 1.102, + "step": 1650 + }, + { + "epoch": 1.2050816696914701, + "grad_norm": 0.6142027378082275, + "learning_rate": 0.0002, + "loss": 1.0565, + "step": 1660 + }, + { + "epoch": 1.2123411978221417, + "grad_norm": 0.44769710302352905, + "learning_rate": 0.0002, + "loss": 0.9541, + "step": 1670 + }, + { + "epoch": 1.219600725952813, + "grad_norm": 0.41233646869659424, + "learning_rate": 0.0002, + "loss": 1.2501, + "step": 1680 + }, + { + "epoch": 1.2268602540834845, + "grad_norm": 0.2928866147994995, + "learning_rate": 0.0002, + "loss": 1.0147, + "step": 1690 + }, + { + "epoch": 1.234119782214156, + "grad_norm": 0.36913734674453735, + "learning_rate": 0.0002, + "loss": 0.9074, + "step": 1700 + }, + { + "epoch": 1.2413793103448276, + "grad_norm": 0.5281891226768494, + "learning_rate": 0.0002, + "loss": 1.0847, + "step": 1710 + }, + { + "epoch": 1.2486388384754992, + "grad_norm": 0.3374697268009186, + "learning_rate": 0.0002, + "loss": 0.9482, + "step": 1720 + }, + { + "epoch": 1.2558983666061705, + "grad_norm": 0.3802020847797394, + "learning_rate": 0.0002, + "loss": 0.9826, + "step": 1730 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 0.38048651814460754, + "learning_rate": 0.0002, + "loss": 1.0191, + "step": 1740 + }, + { + "epoch": 1.2704174228675136, + "grad_norm": 0.6676169633865356, + "learning_rate": 0.0002, + "loss": 1.0142, + "step": 1750 + }, + { + "epoch": 1.2776769509981851, + "grad_norm": 0.4075961410999298, + "learning_rate": 0.0002, + "loss": 0.992, + "step": 1760 + }, + { + "epoch": 1.2849364791288567, + "grad_norm": 0.4374721348285675, + "learning_rate": 0.0002, + "loss": 1.0301, + "step": 1770 + }, + { + "epoch": 1.2921960072595282, + "grad_norm": 0.4638824164867401, + "learning_rate": 0.0002, + "loss": 1.0601, + "step": 1780 + }, + { + "epoch": 1.2994555353901998, + "grad_norm": 0.38631564378738403, + "learning_rate": 0.0002, + "loss": 0.9593, + "step": 1790 + }, + { + "epoch": 1.306715063520871, + "grad_norm": 0.35873809456825256, + "learning_rate": 0.0002, + "loss": 1.0042, + "step": 1800 + }, + { + "epoch": 1.3139745916515426, + "grad_norm": 0.33602237701416016, + "learning_rate": 0.0002, + "loss": 0.9835, + "step": 1810 + }, + { + "epoch": 1.3212341197822142, + "grad_norm": 0.46696463227272034, + "learning_rate": 0.0002, + "loss": 1.1032, + "step": 1820 + }, + { + "epoch": 1.3284936479128857, + "grad_norm": 0.368958979845047, + "learning_rate": 0.0002, + "loss": 1.0123, + "step": 1830 + }, + { + "epoch": 1.335753176043557, + "grad_norm": 0.3160957396030426, + "learning_rate": 0.0002, + "loss": 1.0456, + "step": 1840 + }, + { + "epoch": 1.3430127041742286, + "grad_norm": 0.4511511027812958, + "learning_rate": 0.0002, + "loss": 1.0264, + "step": 1850 + }, + { + "epoch": 1.3502722323049001, + "grad_norm": 0.6769845485687256, + "learning_rate": 0.0002, + "loss": 1.0179, + "step": 1860 + }, + { + "epoch": 1.3575317604355717, + "grad_norm": 0.3749309480190277, + "learning_rate": 0.0002, + "loss": 1.011, + "step": 1870 + }, + { + "epoch": 1.3647912885662432, + "grad_norm": 0.553798496723175, + "learning_rate": 0.0002, + "loss": 1.0223, + "step": 1880 + }, + { + "epoch": 1.3720508166969148, + "grad_norm": 0.3538985550403595, + "learning_rate": 0.0002, + "loss": 0.9634, + "step": 1890 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 0.5501534938812256, + "learning_rate": 0.0002, + "loss": 1.0275, + "step": 1900 + }, + { + "epoch": 1.3865698729582578, + "grad_norm": 0.4432051181793213, + "learning_rate": 0.0002, + "loss": 1.0461, + "step": 1910 + }, + { + "epoch": 1.3938294010889292, + "grad_norm": 0.41755786538124084, + "learning_rate": 0.0002, + "loss": 0.9082, + "step": 1920 + }, + { + "epoch": 1.4010889292196007, + "grad_norm": 0.5626114010810852, + "learning_rate": 0.0002, + "loss": 1.0858, + "step": 1930 + }, + { + "epoch": 1.4083484573502723, + "grad_norm": 0.44038185477256775, + "learning_rate": 0.0002, + "loss": 1.0687, + "step": 1940 + }, + { + "epoch": 1.4156079854809438, + "grad_norm": 0.3397001624107361, + "learning_rate": 0.0002, + "loss": 0.9454, + "step": 1950 + }, + { + "epoch": 1.4228675136116151, + "grad_norm": 0.4325368106365204, + "learning_rate": 0.0002, + "loss": 0.943, + "step": 1960 + }, + { + "epoch": 1.4301270417422867, + "grad_norm": 0.3900907039642334, + "learning_rate": 0.0002, + "loss": 1.0488, + "step": 1970 + }, + { + "epoch": 1.4373865698729582, + "grad_norm": 0.369612455368042, + "learning_rate": 0.0002, + "loss": 0.9699, + "step": 1980 + }, + { + "epoch": 1.4446460980036298, + "grad_norm": 0.4389338791370392, + "learning_rate": 0.0002, + "loss": 1.0609, + "step": 1990 + }, + { + "epoch": 1.4519056261343013, + "grad_norm": 1.694450855255127, + "learning_rate": 0.0002, + "loss": 1.042, + "step": 2000 + }, + { + "epoch": 1.4591651542649728, + "grad_norm": 0.516957700252533, + "learning_rate": 0.0002, + "loss": 0.997, + "step": 2010 + }, + { + "epoch": 1.4664246823956444, + "grad_norm": 0.45515501499176025, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 2020 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 0.4153139591217041, + "learning_rate": 0.0002, + "loss": 1.0188, + "step": 2030 + }, + { + "epoch": 1.4809437386569873, + "grad_norm": 0.44353052973747253, + "learning_rate": 0.0002, + "loss": 1.1521, + "step": 2040 + }, + { + "epoch": 1.4882032667876588, + "grad_norm": 0.570554256439209, + "learning_rate": 0.0002, + "loss": 0.9653, + "step": 2050 + }, + { + "epoch": 1.4954627949183303, + "grad_norm": 0.5742740035057068, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 2060 + }, + { + "epoch": 1.5027223230490017, + "grad_norm": 0.5890517830848694, + "learning_rate": 0.0002, + "loss": 1.0963, + "step": 2070 + }, + { + "epoch": 1.5099818511796732, + "grad_norm": 0.4162650406360626, + "learning_rate": 0.0002, + "loss": 1.0639, + "step": 2080 + }, + { + "epoch": 1.5172413793103448, + "grad_norm": 0.7334717512130737, + "learning_rate": 0.0002, + "loss": 1.071, + "step": 2090 + }, + { + "epoch": 1.5245009074410163, + "grad_norm": 0.2963249683380127, + "learning_rate": 0.0002, + "loss": 0.8957, + "step": 2100 + }, + { + "epoch": 1.5317604355716878, + "grad_norm": 0.30676454305648804, + "learning_rate": 0.0002, + "loss": 1.0446, + "step": 2110 + }, + { + "epoch": 1.5390199637023594, + "grad_norm": 0.35984641313552856, + "learning_rate": 0.0002, + "loss": 0.9967, + "step": 2120 + }, + { + "epoch": 1.546279491833031, + "grad_norm": 0.3384549617767334, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 2130 + }, + { + "epoch": 1.5535390199637025, + "grad_norm": 0.4725518226623535, + "learning_rate": 0.0002, + "loss": 0.9874, + "step": 2140 + }, + { + "epoch": 1.560798548094374, + "grad_norm": 0.3252685070037842, + "learning_rate": 0.0002, + "loss": 1.1166, + "step": 2150 + }, + { + "epoch": 1.5680580762250453, + "grad_norm": 0.45043081045150757, + "learning_rate": 0.0002, + "loss": 0.9127, + "step": 2160 + }, + { + "epoch": 1.5753176043557169, + "grad_norm": 0.374208003282547, + "learning_rate": 0.0002, + "loss": 1.0767, + "step": 2170 + }, + { + "epoch": 1.5825771324863884, + "grad_norm": 0.5118404030799866, + "learning_rate": 0.0002, + "loss": 1.0255, + "step": 2180 + }, + { + "epoch": 1.5898366606170597, + "grad_norm": 0.482170969247818, + "learning_rate": 0.0002, + "loss": 1.0062, + "step": 2190 + }, + { + "epoch": 1.5970961887477313, + "grad_norm": 0.5337533950805664, + "learning_rate": 0.0002, + "loss": 1.0872, + "step": 2200 + }, + { + "epoch": 1.6043557168784028, + "grad_norm": 0.5195064544677734, + "learning_rate": 0.0002, + "loss": 1.0405, + "step": 2210 + }, + { + "epoch": 1.6116152450090744, + "grad_norm": 0.30807098746299744, + "learning_rate": 0.0002, + "loss": 1.0454, + "step": 2220 + }, + { + "epoch": 1.618874773139746, + "grad_norm": 0.3962925672531128, + "learning_rate": 0.0002, + "loss": 1.0293, + "step": 2230 + }, + { + "epoch": 1.6261343012704175, + "grad_norm": 0.7636962532997131, + "learning_rate": 0.0002, + "loss": 1.0137, + "step": 2240 + }, + { + "epoch": 1.633393829401089, + "grad_norm": 0.32380592823028564, + "learning_rate": 0.0002, + "loss": 0.991, + "step": 2250 + }, + { + "epoch": 1.6406533575317606, + "grad_norm": 0.5767741799354553, + "learning_rate": 0.0002, + "loss": 0.9471, + "step": 2260 + }, + { + "epoch": 1.647912885662432, + "grad_norm": 0.39964812994003296, + "learning_rate": 0.0002, + "loss": 0.9524, + "step": 2270 + }, + { + "epoch": 1.6551724137931034, + "grad_norm": 0.622629702091217, + "learning_rate": 0.0002, + "loss": 0.9866, + "step": 2280 + }, + { + "epoch": 1.662431941923775, + "grad_norm": 0.40202152729034424, + "learning_rate": 0.0002, + "loss": 0.9609, + "step": 2290 + }, + { + "epoch": 1.6696914700544465, + "grad_norm": 0.4467349052429199, + "learning_rate": 0.0002, + "loss": 0.9859, + "step": 2300 + }, + { + "epoch": 1.6769509981851178, + "grad_norm": 0.5026949048042297, + "learning_rate": 0.0002, + "loss": 1.0312, + "step": 2310 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 0.43754541873931885, + "learning_rate": 0.0002, + "loss": 1.0776, + "step": 2320 + }, + { + "epoch": 1.691470054446461, + "grad_norm": 0.42869430780410767, + "learning_rate": 0.0002, + "loss": 0.9786, + "step": 2330 + }, + { + "epoch": 1.6987295825771325, + "grad_norm": 0.4192679524421692, + "learning_rate": 0.0002, + "loss": 1.0747, + "step": 2340 + }, + { + "epoch": 1.705989110707804, + "grad_norm": 0.3243155777454376, + "learning_rate": 0.0002, + "loss": 1.0054, + "step": 2350 + }, + { + "epoch": 1.7132486388384756, + "grad_norm": 1.0514075756072998, + "learning_rate": 0.0002, + "loss": 0.9165, + "step": 2360 + }, + { + "epoch": 1.720508166969147, + "grad_norm": 0.4719122648239136, + "learning_rate": 0.0002, + "loss": 1.1353, + "step": 2370 + }, + { + "epoch": 1.7277676950998186, + "grad_norm": 0.3846144676208496, + "learning_rate": 0.0002, + "loss": 0.9913, + "step": 2380 + }, + { + "epoch": 1.73502722323049, + "grad_norm": 0.7266581058502197, + "learning_rate": 0.0002, + "loss": 0.9935, + "step": 2390 + }, + { + "epoch": 1.7422867513611615, + "grad_norm": 0.6183241605758667, + "learning_rate": 0.0002, + "loss": 0.9509, + "step": 2400 + }, + { + "epoch": 1.749546279491833, + "grad_norm": 0.3658260405063629, + "learning_rate": 0.0002, + "loss": 0.9713, + "step": 2410 + }, + { + "epoch": 1.7568058076225044, + "grad_norm": 0.6036322712898254, + "learning_rate": 0.0002, + "loss": 1.0033, + "step": 2420 + }, + { + "epoch": 1.764065335753176, + "grad_norm": 0.7872936129570007, + "learning_rate": 0.0002, + "loss": 1.0526, + "step": 2430 + }, + { + "epoch": 1.7713248638838475, + "grad_norm": 0.35946124792099, + "learning_rate": 0.0002, + "loss": 0.99, + "step": 2440 + }, + { + "epoch": 1.778584392014519, + "grad_norm": 0.3740338981151581, + "learning_rate": 0.0002, + "loss": 0.8845, + "step": 2450 + }, + { + "epoch": 1.7858439201451906, + "grad_norm": 0.6150230169296265, + "learning_rate": 0.0002, + "loss": 0.9333, + "step": 2460 + }, + { + "epoch": 1.793103448275862, + "grad_norm": 0.4726075530052185, + "learning_rate": 0.0002, + "loss": 1.0373, + "step": 2470 + }, + { + "epoch": 1.8003629764065336, + "grad_norm": 0.31292253732681274, + "learning_rate": 0.0002, + "loss": 0.974, + "step": 2480 + }, + { + "epoch": 1.8076225045372052, + "grad_norm": 0.4463104009628296, + "learning_rate": 0.0002, + "loss": 1.0205, + "step": 2490 + }, + { + "epoch": 1.8148820326678767, + "grad_norm": 0.7848200798034668, + "learning_rate": 0.0002, + "loss": 1.0771, + "step": 2500 + }, + { + "epoch": 1.822141560798548, + "grad_norm": 0.5562082529067993, + "learning_rate": 0.0002, + "loss": 0.9508, + "step": 2510 + }, + { + "epoch": 1.8294010889292196, + "grad_norm": 0.39892756938934326, + "learning_rate": 0.0002, + "loss": 0.9343, + "step": 2520 + }, + { + "epoch": 1.8366606170598911, + "grad_norm": 1.2923320531845093, + "learning_rate": 0.0002, + "loss": 1.0489, + "step": 2530 + }, + { + "epoch": 1.8439201451905625, + "grad_norm": 0.6316490769386292, + "learning_rate": 0.0002, + "loss": 1.1203, + "step": 2540 + }, + { + "epoch": 1.851179673321234, + "grad_norm": 0.46100497245788574, + "learning_rate": 0.0002, + "loss": 0.9131, + "step": 2550 + }, + { + "epoch": 1.8584392014519056, + "grad_norm": 0.7902987003326416, + "learning_rate": 0.0002, + "loss": 1.0977, + "step": 2560 + }, + { + "epoch": 1.865698729582577, + "grad_norm": 0.4596365690231323, + "learning_rate": 0.0002, + "loss": 0.9702, + "step": 2570 + }, + { + "epoch": 1.8729582577132486, + "grad_norm": 0.6592172384262085, + "learning_rate": 0.0002, + "loss": 0.9322, + "step": 2580 + }, + { + "epoch": 1.8802177858439202, + "grad_norm": 0.5764662027359009, + "learning_rate": 0.0002, + "loss": 0.9549, + "step": 2590 + }, + { + "epoch": 1.8874773139745917, + "grad_norm": 0.8421637415885925, + "learning_rate": 0.0002, + "loss": 0.9485, + "step": 2600 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.5635305047035217, + "learning_rate": 0.0002, + "loss": 1.012, + "step": 2610 + }, + { + "epoch": 1.9019963702359348, + "grad_norm": 0.46231237053871155, + "learning_rate": 0.0002, + "loss": 0.8907, + "step": 2620 + }, + { + "epoch": 1.9092558983666061, + "grad_norm": 0.3944607079029083, + "learning_rate": 0.0002, + "loss": 0.9543, + "step": 2630 + }, + { + "epoch": 1.9165154264972777, + "grad_norm": 0.4753907322883606, + "learning_rate": 0.0002, + "loss": 0.9964, + "step": 2640 + }, + { + "epoch": 1.9237749546279492, + "grad_norm": 0.4151090979576111, + "learning_rate": 0.0002, + "loss": 1.0217, + "step": 2650 + }, + { + "epoch": 1.9310344827586206, + "grad_norm": 0.6793725490570068, + "learning_rate": 0.0002, + "loss": 1.0397, + "step": 2660 + }, + { + "epoch": 1.938294010889292, + "grad_norm": 0.339755117893219, + "learning_rate": 0.0002, + "loss": 1.0401, + "step": 2670 + }, + { + "epoch": 1.9455535390199636, + "grad_norm": 0.40789374709129333, + "learning_rate": 0.0002, + "loss": 1.082, + "step": 2680 + }, + { + "epoch": 1.9528130671506352, + "grad_norm": 0.3750005066394806, + "learning_rate": 0.0002, + "loss": 0.9973, + "step": 2690 + }, + { + "epoch": 1.9600725952813067, + "grad_norm": 0.39684441685676575, + "learning_rate": 0.0002, + "loss": 0.9837, + "step": 2700 + }, + { + "epoch": 1.9673321234119783, + "grad_norm": 0.378287672996521, + "learning_rate": 0.0002, + "loss": 1.1204, + "step": 2710 + }, + { + "epoch": 1.9745916515426498, + "grad_norm": 0.3668482005596161, + "learning_rate": 0.0002, + "loss": 0.9284, + "step": 2720 + }, + { + "epoch": 1.9818511796733214, + "grad_norm": 0.49997565150260925, + "learning_rate": 0.0002, + "loss": 1.0512, + "step": 2730 + }, + { + "epoch": 1.989110707803993, + "grad_norm": 0.36852124333381653, + "learning_rate": 0.0002, + "loss": 0.9311, + "step": 2740 + }, + { + "epoch": 1.9963702359346642, + "grad_norm": 0.5203380584716797, + "learning_rate": 0.0002, + "loss": 1.0898, + "step": 2750 + }, + { + "epoch": 2.0, + "eval_loss": 1.1086540222167969, + "eval_runtime": 95.6866, + "eval_samples_per_second": 4.557, + "eval_steps_per_second": 0.575, + "step": 2755 + }, + { + "epoch": 2.0036297640653356, + "grad_norm": 0.35921573638916016, + "learning_rate": 0.0002, + "loss": 0.9791, + "step": 2760 + }, + { + "epoch": 2.010889292196007, + "grad_norm": 1.013861894607544, + "learning_rate": 0.0002, + "loss": 0.8968, + "step": 2770 + }, + { + "epoch": 2.0181488203266786, + "grad_norm": 0.4425240159034729, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 2780 + }, + { + "epoch": 2.02540834845735, + "grad_norm": 0.60174161195755, + "learning_rate": 0.0002, + "loss": 0.8758, + "step": 2790 + }, + { + "epoch": 2.0326678765880217, + "grad_norm": 0.47582098841667175, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 2800 + }, + { + "epoch": 2.0399274047186933, + "grad_norm": 0.6012811660766602, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 2810 + }, + { + "epoch": 2.047186932849365, + "grad_norm": 0.4444408118724823, + "learning_rate": 0.0002, + "loss": 0.8109, + "step": 2820 + }, + { + "epoch": 2.0544464609800364, + "grad_norm": 0.4864003360271454, + "learning_rate": 0.0002, + "loss": 0.8621, + "step": 2830 + }, + { + "epoch": 2.061705989110708, + "grad_norm": 0.5104215741157532, + "learning_rate": 0.0002, + "loss": 0.8817, + "step": 2840 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 0.6218489408493042, + "learning_rate": 0.0002, + "loss": 0.7789, + "step": 2850 + }, + { + "epoch": 2.076225045372051, + "grad_norm": 0.705784261226654, + "learning_rate": 0.0002, + "loss": 0.8778, + "step": 2860 + }, + { + "epoch": 2.0834845735027225, + "grad_norm": 0.48091503977775574, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 2870 + }, + { + "epoch": 2.0907441016333936, + "grad_norm": 0.5062456727027893, + "learning_rate": 0.0002, + "loss": 0.8444, + "step": 2880 + }, + { + "epoch": 2.098003629764065, + "grad_norm": 0.4862022399902344, + "learning_rate": 0.0002, + "loss": 0.7898, + "step": 2890 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.48264044523239136, + "learning_rate": 0.0002, + "loss": 0.8593, + "step": 2900 + }, + { + "epoch": 2.1125226860254083, + "grad_norm": 0.43744346499443054, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 2910 + }, + { + "epoch": 2.11978221415608, + "grad_norm": 0.5480492115020752, + "learning_rate": 0.0002, + "loss": 0.9099, + "step": 2920 + }, + { + "epoch": 2.1270417422867514, + "grad_norm": 0.5068560838699341, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 2930 + }, + { + "epoch": 2.134301270417423, + "grad_norm": 0.4650852680206299, + "learning_rate": 0.0002, + "loss": 0.8403, + "step": 2940 + }, + { + "epoch": 2.1415607985480944, + "grad_norm": 0.4929981231689453, + "learning_rate": 0.0002, + "loss": 0.8467, + "step": 2950 + }, + { + "epoch": 2.148820326678766, + "grad_norm": 0.6537389755249023, + "learning_rate": 0.0002, + "loss": 0.8747, + "step": 2960 + }, + { + "epoch": 2.1560798548094375, + "grad_norm": 0.8032940626144409, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 2970 + }, + { + "epoch": 2.163339382940109, + "grad_norm": 0.7131643891334534, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 2980 + }, + { + "epoch": 2.1705989110707806, + "grad_norm": 0.6034275889396667, + "learning_rate": 0.0002, + "loss": 0.8424, + "step": 2990 + }, + { + "epoch": 2.1778584392014517, + "grad_norm": 0.6081095933914185, + "learning_rate": 0.0002, + "loss": 0.8838, + "step": 3000 + }, + { + "epoch": 2.1851179673321233, + "grad_norm": 0.5706912875175476, + "learning_rate": 0.0002, + "loss": 0.9013, + "step": 3010 + }, + { + "epoch": 2.192377495462795, + "grad_norm": 0.6742380261421204, + "learning_rate": 0.0002, + "loss": 0.8473, + "step": 3020 + }, + { + "epoch": 2.1996370235934664, + "grad_norm": 0.4847496449947357, + "learning_rate": 0.0002, + "loss": 0.8372, + "step": 3030 + }, + { + "epoch": 2.206896551724138, + "grad_norm": 0.5529342889785767, + "learning_rate": 0.0002, + "loss": 0.8487, + "step": 3040 + }, + { + "epoch": 2.2141560798548094, + "grad_norm": 0.6108783483505249, + "learning_rate": 0.0002, + "loss": 0.9057, + "step": 3050 + }, + { + "epoch": 2.221415607985481, + "grad_norm": 0.8841571807861328, + "learning_rate": 0.0002, + "loss": 0.9659, + "step": 3060 + }, + { + "epoch": 2.2286751361161525, + "grad_norm": 0.4227530360221863, + "learning_rate": 0.0002, + "loss": 0.8538, + "step": 3070 + }, + { + "epoch": 2.235934664246824, + "grad_norm": 0.461935818195343, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 3080 + }, + { + "epoch": 2.2431941923774956, + "grad_norm": 0.5407412648200989, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 3090 + }, + { + "epoch": 2.250453720508167, + "grad_norm": 0.5057998895645142, + "learning_rate": 0.0002, + "loss": 0.8392, + "step": 3100 + }, + { + "epoch": 2.2577132486388383, + "grad_norm": 0.530057966709137, + "learning_rate": 0.0002, + "loss": 0.7898, + "step": 3110 + }, + { + "epoch": 2.26497277676951, + "grad_norm": 0.5066842436790466, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 3120 + }, + { + "epoch": 2.2722323049001814, + "grad_norm": 0.5069178342819214, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 3130 + }, + { + "epoch": 2.279491833030853, + "grad_norm": 0.6095499396324158, + "learning_rate": 0.0002, + "loss": 0.8458, + "step": 3140 + }, + { + "epoch": 2.2867513611615244, + "grad_norm": 0.49524766206741333, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 3150 + }, + { + "epoch": 2.294010889292196, + "grad_norm": 0.5334409475326538, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 3160 + }, + { + "epoch": 2.3012704174228675, + "grad_norm": 1.681748867034912, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 3170 + }, + { + "epoch": 2.308529945553539, + "grad_norm": 0.7225565314292908, + "learning_rate": 0.0002, + "loss": 0.8155, + "step": 3180 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 0.5379388928413391, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 3190 + }, + { + "epoch": 2.323049001814882, + "grad_norm": 0.45770326256752014, + "learning_rate": 0.0002, + "loss": 0.8833, + "step": 3200 + }, + { + "epoch": 2.3303085299455537, + "grad_norm": 0.6984533071517944, + "learning_rate": 0.0002, + "loss": 0.8419, + "step": 3210 + }, + { + "epoch": 2.337568058076225, + "grad_norm": 0.6725744605064392, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 3220 + }, + { + "epoch": 2.344827586206897, + "grad_norm": 1.1247471570968628, + "learning_rate": 0.0002, + "loss": 0.8657, + "step": 3230 + }, + { + "epoch": 2.352087114337568, + "grad_norm": 1.0240263938903809, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 3240 + }, + { + "epoch": 2.3593466424682394, + "grad_norm": 0.5608096122741699, + "learning_rate": 0.0002, + "loss": 0.826, + "step": 3250 + }, + { + "epoch": 2.366606170598911, + "grad_norm": 0.8294990062713623, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 3260 + }, + { + "epoch": 2.3738656987295825, + "grad_norm": 0.6734224557876587, + "learning_rate": 0.0002, + "loss": 0.9845, + "step": 3270 + }, + { + "epoch": 2.381125226860254, + "grad_norm": 0.6862800717353821, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 3280 + }, + { + "epoch": 2.3883847549909256, + "grad_norm": 0.5442930459976196, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 3290 + }, + { + "epoch": 2.395644283121597, + "grad_norm": 0.5745394229888916, + "learning_rate": 0.0002, + "loss": 0.8543, + "step": 3300 + }, + { + "epoch": 2.4029038112522687, + "grad_norm": 0.6257799863815308, + "learning_rate": 0.0002, + "loss": 0.833, + "step": 3310 + }, + { + "epoch": 2.4101633393829403, + "grad_norm": 0.5608420968055725, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 3320 + }, + { + "epoch": 2.417422867513612, + "grad_norm": 0.5512017011642456, + "learning_rate": 0.0002, + "loss": 0.8745, + "step": 3330 + }, + { + "epoch": 2.4246823956442833, + "grad_norm": 0.7452999353408813, + "learning_rate": 0.0002, + "loss": 0.8642, + "step": 3340 + }, + { + "epoch": 2.4319419237749544, + "grad_norm": 0.4604301452636719, + "learning_rate": 0.0002, + "loss": 0.8417, + "step": 3350 + }, + { + "epoch": 2.439201451905626, + "grad_norm": 0.8225823640823364, + "learning_rate": 0.0002, + "loss": 0.9148, + "step": 3360 + }, + { + "epoch": 2.4464609800362975, + "grad_norm": 0.8604981303215027, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 3370 + }, + { + "epoch": 2.453720508166969, + "grad_norm": 0.6620925664901733, + "learning_rate": 0.0002, + "loss": 0.9276, + "step": 3380 + }, + { + "epoch": 2.4609800362976406, + "grad_norm": 0.4750158488750458, + "learning_rate": 0.0002, + "loss": 0.8381, + "step": 3390 + }, + { + "epoch": 2.468239564428312, + "grad_norm": 0.6061418056488037, + "learning_rate": 0.0002, + "loss": 0.8249, + "step": 3400 + }, + { + "epoch": 2.4754990925589837, + "grad_norm": 0.5977247357368469, + "learning_rate": 0.0002, + "loss": 0.8944, + "step": 3410 + }, + { + "epoch": 2.4827586206896552, + "grad_norm": 0.5004227757453918, + "learning_rate": 0.0002, + "loss": 0.8366, + "step": 3420 + }, + { + "epoch": 2.490018148820327, + "grad_norm": 0.46428972482681274, + "learning_rate": 0.0002, + "loss": 0.9133, + "step": 3430 + }, + { + "epoch": 2.4972776769509983, + "grad_norm": 0.7658395171165466, + "learning_rate": 0.0002, + "loss": 0.8534, + "step": 3440 + }, + { + "epoch": 2.50453720508167, + "grad_norm": 0.621819794178009, + "learning_rate": 0.0002, + "loss": 0.9066, + "step": 3450 + }, + { + "epoch": 2.511796733212341, + "grad_norm": 0.4556088149547577, + "learning_rate": 0.0002, + "loss": 0.8481, + "step": 3460 + }, + { + "epoch": 2.519056261343013, + "grad_norm": 0.6124140024185181, + "learning_rate": 0.0002, + "loss": 0.8522, + "step": 3470 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 0.6256231665611267, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 3480 + }, + { + "epoch": 2.5335753176043556, + "grad_norm": 0.5464141964912415, + "learning_rate": 0.0002, + "loss": 0.9097, + "step": 3490 + }, + { + "epoch": 2.540834845735027, + "grad_norm": 0.51471346616745, + "learning_rate": 0.0002, + "loss": 0.8708, + "step": 3500 + }, + { + "epoch": 2.5480943738656987, + "grad_norm": 0.5326165556907654, + "learning_rate": 0.0002, + "loss": 0.9042, + "step": 3510 + }, + { + "epoch": 2.5553539019963702, + "grad_norm": 0.4750378429889679, + "learning_rate": 0.0002, + "loss": 0.8749, + "step": 3520 + }, + { + "epoch": 2.562613430127042, + "grad_norm": 0.5292279124259949, + "learning_rate": 0.0002, + "loss": 0.8248, + "step": 3530 + }, + { + "epoch": 2.5698729582577133, + "grad_norm": 0.6145227551460266, + "learning_rate": 0.0002, + "loss": 0.8808, + "step": 3540 + }, + { + "epoch": 2.577132486388385, + "grad_norm": 0.8275189399719238, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 3550 + }, + { + "epoch": 2.5843920145190564, + "grad_norm": 0.5037438273429871, + "learning_rate": 0.0002, + "loss": 0.875, + "step": 3560 + }, + { + "epoch": 2.5916515426497275, + "grad_norm": 0.5838707685470581, + "learning_rate": 0.0002, + "loss": 0.8324, + "step": 3570 + }, + { + "epoch": 2.5989110707803995, + "grad_norm": 0.5398710370063782, + "learning_rate": 0.0002, + "loss": 0.8716, + "step": 3580 + }, + { + "epoch": 2.6061705989110706, + "grad_norm": 0.6115376949310303, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 3590 + }, + { + "epoch": 2.613430127041742, + "grad_norm": 1.118809461593628, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 3600 + }, + { + "epoch": 2.6206896551724137, + "grad_norm": 0.7811771631240845, + "learning_rate": 0.0002, + "loss": 0.8282, + "step": 3610 + }, + { + "epoch": 2.6279491833030852, + "grad_norm": 0.5753175020217896, + "learning_rate": 0.0002, + "loss": 0.7997, + "step": 3620 + }, + { + "epoch": 2.635208711433757, + "grad_norm": 0.550829291343689, + "learning_rate": 0.0002, + "loss": 0.7705, + "step": 3630 + }, + { + "epoch": 2.6424682395644283, + "grad_norm": 0.5360019207000732, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 3640 + }, + { + "epoch": 2.6497277676951, + "grad_norm": 0.63050377368927, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 3650 + }, + { + "epoch": 2.6569872958257714, + "grad_norm": 0.5833110213279724, + "learning_rate": 0.0002, + "loss": 0.9156, + "step": 3660 + }, + { + "epoch": 2.664246823956443, + "grad_norm": 0.5543047189712524, + "learning_rate": 0.0002, + "loss": 0.8033, + "step": 3670 + }, + { + "epoch": 2.671506352087114, + "grad_norm": 0.6842212080955505, + "learning_rate": 0.0002, + "loss": 0.8867, + "step": 3680 + }, + { + "epoch": 2.678765880217786, + "grad_norm": 0.6859333515167236, + "learning_rate": 0.0002, + "loss": 0.8569, + "step": 3690 + }, + { + "epoch": 2.686025408348457, + "grad_norm": 0.7038410902023315, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 3700 + }, + { + "epoch": 2.6932849364791287, + "grad_norm": 0.447233647108078, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 3710 + }, + { + "epoch": 2.7005444646098002, + "grad_norm": 0.6453872919082642, + "learning_rate": 0.0002, + "loss": 0.8951, + "step": 3720 + }, + { + "epoch": 2.707803992740472, + "grad_norm": 0.8025672435760498, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 3730 + }, + { + "epoch": 2.7150635208711433, + "grad_norm": 0.5997087955474854, + "learning_rate": 0.0002, + "loss": 0.8488, + "step": 3740 + }, + { + "epoch": 2.722323049001815, + "grad_norm": 0.6901142001152039, + "learning_rate": 0.0002, + "loss": 0.8308, + "step": 3750 + }, + { + "epoch": 2.7295825771324864, + "grad_norm": 1.036145567893982, + "learning_rate": 0.0002, + "loss": 0.8517, + "step": 3760 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 0.7207072377204895, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 3770 + }, + { + "epoch": 2.7441016333938295, + "grad_norm": 1.0452989339828491, + "learning_rate": 0.0002, + "loss": 0.8009, + "step": 3780 + }, + { + "epoch": 2.751361161524501, + "grad_norm": 0.5615278482437134, + "learning_rate": 0.0002, + "loss": 0.8868, + "step": 3790 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 0.46439215540885925, + "learning_rate": 0.0002, + "loss": 0.9047, + "step": 3800 + }, + { + "epoch": 2.7658802177858437, + "grad_norm": 0.7134917974472046, + "learning_rate": 0.0002, + "loss": 0.7847, + "step": 3810 + }, + { + "epoch": 2.7731397459165157, + "grad_norm": 0.5139115452766418, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 3820 + }, + { + "epoch": 2.780399274047187, + "grad_norm": 0.8595781326293945, + "learning_rate": 0.0002, + "loss": 0.8995, + "step": 3830 + }, + { + "epoch": 2.7876588021778583, + "grad_norm": 0.544614851474762, + "learning_rate": 0.0002, + "loss": 0.9079, + "step": 3840 + }, + { + "epoch": 2.79491833030853, + "grad_norm": 0.6073850393295288, + "learning_rate": 0.0002, + "loss": 0.8322, + "step": 3850 + }, + { + "epoch": 2.8021778584392014, + "grad_norm": 0.8224069476127625, + "learning_rate": 0.0002, + "loss": 0.8277, + "step": 3860 + }, + { + "epoch": 2.809437386569873, + "grad_norm": 0.5347970128059387, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 3870 + }, + { + "epoch": 2.8166969147005445, + "grad_norm": 0.6799601912498474, + "learning_rate": 0.0002, + "loss": 0.8219, + "step": 3880 + }, + { + "epoch": 2.823956442831216, + "grad_norm": 0.5219197869300842, + "learning_rate": 0.0002, + "loss": 0.8796, + "step": 3890 + }, + { + "epoch": 2.8312159709618876, + "grad_norm": 0.5710130333900452, + "learning_rate": 0.0002, + "loss": 0.8649, + "step": 3900 + }, + { + "epoch": 2.838475499092559, + "grad_norm": 0.5857213139533997, + "learning_rate": 0.0002, + "loss": 0.8741, + "step": 3910 + }, + { + "epoch": 2.8457350272232302, + "grad_norm": 0.5206644535064697, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 3920 + }, + { + "epoch": 2.8529945553539022, + "grad_norm": 1.2902015447616577, + "learning_rate": 0.0002, + "loss": 0.9328, + "step": 3930 + }, + { + "epoch": 2.8602540834845733, + "grad_norm": 0.5252797603607178, + "learning_rate": 0.0002, + "loss": 0.8192, + "step": 3940 + }, + { + "epoch": 2.867513611615245, + "grad_norm": 0.5925108790397644, + "learning_rate": 0.0002, + "loss": 0.9047, + "step": 3950 + }, + { + "epoch": 2.8747731397459164, + "grad_norm": 0.9719610810279846, + "learning_rate": 0.0002, + "loss": 0.8611, + "step": 3960 + }, + { + "epoch": 2.882032667876588, + "grad_norm": 0.6834747195243835, + "learning_rate": 0.0002, + "loss": 0.8531, + "step": 3970 + }, + { + "epoch": 2.8892921960072595, + "grad_norm": 0.6952353119850159, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 3980 + }, + { + "epoch": 2.896551724137931, + "grad_norm": 0.49889910221099854, + "learning_rate": 0.0002, + "loss": 0.8365, + "step": 3990 + }, + { + "epoch": 2.9038112522686026, + "grad_norm": 0.5007026791572571, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 4000 + }, + { + "epoch": 2.911070780399274, + "grad_norm": 0.5474239587783813, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 4010 + }, + { + "epoch": 2.9183303085299457, + "grad_norm": 0.6617428660392761, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 4020 + }, + { + "epoch": 2.925589836660617, + "grad_norm": 0.6097776293754578, + "learning_rate": 0.0002, + "loss": 0.8583, + "step": 4030 + }, + { + "epoch": 2.9328493647912888, + "grad_norm": 0.5985828638076782, + "learning_rate": 0.0002, + "loss": 0.8296, + "step": 4040 + }, + { + "epoch": 2.94010889292196, + "grad_norm": 0.769488513469696, + "learning_rate": 0.0002, + "loss": 0.9134, + "step": 4050 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 0.5167055130004883, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 4060 + }, + { + "epoch": 2.954627949183303, + "grad_norm": 0.6265496015548706, + "learning_rate": 0.0002, + "loss": 0.8665, + "step": 4070 + }, + { + "epoch": 2.9618874773139745, + "grad_norm": 1.2644082307815552, + "learning_rate": 0.0002, + "loss": 0.9218, + "step": 4080 + }, + { + "epoch": 2.969147005444646, + "grad_norm": 0.6007561087608337, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 4090 + }, + { + "epoch": 2.9764065335753176, + "grad_norm": 0.47984927892684937, + "learning_rate": 0.0002, + "loss": 0.8262, + "step": 4100 + }, + { + "epoch": 2.983666061705989, + "grad_norm": 1.128198504447937, + "learning_rate": 0.0002, + "loss": 0.7897, + "step": 4110 + }, + { + "epoch": 2.9909255898366607, + "grad_norm": 0.526292085647583, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 4120 + }, + { + "epoch": 2.9981851179673322, + "grad_norm": 0.5462674498558044, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 4130 + }, + { + "epoch": 2.9996370235934666, + "eval_loss": 1.1093357801437378, + "eval_runtime": 46.2498, + "eval_samples_per_second": 9.427, + "eval_steps_per_second": 1.189, + "step": 4132 + }, + { + "epoch": 3.0054446460980038, + "grad_norm": 0.48288026452064514, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 4140 + }, + { + "epoch": 3.0127041742286753, + "grad_norm": 1.0181782245635986, + "learning_rate": 0.0002, + "loss": 0.6769, + "step": 4150 + }, + { + "epoch": 3.019963702359347, + "grad_norm": 0.7718019485473633, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 4160 + }, + { + "epoch": 3.027223230490018, + "grad_norm": 0.7492219805717468, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 4170 + }, + { + "epoch": 3.0344827586206895, + "grad_norm": 0.9363632798194885, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 4180 + }, + { + "epoch": 3.041742286751361, + "grad_norm": 0.6888533234596252, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 4190 + }, + { + "epoch": 3.0490018148820326, + "grad_norm": 0.7072834968566895, + "learning_rate": 0.0002, + "loss": 0.6469, + "step": 4200 + }, + { + "epoch": 3.056261343012704, + "grad_norm": 0.7182047963142395, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 4210 + }, + { + "epoch": 3.0635208711433757, + "grad_norm": 0.7194355130195618, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 4220 + }, + { + "epoch": 3.0707803992740472, + "grad_norm": 0.9454023838043213, + "learning_rate": 0.0002, + "loss": 0.5812, + "step": 4230 + }, + { + "epoch": 3.0780399274047188, + "grad_norm": 0.838657557964325, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 4240 + }, + { + "epoch": 3.0852994555353903, + "grad_norm": 0.740113377571106, + "learning_rate": 0.0002, + "loss": 0.646, + "step": 4250 + }, + { + "epoch": 3.092558983666062, + "grad_norm": 0.6616561412811279, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 4260 + }, + { + "epoch": 3.0998185117967334, + "grad_norm": 0.8846506476402283, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 4270 + }, + { + "epoch": 3.107078039927405, + "grad_norm": 0.6322125792503357, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 4280 + }, + { + "epoch": 3.114337568058076, + "grad_norm": 0.7461467385292053, + "learning_rate": 0.0002, + "loss": 0.5953, + "step": 4290 + }, + { + "epoch": 3.1215970961887476, + "grad_norm": 0.8251287341117859, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 4300 + }, + { + "epoch": 3.128856624319419, + "grad_norm": 0.8767673373222351, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 4310 + }, + { + "epoch": 3.1361161524500907, + "grad_norm": 0.7758759260177612, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 4320 + }, + { + "epoch": 3.143375680580762, + "grad_norm": 1.1056879758834839, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 4330 + }, + { + "epoch": 3.1506352087114338, + "grad_norm": 0.8259835243225098, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 4340 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 0.6607027053833008, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 4350 + }, + { + "epoch": 3.165154264972777, + "grad_norm": 0.7983301281929016, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 4360 + }, + { + "epoch": 3.1724137931034484, + "grad_norm": 0.6725239157676697, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 4370 + }, + { + "epoch": 3.17967332123412, + "grad_norm": 0.9052095413208008, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 4380 + }, + { + "epoch": 3.1869328493647915, + "grad_norm": 0.8131307363510132, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 4390 + }, + { + "epoch": 3.1941923774954626, + "grad_norm": 0.6435626149177551, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 4400 + }, + { + "epoch": 3.201451905626134, + "grad_norm": 0.84367436170578, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 4410 + }, + { + "epoch": 3.2087114337568057, + "grad_norm": 1.5018867254257202, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 4420 + }, + { + "epoch": 3.215970961887477, + "grad_norm": 0.7019091844558716, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 4430 + }, + { + "epoch": 3.2232304900181488, + "grad_norm": 0.9164197444915771, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 4440 + }, + { + "epoch": 3.2304900181488203, + "grad_norm": 0.7890861630439758, + "learning_rate": 0.0002, + "loss": 0.6618, + "step": 4450 + }, + { + "epoch": 3.237749546279492, + "grad_norm": 0.6517660617828369, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 4460 + }, + { + "epoch": 3.2450090744101634, + "grad_norm": 1.10188889503479, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 4470 + }, + { + "epoch": 3.252268602540835, + "grad_norm": 0.8158330917358398, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 4480 + }, + { + "epoch": 3.2595281306715065, + "grad_norm": 0.7663109302520752, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 4490 + }, + { + "epoch": 3.266787658802178, + "grad_norm": 0.8473444581031799, + "learning_rate": 0.0002, + "loss": 0.6539, + "step": 4500 + }, + { + "epoch": 3.274047186932849, + "grad_norm": 0.9724768996238708, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 4510 + }, + { + "epoch": 3.281306715063521, + "grad_norm": 0.8516759276390076, + "learning_rate": 0.0002, + "loss": 0.5464, + "step": 4520 + }, + { + "epoch": 3.288566243194192, + "grad_norm": 0.7543437480926514, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 4530 + }, + { + "epoch": 3.2958257713248638, + "grad_norm": 1.0472029447555542, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 4540 + }, + { + "epoch": 3.3030852994555353, + "grad_norm": 0.6240826845169067, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 4550 + }, + { + "epoch": 3.310344827586207, + "grad_norm": 0.9957774877548218, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 4560 + }, + { + "epoch": 3.3176043557168784, + "grad_norm": 0.6448912620544434, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 4570 + }, + { + "epoch": 3.32486388384755, + "grad_norm": 0.7519692778587341, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 4580 + }, + { + "epoch": 3.3321234119782215, + "grad_norm": 0.7367453575134277, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 4590 + }, + { + "epoch": 3.339382940108893, + "grad_norm": 0.8064960837364197, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 4600 + }, + { + "epoch": 3.3466424682395646, + "grad_norm": 0.7664631009101868, + "learning_rate": 0.0002, + "loss": 0.6062, + "step": 4610 + }, + { + "epoch": 3.353901996370236, + "grad_norm": 0.7803396582603455, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 4620 + }, + { + "epoch": 3.3611615245009077, + "grad_norm": 0.9141599535942078, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 4630 + }, + { + "epoch": 3.3684210526315788, + "grad_norm": 0.9719856381416321, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 4640 + }, + { + "epoch": 3.3756805807622503, + "grad_norm": 0.9223218560218811, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 4650 + }, + { + "epoch": 3.382940108892922, + "grad_norm": 0.7289277911186218, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 4660 + }, + { + "epoch": 3.3901996370235934, + "grad_norm": 1.039724349975586, + "learning_rate": 0.0002, + "loss": 0.595, + "step": 4670 + }, + { + "epoch": 3.397459165154265, + "grad_norm": 1.397438883781433, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 4680 + }, + { + "epoch": 3.4047186932849365, + "grad_norm": 1.0069999694824219, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 4690 + }, + { + "epoch": 3.411978221415608, + "grad_norm": 0.816291332244873, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 4700 + }, + { + "epoch": 3.4192377495462796, + "grad_norm": 1.2831530570983887, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 4710 + }, + { + "epoch": 3.426497277676951, + "grad_norm": 0.9573889970779419, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 4720 + }, + { + "epoch": 3.4337568058076227, + "grad_norm": 0.7685632705688477, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 4730 + }, + { + "epoch": 3.441016333938294, + "grad_norm": 0.7019195556640625, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 4740 + }, + { + "epoch": 3.4482758620689653, + "grad_norm": 0.7244833707809448, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 4750 + }, + { + "epoch": 3.455535390199637, + "grad_norm": 1.3468551635742188, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 4760 + }, + { + "epoch": 3.4627949183303084, + "grad_norm": 0.822846531867981, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 4770 + }, + { + "epoch": 3.47005444646098, + "grad_norm": 0.7311608195304871, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 4780 + }, + { + "epoch": 3.4773139745916515, + "grad_norm": 0.9466770887374878, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 4790 + }, + { + "epoch": 3.484573502722323, + "grad_norm": 1.1527155637741089, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 4800 + }, + { + "epoch": 3.4918330308529946, + "grad_norm": 1.1288906335830688, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 4810 + }, + { + "epoch": 3.499092558983666, + "grad_norm": 0.9096164107322693, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 4820 + }, + { + "epoch": 3.5063520871143377, + "grad_norm": 0.7988565564155579, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 4830 + }, + { + "epoch": 3.513611615245009, + "grad_norm": 0.7183415293693542, + "learning_rate": 0.0002, + "loss": 0.7004, + "step": 4840 + }, + { + "epoch": 3.5208711433756807, + "grad_norm": 0.6614915132522583, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 4850 + }, + { + "epoch": 3.528130671506352, + "grad_norm": 0.8609521985054016, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 4860 + }, + { + "epoch": 3.535390199637024, + "grad_norm": 0.86552894115448, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 4870 + }, + { + "epoch": 3.542649727767695, + "grad_norm": 0.6926496028900146, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 4880 + }, + { + "epoch": 3.5499092558983665, + "grad_norm": 0.8157467246055603, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 4890 + }, + { + "epoch": 3.557168784029038, + "grad_norm": 0.9085357189178467, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 4900 + }, + { + "epoch": 3.5644283121597096, + "grad_norm": 0.6322644948959351, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 4910 + }, + { + "epoch": 3.571687840290381, + "grad_norm": 1.263205885887146, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 4920 + }, + { + "epoch": 3.5789473684210527, + "grad_norm": 0.8901070356369019, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 4930 + }, + { + "epoch": 3.586206896551724, + "grad_norm": 0.7983952164649963, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 4940 + }, + { + "epoch": 3.5934664246823957, + "grad_norm": 0.9887813925743103, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 4950 + }, + { + "epoch": 3.6007259528130673, + "grad_norm": 0.7895187735557556, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 4960 + }, + { + "epoch": 3.6079854809437384, + "grad_norm": 0.9685819745063782, + "learning_rate": 0.0002, + "loss": 0.7111, + "step": 4970 + }, + { + "epoch": 3.6152450090744104, + "grad_norm": 0.6576591730117798, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 4980 + }, + { + "epoch": 3.6225045372050815, + "grad_norm": 0.856985330581665, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 4990 + }, + { + "epoch": 3.629764065335753, + "grad_norm": 0.7230252623558044, + "learning_rate": 0.0002, + "loss": 0.6318, + "step": 5000 + }, + { + "epoch": 3.6370235934664246, + "grad_norm": 0.8260893821716309, + "learning_rate": 0.0002, + "loss": 0.742, + "step": 5010 + }, + { + "epoch": 3.644283121597096, + "grad_norm": 0.7635950446128845, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 5020 + }, + { + "epoch": 3.6515426497277677, + "grad_norm": 0.7060768604278564, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 5030 + }, + { + "epoch": 3.658802177858439, + "grad_norm": 0.8020303249359131, + "learning_rate": 0.0002, + "loss": 0.6921, + "step": 5040 + }, + { + "epoch": 3.6660617059891107, + "grad_norm": 0.8530341386795044, + "learning_rate": 0.0002, + "loss": 0.6446, + "step": 5050 + }, + { + "epoch": 3.6733212341197823, + "grad_norm": 0.6667101979255676, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 5060 + }, + { + "epoch": 3.680580762250454, + "grad_norm": 0.7385406494140625, + "learning_rate": 0.0002, + "loss": 0.7081, + "step": 5070 + }, + { + "epoch": 3.6878402903811254, + "grad_norm": 0.7753380537033081, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 5080 + }, + { + "epoch": 3.695099818511797, + "grad_norm": 0.7516207098960876, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 5090 + }, + { + "epoch": 3.702359346642468, + "grad_norm": 0.8171586394309998, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 5100 + }, + { + "epoch": 3.70961887477314, + "grad_norm": 1.0796279907226562, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 5110 + }, + { + "epoch": 3.716878402903811, + "grad_norm": 0.6957688927650452, + "learning_rate": 0.0002, + "loss": 0.5948, + "step": 5120 + }, + { + "epoch": 3.7241379310344827, + "grad_norm": 0.8550161719322205, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 5130 + }, + { + "epoch": 3.731397459165154, + "grad_norm": 0.9396728277206421, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 5140 + }, + { + "epoch": 3.7386569872958257, + "grad_norm": 1.4264805316925049, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 5150 + }, + { + "epoch": 3.7459165154264973, + "grad_norm": 0.8725108504295349, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 5160 + }, + { + "epoch": 3.753176043557169, + "grad_norm": 1.0346195697784424, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 5170 + }, + { + "epoch": 3.7604355716878404, + "grad_norm": 0.5395554304122925, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 5180 + }, + { + "epoch": 3.767695099818512, + "grad_norm": 1.3153616189956665, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 5190 + }, + { + "epoch": 3.7749546279491835, + "grad_norm": 0.9879828691482544, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 5200 + }, + { + "epoch": 3.7822141560798546, + "grad_norm": 0.8876672983169556, + "learning_rate": 0.0002, + "loss": 0.7068, + "step": 5210 + }, + { + "epoch": 3.7894736842105265, + "grad_norm": 0.8363267779350281, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 5220 + }, + { + "epoch": 3.7967332123411976, + "grad_norm": 0.637294590473175, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 5230 + }, + { + "epoch": 3.803992740471869, + "grad_norm": 1.1408970355987549, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 5240 + }, + { + "epoch": 3.8112522686025407, + "grad_norm": 1.0128360986709595, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 5250 + }, + { + "epoch": 3.8185117967332123, + "grad_norm": 0.8061144351959229, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 5260 + }, + { + "epoch": 3.825771324863884, + "grad_norm": 0.9626626968383789, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 5270 + }, + { + "epoch": 3.8330308529945554, + "grad_norm": 0.9013627171516418, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 5280 + }, + { + "epoch": 3.840290381125227, + "grad_norm": 0.8411344289779663, + "learning_rate": 0.0002, + "loss": 0.6806, + "step": 5290 + }, + { + "epoch": 3.8475499092558985, + "grad_norm": 0.7426059246063232, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 5300 + }, + { + "epoch": 3.85480943738657, + "grad_norm": 1.003413438796997, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 5310 + }, + { + "epoch": 3.862068965517241, + "grad_norm": 0.7527840733528137, + "learning_rate": 0.0002, + "loss": 0.8554, + "step": 5320 + }, + { + "epoch": 3.869328493647913, + "grad_norm": 0.738610565662384, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 5330 + }, + { + "epoch": 3.876588021778584, + "grad_norm": 0.7277999520301819, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 5340 + }, + { + "epoch": 3.8838475499092557, + "grad_norm": 0.5951359272003174, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 5350 + }, + { + "epoch": 3.8911070780399273, + "grad_norm": 1.043884038925171, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 5360 + }, + { + "epoch": 3.898366606170599, + "grad_norm": 0.8436498045921326, + "learning_rate": 0.0002, + "loss": 0.6862, + "step": 5370 + }, + { + "epoch": 3.9056261343012704, + "grad_norm": 0.5603365302085876, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 5380 + }, + { + "epoch": 3.912885662431942, + "grad_norm": 1.0128886699676514, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 5390 + }, + { + "epoch": 3.9201451905626135, + "grad_norm": 0.7970930337905884, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 5400 + }, + { + "epoch": 3.927404718693285, + "grad_norm": 0.7699369192123413, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 5410 + }, + { + "epoch": 3.9346642468239565, + "grad_norm": 0.800561249256134, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 5420 + }, + { + "epoch": 3.941923774954628, + "grad_norm": 0.8020331859588623, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 5430 + }, + { + "epoch": 3.9491833030852996, + "grad_norm": 0.7461140155792236, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 5440 + }, + { + "epoch": 3.9564428312159707, + "grad_norm": 0.8346918821334839, + "learning_rate": 0.0002, + "loss": 0.7013, + "step": 5450 + }, + { + "epoch": 3.9637023593466427, + "grad_norm": 0.9723302125930786, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 5460 + }, + { + "epoch": 3.970961887477314, + "grad_norm": 0.6809740662574768, + "learning_rate": 0.0002, + "loss": 0.8029, + "step": 5470 + }, + { + "epoch": 3.9782214156079854, + "grad_norm": 0.7353498339653015, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 5480 + }, + { + "epoch": 3.985480943738657, + "grad_norm": 0.748009443283081, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 5490 + }, + { + "epoch": 3.9927404718693285, + "grad_norm": 1.3656195402145386, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 5500 + }, + { + "epoch": 4.0, + "grad_norm": 0.8402108550071716, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 5510 + }, + { + "epoch": 4.0, + "eval_loss": 1.17229425907135, + "eval_runtime": 46.2554, + "eval_samples_per_second": 9.426, + "eval_steps_per_second": 1.189, + "step": 5510 + }, + { + "epoch": 4.007259528130671, + "grad_norm": 0.8601235747337341, + "learning_rate": 0.0002, + "loss": 0.4637, + "step": 5520 + }, + { + "epoch": 4.014519056261343, + "grad_norm": 1.2635200023651123, + "learning_rate": 0.0002, + "loss": 0.4717, + "step": 5530 + }, + { + "epoch": 4.021778584392014, + "grad_norm": 1.0257477760314941, + "learning_rate": 0.0002, + "loss": 0.503, + "step": 5540 + }, + { + "epoch": 4.029038112522686, + "grad_norm": 0.9436745047569275, + "learning_rate": 0.0002, + "loss": 0.4547, + "step": 5550 + }, + { + "epoch": 4.036297640653357, + "grad_norm": 0.9443606734275818, + "learning_rate": 0.0002, + "loss": 0.459, + "step": 5560 + }, + { + "epoch": 4.043557168784029, + "grad_norm": 1.3965742588043213, + "learning_rate": 0.0002, + "loss": 0.5386, + "step": 5570 + }, + { + "epoch": 4.0508166969147, + "grad_norm": 0.8973520398139954, + "learning_rate": 0.0002, + "loss": 0.4248, + "step": 5580 + }, + { + "epoch": 4.058076225045372, + "grad_norm": 0.9998409748077393, + "learning_rate": 0.0002, + "loss": 0.4111, + "step": 5590 + }, + { + "epoch": 4.0653357531760435, + "grad_norm": 1.1213387250900269, + "learning_rate": 0.0002, + "loss": 0.4828, + "step": 5600 + }, + { + "epoch": 4.072595281306715, + "grad_norm": 0.7064558863639832, + "learning_rate": 0.0002, + "loss": 0.439, + "step": 5610 + }, + { + "epoch": 4.0798548094373865, + "grad_norm": 1.2390803098678589, + "learning_rate": 0.0002, + "loss": 0.4607, + "step": 5620 + }, + { + "epoch": 4.087114337568058, + "grad_norm": 1.123469591140747, + "learning_rate": 0.0002, + "loss": 0.5014, + "step": 5630 + }, + { + "epoch": 4.09437386569873, + "grad_norm": 1.229573369026184, + "learning_rate": 0.0002, + "loss": 0.513, + "step": 5640 + }, + { + "epoch": 4.101633393829401, + "grad_norm": 1.7182831764221191, + "learning_rate": 0.0002, + "loss": 0.5258, + "step": 5650 + }, + { + "epoch": 4.108892921960073, + "grad_norm": 0.894903302192688, + "learning_rate": 0.0002, + "loss": 0.5371, + "step": 5660 + }, + { + "epoch": 4.116152450090744, + "grad_norm": 0.8754552006721497, + "learning_rate": 0.0002, + "loss": 0.4813, + "step": 5670 + }, + { + "epoch": 4.123411978221416, + "grad_norm": 1.2401553392410278, + "learning_rate": 0.0002, + "loss": 0.491, + "step": 5680 + }, + { + "epoch": 4.130671506352087, + "grad_norm": 0.8631148934364319, + "learning_rate": 0.0002, + "loss": 0.4549, + "step": 5690 + }, + { + "epoch": 4.137931034482759, + "grad_norm": 1.1798022985458374, + "learning_rate": 0.0002, + "loss": 0.487, + "step": 5700 + }, + { + "epoch": 4.14519056261343, + "grad_norm": 0.8344549536705017, + "learning_rate": 0.0002, + "loss": 0.4522, + "step": 5710 + }, + { + "epoch": 4.152450090744102, + "grad_norm": 1.2342697381973267, + "learning_rate": 0.0002, + "loss": 0.4559, + "step": 5720 + }, + { + "epoch": 4.159709618874773, + "grad_norm": 1.1601094007492065, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 5730 + }, + { + "epoch": 4.166969147005445, + "grad_norm": 1.2925703525543213, + "learning_rate": 0.0002, + "loss": 0.4755, + "step": 5740 + }, + { + "epoch": 4.174228675136116, + "grad_norm": 1.0870997905731201, + "learning_rate": 0.0002, + "loss": 0.4973, + "step": 5750 + }, + { + "epoch": 4.181488203266787, + "grad_norm": 0.9077792763710022, + "learning_rate": 0.0002, + "loss": 0.5184, + "step": 5760 + }, + { + "epoch": 4.188747731397459, + "grad_norm": 1.009273886680603, + "learning_rate": 0.0002, + "loss": 0.4606, + "step": 5770 + }, + { + "epoch": 4.19600725952813, + "grad_norm": 1.2465530633926392, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 5780 + }, + { + "epoch": 4.203266787658802, + "grad_norm": 1.2261253595352173, + "learning_rate": 0.0002, + "loss": 0.4938, + "step": 5790 + }, + { + "epoch": 4.2105263157894735, + "grad_norm": 1.1498041152954102, + "learning_rate": 0.0002, + "loss": 0.5255, + "step": 5800 + }, + { + "epoch": 4.217785843920145, + "grad_norm": 1.1966725587844849, + "learning_rate": 0.0002, + "loss": 0.5269, + "step": 5810 + }, + { + "epoch": 4.2250453720508165, + "grad_norm": 1.2651296854019165, + "learning_rate": 0.0002, + "loss": 0.5626, + "step": 5820 + }, + { + "epoch": 4.2323049001814885, + "grad_norm": 1.0388574600219727, + "learning_rate": 0.0002, + "loss": 0.5213, + "step": 5830 + }, + { + "epoch": 4.23956442831216, + "grad_norm": 1.3042771816253662, + "learning_rate": 0.0002, + "loss": 0.4965, + "step": 5840 + }, + { + "epoch": 4.246823956442832, + "grad_norm": 1.1127727031707764, + "learning_rate": 0.0002, + "loss": 0.5116, + "step": 5850 + }, + { + "epoch": 4.254083484573503, + "grad_norm": 0.9653958082199097, + "learning_rate": 0.0002, + "loss": 0.5197, + "step": 5860 + }, + { + "epoch": 4.261343012704174, + "grad_norm": 1.0500504970550537, + "learning_rate": 0.0002, + "loss": 0.4414, + "step": 5870 + }, + { + "epoch": 4.268602540834846, + "grad_norm": 1.1476165056228638, + "learning_rate": 0.0002, + "loss": 0.544, + "step": 5880 + }, + { + "epoch": 4.275862068965517, + "grad_norm": 0.9424414038658142, + "learning_rate": 0.0002, + "loss": 0.4667, + "step": 5890 + }, + { + "epoch": 4.283121597096189, + "grad_norm": 1.3309166431427002, + "learning_rate": 0.0002, + "loss": 0.5039, + "step": 5900 + }, + { + "epoch": 4.29038112522686, + "grad_norm": 1.3025873899459839, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 5910 + }, + { + "epoch": 4.297640653357532, + "grad_norm": 1.1442325115203857, + "learning_rate": 0.0002, + "loss": 0.4644, + "step": 5920 + }, + { + "epoch": 4.304900181488203, + "grad_norm": 0.9820859432220459, + "learning_rate": 0.0002, + "loss": 0.5066, + "step": 5930 + }, + { + "epoch": 4.312159709618875, + "grad_norm": 0.9615740180015564, + "learning_rate": 0.0002, + "loss": 0.5224, + "step": 5940 + }, + { + "epoch": 4.319419237749546, + "grad_norm": 1.1627109050750732, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 5950 + }, + { + "epoch": 4.326678765880218, + "grad_norm": 0.9381322860717773, + "learning_rate": 0.0002, + "loss": 0.4852, + "step": 5960 + }, + { + "epoch": 4.333938294010889, + "grad_norm": 0.8154335618019104, + "learning_rate": 0.0002, + "loss": 0.4532, + "step": 5970 + }, + { + "epoch": 4.341197822141561, + "grad_norm": 0.877671480178833, + "learning_rate": 0.0002, + "loss": 0.5237, + "step": 5980 + }, + { + "epoch": 4.348457350272232, + "grad_norm": 1.1742031574249268, + "learning_rate": 0.0002, + "loss": 0.6113, + "step": 5990 + }, + { + "epoch": 4.3557168784029034, + "grad_norm": 1.0352917909622192, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 6000 + }, + { + "epoch": 4.362976406533575, + "grad_norm": 0.9963878989219666, + "learning_rate": 0.0002, + "loss": 0.4996, + "step": 6010 + }, + { + "epoch": 4.3702359346642465, + "grad_norm": 1.1892237663269043, + "learning_rate": 0.0002, + "loss": 0.4813, + "step": 6020 + }, + { + "epoch": 4.3774954627949185, + "grad_norm": 1.2516111135482788, + "learning_rate": 0.0002, + "loss": 0.5224, + "step": 6030 + }, + { + "epoch": 4.38475499092559, + "grad_norm": 1.2111951112747192, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 6040 + }, + { + "epoch": 4.392014519056262, + "grad_norm": 1.0820083618164062, + "learning_rate": 0.0002, + "loss": 0.4275, + "step": 6050 + }, + { + "epoch": 4.399274047186933, + "grad_norm": 1.033915638923645, + "learning_rate": 0.0002, + "loss": 0.5117, + "step": 6060 + }, + { + "epoch": 4.406533575317605, + "grad_norm": 1.0635870695114136, + "learning_rate": 0.0002, + "loss": 0.5431, + "step": 6070 + }, + { + "epoch": 4.413793103448276, + "grad_norm": 1.0520414113998413, + "learning_rate": 0.0002, + "loss": 0.5341, + "step": 6080 + }, + { + "epoch": 4.421052631578947, + "grad_norm": 1.0821926593780518, + "learning_rate": 0.0002, + "loss": 0.512, + "step": 6090 + }, + { + "epoch": 4.428312159709619, + "grad_norm": 1.0533246994018555, + "learning_rate": 0.0002, + "loss": 0.5065, + "step": 6100 + }, + { + "epoch": 4.43557168784029, + "grad_norm": 0.9231932759284973, + "learning_rate": 0.0002, + "loss": 0.4577, + "step": 6110 + }, + { + "epoch": 4.442831215970962, + "grad_norm": 0.9910260438919067, + "learning_rate": 0.0002, + "loss": 0.583, + "step": 6120 + }, + { + "epoch": 4.450090744101633, + "grad_norm": 1.061949372291565, + "learning_rate": 0.0002, + "loss": 0.4717, + "step": 6130 + }, + { + "epoch": 4.457350272232305, + "grad_norm": 1.2927039861679077, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 6140 + }, + { + "epoch": 4.464609800362976, + "grad_norm": 1.3966081142425537, + "learning_rate": 0.0002, + "loss": 0.4684, + "step": 6150 + }, + { + "epoch": 4.471869328493648, + "grad_norm": 1.3835992813110352, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 6160 + }, + { + "epoch": 4.479128856624319, + "grad_norm": 1.0892692804336548, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 6170 + }, + { + "epoch": 4.486388384754991, + "grad_norm": 1.0318800210952759, + "learning_rate": 0.0002, + "loss": 0.478, + "step": 6180 + }, + { + "epoch": 4.493647912885662, + "grad_norm": 0.8174677491188049, + "learning_rate": 0.0002, + "loss": 0.5198, + "step": 6190 + }, + { + "epoch": 4.500907441016334, + "grad_norm": 1.4157509803771973, + "learning_rate": 0.0002, + "loss": 0.5387, + "step": 6200 + }, + { + "epoch": 4.508166969147005, + "grad_norm": 1.5244114398956299, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 6210 + }, + { + "epoch": 4.5154264972776765, + "grad_norm": 0.8164850473403931, + "learning_rate": 0.0002, + "loss": 0.4642, + "step": 6220 + }, + { + "epoch": 4.5226860254083485, + "grad_norm": 1.2904746532440186, + "learning_rate": 0.0002, + "loss": 0.522, + "step": 6230 + }, + { + "epoch": 4.52994555353902, + "grad_norm": 0.7987732887268066, + "learning_rate": 0.0002, + "loss": 0.5103, + "step": 6240 + }, + { + "epoch": 4.537205081669692, + "grad_norm": 0.831040620803833, + "learning_rate": 0.0002, + "loss": 0.4615, + "step": 6250 + }, + { + "epoch": 4.544464609800363, + "grad_norm": 0.9545485973358154, + "learning_rate": 0.0002, + "loss": 0.5065, + "step": 6260 + }, + { + "epoch": 4.551724137931035, + "grad_norm": 0.9291793704032898, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 6270 + }, + { + "epoch": 4.558983666061706, + "grad_norm": 0.8977208733558655, + "learning_rate": 0.0002, + "loss": 0.4535, + "step": 6280 + }, + { + "epoch": 4.566243194192378, + "grad_norm": 1.1768537759780884, + "learning_rate": 0.0002, + "loss": 0.544, + "step": 6290 + }, + { + "epoch": 4.573502722323049, + "grad_norm": 1.0688952207565308, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 6300 + }, + { + "epoch": 4.580762250453721, + "grad_norm": 0.8800966739654541, + "learning_rate": 0.0002, + "loss": 0.5207, + "step": 6310 + }, + { + "epoch": 4.588021778584392, + "grad_norm": 1.0911834239959717, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 6320 + }, + { + "epoch": 4.595281306715064, + "grad_norm": 1.1420872211456299, + "learning_rate": 0.0002, + "loss": 0.5109, + "step": 6330 + }, + { + "epoch": 4.602540834845735, + "grad_norm": 1.0215224027633667, + "learning_rate": 0.0002, + "loss": 0.5147, + "step": 6340 + }, + { + "epoch": 4.609800362976406, + "grad_norm": 0.9685489535331726, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 6350 + }, + { + "epoch": 4.617059891107078, + "grad_norm": 1.12773597240448, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 6360 + }, + { + "epoch": 4.624319419237749, + "grad_norm": 1.0663973093032837, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 6370 + }, + { + "epoch": 4.631578947368421, + "grad_norm": 1.1707262992858887, + "learning_rate": 0.0002, + "loss": 0.512, + "step": 6380 + }, + { + "epoch": 4.638838475499092, + "grad_norm": 1.0672980546951294, + "learning_rate": 0.0002, + "loss": 0.5497, + "step": 6390 + }, + { + "epoch": 4.646098003629764, + "grad_norm": 1.1464333534240723, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 6400 + }, + { + "epoch": 4.653357531760435, + "grad_norm": 1.070230484008789, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 6410 + }, + { + "epoch": 4.660617059891107, + "grad_norm": 0.9673764109611511, + "learning_rate": 0.0002, + "loss": 0.5013, + "step": 6420 + }, + { + "epoch": 4.6678765880217785, + "grad_norm": 1.0189043283462524, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 6430 + }, + { + "epoch": 4.67513611615245, + "grad_norm": 1.185896396636963, + "learning_rate": 0.0002, + "loss": 0.5193, + "step": 6440 + }, + { + "epoch": 4.682395644283122, + "grad_norm": 1.0682812929153442, + "learning_rate": 0.0002, + "loss": 0.5318, + "step": 6450 + }, + { + "epoch": 4.689655172413794, + "grad_norm": 1.3586071729660034, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 6460 + }, + { + "epoch": 4.696914700544465, + "grad_norm": 0.6561792492866516, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 6470 + }, + { + "epoch": 4.704174228675136, + "grad_norm": 1.1394113302230835, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 6480 + }, + { + "epoch": 4.711433756805808, + "grad_norm": 0.9683151245117188, + "learning_rate": 0.0002, + "loss": 0.5325, + "step": 6490 + }, + { + "epoch": 4.718693284936479, + "grad_norm": 1.0247553586959839, + "learning_rate": 0.0002, + "loss": 0.5139, + "step": 6500 + }, + { + "epoch": 4.725952813067151, + "grad_norm": 0.8046169281005859, + "learning_rate": 0.0002, + "loss": 0.5794, + "step": 6510 + }, + { + "epoch": 4.733212341197822, + "grad_norm": 1.0710240602493286, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 6520 + }, + { + "epoch": 4.740471869328494, + "grad_norm": 0.9438924193382263, + "learning_rate": 0.0002, + "loss": 0.4805, + "step": 6530 + }, + { + "epoch": 4.747731397459165, + "grad_norm": 0.869162380695343, + "learning_rate": 0.0002, + "loss": 0.5404, + "step": 6540 + }, + { + "epoch": 4.754990925589837, + "grad_norm": 0.9776787161827087, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 6550 + }, + { + "epoch": 4.762250453720508, + "grad_norm": 1.1990505456924438, + "learning_rate": 0.0002, + "loss": 0.5288, + "step": 6560 + }, + { + "epoch": 4.769509981851179, + "grad_norm": 1.0582209825515747, + "learning_rate": 0.0002, + "loss": 0.5539, + "step": 6570 + }, + { + "epoch": 4.776769509981851, + "grad_norm": 0.9966367483139038, + "learning_rate": 0.0002, + "loss": 0.489, + "step": 6580 + }, + { + "epoch": 4.784029038112522, + "grad_norm": 0.9130612015724182, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 6590 + }, + { + "epoch": 4.791288566243194, + "grad_norm": 1.0950500965118408, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 6600 + }, + { + "epoch": 4.798548094373865, + "grad_norm": 1.108681321144104, + "learning_rate": 0.0002, + "loss": 0.5266, + "step": 6610 + }, + { + "epoch": 4.805807622504537, + "grad_norm": 1.1873763799667358, + "learning_rate": 0.0002, + "loss": 0.5875, + "step": 6620 + }, + { + "epoch": 4.8130671506352085, + "grad_norm": 1.305367112159729, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 6630 + }, + { + "epoch": 4.8203266787658805, + "grad_norm": 1.2801482677459717, + "learning_rate": 0.0002, + "loss": 0.5636, + "step": 6640 + }, + { + "epoch": 4.827586206896552, + "grad_norm": 1.26764976978302, + "learning_rate": 0.0002, + "loss": 0.582, + "step": 6650 + }, + { + "epoch": 4.834845735027224, + "grad_norm": 1.0018208026885986, + "learning_rate": 0.0002, + "loss": 0.5259, + "step": 6660 + }, + { + "epoch": 4.842105263157895, + "grad_norm": 1.2326326370239258, + "learning_rate": 0.0002, + "loss": 0.548, + "step": 6670 + }, + { + "epoch": 4.849364791288567, + "grad_norm": 0.9707282781600952, + "learning_rate": 0.0002, + "loss": 0.5933, + "step": 6680 + }, + { + "epoch": 4.856624319419238, + "grad_norm": 1.2772048711776733, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 6690 + }, + { + "epoch": 4.863883847549909, + "grad_norm": 2.6652262210845947, + "learning_rate": 0.0002, + "loss": 0.5346, + "step": 6700 + }, + { + "epoch": 4.871143375680581, + "grad_norm": 1.215828537940979, + "learning_rate": 0.0002, + "loss": 0.5428, + "step": 6710 + }, + { + "epoch": 4.878402903811252, + "grad_norm": 1.3704510927200317, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 6720 + }, + { + "epoch": 4.885662431941924, + "grad_norm": 0.7781757116317749, + "learning_rate": 0.0002, + "loss": 0.4963, + "step": 6730 + }, + { + "epoch": 4.892921960072595, + "grad_norm": 1.1883646249771118, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 6740 + }, + { + "epoch": 4.900181488203267, + "grad_norm": 0.9216066002845764, + "learning_rate": 0.0002, + "loss": 0.6067, + "step": 6750 + }, + { + "epoch": 4.907441016333938, + "grad_norm": 1.0558464527130127, + "learning_rate": 0.0002, + "loss": 0.5085, + "step": 6760 + }, + { + "epoch": 4.91470054446461, + "grad_norm": 1.032656192779541, + "learning_rate": 0.0002, + "loss": 0.5216, + "step": 6770 + }, + { + "epoch": 4.921960072595281, + "grad_norm": 1.1261441707611084, + "learning_rate": 0.0002, + "loss": 0.5426, + "step": 6780 + }, + { + "epoch": 4.929219600725952, + "grad_norm": 1.2178640365600586, + "learning_rate": 0.0002, + "loss": 0.5295, + "step": 6790 + }, + { + "epoch": 4.936479128856624, + "grad_norm": 1.5369361639022827, + "learning_rate": 0.0002, + "loss": 0.5476, + "step": 6800 + }, + { + "epoch": 4.943738656987296, + "grad_norm": 1.1188377141952515, + "learning_rate": 0.0002, + "loss": 0.5358, + "step": 6810 + }, + { + "epoch": 4.950998185117967, + "grad_norm": 1.2506113052368164, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 6820 + }, + { + "epoch": 4.9582577132486385, + "grad_norm": 0.8776047825813293, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 6830 + }, + { + "epoch": 4.9655172413793105, + "grad_norm": 0.9700555205345154, + "learning_rate": 0.0002, + "loss": 0.5764, + "step": 6840 + }, + { + "epoch": 4.972776769509982, + "grad_norm": 1.2713534832000732, + "learning_rate": 0.0002, + "loss": 0.5396, + "step": 6850 + }, + { + "epoch": 4.980036297640654, + "grad_norm": 0.9855955243110657, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 6860 + }, + { + "epoch": 4.987295825771325, + "grad_norm": 0.8734853863716125, + "learning_rate": 0.0002, + "loss": 0.5884, + "step": 6870 + }, + { + "epoch": 4.994555353901997, + "grad_norm": 0.8065403699874878, + "learning_rate": 0.0002, + "loss": 0.5189, + "step": 6880 + }, + { + "epoch": 4.999637023593467, + "eval_loss": 1.3302682638168335, + "eval_runtime": 46.2496, + "eval_samples_per_second": 9.427, + "eval_steps_per_second": 1.189, + "step": 6887 + }, + { + "epoch": 5.001814882032668, + "grad_norm": 0.5163813829421997, + "learning_rate": 0.0002, + "loss": 0.4889, + "step": 6890 + }, + { + "epoch": 5.00907441016334, + "grad_norm": 1.1496137380599976, + "learning_rate": 0.0002, + "loss": 0.3545, + "step": 6900 + }, + { + "epoch": 5.016333938294011, + "grad_norm": 1.0133885145187378, + "learning_rate": 0.0002, + "loss": 0.39, + "step": 6910 + }, + { + "epoch": 5.023593466424682, + "grad_norm": 0.9479621052742004, + "learning_rate": 0.0002, + "loss": 0.3693, + "step": 6920 + }, + { + "epoch": 5.030852994555354, + "grad_norm": 0.8587583303451538, + "learning_rate": 0.0002, + "loss": 0.4012, + "step": 6930 + }, + { + "epoch": 5.038112522686025, + "grad_norm": 1.3314697742462158, + "learning_rate": 0.0002, + "loss": 0.3428, + "step": 6940 + }, + { + "epoch": 5.045372050816697, + "grad_norm": 1.195448875427246, + "learning_rate": 0.0002, + "loss": 0.3909, + "step": 6950 + }, + { + "epoch": 5.052631578947368, + "grad_norm": 1.2482256889343262, + "learning_rate": 0.0002, + "loss": 0.3322, + "step": 6960 + }, + { + "epoch": 5.05989110707804, + "grad_norm": 1.2011528015136719, + "learning_rate": 0.0002, + "loss": 0.3893, + "step": 6970 + }, + { + "epoch": 5.067150635208711, + "grad_norm": 1.3997188806533813, + "learning_rate": 0.0002, + "loss": 0.3265, + "step": 6980 + }, + { + "epoch": 5.074410163339383, + "grad_norm": 1.2147513628005981, + "learning_rate": 0.0002, + "loss": 0.3716, + "step": 6990 + }, + { + "epoch": 5.081669691470054, + "grad_norm": 1.6030137538909912, + "learning_rate": 0.0002, + "loss": 0.4053, + "step": 7000 + }, + { + "epoch": 5.088929219600726, + "grad_norm": 0.9466970562934875, + "learning_rate": 0.0002, + "loss": 0.3665, + "step": 7010 + }, + { + "epoch": 5.096188747731397, + "grad_norm": 1.4593411684036255, + "learning_rate": 0.0002, + "loss": 0.3451, + "step": 7020 + }, + { + "epoch": 5.103448275862069, + "grad_norm": 1.2196033000946045, + "learning_rate": 0.0002, + "loss": 0.3843, + "step": 7030 + }, + { + "epoch": 5.1107078039927405, + "grad_norm": 1.1341328620910645, + "learning_rate": 0.0002, + "loss": 0.3896, + "step": 7040 + }, + { + "epoch": 5.117967332123412, + "grad_norm": 1.2248976230621338, + "learning_rate": 0.0002, + "loss": 0.3627, + "step": 7050 + }, + { + "epoch": 5.125226860254084, + "grad_norm": 1.1620593070983887, + "learning_rate": 0.0002, + "loss": 0.3784, + "step": 7060 + }, + { + "epoch": 5.132486388384755, + "grad_norm": 0.9300723671913147, + "learning_rate": 0.0002, + "loss": 0.3678, + "step": 7070 + }, + { + "epoch": 5.139745916515427, + "grad_norm": 1.2265169620513916, + "learning_rate": 0.0002, + "loss": 0.3756, + "step": 7080 + }, + { + "epoch": 5.147005444646098, + "grad_norm": 1.4430373907089233, + "learning_rate": 0.0002, + "loss": 0.3595, + "step": 7090 + }, + { + "epoch": 5.15426497277677, + "grad_norm": 1.0821576118469238, + "learning_rate": 0.0002, + "loss": 0.3788, + "step": 7100 + }, + { + "epoch": 5.161524500907441, + "grad_norm": 1.2574739456176758, + "learning_rate": 0.0002, + "loss": 0.383, + "step": 7110 + }, + { + "epoch": 5.168784029038113, + "grad_norm": 1.1806069612503052, + "learning_rate": 0.0002, + "loss": 0.3692, + "step": 7120 + }, + { + "epoch": 5.176043557168784, + "grad_norm": 0.9900956153869629, + "learning_rate": 0.0002, + "loss": 0.3978, + "step": 7130 + }, + { + "epoch": 5.183303085299456, + "grad_norm": 1.2414425611495972, + "learning_rate": 0.0002, + "loss": 0.4358, + "step": 7140 + }, + { + "epoch": 5.190562613430127, + "grad_norm": 0.8220699429512024, + "learning_rate": 0.0002, + "loss": 0.3485, + "step": 7150 + }, + { + "epoch": 5.197822141560798, + "grad_norm": 1.29408860206604, + "learning_rate": 0.0002, + "loss": 0.3517, + "step": 7160 + }, + { + "epoch": 5.20508166969147, + "grad_norm": 0.8510639071464539, + "learning_rate": 0.0002, + "loss": 0.3405, + "step": 7170 + }, + { + "epoch": 5.212341197822141, + "grad_norm": 1.3448902368545532, + "learning_rate": 0.0002, + "loss": 0.4233, + "step": 7180 + }, + { + "epoch": 5.219600725952813, + "grad_norm": 1.054451584815979, + "learning_rate": 0.0002, + "loss": 0.3808, + "step": 7190 + }, + { + "epoch": 5.226860254083484, + "grad_norm": 1.3752713203430176, + "learning_rate": 0.0002, + "loss": 0.368, + "step": 7200 + }, + { + "epoch": 5.234119782214156, + "grad_norm": 1.4848095178604126, + "learning_rate": 0.0002, + "loss": 0.3844, + "step": 7210 + }, + { + "epoch": 5.241379310344827, + "grad_norm": 1.428842544555664, + "learning_rate": 0.0002, + "loss": 0.4187, + "step": 7220 + }, + { + "epoch": 5.248638838475499, + "grad_norm": 1.1703591346740723, + "learning_rate": 0.0002, + "loss": 0.3778, + "step": 7230 + }, + { + "epoch": 5.2558983666061705, + "grad_norm": 1.2180451154708862, + "learning_rate": 0.0002, + "loss": 0.417, + "step": 7240 + }, + { + "epoch": 5.2631578947368425, + "grad_norm": 1.094045877456665, + "learning_rate": 0.0002, + "loss": 0.3656, + "step": 7250 + }, + { + "epoch": 5.270417422867514, + "grad_norm": 0.9545766115188599, + "learning_rate": 0.0002, + "loss": 0.4331, + "step": 7260 + }, + { + "epoch": 5.277676950998185, + "grad_norm": 0.8356652855873108, + "learning_rate": 0.0002, + "loss": 0.3642, + "step": 7270 + }, + { + "epoch": 5.284936479128857, + "grad_norm": 1.148160457611084, + "learning_rate": 0.0002, + "loss": 0.3576, + "step": 7280 + }, + { + "epoch": 5.292196007259528, + "grad_norm": 1.2009977102279663, + "learning_rate": 0.0002, + "loss": 0.4178, + "step": 7290 + }, + { + "epoch": 5.2994555353902, + "grad_norm": 1.3283873796463013, + "learning_rate": 0.0002, + "loss": 0.3977, + "step": 7300 + }, + { + "epoch": 5.306715063520871, + "grad_norm": 0.9850481748580933, + "learning_rate": 0.0002, + "loss": 0.3853, + "step": 7310 + }, + { + "epoch": 5.313974591651543, + "grad_norm": 1.367550015449524, + "learning_rate": 0.0002, + "loss": 0.3645, + "step": 7320 + }, + { + "epoch": 5.321234119782214, + "grad_norm": 0.8602936863899231, + "learning_rate": 0.0002, + "loss": 0.3898, + "step": 7330 + }, + { + "epoch": 5.328493647912886, + "grad_norm": 1.1130679845809937, + "learning_rate": 0.0002, + "loss": 0.4173, + "step": 7340 + }, + { + "epoch": 5.335753176043557, + "grad_norm": 1.3002253770828247, + "learning_rate": 0.0002, + "loss": 0.3642, + "step": 7350 + }, + { + "epoch": 5.343012704174229, + "grad_norm": 1.6235289573669434, + "learning_rate": 0.0002, + "loss": 0.4138, + "step": 7360 + }, + { + "epoch": 5.3502722323049, + "grad_norm": 1.156379222869873, + "learning_rate": 0.0002, + "loss": 0.4779, + "step": 7370 + }, + { + "epoch": 5.357531760435572, + "grad_norm": 1.0569308996200562, + "learning_rate": 0.0002, + "loss": 0.3222, + "step": 7380 + }, + { + "epoch": 5.364791288566243, + "grad_norm": 1.6674021482467651, + "learning_rate": 0.0002, + "loss": 0.3573, + "step": 7390 + }, + { + "epoch": 5.372050816696914, + "grad_norm": 1.2962018251419067, + "learning_rate": 0.0002, + "loss": 0.4325, + "step": 7400 + }, + { + "epoch": 5.379310344827586, + "grad_norm": 1.1904195547103882, + "learning_rate": 0.0002, + "loss": 0.3809, + "step": 7410 + }, + { + "epoch": 5.386569872958257, + "grad_norm": 1.316245675086975, + "learning_rate": 0.0002, + "loss": 0.3728, + "step": 7420 + }, + { + "epoch": 5.393829401088929, + "grad_norm": 1.127570390701294, + "learning_rate": 0.0002, + "loss": 0.4096, + "step": 7430 + }, + { + "epoch": 5.4010889292196005, + "grad_norm": 1.3895777463912964, + "learning_rate": 0.0002, + "loss": 0.3933, + "step": 7440 + }, + { + "epoch": 5.4083484573502725, + "grad_norm": 1.626830816268921, + "learning_rate": 0.0002, + "loss": 0.4085, + "step": 7450 + }, + { + "epoch": 5.415607985480944, + "grad_norm": 1.3703926801681519, + "learning_rate": 0.0002, + "loss": 0.4186, + "step": 7460 + }, + { + "epoch": 5.422867513611616, + "grad_norm": 1.3854840993881226, + "learning_rate": 0.0002, + "loss": 0.3517, + "step": 7470 + }, + { + "epoch": 5.430127041742287, + "grad_norm": 1.107065200805664, + "learning_rate": 0.0002, + "loss": 0.3714, + "step": 7480 + }, + { + "epoch": 5.437386569872959, + "grad_norm": 0.7843456268310547, + "learning_rate": 0.0002, + "loss": 0.3855, + "step": 7490 + }, + { + "epoch": 5.44464609800363, + "grad_norm": 1.6692372560501099, + "learning_rate": 0.0002, + "loss": 0.4159, + "step": 7500 + }, + { + "epoch": 5.451905626134302, + "grad_norm": 1.2583858966827393, + "learning_rate": 0.0002, + "loss": 0.4185, + "step": 7510 + }, + { + "epoch": 5.459165154264973, + "grad_norm": 1.6827000379562378, + "learning_rate": 0.0002, + "loss": 0.4401, + "step": 7520 + }, + { + "epoch": 5.466424682395644, + "grad_norm": 1.6680560111999512, + "learning_rate": 0.0002, + "loss": 0.397, + "step": 7530 + }, + { + "epoch": 5.473684210526316, + "grad_norm": 1.3696072101593018, + "learning_rate": 0.0002, + "loss": 0.4193, + "step": 7540 + }, + { + "epoch": 5.480943738656987, + "grad_norm": 1.4523496627807617, + "learning_rate": 0.0002, + "loss": 0.4244, + "step": 7550 + }, + { + "epoch": 5.488203266787659, + "grad_norm": 1.3432692289352417, + "learning_rate": 0.0002, + "loss": 0.3609, + "step": 7560 + }, + { + "epoch": 5.49546279491833, + "grad_norm": 1.363818645477295, + "learning_rate": 0.0002, + "loss": 0.3675, + "step": 7570 + }, + { + "epoch": 5.502722323049002, + "grad_norm": 1.0176721811294556, + "learning_rate": 0.0002, + "loss": 0.3726, + "step": 7580 + }, + { + "epoch": 5.509981851179673, + "grad_norm": 1.1625547409057617, + "learning_rate": 0.0002, + "loss": 0.3751, + "step": 7590 + }, + { + "epoch": 5.517241379310345, + "grad_norm": 1.2480388879776, + "learning_rate": 0.0002, + "loss": 0.433, + "step": 7600 + }, + { + "epoch": 5.524500907441016, + "grad_norm": 1.341509222984314, + "learning_rate": 0.0002, + "loss": 0.4511, + "step": 7610 + }, + { + "epoch": 5.531760435571687, + "grad_norm": 1.7048436403274536, + "learning_rate": 0.0002, + "loss": 0.4642, + "step": 7620 + }, + { + "epoch": 5.539019963702359, + "grad_norm": 1.1435480117797852, + "learning_rate": 0.0002, + "loss": 0.4509, + "step": 7630 + }, + { + "epoch": 5.5462794918330305, + "grad_norm": 1.2381842136383057, + "learning_rate": 0.0002, + "loss": 0.4528, + "step": 7640 + }, + { + "epoch": 5.5535390199637025, + "grad_norm": 1.50786292552948, + "learning_rate": 0.0002, + "loss": 0.4496, + "step": 7650 + }, + { + "epoch": 5.560798548094374, + "grad_norm": 1.2263519763946533, + "learning_rate": 0.0002, + "loss": 0.4242, + "step": 7660 + }, + { + "epoch": 5.568058076225046, + "grad_norm": 1.2864696979522705, + "learning_rate": 0.0002, + "loss": 0.418, + "step": 7670 + }, + { + "epoch": 5.575317604355717, + "grad_norm": 1.4443191289901733, + "learning_rate": 0.0002, + "loss": 0.3832, + "step": 7680 + }, + { + "epoch": 5.582577132486389, + "grad_norm": 1.3360971212387085, + "learning_rate": 0.0002, + "loss": 0.3964, + "step": 7690 + }, + { + "epoch": 5.58983666061706, + "grad_norm": 1.391828179359436, + "learning_rate": 0.0002, + "loss": 0.4639, + "step": 7700 + }, + { + "epoch": 5.597096188747732, + "grad_norm": 1.3699384927749634, + "learning_rate": 0.0002, + "loss": 0.4722, + "step": 7710 + }, + { + "epoch": 5.604355716878403, + "grad_norm": 1.3778468370437622, + "learning_rate": 0.0002, + "loss": 0.4302, + "step": 7720 + }, + { + "epoch": 5.611615245009075, + "grad_norm": 1.1009501218795776, + "learning_rate": 0.0002, + "loss": 0.4179, + "step": 7730 + }, + { + "epoch": 5.618874773139746, + "grad_norm": 1.0410021543502808, + "learning_rate": 0.0002, + "loss": 0.4104, + "step": 7740 + }, + { + "epoch": 5.626134301270417, + "grad_norm": 1.1012226343154907, + "learning_rate": 0.0002, + "loss": 0.4489, + "step": 7750 + }, + { + "epoch": 5.633393829401089, + "grad_norm": 1.3246384859085083, + "learning_rate": 0.0002, + "loss": 0.4544, + "step": 7760 + }, + { + "epoch": 5.64065335753176, + "grad_norm": 1.4301716089248657, + "learning_rate": 0.0002, + "loss": 0.4381, + "step": 7770 + }, + { + "epoch": 5.647912885662432, + "grad_norm": 1.1368978023529053, + "learning_rate": 0.0002, + "loss": 0.4297, + "step": 7780 + }, + { + "epoch": 5.655172413793103, + "grad_norm": 1.3493064641952515, + "learning_rate": 0.0002, + "loss": 0.4063, + "step": 7790 + }, + { + "epoch": 5.662431941923775, + "grad_norm": 1.3328721523284912, + "learning_rate": 0.0002, + "loss": 0.4562, + "step": 7800 + }, + { + "epoch": 5.669691470054446, + "grad_norm": 1.3235671520233154, + "learning_rate": 0.0002, + "loss": 0.4075, + "step": 7810 + }, + { + "epoch": 5.676950998185118, + "grad_norm": 1.1961841583251953, + "learning_rate": 0.0002, + "loss": 0.4589, + "step": 7820 + }, + { + "epoch": 5.684210526315789, + "grad_norm": 1.4189636707305908, + "learning_rate": 0.0002, + "loss": 0.4503, + "step": 7830 + }, + { + "epoch": 5.691470054446461, + "grad_norm": 1.3551312685012817, + "learning_rate": 0.0002, + "loss": 0.4452, + "step": 7840 + }, + { + "epoch": 5.6987295825771325, + "grad_norm": 1.449987769126892, + "learning_rate": 0.0002, + "loss": 0.4268, + "step": 7850 + }, + { + "epoch": 5.7059891107078045, + "grad_norm": 1.1225156784057617, + "learning_rate": 0.0002, + "loss": 0.4141, + "step": 7860 + }, + { + "epoch": 5.713248638838476, + "grad_norm": 1.4734594821929932, + "learning_rate": 0.0002, + "loss": 0.41, + "step": 7870 + }, + { + "epoch": 5.720508166969147, + "grad_norm": 1.3793359994888306, + "learning_rate": 0.0002, + "loss": 0.4013, + "step": 7880 + }, + { + "epoch": 5.727767695099819, + "grad_norm": 1.2431834936141968, + "learning_rate": 0.0002, + "loss": 0.4065, + "step": 7890 + }, + { + "epoch": 5.73502722323049, + "grad_norm": 1.1158313751220703, + "learning_rate": 0.0002, + "loss": 0.4595, + "step": 7900 + }, + { + "epoch": 5.742286751361162, + "grad_norm": 1.212248682975769, + "learning_rate": 0.0002, + "loss": 0.4342, + "step": 7910 + }, + { + "epoch": 5.749546279491833, + "grad_norm": 1.5259995460510254, + "learning_rate": 0.0002, + "loss": 0.4611, + "step": 7920 + }, + { + "epoch": 5.756805807622505, + "grad_norm": 1.3909121751785278, + "learning_rate": 0.0002, + "loss": 0.4483, + "step": 7930 + }, + { + "epoch": 5.764065335753176, + "grad_norm": 1.2511249780654907, + "learning_rate": 0.0002, + "loss": 0.4325, + "step": 7940 + }, + { + "epoch": 5.771324863883848, + "grad_norm": 1.2511906623840332, + "learning_rate": 0.0002, + "loss": 0.4048, + "step": 7950 + }, + { + "epoch": 5.778584392014519, + "grad_norm": 1.1489921808242798, + "learning_rate": 0.0002, + "loss": 0.3715, + "step": 7960 + }, + { + "epoch": 5.78584392014519, + "grad_norm": 1.028943419456482, + "learning_rate": 0.0002, + "loss": 0.4196, + "step": 7970 + }, + { + "epoch": 5.793103448275862, + "grad_norm": 1.0820423364639282, + "learning_rate": 0.0002, + "loss": 0.4334, + "step": 7980 + }, + { + "epoch": 5.800362976406533, + "grad_norm": 1.296520471572876, + "learning_rate": 0.0002, + "loss": 0.3917, + "step": 7990 + }, + { + "epoch": 5.807622504537205, + "grad_norm": 1.3597749471664429, + "learning_rate": 0.0002, + "loss": 0.4509, + "step": 8000 + }, + { + "epoch": 5.814882032667876, + "grad_norm": 0.8741790652275085, + "learning_rate": 0.0002, + "loss": 0.4535, + "step": 8010 + }, + { + "epoch": 5.822141560798548, + "grad_norm": 1.1471822261810303, + "learning_rate": 0.0002, + "loss": 0.4239, + "step": 8020 + }, + { + "epoch": 5.829401088929219, + "grad_norm": 1.2997334003448486, + "learning_rate": 0.0002, + "loss": 0.5042, + "step": 8030 + }, + { + "epoch": 5.836660617059891, + "grad_norm": 1.1027175188064575, + "learning_rate": 0.0002, + "loss": 0.4758, + "step": 8040 + }, + { + "epoch": 5.8439201451905625, + "grad_norm": 1.2695307731628418, + "learning_rate": 0.0002, + "loss": 0.4192, + "step": 8050 + }, + { + "epoch": 5.8511796733212345, + "grad_norm": 1.5275461673736572, + "learning_rate": 0.0002, + "loss": 0.5173, + "step": 8060 + }, + { + "epoch": 5.8584392014519056, + "grad_norm": 1.3059501647949219, + "learning_rate": 0.0002, + "loss": 0.5012, + "step": 8070 + }, + { + "epoch": 5.8656987295825775, + "grad_norm": 1.57442045211792, + "learning_rate": 0.0002, + "loss": 0.4425, + "step": 8080 + }, + { + "epoch": 5.872958257713249, + "grad_norm": 1.119564414024353, + "learning_rate": 0.0002, + "loss": 0.4261, + "step": 8090 + }, + { + "epoch": 5.88021778584392, + "grad_norm": 1.6517373323440552, + "learning_rate": 0.0002, + "loss": 0.465, + "step": 8100 + }, + { + "epoch": 5.887477313974592, + "grad_norm": 1.4093554019927979, + "learning_rate": 0.0002, + "loss": 0.4406, + "step": 8110 + }, + { + "epoch": 5.894736842105263, + "grad_norm": 1.278843641281128, + "learning_rate": 0.0002, + "loss": 0.4433, + "step": 8120 + }, + { + "epoch": 5.901996370235935, + "grad_norm": 1.2042944431304932, + "learning_rate": 0.0002, + "loss": 0.4007, + "step": 8130 + }, + { + "epoch": 5.909255898366606, + "grad_norm": 1.1788326501846313, + "learning_rate": 0.0002, + "loss": 0.3972, + "step": 8140 + }, + { + "epoch": 5.916515426497278, + "grad_norm": 1.4364569187164307, + "learning_rate": 0.0002, + "loss": 0.4506, + "step": 8150 + }, + { + "epoch": 5.923774954627949, + "grad_norm": 1.1704283952713013, + "learning_rate": 0.0002, + "loss": 0.4651, + "step": 8160 + }, + { + "epoch": 5.931034482758621, + "grad_norm": 1.040814995765686, + "learning_rate": 0.0002, + "loss": 0.3972, + "step": 8170 + }, + { + "epoch": 5.938294010889292, + "grad_norm": 1.1367416381835938, + "learning_rate": 0.0002, + "loss": 0.4038, + "step": 8180 + }, + { + "epoch": 5.945553539019964, + "grad_norm": 1.3401511907577515, + "learning_rate": 0.0002, + "loss": 0.4387, + "step": 8190 + }, + { + "epoch": 5.952813067150635, + "grad_norm": 1.1154041290283203, + "learning_rate": 0.0002, + "loss": 0.4396, + "step": 8200 + }, + { + "epoch": 5.960072595281307, + "grad_norm": 1.426089882850647, + "learning_rate": 0.0002, + "loss": 0.4744, + "step": 8210 + }, + { + "epoch": 5.967332123411978, + "grad_norm": 1.3170222043991089, + "learning_rate": 0.0002, + "loss": 0.4105, + "step": 8220 + }, + { + "epoch": 5.974591651542649, + "grad_norm": 1.1960029602050781, + "learning_rate": 0.0002, + "loss": 0.4137, + "step": 8230 + }, + { + "epoch": 5.981851179673321, + "grad_norm": 1.0843931436538696, + "learning_rate": 0.0002, + "loss": 0.423, + "step": 8240 + }, + { + "epoch": 5.9891107078039925, + "grad_norm": 1.050421118736267, + "learning_rate": 0.0002, + "loss": 0.459, + "step": 8250 + }, + { + "epoch": 5.9963702359346644, + "grad_norm": 1.0183138847351074, + "learning_rate": 0.0002, + "loss": 0.3993, + "step": 8260 + }, + { + "epoch": 6.0, + "eval_loss": 1.4677470922470093, + "eval_runtime": 46.2504, + "eval_samples_per_second": 9.427, + "eval_steps_per_second": 1.189, + "step": 8265 + }, + { + "epoch": 6.0036297640653356, + "grad_norm": 1.1702998876571655, + "learning_rate": 0.0002, + "loss": 0.3947, + "step": 8270 + }, + { + "epoch": 6.0108892921960075, + "grad_norm": 1.5389727354049683, + "learning_rate": 0.0002, + "loss": 0.2854, + "step": 8280 + }, + { + "epoch": 6.018148820326679, + "grad_norm": 1.502568244934082, + "learning_rate": 0.0002, + "loss": 0.2603, + "step": 8290 + }, + { + "epoch": 6.025408348457351, + "grad_norm": 1.3846043348312378, + "learning_rate": 0.0002, + "loss": 0.3329, + "step": 8300 + }, + { + "epoch": 6.032667876588022, + "grad_norm": 1.173553228378296, + "learning_rate": 0.0002, + "loss": 0.2651, + "step": 8310 + }, + { + "epoch": 6.039927404718694, + "grad_norm": 1.5325932502746582, + "learning_rate": 0.0002, + "loss": 0.3142, + "step": 8320 + }, + { + "epoch": 6.047186932849365, + "grad_norm": 1.303783655166626, + "learning_rate": 0.0002, + "loss": 0.2511, + "step": 8330 + }, + { + "epoch": 6.054446460980036, + "grad_norm": 0.9408994913101196, + "learning_rate": 0.0002, + "loss": 0.2352, + "step": 8340 + }, + { + "epoch": 6.061705989110708, + "grad_norm": 1.5430388450622559, + "learning_rate": 0.0002, + "loss": 0.2548, + "step": 8350 + }, + { + "epoch": 6.068965517241379, + "grad_norm": 0.8765342235565186, + "learning_rate": 0.0002, + "loss": 0.2682, + "step": 8360 + }, + { + "epoch": 6.076225045372051, + "grad_norm": 1.2363157272338867, + "learning_rate": 0.0002, + "loss": 0.2614, + "step": 8370 + }, + { + "epoch": 6.083484573502722, + "grad_norm": 1.21284818649292, + "learning_rate": 0.0002, + "loss": 0.294, + "step": 8380 + }, + { + "epoch": 6.090744101633394, + "grad_norm": 1.3261712789535522, + "learning_rate": 0.0002, + "loss": 0.2498, + "step": 8390 + }, + { + "epoch": 6.098003629764065, + "grad_norm": 1.077317714691162, + "learning_rate": 0.0002, + "loss": 0.2649, + "step": 8400 + }, + { + "epoch": 6.105263157894737, + "grad_norm": 0.9873808026313782, + "learning_rate": 0.0002, + "loss": 0.269, + "step": 8410 + }, + { + "epoch": 6.112522686025408, + "grad_norm": 1.032258152961731, + "learning_rate": 0.0002, + "loss": 0.2736, + "step": 8420 + }, + { + "epoch": 6.11978221415608, + "grad_norm": 1.1014811992645264, + "learning_rate": 0.0002, + "loss": 0.2854, + "step": 8430 + }, + { + "epoch": 6.127041742286751, + "grad_norm": 1.4264203310012817, + "learning_rate": 0.0002, + "loss": 0.2924, + "step": 8440 + }, + { + "epoch": 6.1343012704174225, + "grad_norm": 1.4086531400680542, + "learning_rate": 0.0002, + "loss": 0.3388, + "step": 8450 + }, + { + "epoch": 6.1415607985480944, + "grad_norm": 1.3842453956604004, + "learning_rate": 0.0002, + "loss": 0.2786, + "step": 8460 + }, + { + "epoch": 6.1488203266787655, + "grad_norm": 1.4356757402420044, + "learning_rate": 0.0002, + "loss": 0.3201, + "step": 8470 + }, + { + "epoch": 6.1560798548094375, + "grad_norm": 1.193315029144287, + "learning_rate": 0.0002, + "loss": 0.2908, + "step": 8480 + }, + { + "epoch": 6.163339382940109, + "grad_norm": 1.0623924732208252, + "learning_rate": 0.0002, + "loss": 0.342, + "step": 8490 + }, + { + "epoch": 6.170598911070781, + "grad_norm": 1.5484434366226196, + "learning_rate": 0.0002, + "loss": 0.3257, + "step": 8500 + }, + { + "epoch": 6.177858439201452, + "grad_norm": 1.3520029783248901, + "learning_rate": 0.0002, + "loss": 0.2861, + "step": 8510 + }, + { + "epoch": 6.185117967332124, + "grad_norm": 1.2773103713989258, + "learning_rate": 0.0002, + "loss": 0.3242, + "step": 8520 + }, + { + "epoch": 6.192377495462795, + "grad_norm": 1.4675105810165405, + "learning_rate": 0.0002, + "loss": 0.3108, + "step": 8530 + }, + { + "epoch": 6.199637023593467, + "grad_norm": 1.2118732929229736, + "learning_rate": 0.0002, + "loss": 0.3044, + "step": 8540 + }, + { + "epoch": 6.206896551724138, + "grad_norm": 1.264024257659912, + "learning_rate": 0.0002, + "loss": 0.2726, + "step": 8550 + }, + { + "epoch": 6.21415607985481, + "grad_norm": 1.406931757926941, + "learning_rate": 0.0002, + "loss": 0.306, + "step": 8560 + }, + { + "epoch": 6.221415607985481, + "grad_norm": 1.385459542274475, + "learning_rate": 0.0002, + "loss": 0.2904, + "step": 8570 + }, + { + "epoch": 6.228675136116152, + "grad_norm": 1.9336168766021729, + "learning_rate": 0.0002, + "loss": 0.3413, + "step": 8580 + }, + { + "epoch": 6.235934664246824, + "grad_norm": 0.9880136847496033, + "learning_rate": 0.0002, + "loss": 0.2769, + "step": 8590 + }, + { + "epoch": 6.243194192377495, + "grad_norm": 1.3870339393615723, + "learning_rate": 0.0002, + "loss": 0.3035, + "step": 8600 + }, + { + "epoch": 6.250453720508167, + "grad_norm": 1.2303647994995117, + "learning_rate": 0.0002, + "loss": 0.286, + "step": 8610 + }, + { + "epoch": 6.257713248638838, + "grad_norm": 1.5406211614608765, + "learning_rate": 0.0002, + "loss": 0.3113, + "step": 8620 + }, + { + "epoch": 6.26497277676951, + "grad_norm": 1.2436790466308594, + "learning_rate": 0.0002, + "loss": 0.292, + "step": 8630 + }, + { + "epoch": 6.272232304900181, + "grad_norm": 0.8844212293624878, + "learning_rate": 0.0002, + "loss": 0.3102, + "step": 8640 + }, + { + "epoch": 6.279491833030853, + "grad_norm": 1.2846336364746094, + "learning_rate": 0.0002, + "loss": 0.3373, + "step": 8650 + }, + { + "epoch": 6.286751361161524, + "grad_norm": 1.593814730644226, + "learning_rate": 0.0002, + "loss": 0.3535, + "step": 8660 + }, + { + "epoch": 6.2940108892921955, + "grad_norm": 1.2277469635009766, + "learning_rate": 0.0002, + "loss": 0.3413, + "step": 8670 + }, + { + "epoch": 6.3012704174228675, + "grad_norm": 1.2574384212493896, + "learning_rate": 0.0002, + "loss": 0.2958, + "step": 8680 + }, + { + "epoch": 6.308529945553539, + "grad_norm": 1.335150957107544, + "learning_rate": 0.0002, + "loss": 0.3251, + "step": 8690 + }, + { + "epoch": 6.315789473684211, + "grad_norm": 1.3140437602996826, + "learning_rate": 0.0002, + "loss": 0.3009, + "step": 8700 + }, + { + "epoch": 6.323049001814882, + "grad_norm": 1.1689209938049316, + "learning_rate": 0.0002, + "loss": 0.2783, + "step": 8710 + }, + { + "epoch": 6.330308529945554, + "grad_norm": 1.6448503732681274, + "learning_rate": 0.0002, + "loss": 0.3476, + "step": 8720 + }, + { + "epoch": 6.337568058076225, + "grad_norm": 0.9944145679473877, + "learning_rate": 0.0002, + "loss": 0.2934, + "step": 8730 + }, + { + "epoch": 6.344827586206897, + "grad_norm": 1.1775634288787842, + "learning_rate": 0.0002, + "loss": 0.3315, + "step": 8740 + }, + { + "epoch": 6.352087114337568, + "grad_norm": 1.8438639640808105, + "learning_rate": 0.0002, + "loss": 0.3514, + "step": 8750 + }, + { + "epoch": 6.35934664246824, + "grad_norm": 1.062495470046997, + "learning_rate": 0.0002, + "loss": 0.303, + "step": 8760 + }, + { + "epoch": 6.366606170598911, + "grad_norm": 1.3224315643310547, + "learning_rate": 0.0002, + "loss": 0.2737, + "step": 8770 + }, + { + "epoch": 6.373865698729583, + "grad_norm": 1.399844765663147, + "learning_rate": 0.0002, + "loss": 0.3445, + "step": 8780 + }, + { + "epoch": 6.381125226860254, + "grad_norm": 1.0409915447235107, + "learning_rate": 0.0002, + "loss": 0.3277, + "step": 8790 + }, + { + "epoch": 6.388384754990925, + "grad_norm": 1.5657726526260376, + "learning_rate": 0.0002, + "loss": 0.3218, + "step": 8800 + }, + { + "epoch": 6.395644283121597, + "grad_norm": 1.4098644256591797, + "learning_rate": 0.0002, + "loss": 0.3031, + "step": 8810 + }, + { + "epoch": 6.402903811252268, + "grad_norm": 1.5154732465744019, + "learning_rate": 0.0002, + "loss": 0.3133, + "step": 8820 + }, + { + "epoch": 6.41016333938294, + "grad_norm": 1.1139698028564453, + "learning_rate": 0.0002, + "loss": 0.3111, + "step": 8830 + }, + { + "epoch": 6.417422867513611, + "grad_norm": 1.4149729013442993, + "learning_rate": 0.0002, + "loss": 0.3553, + "step": 8840 + }, + { + "epoch": 6.424682395644283, + "grad_norm": 1.2632299661636353, + "learning_rate": 0.0002, + "loss": 0.287, + "step": 8850 + }, + { + "epoch": 6.431941923774954, + "grad_norm": 1.6636109352111816, + "learning_rate": 0.0002, + "loss": 0.3198, + "step": 8860 + }, + { + "epoch": 6.439201451905626, + "grad_norm": 1.4149386882781982, + "learning_rate": 0.0002, + "loss": 0.3749, + "step": 8870 + }, + { + "epoch": 6.4464609800362975, + "grad_norm": 1.1396206617355347, + "learning_rate": 0.0002, + "loss": 0.3504, + "step": 8880 + }, + { + "epoch": 6.4537205081669695, + "grad_norm": 1.2188775539398193, + "learning_rate": 0.0002, + "loss": 0.3328, + "step": 8890 + }, + { + "epoch": 6.460980036297641, + "grad_norm": 0.9740369319915771, + "learning_rate": 0.0002, + "loss": 0.3427, + "step": 8900 + }, + { + "epoch": 6.468239564428313, + "grad_norm": 1.228569746017456, + "learning_rate": 0.0002, + "loss": 0.3223, + "step": 8910 + }, + { + "epoch": 6.475499092558984, + "grad_norm": 1.5019789934158325, + "learning_rate": 0.0002, + "loss": 0.3151, + "step": 8920 + }, + { + "epoch": 6.482758620689655, + "grad_norm": 1.3320101499557495, + "learning_rate": 0.0002, + "loss": 0.2916, + "step": 8930 + }, + { + "epoch": 6.490018148820327, + "grad_norm": 1.5551502704620361, + "learning_rate": 0.0002, + "loss": 0.298, + "step": 8940 + }, + { + "epoch": 6.497277676950998, + "grad_norm": 1.470131754875183, + "learning_rate": 0.0002, + "loss": 0.3238, + "step": 8950 + }, + { + "epoch": 6.50453720508167, + "grad_norm": 1.1803025007247925, + "learning_rate": 0.0002, + "loss": 0.2808, + "step": 8960 + }, + { + "epoch": 6.511796733212341, + "grad_norm": 1.3505640029907227, + "learning_rate": 0.0002, + "loss": 0.3025, + "step": 8970 + }, + { + "epoch": 6.519056261343013, + "grad_norm": 1.13093900680542, + "learning_rate": 0.0002, + "loss": 0.3124, + "step": 8980 + }, + { + "epoch": 6.526315789473684, + "grad_norm": 1.347386121749878, + "learning_rate": 0.0002, + "loss": 0.3454, + "step": 8990 + }, + { + "epoch": 6.533575317604356, + "grad_norm": 1.7879165410995483, + "learning_rate": 0.0002, + "loss": 0.3532, + "step": 9000 + }, + { + "epoch": 6.540834845735027, + "grad_norm": 1.2168169021606445, + "learning_rate": 0.0002, + "loss": 0.3382, + "step": 9010 + }, + { + "epoch": 6.548094373865698, + "grad_norm": 1.1758877038955688, + "learning_rate": 0.0002, + "loss": 0.3413, + "step": 9020 + }, + { + "epoch": 6.55535390199637, + "grad_norm": 1.7366445064544678, + "learning_rate": 0.0002, + "loss": 0.2806, + "step": 9030 + }, + { + "epoch": 6.562613430127042, + "grad_norm": 1.5919222831726074, + "learning_rate": 0.0002, + "loss": 0.3437, + "step": 9040 + }, + { + "epoch": 6.569872958257713, + "grad_norm": 1.336863398551941, + "learning_rate": 0.0002, + "loss": 0.3261, + "step": 9050 + }, + { + "epoch": 6.577132486388384, + "grad_norm": 1.1769421100616455, + "learning_rate": 0.0002, + "loss": 0.3103, + "step": 9060 + }, + { + "epoch": 6.584392014519056, + "grad_norm": 1.0048751831054688, + "learning_rate": 0.0002, + "loss": 0.3295, + "step": 9070 + }, + { + "epoch": 6.5916515426497275, + "grad_norm": 1.5268515348434448, + "learning_rate": 0.0002, + "loss": 0.3156, + "step": 9080 + }, + { + "epoch": 6.5989110707803995, + "grad_norm": 1.434610366821289, + "learning_rate": 0.0002, + "loss": 0.3752, + "step": 9090 + }, + { + "epoch": 6.606170598911071, + "grad_norm": 1.1151410341262817, + "learning_rate": 0.0002, + "loss": 0.3375, + "step": 9100 + }, + { + "epoch": 6.613430127041743, + "grad_norm": 1.6690642833709717, + "learning_rate": 0.0002, + "loss": 0.363, + "step": 9110 + }, + { + "epoch": 6.620689655172414, + "grad_norm": 1.4495552778244019, + "learning_rate": 0.0002, + "loss": 0.3703, + "step": 9120 + }, + { + "epoch": 6.627949183303086, + "grad_norm": 1.377621054649353, + "learning_rate": 0.0002, + "loss": 0.3648, + "step": 9130 + }, + { + "epoch": 6.635208711433757, + "grad_norm": 1.5459434986114502, + "learning_rate": 0.0002, + "loss": 0.3766, + "step": 9140 + }, + { + "epoch": 6.642468239564428, + "grad_norm": 1.0920850038528442, + "learning_rate": 0.0002, + "loss": 0.3196, + "step": 9150 + }, + { + "epoch": 6.6497277676951, + "grad_norm": 1.6708381175994873, + "learning_rate": 0.0002, + "loss": 0.3505, + "step": 9160 + }, + { + "epoch": 6.656987295825771, + "grad_norm": 1.747514009475708, + "learning_rate": 0.0002, + "loss": 0.368, + "step": 9170 + }, + { + "epoch": 6.664246823956443, + "grad_norm": 1.133466362953186, + "learning_rate": 0.0002, + "loss": 0.3099, + "step": 9180 + }, + { + "epoch": 6.671506352087114, + "grad_norm": 1.394358515739441, + "learning_rate": 0.0002, + "loss": 0.3175, + "step": 9190 + }, + { + "epoch": 6.678765880217786, + "grad_norm": 0.9258374571800232, + "learning_rate": 0.0002, + "loss": 0.2981, + "step": 9200 + }, + { + "epoch": 6.686025408348457, + "grad_norm": 1.3750739097595215, + "learning_rate": 0.0002, + "loss": 0.3723, + "step": 9210 + }, + { + "epoch": 6.693284936479129, + "grad_norm": 0.8604967594146729, + "learning_rate": 0.0002, + "loss": 0.3441, + "step": 9220 + }, + { + "epoch": 6.7005444646098, + "grad_norm": 1.6074559688568115, + "learning_rate": 0.0002, + "loss": 0.3775, + "step": 9230 + }, + { + "epoch": 6.707803992740472, + "grad_norm": 0.9576877355575562, + "learning_rate": 0.0002, + "loss": 0.3139, + "step": 9240 + }, + { + "epoch": 6.715063520871143, + "grad_norm": 1.7193048000335693, + "learning_rate": 0.0002, + "loss": 0.3633, + "step": 9250 + }, + { + "epoch": 6.722323049001815, + "grad_norm": 1.3131844997406006, + "learning_rate": 0.0002, + "loss": 0.3139, + "step": 9260 + }, + { + "epoch": 6.729582577132486, + "grad_norm": 1.2978184223175049, + "learning_rate": 0.0002, + "loss": 0.3121, + "step": 9270 + }, + { + "epoch": 6.7368421052631575, + "grad_norm": 1.4792617559432983, + "learning_rate": 0.0002, + "loss": 0.3534, + "step": 9280 + }, + { + "epoch": 6.7441016333938295, + "grad_norm": 1.1265567541122437, + "learning_rate": 0.0002, + "loss": 0.3429, + "step": 9290 + }, + { + "epoch": 6.751361161524501, + "grad_norm": 1.8553377389907837, + "learning_rate": 0.0002, + "loss": 0.3526, + "step": 9300 + }, + { + "epoch": 6.758620689655173, + "grad_norm": 1.3602519035339355, + "learning_rate": 0.0002, + "loss": 0.3666, + "step": 9310 + }, + { + "epoch": 6.765880217785844, + "grad_norm": 1.2874794006347656, + "learning_rate": 0.0002, + "loss": 0.2922, + "step": 9320 + }, + { + "epoch": 6.773139745916516, + "grad_norm": 1.4834712743759155, + "learning_rate": 0.0002, + "loss": 0.3816, + "step": 9330 + }, + { + "epoch": 6.780399274047187, + "grad_norm": 2.0824034214019775, + "learning_rate": 0.0002, + "loss": 0.3557, + "step": 9340 + }, + { + "epoch": 6.787658802177859, + "grad_norm": 1.2267698049545288, + "learning_rate": 0.0002, + "loss": 0.3174, + "step": 9350 + }, + { + "epoch": 6.79491833030853, + "grad_norm": 1.4485498666763306, + "learning_rate": 0.0002, + "loss": 0.3665, + "step": 9360 + }, + { + "epoch": 6.802177858439201, + "grad_norm": 1.3199396133422852, + "learning_rate": 0.0002, + "loss": 0.3676, + "step": 9370 + }, + { + "epoch": 6.809437386569873, + "grad_norm": 1.2552456855773926, + "learning_rate": 0.0002, + "loss": 0.298, + "step": 9380 + }, + { + "epoch": 6.816696914700545, + "grad_norm": 1.3895127773284912, + "learning_rate": 0.0002, + "loss": 0.3152, + "step": 9390 + }, + { + "epoch": 6.823956442831216, + "grad_norm": 1.7637823820114136, + "learning_rate": 0.0002, + "loss": 0.3375, + "step": 9400 + }, + { + "epoch": 6.831215970961887, + "grad_norm": 1.6004475355148315, + "learning_rate": 0.0002, + "loss": 0.3234, + "step": 9410 + }, + { + "epoch": 6.838475499092559, + "grad_norm": 1.4133695363998413, + "learning_rate": 0.0002, + "loss": 0.3364, + "step": 9420 + }, + { + "epoch": 6.84573502722323, + "grad_norm": 1.1583502292633057, + "learning_rate": 0.0002, + "loss": 0.3656, + "step": 9430 + }, + { + "epoch": 6.852994555353902, + "grad_norm": 1.3769075870513916, + "learning_rate": 0.0002, + "loss": 0.3499, + "step": 9440 + }, + { + "epoch": 6.860254083484573, + "grad_norm": 1.1831218004226685, + "learning_rate": 0.0002, + "loss": 0.3333, + "step": 9450 + }, + { + "epoch": 6.867513611615245, + "grad_norm": 1.6092621088027954, + "learning_rate": 0.0002, + "loss": 0.3501, + "step": 9460 + }, + { + "epoch": 6.874773139745916, + "grad_norm": 1.3850210905075073, + "learning_rate": 0.0002, + "loss": 0.3933, + "step": 9470 + }, + { + "epoch": 6.882032667876588, + "grad_norm": 1.4119619131088257, + "learning_rate": 0.0002, + "loss": 0.3868, + "step": 9480 + }, + { + "epoch": 6.8892921960072595, + "grad_norm": 1.3494242429733276, + "learning_rate": 0.0002, + "loss": 0.3939, + "step": 9490 + }, + { + "epoch": 6.896551724137931, + "grad_norm": 1.3130041360855103, + "learning_rate": 0.0002, + "loss": 0.3217, + "step": 9500 + }, + { + "epoch": 6.903811252268603, + "grad_norm": 1.169256329536438, + "learning_rate": 0.0002, + "loss": 0.3738, + "step": 9510 + }, + { + "epoch": 6.911070780399274, + "grad_norm": 1.7475035190582275, + "learning_rate": 0.0002, + "loss": 0.408, + "step": 9520 + }, + { + "epoch": 6.918330308529946, + "grad_norm": 1.440434217453003, + "learning_rate": 0.0002, + "loss": 0.3407, + "step": 9530 + }, + { + "epoch": 6.925589836660617, + "grad_norm": 1.6768704652786255, + "learning_rate": 0.0002, + "loss": 0.3707, + "step": 9540 + }, + { + "epoch": 6.932849364791289, + "grad_norm": 1.3720577955245972, + "learning_rate": 0.0002, + "loss": 0.3283, + "step": 9550 + }, + { + "epoch": 6.94010889292196, + "grad_norm": 1.8140523433685303, + "learning_rate": 0.0002, + "loss": 0.3257, + "step": 9560 + }, + { + "epoch": 6.947368421052632, + "grad_norm": 1.1828241348266602, + "learning_rate": 0.0002, + "loss": 0.3308, + "step": 9570 + }, + { + "epoch": 6.954627949183303, + "grad_norm": 1.2755135297775269, + "learning_rate": 0.0002, + "loss": 0.3536, + "step": 9580 + }, + { + "epoch": 6.961887477313975, + "grad_norm": 1.622009038925171, + "learning_rate": 0.0002, + "loss": 0.3711, + "step": 9590 + }, + { + "epoch": 6.969147005444646, + "grad_norm": 1.1543664932250977, + "learning_rate": 0.0002, + "loss": 0.3529, + "step": 9600 + }, + { + "epoch": 6.976406533575318, + "grad_norm": 1.6755319833755493, + "learning_rate": 0.0002, + "loss": 0.416, + "step": 9610 + }, + { + "epoch": 6.983666061705989, + "grad_norm": 1.3726437091827393, + "learning_rate": 0.0002, + "loss": 0.339, + "step": 9620 + }, + { + "epoch": 6.99092558983666, + "grad_norm": 1.1605958938598633, + "learning_rate": 0.0002, + "loss": 0.3709, + "step": 9630 + }, + { + "epoch": 6.998185117967332, + "grad_norm": 1.5371781587600708, + "learning_rate": 0.0002, + "loss": 0.371, + "step": 9640 + }, + { + "epoch": 6.999637023593467, + "eval_loss": 1.6280181407928467, + "eval_runtime": 46.1964, + "eval_samples_per_second": 9.438, + "eval_steps_per_second": 1.191, + "step": 9642 + }, + { + "epoch": 7.005444646098003, + "grad_norm": 1.1645569801330566, + "learning_rate": 0.0002, + "loss": 0.2449, + "step": 9650 + }, + { + "epoch": 7.012704174228675, + "grad_norm": 0.7663792967796326, + "learning_rate": 0.0002, + "loss": 0.251, + "step": 9660 + }, + { + "epoch": 7.019963702359346, + "grad_norm": 1.5808782577514648, + "learning_rate": 0.0002, + "loss": 0.2553, + "step": 9670 + }, + { + "epoch": 7.027223230490018, + "grad_norm": 1.046607255935669, + "learning_rate": 0.0002, + "loss": 0.2349, + "step": 9680 + }, + { + "epoch": 7.0344827586206895, + "grad_norm": 1.2008668184280396, + "learning_rate": 0.0002, + "loss": 0.2078, + "step": 9690 + }, + { + "epoch": 7.0417422867513615, + "grad_norm": 1.9596126079559326, + "learning_rate": 0.0002, + "loss": 0.2519, + "step": 9700 + }, + { + "epoch": 7.049001814882033, + "grad_norm": 1.0400182008743286, + "learning_rate": 0.0002, + "loss": 0.2275, + "step": 9710 + }, + { + "epoch": 7.056261343012705, + "grad_norm": 1.3162504434585571, + "learning_rate": 0.0002, + "loss": 0.2136, + "step": 9720 + }, + { + "epoch": 7.063520871143376, + "grad_norm": 1.180074691772461, + "learning_rate": 0.0002, + "loss": 0.2227, + "step": 9730 + }, + { + "epoch": 7.070780399274047, + "grad_norm": 1.2093719244003296, + "learning_rate": 0.0002, + "loss": 0.2206, + "step": 9740 + }, + { + "epoch": 7.078039927404719, + "grad_norm": 1.4838900566101074, + "learning_rate": 0.0002, + "loss": 0.2387, + "step": 9750 + }, + { + "epoch": 7.08529945553539, + "grad_norm": 1.2319235801696777, + "learning_rate": 0.0002, + "loss": 0.2391, + "step": 9760 + }, + { + "epoch": 7.092558983666062, + "grad_norm": 1.2346558570861816, + "learning_rate": 0.0002, + "loss": 0.2624, + "step": 9770 + }, + { + "epoch": 7.099818511796733, + "grad_norm": 1.0748975276947021, + "learning_rate": 0.0002, + "loss": 0.2321, + "step": 9780 + }, + { + "epoch": 7.107078039927405, + "grad_norm": 1.0162630081176758, + "learning_rate": 0.0002, + "loss": 0.2751, + "step": 9790 + }, + { + "epoch": 7.114337568058076, + "grad_norm": 1.0014166831970215, + "learning_rate": 0.0002, + "loss": 0.2142, + "step": 9800 + }, + { + "epoch": 7.121597096188748, + "grad_norm": 1.0928411483764648, + "learning_rate": 0.0002, + "loss": 0.2439, + "step": 9810 + }, + { + "epoch": 7.128856624319419, + "grad_norm": 1.181496500968933, + "learning_rate": 0.0002, + "loss": 0.2261, + "step": 9820 + }, + { + "epoch": 7.136116152450091, + "grad_norm": 1.5846176147460938, + "learning_rate": 0.0002, + "loss": 0.2345, + "step": 9830 + }, + { + "epoch": 7.143375680580762, + "grad_norm": 0.8734912872314453, + "learning_rate": 0.0002, + "loss": 0.2282, + "step": 9840 + }, + { + "epoch": 7.150635208711433, + "grad_norm": 1.1599528789520264, + "learning_rate": 0.0002, + "loss": 0.2127, + "step": 9850 + }, + { + "epoch": 7.157894736842105, + "grad_norm": 1.168256402015686, + "learning_rate": 0.0002, + "loss": 0.2614, + "step": 9860 + }, + { + "epoch": 7.165154264972776, + "grad_norm": 1.4439860582351685, + "learning_rate": 0.0002, + "loss": 0.2418, + "step": 9870 + }, + { + "epoch": 7.172413793103448, + "grad_norm": 1.3615007400512695, + "learning_rate": 0.0002, + "loss": 0.2183, + "step": 9880 + }, + { + "epoch": 7.1796733212341195, + "grad_norm": 1.1908115148544312, + "learning_rate": 0.0002, + "loss": 0.2366, + "step": 9890 + }, + { + "epoch": 7.1869328493647915, + "grad_norm": 1.452515959739685, + "learning_rate": 0.0002, + "loss": 0.2338, + "step": 9900 + }, + { + "epoch": 7.194192377495463, + "grad_norm": 0.8387667536735535, + "learning_rate": 0.0002, + "loss": 0.2203, + "step": 9910 + }, + { + "epoch": 7.201451905626135, + "grad_norm": 1.3990435600280762, + "learning_rate": 0.0002, + "loss": 0.2117, + "step": 9920 + }, + { + "epoch": 7.208711433756806, + "grad_norm": 1.057800531387329, + "learning_rate": 0.0002, + "loss": 0.2188, + "step": 9930 + }, + { + "epoch": 7.215970961887478, + "grad_norm": 1.3718253374099731, + "learning_rate": 0.0002, + "loss": 0.2516, + "step": 9940 + }, + { + "epoch": 7.223230490018149, + "grad_norm": 1.2011432647705078, + "learning_rate": 0.0002, + "loss": 0.2084, + "step": 9950 + }, + { + "epoch": 7.230490018148821, + "grad_norm": 1.1608737707138062, + "learning_rate": 0.0002, + "loss": 0.2063, + "step": 9960 + }, + { + "epoch": 7.237749546279492, + "grad_norm": 1.7522791624069214, + "learning_rate": 0.0002, + "loss": 0.2275, + "step": 9970 + }, + { + "epoch": 7.245009074410163, + "grad_norm": 1.0787912607192993, + "learning_rate": 0.0002, + "loss": 0.2358, + "step": 9980 + }, + { + "epoch": 7.252268602540835, + "grad_norm": 1.8227689266204834, + "learning_rate": 0.0002, + "loss": 0.2361, + "step": 9990 + }, + { + "epoch": 7.259528130671506, + "grad_norm": 1.1438913345336914, + "learning_rate": 0.0002, + "loss": 0.2217, + "step": 10000 + }, + { + "epoch": 7.266787658802178, + "grad_norm": 1.331770420074463, + "learning_rate": 0.0002, + "loss": 0.2438, + "step": 10010 + }, + { + "epoch": 7.274047186932849, + "grad_norm": 1.2809056043624878, + "learning_rate": 0.0002, + "loss": 0.2622, + "step": 10020 + }, + { + "epoch": 7.281306715063521, + "grad_norm": 1.2245303392410278, + "learning_rate": 0.0002, + "loss": 0.2634, + "step": 10030 + }, + { + "epoch": 7.288566243194192, + "grad_norm": 1.2359435558319092, + "learning_rate": 0.0002, + "loss": 0.2706, + "step": 10040 + }, + { + "epoch": 7.295825771324864, + "grad_norm": 1.3707170486450195, + "learning_rate": 0.0002, + "loss": 0.2427, + "step": 10050 + }, + { + "epoch": 7.303085299455535, + "grad_norm": 1.7405836582183838, + "learning_rate": 0.0002, + "loss": 0.2497, + "step": 10060 + }, + { + "epoch": 7.310344827586207, + "grad_norm": 1.446069359779358, + "learning_rate": 0.0002, + "loss": 0.2628, + "step": 10070 + }, + { + "epoch": 7.317604355716878, + "grad_norm": 1.48823082447052, + "learning_rate": 0.0002, + "loss": 0.2457, + "step": 10080 + }, + { + "epoch": 7.3248638838475495, + "grad_norm": 1.1720311641693115, + "learning_rate": 0.0002, + "loss": 0.2391, + "step": 10090 + }, + { + "epoch": 7.3321234119782215, + "grad_norm": 1.5485225915908813, + "learning_rate": 0.0002, + "loss": 0.2701, + "step": 10100 + }, + { + "epoch": 7.339382940108893, + "grad_norm": 1.6018894910812378, + "learning_rate": 0.0002, + "loss": 0.2855, + "step": 10110 + }, + { + "epoch": 7.346642468239565, + "grad_norm": 1.4753694534301758, + "learning_rate": 0.0002, + "loss": 0.2662, + "step": 10120 + }, + { + "epoch": 7.353901996370236, + "grad_norm": 1.3604710102081299, + "learning_rate": 0.0002, + "loss": 0.2323, + "step": 10130 + }, + { + "epoch": 7.361161524500908, + "grad_norm": 1.5755873918533325, + "learning_rate": 0.0002, + "loss": 0.2664, + "step": 10140 + }, + { + "epoch": 7.368421052631579, + "grad_norm": 0.9421692490577698, + "learning_rate": 0.0002, + "loss": 0.215, + "step": 10150 + }, + { + "epoch": 7.375680580762251, + "grad_norm": 1.3055956363677979, + "learning_rate": 0.0002, + "loss": 0.297, + "step": 10160 + }, + { + "epoch": 7.382940108892922, + "grad_norm": 1.4764302968978882, + "learning_rate": 0.0002, + "loss": 0.2385, + "step": 10170 + }, + { + "epoch": 7.390199637023594, + "grad_norm": 1.3726946115493774, + "learning_rate": 0.0002, + "loss": 0.2724, + "step": 10180 + }, + { + "epoch": 7.397459165154265, + "grad_norm": 1.446473240852356, + "learning_rate": 0.0002, + "loss": 0.2599, + "step": 10190 + }, + { + "epoch": 7.404718693284936, + "grad_norm": 1.489094614982605, + "learning_rate": 0.0002, + "loss": 0.2837, + "step": 10200 + }, + { + "epoch": 7.411978221415608, + "grad_norm": 1.247572898864746, + "learning_rate": 0.0002, + "loss": 0.2786, + "step": 10210 + }, + { + "epoch": 7.419237749546279, + "grad_norm": 1.2741918563842773, + "learning_rate": 0.0002, + "loss": 0.2498, + "step": 10220 + }, + { + "epoch": 7.426497277676951, + "grad_norm": 1.0347636938095093, + "learning_rate": 0.0002, + "loss": 0.2649, + "step": 10230 + }, + { + "epoch": 7.433756805807622, + "grad_norm": 1.3295499086380005, + "learning_rate": 0.0002, + "loss": 0.2419, + "step": 10240 + }, + { + "epoch": 7.441016333938294, + "grad_norm": 1.6056840419769287, + "learning_rate": 0.0002, + "loss": 0.226, + "step": 10250 + }, + { + "epoch": 7.448275862068965, + "grad_norm": 1.4824398756027222, + "learning_rate": 0.0002, + "loss": 0.2571, + "step": 10260 + }, + { + "epoch": 7.455535390199637, + "grad_norm": 1.6259359121322632, + "learning_rate": 0.0002, + "loss": 0.2939, + "step": 10270 + }, + { + "epoch": 7.462794918330308, + "grad_norm": 1.5065499544143677, + "learning_rate": 0.0002, + "loss": 0.2873, + "step": 10280 + }, + { + "epoch": 7.47005444646098, + "grad_norm": 1.3505364656448364, + "learning_rate": 0.0002, + "loss": 0.2614, + "step": 10290 + }, + { + "epoch": 7.4773139745916515, + "grad_norm": 1.4457359313964844, + "learning_rate": 0.0002, + "loss": 0.2862, + "step": 10300 + }, + { + "epoch": 7.4845735027223235, + "grad_norm": 1.0782662630081177, + "learning_rate": 0.0002, + "loss": 0.233, + "step": 10310 + }, + { + "epoch": 7.491833030852995, + "grad_norm": 1.4209016561508179, + "learning_rate": 0.0002, + "loss": 0.2506, + "step": 10320 + }, + { + "epoch": 7.499092558983666, + "grad_norm": 1.4511336088180542, + "learning_rate": 0.0002, + "loss": 0.2741, + "step": 10330 + }, + { + "epoch": 7.506352087114338, + "grad_norm": 1.306691288948059, + "learning_rate": 0.0002, + "loss": 0.2736, + "step": 10340 + }, + { + "epoch": 7.513611615245009, + "grad_norm": 1.0647870302200317, + "learning_rate": 0.0002, + "loss": 0.2528, + "step": 10350 + }, + { + "epoch": 7.520871143375681, + "grad_norm": 1.0374330282211304, + "learning_rate": 0.0002, + "loss": 0.2402, + "step": 10360 + }, + { + "epoch": 7.528130671506352, + "grad_norm": 0.8428803086280823, + "learning_rate": 0.0002, + "loss": 0.3057, + "step": 10370 + }, + { + "epoch": 7.535390199637024, + "grad_norm": 1.3868707418441772, + "learning_rate": 0.0002, + "loss": 0.2446, + "step": 10380 + }, + { + "epoch": 7.542649727767695, + "grad_norm": 1.4324088096618652, + "learning_rate": 0.0002, + "loss": 0.2851, + "step": 10390 + }, + { + "epoch": 7.549909255898367, + "grad_norm": 1.6413776874542236, + "learning_rate": 0.0002, + "loss": 0.292, + "step": 10400 + }, + { + "epoch": 7.557168784029038, + "grad_norm": 1.4302188158035278, + "learning_rate": 0.0002, + "loss": 0.2585, + "step": 10410 + }, + { + "epoch": 7.564428312159709, + "grad_norm": 1.3648983240127563, + "learning_rate": 0.0002, + "loss": 0.306, + "step": 10420 + }, + { + "epoch": 7.571687840290381, + "grad_norm": 1.4480061531066895, + "learning_rate": 0.0002, + "loss": 0.2927, + "step": 10430 + }, + { + "epoch": 7.578947368421053, + "grad_norm": 1.0944541692733765, + "learning_rate": 0.0002, + "loss": 0.2711, + "step": 10440 + }, + { + "epoch": 7.586206896551724, + "grad_norm": 1.4632091522216797, + "learning_rate": 0.0002, + "loss": 0.2698, + "step": 10450 + }, + { + "epoch": 7.593466424682395, + "grad_norm": 1.562364935874939, + "learning_rate": 0.0002, + "loss": 0.2994, + "step": 10460 + }, + { + "epoch": 7.600725952813067, + "grad_norm": 1.2138582468032837, + "learning_rate": 0.0002, + "loss": 0.271, + "step": 10470 + }, + { + "epoch": 7.607985480943738, + "grad_norm": 1.467578411102295, + "learning_rate": 0.0002, + "loss": 0.2638, + "step": 10480 + }, + { + "epoch": 7.61524500907441, + "grad_norm": 1.3470213413238525, + "learning_rate": 0.0002, + "loss": 0.2949, + "step": 10490 + }, + { + "epoch": 7.6225045372050815, + "grad_norm": 1.5385268926620483, + "learning_rate": 0.0002, + "loss": 0.2518, + "step": 10500 + }, + { + "epoch": 7.6297640653357535, + "grad_norm": 1.1245018243789673, + "learning_rate": 0.0002, + "loss": 0.2527, + "step": 10510 + }, + { + "epoch": 7.637023593466425, + "grad_norm": 1.3161317110061646, + "learning_rate": 0.0002, + "loss": 0.267, + "step": 10520 + }, + { + "epoch": 7.6442831215970966, + "grad_norm": 1.0402427911758423, + "learning_rate": 0.0002, + "loss": 0.2803, + "step": 10530 + }, + { + "epoch": 7.651542649727768, + "grad_norm": 1.2699987888336182, + "learning_rate": 0.0002, + "loss": 0.2676, + "step": 10540 + }, + { + "epoch": 7.658802177858439, + "grad_norm": 1.47243332862854, + "learning_rate": 0.0002, + "loss": 0.2835, + "step": 10550 + }, + { + "epoch": 7.666061705989111, + "grad_norm": 1.1261144876480103, + "learning_rate": 0.0002, + "loss": 0.2624, + "step": 10560 + }, + { + "epoch": 7.673321234119782, + "grad_norm": 1.5402237176895142, + "learning_rate": 0.0002, + "loss": 0.282, + "step": 10570 + }, + { + "epoch": 7.680580762250454, + "grad_norm": 1.1316986083984375, + "learning_rate": 0.0002, + "loss": 0.2866, + "step": 10580 + }, + { + "epoch": 7.687840290381125, + "grad_norm": 1.2155439853668213, + "learning_rate": 0.0002, + "loss": 0.2593, + "step": 10590 + }, + { + "epoch": 7.695099818511797, + "grad_norm": 1.566380500793457, + "learning_rate": 0.0002, + "loss": 0.274, + "step": 10600 + }, + { + "epoch": 7.702359346642468, + "grad_norm": 1.7367318868637085, + "learning_rate": 0.0002, + "loss": 0.2664, + "step": 10610 + }, + { + "epoch": 7.70961887477314, + "grad_norm": 1.5213567018508911, + "learning_rate": 0.0002, + "loss": 0.3115, + "step": 10620 + }, + { + "epoch": 7.716878402903811, + "grad_norm": 1.3955585956573486, + "learning_rate": 0.0002, + "loss": 0.2863, + "step": 10630 + }, + { + "epoch": 7.724137931034483, + "grad_norm": 1.321916937828064, + "learning_rate": 0.0002, + "loss": 0.2683, + "step": 10640 + }, + { + "epoch": 7.731397459165154, + "grad_norm": 1.8494919538497925, + "learning_rate": 0.0002, + "loss": 0.306, + "step": 10650 + }, + { + "epoch": 7.738656987295826, + "grad_norm": 1.5309828519821167, + "learning_rate": 0.0002, + "loss": 0.2544, + "step": 10660 + }, + { + "epoch": 7.745916515426497, + "grad_norm": 1.3796069622039795, + "learning_rate": 0.0002, + "loss": 0.2693, + "step": 10670 + }, + { + "epoch": 7.753176043557168, + "grad_norm": 1.2416858673095703, + "learning_rate": 0.0002, + "loss": 0.2872, + "step": 10680 + }, + { + "epoch": 7.76043557168784, + "grad_norm": 1.4447332620620728, + "learning_rate": 0.0002, + "loss": 0.2729, + "step": 10690 + }, + { + "epoch": 7.7676950998185115, + "grad_norm": 1.2003352642059326, + "learning_rate": 0.0002, + "loss": 0.2736, + "step": 10700 + }, + { + "epoch": 7.7749546279491835, + "grad_norm": 1.3607908487319946, + "learning_rate": 0.0002, + "loss": 0.2771, + "step": 10710 + }, + { + "epoch": 7.782214156079855, + "grad_norm": 1.1789227724075317, + "learning_rate": 0.0002, + "loss": 0.2739, + "step": 10720 + }, + { + "epoch": 7.7894736842105265, + "grad_norm": 1.2998148202896118, + "learning_rate": 0.0002, + "loss": 0.2927, + "step": 10730 + }, + { + "epoch": 7.796733212341198, + "grad_norm": 1.8224656581878662, + "learning_rate": 0.0002, + "loss": 0.2825, + "step": 10740 + }, + { + "epoch": 7.80399274047187, + "grad_norm": 1.2510570287704468, + "learning_rate": 0.0002, + "loss": 0.2864, + "step": 10750 + }, + { + "epoch": 7.811252268602541, + "grad_norm": 1.065926194190979, + "learning_rate": 0.0002, + "loss": 0.3007, + "step": 10760 + }, + { + "epoch": 7.818511796733213, + "grad_norm": 1.0313589572906494, + "learning_rate": 0.0002, + "loss": 0.277, + "step": 10770 + }, + { + "epoch": 7.825771324863884, + "grad_norm": 1.10769784450531, + "learning_rate": 0.0002, + "loss": 0.2954, + "step": 10780 + }, + { + "epoch": 7.833030852994556, + "grad_norm": 1.4168727397918701, + "learning_rate": 0.0002, + "loss": 0.2893, + "step": 10790 + }, + { + "epoch": 7.840290381125227, + "grad_norm": 1.8239266872406006, + "learning_rate": 0.0002, + "loss": 0.2903, + "step": 10800 + }, + { + "epoch": 7.847549909255898, + "grad_norm": 1.5748721361160278, + "learning_rate": 0.0002, + "loss": 0.2573, + "step": 10810 + }, + { + "epoch": 7.85480943738657, + "grad_norm": 1.5762766599655151, + "learning_rate": 0.0002, + "loss": 0.3282, + "step": 10820 + }, + { + "epoch": 7.862068965517241, + "grad_norm": 1.1119135618209839, + "learning_rate": 0.0002, + "loss": 0.2981, + "step": 10830 + }, + { + "epoch": 7.869328493647913, + "grad_norm": 1.478314995765686, + "learning_rate": 0.0002, + "loss": 0.3037, + "step": 10840 + }, + { + "epoch": 7.876588021778584, + "grad_norm": 1.2225514650344849, + "learning_rate": 0.0002, + "loss": 0.2866, + "step": 10850 + }, + { + "epoch": 7.883847549909256, + "grad_norm": 1.503473162651062, + "learning_rate": 0.0002, + "loss": 0.2795, + "step": 10860 + }, + { + "epoch": 7.891107078039927, + "grad_norm": 1.0334484577178955, + "learning_rate": 0.0002, + "loss": 0.2732, + "step": 10870 + }, + { + "epoch": 7.898366606170599, + "grad_norm": 1.2068367004394531, + "learning_rate": 0.0002, + "loss": 0.3206, + "step": 10880 + }, + { + "epoch": 7.90562613430127, + "grad_norm": 1.3105504512786865, + "learning_rate": 0.0002, + "loss": 0.2936, + "step": 10890 + }, + { + "epoch": 7.9128856624319415, + "grad_norm": 1.2941272258758545, + "learning_rate": 0.0002, + "loss": 0.3063, + "step": 10900 + }, + { + "epoch": 7.9201451905626135, + "grad_norm": 1.2809823751449585, + "learning_rate": 0.0002, + "loss": 0.2862, + "step": 10910 + }, + { + "epoch": 7.927404718693285, + "grad_norm": 1.5362727642059326, + "learning_rate": 0.0002, + "loss": 0.3202, + "step": 10920 + }, + { + "epoch": 7.9346642468239565, + "grad_norm": 1.5019413232803345, + "learning_rate": 0.0002, + "loss": 0.3005, + "step": 10930 + }, + { + "epoch": 7.941923774954628, + "grad_norm": 1.5947920083999634, + "learning_rate": 0.0002, + "loss": 0.2976, + "step": 10940 + }, + { + "epoch": 7.9491833030853, + "grad_norm": 1.9482372999191284, + "learning_rate": 0.0002, + "loss": 0.2976, + "step": 10950 + }, + { + "epoch": 7.956442831215971, + "grad_norm": 1.8445630073547363, + "learning_rate": 0.0002, + "loss": 0.3297, + "step": 10960 + }, + { + "epoch": 7.963702359346643, + "grad_norm": 1.4342153072357178, + "learning_rate": 0.0002, + "loss": 0.2843, + "step": 10970 + }, + { + "epoch": 7.970961887477314, + "grad_norm": 1.3202505111694336, + "learning_rate": 0.0002, + "loss": 0.3066, + "step": 10980 + }, + { + "epoch": 7.978221415607986, + "grad_norm": 1.186015009880066, + "learning_rate": 0.0002, + "loss": 0.2785, + "step": 10990 + }, + { + "epoch": 7.985480943738657, + "grad_norm": 1.2714571952819824, + "learning_rate": 0.0002, + "loss": 0.324, + "step": 11000 + }, + { + "epoch": 7.992740471869329, + "grad_norm": 0.9723673462867737, + "learning_rate": 0.0002, + "loss": 0.2795, + "step": 11010 + }, + { + "epoch": 7.997096188747731, + "eval_loss": 1.7563573122024536, + "eval_runtime": 46.2097, + "eval_samples_per_second": 9.435, + "eval_steps_per_second": 1.19, + "step": 11016 + } + ], + "logging_steps": 10, + "max_steps": 11016, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.6582302183981056e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..140bceabdec44fc473842cb9ace4a62a4dc9fe89 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-11016/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4abec5480aff960b5a7b03a48be1cdb1ae29ba8792469c451f5c2f269e4be93b +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..50be09609fd83716946490443d34010d2770b093 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94721f474e700335b4e7d287e4ace795275fe6ce10243090004d47de0d72cc6f +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/optimizer.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b21fe839855af6807a65cfe584b243175fdf6e2 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff779cf41c9d8ba5d9c69b4f9dab3d83650430b82da4bc81d777594ea2625626 +size 72886650 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/rng_state.pth b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a1dd6057ad2a35105ac4f8a2d7298a68767ebd70 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c0c855d1585015f23e10dfd2b87b895f7082d0deb7de58a335a2c4c760766ac +size 14244 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/scheduler.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e248b8433174e9b822c1813f26e4a98349a0b55 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fc8e94bd1f293c364bb03428d8b8836c3baea7e161d96b587666edd284ae6bd +size 1064 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/trainer_state.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..44bc92c6a39237e8538a7b3d9491d5cd66170d7f --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/trainer_state.json @@ -0,0 +1,1000 @@ +{ + "best_metric": 1.1381088495254517, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377", + "epoch": 0.9996370235934664, + "eval_steps": 10, + "global_step": 1377, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007259528130671506, + "grad_norm": 0.46010470390319824, + "learning_rate": 0.0002, + "loss": 2.4936, + "step": 10 + }, + { + "epoch": 0.014519056261343012, + "grad_norm": 0.5103179216384888, + "learning_rate": 0.0002, + "loss": 1.8442, + "step": 20 + }, + { + "epoch": 0.021778584392014518, + "grad_norm": 0.6282716989517212, + "learning_rate": 0.0002, + "loss": 1.559, + "step": 30 + }, + { + "epoch": 0.029038112522686024, + "grad_norm": 1.2480497360229492, + "learning_rate": 0.0002, + "loss": 1.3618, + "step": 40 + }, + { + "epoch": 0.036297640653357534, + "grad_norm": 0.4114855229854584, + "learning_rate": 0.0002, + "loss": 1.3874, + "step": 50 + }, + { + "epoch": 0.043557168784029036, + "grad_norm": 0.49482840299606323, + "learning_rate": 0.0002, + "loss": 1.2836, + "step": 60 + }, + { + "epoch": 0.050816696914700546, + "grad_norm": 0.4536272883415222, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 70 + }, + { + "epoch": 0.05807622504537205, + "grad_norm": 0.32328274846076965, + "learning_rate": 0.0002, + "loss": 1.1125, + "step": 80 + }, + { + "epoch": 0.06533575317604355, + "grad_norm": 0.40990468859672546, + "learning_rate": 0.0002, + "loss": 1.2447, + "step": 90 + }, + { + "epoch": 0.07259528130671507, + "grad_norm": 0.37273502349853516, + "learning_rate": 0.0002, + "loss": 1.2216, + "step": 100 + }, + { + "epoch": 0.07985480943738657, + "grad_norm": 0.3903871476650238, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 110 + }, + { + "epoch": 0.08711433756805807, + "grad_norm": 0.3272787928581238, + "learning_rate": 0.0002, + "loss": 1.3718, + "step": 120 + }, + { + "epoch": 0.09437386569872959, + "grad_norm": 0.3622824251651764, + "learning_rate": 0.0002, + "loss": 1.1832, + "step": 130 + }, + { + "epoch": 0.10163339382940109, + "grad_norm": 0.3503916561603546, + "learning_rate": 0.0002, + "loss": 1.2966, + "step": 140 + }, + { + "epoch": 0.1088929219600726, + "grad_norm": 0.32787832617759705, + "learning_rate": 0.0002, + "loss": 1.0913, + "step": 150 + }, + { + "epoch": 0.1161524500907441, + "grad_norm": 0.5822657942771912, + "learning_rate": 0.0002, + "loss": 1.2143, + "step": 160 + }, + { + "epoch": 0.12341197822141561, + "grad_norm": 0.28028249740600586, + "learning_rate": 0.0002, + "loss": 1.2091, + "step": 170 + }, + { + "epoch": 0.1306715063520871, + "grad_norm": 0.33602750301361084, + "learning_rate": 0.0002, + "loss": 1.1731, + "step": 180 + }, + { + "epoch": 0.13793103448275862, + "grad_norm": 0.29106274247169495, + "learning_rate": 0.0002, + "loss": 1.127, + "step": 190 + }, + { + "epoch": 0.14519056261343014, + "grad_norm": 0.38753363490104675, + "learning_rate": 0.0002, + "loss": 1.3261, + "step": 200 + }, + { + "epoch": 0.15245009074410162, + "grad_norm": 0.361009418964386, + "learning_rate": 0.0002, + "loss": 1.1536, + "step": 210 + }, + { + "epoch": 0.15970961887477314, + "grad_norm": 0.6743836402893066, + "learning_rate": 0.0002, + "loss": 1.1483, + "step": 220 + }, + { + "epoch": 0.16696914700544466, + "grad_norm": 0.3923613727092743, + "learning_rate": 0.0002, + "loss": 1.264, + "step": 230 + }, + { + "epoch": 0.17422867513611615, + "grad_norm": 0.2809699773788452, + "learning_rate": 0.0002, + "loss": 1.0437, + "step": 240 + }, + { + "epoch": 0.18148820326678766, + "grad_norm": 0.3631494641304016, + "learning_rate": 0.0002, + "loss": 1.1826, + "step": 250 + }, + { + "epoch": 0.18874773139745918, + "grad_norm": 0.24658171832561493, + "learning_rate": 0.0002, + "loss": 1.1584, + "step": 260 + }, + { + "epoch": 0.19600725952813067, + "grad_norm": 0.5780664682388306, + "learning_rate": 0.0002, + "loss": 1.0065, + "step": 270 + }, + { + "epoch": 0.20326678765880218, + "grad_norm": 0.3056720495223999, + "learning_rate": 0.0002, + "loss": 1.1251, + "step": 280 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.2857084572315216, + "learning_rate": 0.0002, + "loss": 1.1503, + "step": 290 + }, + { + "epoch": 0.2177858439201452, + "grad_norm": 0.6645345687866211, + "learning_rate": 0.0002, + "loss": 1.1215, + "step": 300 + }, + { + "epoch": 0.2250453720508167, + "grad_norm": 0.5966078639030457, + "learning_rate": 0.0002, + "loss": 1.127, + "step": 310 + }, + { + "epoch": 0.2323049001814882, + "grad_norm": 0.40937140583992004, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 320 + }, + { + "epoch": 0.2395644283121597, + "grad_norm": 0.5642806887626648, + "learning_rate": 0.0002, + "loss": 1.2385, + "step": 330 + }, + { + "epoch": 0.24682395644283123, + "grad_norm": 0.2750748097896576, + "learning_rate": 0.0002, + "loss": 1.1703, + "step": 340 + }, + { + "epoch": 0.2540834845735027, + "grad_norm": 0.34350234270095825, + "learning_rate": 0.0002, + "loss": 1.2598, + "step": 350 + }, + { + "epoch": 0.2613430127041742, + "grad_norm": 0.6767239570617676, + "learning_rate": 0.0002, + "loss": 1.1942, + "step": 360 + }, + { + "epoch": 0.26860254083484575, + "grad_norm": 0.31006959080696106, + "learning_rate": 0.0002, + "loss": 1.1436, + "step": 370 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 0.3825474679470062, + "learning_rate": 0.0002, + "loss": 1.2084, + "step": 380 + }, + { + "epoch": 0.2831215970961887, + "grad_norm": 0.30241551995277405, + "learning_rate": 0.0002, + "loss": 1.1523, + "step": 390 + }, + { + "epoch": 0.29038112522686027, + "grad_norm": 0.2962397336959839, + "learning_rate": 0.0002, + "loss": 1.1298, + "step": 400 + }, + { + "epoch": 0.29764065335753176, + "grad_norm": 0.2600369155406952, + "learning_rate": 0.0002, + "loss": 1.1299, + "step": 410 + }, + { + "epoch": 0.30490018148820325, + "grad_norm": 0.3675060272216797, + "learning_rate": 0.0002, + "loss": 1.1366, + "step": 420 + }, + { + "epoch": 0.3121597096188748, + "grad_norm": 0.3429498076438904, + "learning_rate": 0.0002, + "loss": 1.156, + "step": 430 + }, + { + "epoch": 0.3194192377495463, + "grad_norm": 0.34311825037002563, + "learning_rate": 0.0002, + "loss": 1.2741, + "step": 440 + }, + { + "epoch": 0.32667876588021777, + "grad_norm": 0.37872210144996643, + "learning_rate": 0.0002, + "loss": 1.3523, + "step": 450 + }, + { + "epoch": 0.3339382940108893, + "grad_norm": 0.33271121978759766, + "learning_rate": 0.0002, + "loss": 1.1365, + "step": 460 + }, + { + "epoch": 0.3411978221415608, + "grad_norm": 0.34605276584625244, + "learning_rate": 0.0002, + "loss": 1.1572, + "step": 470 + }, + { + "epoch": 0.3484573502722323, + "grad_norm": 0.41050562262535095, + "learning_rate": 0.0002, + "loss": 1.2755, + "step": 480 + }, + { + "epoch": 0.35571687840290384, + "grad_norm": 0.2066836953163147, + "learning_rate": 0.0002, + "loss": 1.075, + "step": 490 + }, + { + "epoch": 0.3629764065335753, + "grad_norm": 0.2859014868736267, + "learning_rate": 0.0002, + "loss": 1.2072, + "step": 500 + }, + { + "epoch": 0.3702359346642468, + "grad_norm": 0.28763777017593384, + "learning_rate": 0.0002, + "loss": 1.1435, + "step": 510 + }, + { + "epoch": 0.37749546279491836, + "grad_norm": 0.2730471193790436, + "learning_rate": 0.0002, + "loss": 1.1341, + "step": 520 + }, + { + "epoch": 0.38475499092558985, + "grad_norm": 0.3968936800956726, + "learning_rate": 0.0002, + "loss": 1.348, + "step": 530 + }, + { + "epoch": 0.39201451905626133, + "grad_norm": 0.3624701201915741, + "learning_rate": 0.0002, + "loss": 1.2363, + "step": 540 + }, + { + "epoch": 0.3992740471869328, + "grad_norm": 0.3303608298301697, + "learning_rate": 0.0002, + "loss": 1.1206, + "step": 550 + }, + { + "epoch": 0.40653357531760437, + "grad_norm": 0.33507466316223145, + "learning_rate": 0.0002, + "loss": 1.203, + "step": 560 + }, + { + "epoch": 0.41379310344827586, + "grad_norm": 0.3297670781612396, + "learning_rate": 0.0002, + "loss": 1.2294, + "step": 570 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.32334890961647034, + "learning_rate": 0.0002, + "loss": 1.2, + "step": 580 + }, + { + "epoch": 0.4283121597096189, + "grad_norm": 0.30281195044517517, + "learning_rate": 0.0002, + "loss": 1.112, + "step": 590 + }, + { + "epoch": 0.4355716878402904, + "grad_norm": 0.5900027751922607, + "learning_rate": 0.0002, + "loss": 1.1352, + "step": 600 + }, + { + "epoch": 0.44283121597096187, + "grad_norm": 0.28437477350234985, + "learning_rate": 0.0002, + "loss": 1.1575, + "step": 610 + }, + { + "epoch": 0.4500907441016334, + "grad_norm": 0.39601704478263855, + "learning_rate": 0.0002, + "loss": 1.2009, + "step": 620 + }, + { + "epoch": 0.4573502722323049, + "grad_norm": 0.41971510648727417, + "learning_rate": 0.0002, + "loss": 1.2302, + "step": 630 + }, + { + "epoch": 0.4646098003629764, + "grad_norm": 0.33814409375190735, + "learning_rate": 0.0002, + "loss": 1.236, + "step": 640 + }, + { + "epoch": 0.47186932849364793, + "grad_norm": 0.575718104839325, + "learning_rate": 0.0002, + "loss": 1.2471, + "step": 650 + }, + { + "epoch": 0.4791288566243194, + "grad_norm": 0.37927401065826416, + "learning_rate": 0.0002, + "loss": 1.1012, + "step": 660 + }, + { + "epoch": 0.4863883847549909, + "grad_norm": 0.3224332630634308, + "learning_rate": 0.0002, + "loss": 1.1552, + "step": 670 + }, + { + "epoch": 0.49364791288566245, + "grad_norm": 0.32683515548706055, + "learning_rate": 0.0002, + "loss": 1.1354, + "step": 680 + }, + { + "epoch": 0.5009074410163339, + "grad_norm": 0.4316163659095764, + "learning_rate": 0.0002, + "loss": 1.207, + "step": 690 + }, + { + "epoch": 0.5081669691470054, + "grad_norm": 0.342602401971817, + "learning_rate": 0.0002, + "loss": 1.2116, + "step": 700 + }, + { + "epoch": 0.515426497277677, + "grad_norm": 0.2794898748397827, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 710 + }, + { + "epoch": 0.5226860254083484, + "grad_norm": 0.3322339951992035, + "learning_rate": 0.0002, + "loss": 1.2395, + "step": 720 + }, + { + "epoch": 0.52994555353902, + "grad_norm": 0.3088509142398834, + "learning_rate": 0.0002, + "loss": 1.0762, + "step": 730 + }, + { + "epoch": 0.5372050816696915, + "grad_norm": 0.24444378912448883, + "learning_rate": 0.0002, + "loss": 1.1495, + "step": 740 + }, + { + "epoch": 0.5444646098003629, + "grad_norm": 0.3483171761035919, + "learning_rate": 0.0002, + "loss": 1.2176, + "step": 750 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 0.3425690531730652, + "learning_rate": 0.0002, + "loss": 1.2248, + "step": 760 + }, + { + "epoch": 0.558983666061706, + "grad_norm": 0.31841927766799927, + "learning_rate": 0.0002, + "loss": 1.1127, + "step": 770 + }, + { + "epoch": 0.5662431941923775, + "grad_norm": 0.39423868060112, + "learning_rate": 0.0002, + "loss": 1.1841, + "step": 780 + }, + { + "epoch": 0.573502722323049, + "grad_norm": 0.30328479409217834, + "learning_rate": 0.0002, + "loss": 1.0583, + "step": 790 + }, + { + "epoch": 0.5807622504537205, + "grad_norm": 0.24475938081741333, + "learning_rate": 0.0002, + "loss": 1.0718, + "step": 800 + }, + { + "epoch": 0.588021778584392, + "grad_norm": 0.37132805585861206, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 810 + }, + { + "epoch": 0.5952813067150635, + "grad_norm": 0.32195979356765747, + "learning_rate": 0.0002, + "loss": 1.1544, + "step": 820 + }, + { + "epoch": 0.6025408348457351, + "grad_norm": 0.2848738729953766, + "learning_rate": 0.0002, + "loss": 0.9937, + "step": 830 + }, + { + "epoch": 0.6098003629764065, + "grad_norm": 0.28015264868736267, + "learning_rate": 0.0002, + "loss": 1.0937, + "step": 840 + }, + { + "epoch": 0.617059891107078, + "grad_norm": 0.37796008586883545, + "learning_rate": 0.0002, + "loss": 1.1234, + "step": 850 + }, + { + "epoch": 0.6243194192377496, + "grad_norm": 0.39311841130256653, + "learning_rate": 0.0002, + "loss": 1.1455, + "step": 860 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.2761685252189636, + "learning_rate": 0.0002, + "loss": 1.1066, + "step": 870 + }, + { + "epoch": 0.6388384754990926, + "grad_norm": 0.3826720118522644, + "learning_rate": 0.0002, + "loss": 1.0859, + "step": 880 + }, + { + "epoch": 0.6460980036297641, + "grad_norm": 0.30076679587364197, + "learning_rate": 0.0002, + "loss": 1.1182, + "step": 890 + }, + { + "epoch": 0.6533575317604355, + "grad_norm": 0.21997687220573425, + "learning_rate": 0.0002, + "loss": 1.0927, + "step": 900 + }, + { + "epoch": 0.6606170598911071, + "grad_norm": 0.32593777775764465, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 910 + }, + { + "epoch": 0.6678765880217786, + "grad_norm": 0.30347898602485657, + "learning_rate": 0.0002, + "loss": 1.1578, + "step": 920 + }, + { + "epoch": 0.6751361161524501, + "grad_norm": 0.44173774123191833, + "learning_rate": 0.0002, + "loss": 1.1642, + "step": 930 + }, + { + "epoch": 0.6823956442831216, + "grad_norm": 0.2507467269897461, + "learning_rate": 0.0002, + "loss": 1.1832, + "step": 940 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 0.29463833570480347, + "learning_rate": 0.0002, + "loss": 1.3001, + "step": 950 + }, + { + "epoch": 0.6969147005444646, + "grad_norm": 0.9363154172897339, + "learning_rate": 0.0002, + "loss": 1.1651, + "step": 960 + }, + { + "epoch": 0.7041742286751361, + "grad_norm": 0.3236212134361267, + "learning_rate": 0.0002, + "loss": 1.158, + "step": 970 + }, + { + "epoch": 0.7114337568058077, + "grad_norm": 0.3123254179954529, + "learning_rate": 0.0002, + "loss": 1.1078, + "step": 980 + }, + { + "epoch": 0.7186932849364791, + "grad_norm": 0.3395805060863495, + "learning_rate": 0.0002, + "loss": 1.1578, + "step": 990 + }, + { + "epoch": 0.7259528130671506, + "grad_norm": 0.3240964412689209, + "learning_rate": 0.0002, + "loss": 1.0754, + "step": 1000 + }, + { + "epoch": 0.7332123411978222, + "grad_norm": 0.31902948021888733, + "learning_rate": 0.0002, + "loss": 1.0313, + "step": 1010 + }, + { + "epoch": 0.7404718693284936, + "grad_norm": 0.4848408102989197, + "learning_rate": 0.0002, + "loss": 1.1664, + "step": 1020 + }, + { + "epoch": 0.7477313974591652, + "grad_norm": 0.33006033301353455, + "learning_rate": 0.0002, + "loss": 1.0838, + "step": 1030 + }, + { + "epoch": 0.7549909255898367, + "grad_norm": 0.2928730547428131, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 1040 + }, + { + "epoch": 0.7622504537205081, + "grad_norm": 0.3529164791107178, + "learning_rate": 0.0002, + "loss": 1.1351, + "step": 1050 + }, + { + "epoch": 0.7695099818511797, + "grad_norm": 0.2736213803291321, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 1060 + }, + { + "epoch": 0.7767695099818511, + "grad_norm": 0.7200686931610107, + "learning_rate": 0.0002, + "loss": 1.1169, + "step": 1070 + }, + { + "epoch": 0.7840290381125227, + "grad_norm": 0.33396708965301514, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 1080 + }, + { + "epoch": 0.7912885662431942, + "grad_norm": 1.5760449171066284, + "learning_rate": 0.0002, + "loss": 1.2447, + "step": 1090 + }, + { + "epoch": 0.7985480943738656, + "grad_norm": 0.28138381242752075, + "learning_rate": 0.0002, + "loss": 1.1126, + "step": 1100 + }, + { + "epoch": 0.8058076225045372, + "grad_norm": 0.2597472369670868, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 1110 + }, + { + "epoch": 0.8130671506352087, + "grad_norm": 0.3305445611476898, + "learning_rate": 0.0002, + "loss": 1.1177, + "step": 1120 + }, + { + "epoch": 0.8203266787658802, + "grad_norm": 0.3934599459171295, + "learning_rate": 0.0002, + "loss": 1.0849, + "step": 1130 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 0.3472191393375397, + "learning_rate": 0.0002, + "loss": 1.0889, + "step": 1140 + }, + { + "epoch": 0.8348457350272233, + "grad_norm": 0.2857365906238556, + "learning_rate": 0.0002, + "loss": 1.265, + "step": 1150 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.3207702934741974, + "learning_rate": 0.0002, + "loss": 1.03, + "step": 1160 + }, + { + "epoch": 0.8493647912885662, + "grad_norm": 0.3176484704017639, + "learning_rate": 0.0002, + "loss": 1.1669, + "step": 1170 + }, + { + "epoch": 0.8566243194192378, + "grad_norm": 0.40685558319091797, + "learning_rate": 0.0002, + "loss": 1.1386, + "step": 1180 + }, + { + "epoch": 0.8638838475499092, + "grad_norm": 0.31125199794769287, + "learning_rate": 0.0002, + "loss": 1.1383, + "step": 1190 + }, + { + "epoch": 0.8711433756805808, + "grad_norm": 0.7361181378364563, + "learning_rate": 0.0002, + "loss": 1.1512, + "step": 1200 + }, + { + "epoch": 0.8784029038112523, + "grad_norm": 0.33699527382850647, + "learning_rate": 0.0002, + "loss": 0.9983, + "step": 1210 + }, + { + "epoch": 0.8856624319419237, + "grad_norm": 0.3315220773220062, + "learning_rate": 0.0002, + "loss": 1.11, + "step": 1220 + }, + { + "epoch": 0.8929219600725953, + "grad_norm": 0.6256054043769836, + "learning_rate": 0.0002, + "loss": 1.0266, + "step": 1230 + }, + { + "epoch": 0.9001814882032668, + "grad_norm": 0.3692137897014618, + "learning_rate": 0.0002, + "loss": 1.1738, + "step": 1240 + }, + { + "epoch": 0.9074410163339383, + "grad_norm": 0.3538484573364258, + "learning_rate": 0.0002, + "loss": 1.1112, + "step": 1250 + }, + { + "epoch": 0.9147005444646098, + "grad_norm": 0.27863040566444397, + "learning_rate": 0.0002, + "loss": 1.0878, + "step": 1260 + }, + { + "epoch": 0.9219600725952813, + "grad_norm": 0.3322528302669525, + "learning_rate": 0.0002, + "loss": 1.0826, + "step": 1270 + }, + { + "epoch": 0.9292196007259528, + "grad_norm": 0.7553173303604126, + "learning_rate": 0.0002, + "loss": 1.0158, + "step": 1280 + }, + { + "epoch": 0.9364791288566243, + "grad_norm": 0.4856191575527191, + "learning_rate": 0.0002, + "loss": 1.198, + "step": 1290 + }, + { + "epoch": 0.9437386569872959, + "grad_norm": 0.3668074905872345, + "learning_rate": 0.0002, + "loss": 1.1678, + "step": 1300 + }, + { + "epoch": 0.9509981851179673, + "grad_norm": 0.29851067066192627, + "learning_rate": 0.0002, + "loss": 1.1809, + "step": 1310 + }, + { + "epoch": 0.9582577132486388, + "grad_norm": 0.276664674282074, + "learning_rate": 0.0002, + "loss": 1.1299, + "step": 1320 + }, + { + "epoch": 0.9655172413793104, + "grad_norm": 0.2941018342971802, + "learning_rate": 0.0002, + "loss": 1.0597, + "step": 1330 + }, + { + "epoch": 0.9727767695099818, + "grad_norm": 0.3505859076976776, + "learning_rate": 0.0002, + "loss": 1.1731, + "step": 1340 + }, + { + "epoch": 0.9800362976406534, + "grad_norm": 0.3067687451839447, + "learning_rate": 0.0002, + "loss": 1.1809, + "step": 1350 + }, + { + "epoch": 0.9872958257713249, + "grad_norm": 0.27151066064834595, + "learning_rate": 0.0002, + "loss": 1.1511, + "step": 1360 + }, + { + "epoch": 0.9945553539019963, + "grad_norm": 0.36370083689689636, + "learning_rate": 0.0002, + "loss": 1.1919, + "step": 1370 + }, + { + "epoch": 0.9996370235934664, + "eval_loss": 1.1381088495254517, + "eval_runtime": 96.0848, + "eval_samples_per_second": 4.538, + "eval_steps_per_second": 0.572, + "step": 1377 + } + ], + "logging_steps": 10, + "max_steps": 11016, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.07535596027904e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..140bceabdec44fc473842cb9ace4a62a4dc9fe89 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4abec5480aff960b5a7b03a48be1cdb1ae29ba8792469c451f5c2f269e4be93b +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..57c0547536cf633208bd0753b6f45625ae2084ca --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:160428ee21539129b32c4f50e46b1c3c4e1b361d3d8a42304e1e889b4311be3f +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/optimizer.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ec8fef898c8a9d69fd77cbd4c39bcaab6095ea7 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6219d7e87d25414277667ab600b4628a4df624a52ae01b039c4d2501eb65a758 +size 72886650 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/rng_state.pth b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..11de8bffda431939f1acb049557abb89aa5f64c3 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b622e5fc2fcbc23cb0255cd1db1954304c4d3fd922f5914715c3f7cbf19176a +size 14244 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/scheduler.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb63a029f86fa9cb62df58e019414d8be905de14 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:736497d6fe23b956ec574624f614b5057faee2badff1718a5f7fc886a837e97e +size 1064 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/trainer_state.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e11b3edc57e74e2ec6e2734caa22f66ef6e95f84 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/trainer_state.json @@ -0,0 +1,1974 @@ +{ + "best_metric": 1.1086540222167969, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 2755, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007259528130671506, + "grad_norm": 0.46010470390319824, + "learning_rate": 0.0002, + "loss": 2.4936, + "step": 10 + }, + { + "epoch": 0.014519056261343012, + "grad_norm": 0.5103179216384888, + "learning_rate": 0.0002, + "loss": 1.8442, + "step": 20 + }, + { + "epoch": 0.021778584392014518, + "grad_norm": 0.6282716989517212, + "learning_rate": 0.0002, + "loss": 1.559, + "step": 30 + }, + { + "epoch": 0.029038112522686024, + "grad_norm": 1.2480497360229492, + "learning_rate": 0.0002, + "loss": 1.3618, + "step": 40 + }, + { + "epoch": 0.036297640653357534, + "grad_norm": 0.4114855229854584, + "learning_rate": 0.0002, + "loss": 1.3874, + "step": 50 + }, + { + "epoch": 0.043557168784029036, + "grad_norm": 0.49482840299606323, + "learning_rate": 0.0002, + "loss": 1.2836, + "step": 60 + }, + { + "epoch": 0.050816696914700546, + "grad_norm": 0.4536272883415222, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 70 + }, + { + "epoch": 0.05807622504537205, + "grad_norm": 0.32328274846076965, + "learning_rate": 0.0002, + "loss": 1.1125, + "step": 80 + }, + { + "epoch": 0.06533575317604355, + "grad_norm": 0.40990468859672546, + "learning_rate": 0.0002, + "loss": 1.2447, + "step": 90 + }, + { + "epoch": 0.07259528130671507, + "grad_norm": 0.37273502349853516, + "learning_rate": 0.0002, + "loss": 1.2216, + "step": 100 + }, + { + "epoch": 0.07985480943738657, + "grad_norm": 0.3903871476650238, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 110 + }, + { + "epoch": 0.08711433756805807, + "grad_norm": 0.3272787928581238, + "learning_rate": 0.0002, + "loss": 1.3718, + "step": 120 + }, + { + "epoch": 0.09437386569872959, + "grad_norm": 0.3622824251651764, + "learning_rate": 0.0002, + "loss": 1.1832, + "step": 130 + }, + { + "epoch": 0.10163339382940109, + "grad_norm": 0.3503916561603546, + "learning_rate": 0.0002, + "loss": 1.2966, + "step": 140 + }, + { + "epoch": 0.1088929219600726, + "grad_norm": 0.32787832617759705, + "learning_rate": 0.0002, + "loss": 1.0913, + "step": 150 + }, + { + "epoch": 0.1161524500907441, + "grad_norm": 0.5822657942771912, + "learning_rate": 0.0002, + "loss": 1.2143, + "step": 160 + }, + { + "epoch": 0.12341197822141561, + "grad_norm": 0.28028249740600586, + "learning_rate": 0.0002, + "loss": 1.2091, + "step": 170 + }, + { + "epoch": 0.1306715063520871, + "grad_norm": 0.33602750301361084, + "learning_rate": 0.0002, + "loss": 1.1731, + "step": 180 + }, + { + "epoch": 0.13793103448275862, + "grad_norm": 0.29106274247169495, + "learning_rate": 0.0002, + "loss": 1.127, + "step": 190 + }, + { + "epoch": 0.14519056261343014, + "grad_norm": 0.38753363490104675, + "learning_rate": 0.0002, + "loss": 1.3261, + "step": 200 + }, + { + "epoch": 0.15245009074410162, + "grad_norm": 0.361009418964386, + "learning_rate": 0.0002, + "loss": 1.1536, + "step": 210 + }, + { + "epoch": 0.15970961887477314, + "grad_norm": 0.6743836402893066, + "learning_rate": 0.0002, + "loss": 1.1483, + "step": 220 + }, + { + "epoch": 0.16696914700544466, + "grad_norm": 0.3923613727092743, + "learning_rate": 0.0002, + "loss": 1.264, + "step": 230 + }, + { + "epoch": 0.17422867513611615, + "grad_norm": 0.2809699773788452, + "learning_rate": 0.0002, + "loss": 1.0437, + "step": 240 + }, + { + "epoch": 0.18148820326678766, + "grad_norm": 0.3631494641304016, + "learning_rate": 0.0002, + "loss": 1.1826, + "step": 250 + }, + { + "epoch": 0.18874773139745918, + "grad_norm": 0.24658171832561493, + "learning_rate": 0.0002, + "loss": 1.1584, + "step": 260 + }, + { + "epoch": 0.19600725952813067, + "grad_norm": 0.5780664682388306, + "learning_rate": 0.0002, + "loss": 1.0065, + "step": 270 + }, + { + "epoch": 0.20326678765880218, + "grad_norm": 0.3056720495223999, + "learning_rate": 0.0002, + "loss": 1.1251, + "step": 280 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.2857084572315216, + "learning_rate": 0.0002, + "loss": 1.1503, + "step": 290 + }, + { + "epoch": 0.2177858439201452, + "grad_norm": 0.6645345687866211, + "learning_rate": 0.0002, + "loss": 1.1215, + "step": 300 + }, + { + "epoch": 0.2250453720508167, + "grad_norm": 0.5966078639030457, + "learning_rate": 0.0002, + "loss": 1.127, + "step": 310 + }, + { + "epoch": 0.2323049001814882, + "grad_norm": 0.40937140583992004, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 320 + }, + { + "epoch": 0.2395644283121597, + "grad_norm": 0.5642806887626648, + "learning_rate": 0.0002, + "loss": 1.2385, + "step": 330 + }, + { + "epoch": 0.24682395644283123, + "grad_norm": 0.2750748097896576, + "learning_rate": 0.0002, + "loss": 1.1703, + "step": 340 + }, + { + "epoch": 0.2540834845735027, + "grad_norm": 0.34350234270095825, + "learning_rate": 0.0002, + "loss": 1.2598, + "step": 350 + }, + { + "epoch": 0.2613430127041742, + "grad_norm": 0.6767239570617676, + "learning_rate": 0.0002, + "loss": 1.1942, + "step": 360 + }, + { + "epoch": 0.26860254083484575, + "grad_norm": 0.31006959080696106, + "learning_rate": 0.0002, + "loss": 1.1436, + "step": 370 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 0.3825474679470062, + "learning_rate": 0.0002, + "loss": 1.2084, + "step": 380 + }, + { + "epoch": 0.2831215970961887, + "grad_norm": 0.30241551995277405, + "learning_rate": 0.0002, + "loss": 1.1523, + "step": 390 + }, + { + "epoch": 0.29038112522686027, + "grad_norm": 0.2962397336959839, + "learning_rate": 0.0002, + "loss": 1.1298, + "step": 400 + }, + { + "epoch": 0.29764065335753176, + "grad_norm": 0.2600369155406952, + "learning_rate": 0.0002, + "loss": 1.1299, + "step": 410 + }, + { + "epoch": 0.30490018148820325, + "grad_norm": 0.3675060272216797, + "learning_rate": 0.0002, + "loss": 1.1366, + "step": 420 + }, + { + "epoch": 0.3121597096188748, + "grad_norm": 0.3429498076438904, + "learning_rate": 0.0002, + "loss": 1.156, + "step": 430 + }, + { + "epoch": 0.3194192377495463, + "grad_norm": 0.34311825037002563, + "learning_rate": 0.0002, + "loss": 1.2741, + "step": 440 + }, + { + "epoch": 0.32667876588021777, + "grad_norm": 0.37872210144996643, + "learning_rate": 0.0002, + "loss": 1.3523, + "step": 450 + }, + { + "epoch": 0.3339382940108893, + "grad_norm": 0.33271121978759766, + "learning_rate": 0.0002, + "loss": 1.1365, + "step": 460 + }, + { + "epoch": 0.3411978221415608, + "grad_norm": 0.34605276584625244, + "learning_rate": 0.0002, + "loss": 1.1572, + "step": 470 + }, + { + "epoch": 0.3484573502722323, + "grad_norm": 0.41050562262535095, + "learning_rate": 0.0002, + "loss": 1.2755, + "step": 480 + }, + { + "epoch": 0.35571687840290384, + "grad_norm": 0.2066836953163147, + "learning_rate": 0.0002, + "loss": 1.075, + "step": 490 + }, + { + "epoch": 0.3629764065335753, + "grad_norm": 0.2859014868736267, + "learning_rate": 0.0002, + "loss": 1.2072, + "step": 500 + }, + { + "epoch": 0.3702359346642468, + "grad_norm": 0.28763777017593384, + "learning_rate": 0.0002, + "loss": 1.1435, + "step": 510 + }, + { + "epoch": 0.37749546279491836, + "grad_norm": 0.2730471193790436, + "learning_rate": 0.0002, + "loss": 1.1341, + "step": 520 + }, + { + "epoch": 0.38475499092558985, + "grad_norm": 0.3968936800956726, + "learning_rate": 0.0002, + "loss": 1.348, + "step": 530 + }, + { + "epoch": 0.39201451905626133, + "grad_norm": 0.3624701201915741, + "learning_rate": 0.0002, + "loss": 1.2363, + "step": 540 + }, + { + "epoch": 0.3992740471869328, + "grad_norm": 0.3303608298301697, + "learning_rate": 0.0002, + "loss": 1.1206, + "step": 550 + }, + { + "epoch": 0.40653357531760437, + "grad_norm": 0.33507466316223145, + "learning_rate": 0.0002, + "loss": 1.203, + "step": 560 + }, + { + "epoch": 0.41379310344827586, + "grad_norm": 0.3297670781612396, + "learning_rate": 0.0002, + "loss": 1.2294, + "step": 570 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.32334890961647034, + "learning_rate": 0.0002, + "loss": 1.2, + "step": 580 + }, + { + "epoch": 0.4283121597096189, + "grad_norm": 0.30281195044517517, + "learning_rate": 0.0002, + "loss": 1.112, + "step": 590 + }, + { + "epoch": 0.4355716878402904, + "grad_norm": 0.5900027751922607, + "learning_rate": 0.0002, + "loss": 1.1352, + "step": 600 + }, + { + "epoch": 0.44283121597096187, + "grad_norm": 0.28437477350234985, + "learning_rate": 0.0002, + "loss": 1.1575, + "step": 610 + }, + { + "epoch": 0.4500907441016334, + "grad_norm": 0.39601704478263855, + "learning_rate": 0.0002, + "loss": 1.2009, + "step": 620 + }, + { + "epoch": 0.4573502722323049, + "grad_norm": 0.41971510648727417, + "learning_rate": 0.0002, + "loss": 1.2302, + "step": 630 + }, + { + "epoch": 0.4646098003629764, + "grad_norm": 0.33814409375190735, + "learning_rate": 0.0002, + "loss": 1.236, + "step": 640 + }, + { + "epoch": 0.47186932849364793, + "grad_norm": 0.575718104839325, + "learning_rate": 0.0002, + "loss": 1.2471, + "step": 650 + }, + { + "epoch": 0.4791288566243194, + "grad_norm": 0.37927401065826416, + "learning_rate": 0.0002, + "loss": 1.1012, + "step": 660 + }, + { + "epoch": 0.4863883847549909, + "grad_norm": 0.3224332630634308, + "learning_rate": 0.0002, + "loss": 1.1552, + "step": 670 + }, + { + "epoch": 0.49364791288566245, + "grad_norm": 0.32683515548706055, + "learning_rate": 0.0002, + "loss": 1.1354, + "step": 680 + }, + { + "epoch": 0.5009074410163339, + "grad_norm": 0.4316163659095764, + "learning_rate": 0.0002, + "loss": 1.207, + "step": 690 + }, + { + "epoch": 0.5081669691470054, + "grad_norm": 0.342602401971817, + "learning_rate": 0.0002, + "loss": 1.2116, + "step": 700 + }, + { + "epoch": 0.515426497277677, + "grad_norm": 0.2794898748397827, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 710 + }, + { + "epoch": 0.5226860254083484, + "grad_norm": 0.3322339951992035, + "learning_rate": 0.0002, + "loss": 1.2395, + "step": 720 + }, + { + "epoch": 0.52994555353902, + "grad_norm": 0.3088509142398834, + "learning_rate": 0.0002, + "loss": 1.0762, + "step": 730 + }, + { + "epoch": 0.5372050816696915, + "grad_norm": 0.24444378912448883, + "learning_rate": 0.0002, + "loss": 1.1495, + "step": 740 + }, + { + "epoch": 0.5444646098003629, + "grad_norm": 0.3483171761035919, + "learning_rate": 0.0002, + "loss": 1.2176, + "step": 750 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 0.3425690531730652, + "learning_rate": 0.0002, + "loss": 1.2248, + "step": 760 + }, + { + "epoch": 0.558983666061706, + "grad_norm": 0.31841927766799927, + "learning_rate": 0.0002, + "loss": 1.1127, + "step": 770 + }, + { + "epoch": 0.5662431941923775, + "grad_norm": 0.39423868060112, + "learning_rate": 0.0002, + "loss": 1.1841, + "step": 780 + }, + { + "epoch": 0.573502722323049, + "grad_norm": 0.30328479409217834, + "learning_rate": 0.0002, + "loss": 1.0583, + "step": 790 + }, + { + "epoch": 0.5807622504537205, + "grad_norm": 0.24475938081741333, + "learning_rate": 0.0002, + "loss": 1.0718, + "step": 800 + }, + { + "epoch": 0.588021778584392, + "grad_norm": 0.37132805585861206, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 810 + }, + { + "epoch": 0.5952813067150635, + "grad_norm": 0.32195979356765747, + "learning_rate": 0.0002, + "loss": 1.1544, + "step": 820 + }, + { + "epoch": 0.6025408348457351, + "grad_norm": 0.2848738729953766, + "learning_rate": 0.0002, + "loss": 0.9937, + "step": 830 + }, + { + "epoch": 0.6098003629764065, + "grad_norm": 0.28015264868736267, + "learning_rate": 0.0002, + "loss": 1.0937, + "step": 840 + }, + { + "epoch": 0.617059891107078, + "grad_norm": 0.37796008586883545, + "learning_rate": 0.0002, + "loss": 1.1234, + "step": 850 + }, + { + "epoch": 0.6243194192377496, + "grad_norm": 0.39311841130256653, + "learning_rate": 0.0002, + "loss": 1.1455, + "step": 860 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.2761685252189636, + "learning_rate": 0.0002, + "loss": 1.1066, + "step": 870 + }, + { + "epoch": 0.6388384754990926, + "grad_norm": 0.3826720118522644, + "learning_rate": 0.0002, + "loss": 1.0859, + "step": 880 + }, + { + "epoch": 0.6460980036297641, + "grad_norm": 0.30076679587364197, + "learning_rate": 0.0002, + "loss": 1.1182, + "step": 890 + }, + { + "epoch": 0.6533575317604355, + "grad_norm": 0.21997687220573425, + "learning_rate": 0.0002, + "loss": 1.0927, + "step": 900 + }, + { + "epoch": 0.6606170598911071, + "grad_norm": 0.32593777775764465, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 910 + }, + { + "epoch": 0.6678765880217786, + "grad_norm": 0.30347898602485657, + "learning_rate": 0.0002, + "loss": 1.1578, + "step": 920 + }, + { + "epoch": 0.6751361161524501, + "grad_norm": 0.44173774123191833, + "learning_rate": 0.0002, + "loss": 1.1642, + "step": 930 + }, + { + "epoch": 0.6823956442831216, + "grad_norm": 0.2507467269897461, + "learning_rate": 0.0002, + "loss": 1.1832, + "step": 940 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 0.29463833570480347, + "learning_rate": 0.0002, + "loss": 1.3001, + "step": 950 + }, + { + "epoch": 0.6969147005444646, + "grad_norm": 0.9363154172897339, + "learning_rate": 0.0002, + "loss": 1.1651, + "step": 960 + }, + { + "epoch": 0.7041742286751361, + "grad_norm": 0.3236212134361267, + "learning_rate": 0.0002, + "loss": 1.158, + "step": 970 + }, + { + "epoch": 0.7114337568058077, + "grad_norm": 0.3123254179954529, + "learning_rate": 0.0002, + "loss": 1.1078, + "step": 980 + }, + { + "epoch": 0.7186932849364791, + "grad_norm": 0.3395805060863495, + "learning_rate": 0.0002, + "loss": 1.1578, + "step": 990 + }, + { + "epoch": 0.7259528130671506, + "grad_norm": 0.3240964412689209, + "learning_rate": 0.0002, + "loss": 1.0754, + "step": 1000 + }, + { + "epoch": 0.7332123411978222, + "grad_norm": 0.31902948021888733, + "learning_rate": 0.0002, + "loss": 1.0313, + "step": 1010 + }, + { + "epoch": 0.7404718693284936, + "grad_norm": 0.4848408102989197, + "learning_rate": 0.0002, + "loss": 1.1664, + "step": 1020 + }, + { + "epoch": 0.7477313974591652, + "grad_norm": 0.33006033301353455, + "learning_rate": 0.0002, + "loss": 1.0838, + "step": 1030 + }, + { + "epoch": 0.7549909255898367, + "grad_norm": 0.2928730547428131, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 1040 + }, + { + "epoch": 0.7622504537205081, + "grad_norm": 0.3529164791107178, + "learning_rate": 0.0002, + "loss": 1.1351, + "step": 1050 + }, + { + "epoch": 0.7695099818511797, + "grad_norm": 0.2736213803291321, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 1060 + }, + { + "epoch": 0.7767695099818511, + "grad_norm": 0.7200686931610107, + "learning_rate": 0.0002, + "loss": 1.1169, + "step": 1070 + }, + { + "epoch": 0.7840290381125227, + "grad_norm": 0.33396708965301514, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 1080 + }, + { + "epoch": 0.7912885662431942, + "grad_norm": 1.5760449171066284, + "learning_rate": 0.0002, + "loss": 1.2447, + "step": 1090 + }, + { + "epoch": 0.7985480943738656, + "grad_norm": 0.28138381242752075, + "learning_rate": 0.0002, + "loss": 1.1126, + "step": 1100 + }, + { + "epoch": 0.8058076225045372, + "grad_norm": 0.2597472369670868, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 1110 + }, + { + "epoch": 0.8130671506352087, + "grad_norm": 0.3305445611476898, + "learning_rate": 0.0002, + "loss": 1.1177, + "step": 1120 + }, + { + "epoch": 0.8203266787658802, + "grad_norm": 0.3934599459171295, + "learning_rate": 0.0002, + "loss": 1.0849, + "step": 1130 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 0.3472191393375397, + "learning_rate": 0.0002, + "loss": 1.0889, + "step": 1140 + }, + { + "epoch": 0.8348457350272233, + "grad_norm": 0.2857365906238556, + "learning_rate": 0.0002, + "loss": 1.265, + "step": 1150 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.3207702934741974, + "learning_rate": 0.0002, + "loss": 1.03, + "step": 1160 + }, + { + "epoch": 0.8493647912885662, + "grad_norm": 0.3176484704017639, + "learning_rate": 0.0002, + "loss": 1.1669, + "step": 1170 + }, + { + "epoch": 0.8566243194192378, + "grad_norm": 0.40685558319091797, + "learning_rate": 0.0002, + "loss": 1.1386, + "step": 1180 + }, + { + "epoch": 0.8638838475499092, + "grad_norm": 0.31125199794769287, + "learning_rate": 0.0002, + "loss": 1.1383, + "step": 1190 + }, + { + "epoch": 0.8711433756805808, + "grad_norm": 0.7361181378364563, + "learning_rate": 0.0002, + "loss": 1.1512, + "step": 1200 + }, + { + "epoch": 0.8784029038112523, + "grad_norm": 0.33699527382850647, + "learning_rate": 0.0002, + "loss": 0.9983, + "step": 1210 + }, + { + "epoch": 0.8856624319419237, + "grad_norm": 0.3315220773220062, + "learning_rate": 0.0002, + "loss": 1.11, + "step": 1220 + }, + { + "epoch": 0.8929219600725953, + "grad_norm": 0.6256054043769836, + "learning_rate": 0.0002, + "loss": 1.0266, + "step": 1230 + }, + { + "epoch": 0.9001814882032668, + "grad_norm": 0.3692137897014618, + "learning_rate": 0.0002, + "loss": 1.1738, + "step": 1240 + }, + { + "epoch": 0.9074410163339383, + "grad_norm": 0.3538484573364258, + "learning_rate": 0.0002, + "loss": 1.1112, + "step": 1250 + }, + { + "epoch": 0.9147005444646098, + "grad_norm": 0.27863040566444397, + "learning_rate": 0.0002, + "loss": 1.0878, + "step": 1260 + }, + { + "epoch": 0.9219600725952813, + "grad_norm": 0.3322528302669525, + "learning_rate": 0.0002, + "loss": 1.0826, + "step": 1270 + }, + { + "epoch": 0.9292196007259528, + "grad_norm": 0.7553173303604126, + "learning_rate": 0.0002, + "loss": 1.0158, + "step": 1280 + }, + { + "epoch": 0.9364791288566243, + "grad_norm": 0.4856191575527191, + "learning_rate": 0.0002, + "loss": 1.198, + "step": 1290 + }, + { + "epoch": 0.9437386569872959, + "grad_norm": 0.3668074905872345, + "learning_rate": 0.0002, + "loss": 1.1678, + "step": 1300 + }, + { + "epoch": 0.9509981851179673, + "grad_norm": 0.29851067066192627, + "learning_rate": 0.0002, + "loss": 1.1809, + "step": 1310 + }, + { + "epoch": 0.9582577132486388, + "grad_norm": 0.276664674282074, + "learning_rate": 0.0002, + "loss": 1.1299, + "step": 1320 + }, + { + "epoch": 0.9655172413793104, + "grad_norm": 0.2941018342971802, + "learning_rate": 0.0002, + "loss": 1.0597, + "step": 1330 + }, + { + "epoch": 0.9727767695099818, + "grad_norm": 0.3505859076976776, + "learning_rate": 0.0002, + "loss": 1.1731, + "step": 1340 + }, + { + "epoch": 0.9800362976406534, + "grad_norm": 0.3067687451839447, + "learning_rate": 0.0002, + "loss": 1.1809, + "step": 1350 + }, + { + "epoch": 0.9872958257713249, + "grad_norm": 0.27151066064834595, + "learning_rate": 0.0002, + "loss": 1.1511, + "step": 1360 + }, + { + "epoch": 0.9945553539019963, + "grad_norm": 0.36370083689689636, + "learning_rate": 0.0002, + "loss": 1.1919, + "step": 1370 + }, + { + "epoch": 0.9996370235934664, + "eval_loss": 1.1381088495254517, + "eval_runtime": 96.0848, + "eval_samples_per_second": 4.538, + "eval_steps_per_second": 0.572, + "step": 1377 + }, + { + "epoch": 1.0018148820326678, + "grad_norm": 0.27980583906173706, + "learning_rate": 0.0002, + "loss": 1.0082, + "step": 1380 + }, + { + "epoch": 1.0090744101633393, + "grad_norm": 0.26713913679122925, + "learning_rate": 0.0002, + "loss": 1.0565, + "step": 1390 + }, + { + "epoch": 1.0163339382940109, + "grad_norm": 0.3089541494846344, + "learning_rate": 0.0002, + "loss": 1.0241, + "step": 1400 + }, + { + "epoch": 1.0235934664246824, + "grad_norm": 0.4188242256641388, + "learning_rate": 0.0002, + "loss": 0.9649, + "step": 1410 + }, + { + "epoch": 1.030852994555354, + "grad_norm": 0.5246463418006897, + "learning_rate": 0.0002, + "loss": 0.9821, + "step": 1420 + }, + { + "epoch": 1.0381125226860255, + "grad_norm": 0.2728777825832367, + "learning_rate": 0.0002, + "loss": 1.0143, + "step": 1430 + }, + { + "epoch": 1.0453720508166968, + "grad_norm": 0.38167616724967957, + "learning_rate": 0.0002, + "loss": 1.075, + "step": 1440 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.4439380168914795, + "learning_rate": 0.0002, + "loss": 1.0229, + "step": 1450 + }, + { + "epoch": 1.05989110707804, + "grad_norm": 0.30954182147979736, + "learning_rate": 0.0002, + "loss": 1.0451, + "step": 1460 + }, + { + "epoch": 1.0671506352087115, + "grad_norm": 0.4022280275821686, + "learning_rate": 0.0002, + "loss": 1.0762, + "step": 1470 + }, + { + "epoch": 1.074410163339383, + "grad_norm": 0.7390811443328857, + "learning_rate": 0.0002, + "loss": 1.0754, + "step": 1480 + }, + { + "epoch": 1.0816696914700545, + "grad_norm": 0.3885486423969269, + "learning_rate": 0.0002, + "loss": 1.1378, + "step": 1490 + }, + { + "epoch": 1.0889292196007259, + "grad_norm": 0.5275560617446899, + "learning_rate": 0.0002, + "loss": 1.005, + "step": 1500 + }, + { + "epoch": 1.0961887477313974, + "grad_norm": 0.35112282633781433, + "learning_rate": 0.0002, + "loss": 1.0147, + "step": 1510 + }, + { + "epoch": 1.103448275862069, + "grad_norm": 0.33714351058006287, + "learning_rate": 0.0002, + "loss": 1.0331, + "step": 1520 + }, + { + "epoch": 1.1107078039927405, + "grad_norm": 0.31221693754196167, + "learning_rate": 0.0002, + "loss": 0.8723, + "step": 1530 + }, + { + "epoch": 1.117967332123412, + "grad_norm": 0.27549654245376587, + "learning_rate": 0.0002, + "loss": 1.0701, + "step": 1540 + }, + { + "epoch": 1.1252268602540836, + "grad_norm": 0.8465521335601807, + "learning_rate": 0.0002, + "loss": 1.0114, + "step": 1550 + }, + { + "epoch": 1.132486388384755, + "grad_norm": 0.36125949025154114, + "learning_rate": 0.0002, + "loss": 0.9643, + "step": 1560 + }, + { + "epoch": 1.1397459165154264, + "grad_norm": 0.37420371174812317, + "learning_rate": 0.0002, + "loss": 1.0744, + "step": 1570 + }, + { + "epoch": 1.147005444646098, + "grad_norm": 0.3294760584831238, + "learning_rate": 0.0002, + "loss": 1.165, + "step": 1580 + }, + { + "epoch": 1.1542649727767695, + "grad_norm": 0.3881238102912903, + "learning_rate": 0.0002, + "loss": 0.9207, + "step": 1590 + }, + { + "epoch": 1.161524500907441, + "grad_norm": 0.4766491651535034, + "learning_rate": 0.0002, + "loss": 1.0983, + "step": 1600 + }, + { + "epoch": 1.1687840290381124, + "grad_norm": 0.395530104637146, + "learning_rate": 0.0002, + "loss": 1.0092, + "step": 1610 + }, + { + "epoch": 1.176043557168784, + "grad_norm": 0.3297106623649597, + "learning_rate": 0.0002, + "loss": 0.9895, + "step": 1620 + }, + { + "epoch": 1.1833030852994555, + "grad_norm": 0.39528271555900574, + "learning_rate": 0.0002, + "loss": 1.1121, + "step": 1630 + }, + { + "epoch": 1.190562613430127, + "grad_norm": 0.3370221257209778, + "learning_rate": 0.0002, + "loss": 0.9202, + "step": 1640 + }, + { + "epoch": 1.1978221415607986, + "grad_norm": 0.31922030448913574, + "learning_rate": 0.0002, + "loss": 1.102, + "step": 1650 + }, + { + "epoch": 1.2050816696914701, + "grad_norm": 0.6142027378082275, + "learning_rate": 0.0002, + "loss": 1.0565, + "step": 1660 + }, + { + "epoch": 1.2123411978221417, + "grad_norm": 0.44769710302352905, + "learning_rate": 0.0002, + "loss": 0.9541, + "step": 1670 + }, + { + "epoch": 1.219600725952813, + "grad_norm": 0.41233646869659424, + "learning_rate": 0.0002, + "loss": 1.2501, + "step": 1680 + }, + { + "epoch": 1.2268602540834845, + "grad_norm": 0.2928866147994995, + "learning_rate": 0.0002, + "loss": 1.0147, + "step": 1690 + }, + { + "epoch": 1.234119782214156, + "grad_norm": 0.36913734674453735, + "learning_rate": 0.0002, + "loss": 0.9074, + "step": 1700 + }, + { + "epoch": 1.2413793103448276, + "grad_norm": 0.5281891226768494, + "learning_rate": 0.0002, + "loss": 1.0847, + "step": 1710 + }, + { + "epoch": 1.2486388384754992, + "grad_norm": 0.3374697268009186, + "learning_rate": 0.0002, + "loss": 0.9482, + "step": 1720 + }, + { + "epoch": 1.2558983666061705, + "grad_norm": 0.3802020847797394, + "learning_rate": 0.0002, + "loss": 0.9826, + "step": 1730 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 0.38048651814460754, + "learning_rate": 0.0002, + "loss": 1.0191, + "step": 1740 + }, + { + "epoch": 1.2704174228675136, + "grad_norm": 0.6676169633865356, + "learning_rate": 0.0002, + "loss": 1.0142, + "step": 1750 + }, + { + "epoch": 1.2776769509981851, + "grad_norm": 0.4075961410999298, + "learning_rate": 0.0002, + "loss": 0.992, + "step": 1760 + }, + { + "epoch": 1.2849364791288567, + "grad_norm": 0.4374721348285675, + "learning_rate": 0.0002, + "loss": 1.0301, + "step": 1770 + }, + { + "epoch": 1.2921960072595282, + "grad_norm": 0.4638824164867401, + "learning_rate": 0.0002, + "loss": 1.0601, + "step": 1780 + }, + { + "epoch": 1.2994555353901998, + "grad_norm": 0.38631564378738403, + "learning_rate": 0.0002, + "loss": 0.9593, + "step": 1790 + }, + { + "epoch": 1.306715063520871, + "grad_norm": 0.35873809456825256, + "learning_rate": 0.0002, + "loss": 1.0042, + "step": 1800 + }, + { + "epoch": 1.3139745916515426, + "grad_norm": 0.33602237701416016, + "learning_rate": 0.0002, + "loss": 0.9835, + "step": 1810 + }, + { + "epoch": 1.3212341197822142, + "grad_norm": 0.46696463227272034, + "learning_rate": 0.0002, + "loss": 1.1032, + "step": 1820 + }, + { + "epoch": 1.3284936479128857, + "grad_norm": 0.368958979845047, + "learning_rate": 0.0002, + "loss": 1.0123, + "step": 1830 + }, + { + "epoch": 1.335753176043557, + "grad_norm": 0.3160957396030426, + "learning_rate": 0.0002, + "loss": 1.0456, + "step": 1840 + }, + { + "epoch": 1.3430127041742286, + "grad_norm": 0.4511511027812958, + "learning_rate": 0.0002, + "loss": 1.0264, + "step": 1850 + }, + { + "epoch": 1.3502722323049001, + "grad_norm": 0.6769845485687256, + "learning_rate": 0.0002, + "loss": 1.0179, + "step": 1860 + }, + { + "epoch": 1.3575317604355717, + "grad_norm": 0.3749309480190277, + "learning_rate": 0.0002, + "loss": 1.011, + "step": 1870 + }, + { + "epoch": 1.3647912885662432, + "grad_norm": 0.553798496723175, + "learning_rate": 0.0002, + "loss": 1.0223, + "step": 1880 + }, + { + "epoch": 1.3720508166969148, + "grad_norm": 0.3538985550403595, + "learning_rate": 0.0002, + "loss": 0.9634, + "step": 1890 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 0.5501534938812256, + "learning_rate": 0.0002, + "loss": 1.0275, + "step": 1900 + }, + { + "epoch": 1.3865698729582578, + "grad_norm": 0.4432051181793213, + "learning_rate": 0.0002, + "loss": 1.0461, + "step": 1910 + }, + { + "epoch": 1.3938294010889292, + "grad_norm": 0.41755786538124084, + "learning_rate": 0.0002, + "loss": 0.9082, + "step": 1920 + }, + { + "epoch": 1.4010889292196007, + "grad_norm": 0.5626114010810852, + "learning_rate": 0.0002, + "loss": 1.0858, + "step": 1930 + }, + { + "epoch": 1.4083484573502723, + "grad_norm": 0.44038185477256775, + "learning_rate": 0.0002, + "loss": 1.0687, + "step": 1940 + }, + { + "epoch": 1.4156079854809438, + "grad_norm": 0.3397001624107361, + "learning_rate": 0.0002, + "loss": 0.9454, + "step": 1950 + }, + { + "epoch": 1.4228675136116151, + "grad_norm": 0.4325368106365204, + "learning_rate": 0.0002, + "loss": 0.943, + "step": 1960 + }, + { + "epoch": 1.4301270417422867, + "grad_norm": 0.3900907039642334, + "learning_rate": 0.0002, + "loss": 1.0488, + "step": 1970 + }, + { + "epoch": 1.4373865698729582, + "grad_norm": 0.369612455368042, + "learning_rate": 0.0002, + "loss": 0.9699, + "step": 1980 + }, + { + "epoch": 1.4446460980036298, + "grad_norm": 0.4389338791370392, + "learning_rate": 0.0002, + "loss": 1.0609, + "step": 1990 + }, + { + "epoch": 1.4519056261343013, + "grad_norm": 1.694450855255127, + "learning_rate": 0.0002, + "loss": 1.042, + "step": 2000 + }, + { + "epoch": 1.4591651542649728, + "grad_norm": 0.516957700252533, + "learning_rate": 0.0002, + "loss": 0.997, + "step": 2010 + }, + { + "epoch": 1.4664246823956444, + "grad_norm": 0.45515501499176025, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 2020 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 0.4153139591217041, + "learning_rate": 0.0002, + "loss": 1.0188, + "step": 2030 + }, + { + "epoch": 1.4809437386569873, + "grad_norm": 0.44353052973747253, + "learning_rate": 0.0002, + "loss": 1.1521, + "step": 2040 + }, + { + "epoch": 1.4882032667876588, + "grad_norm": 0.570554256439209, + "learning_rate": 0.0002, + "loss": 0.9653, + "step": 2050 + }, + { + "epoch": 1.4954627949183303, + "grad_norm": 0.5742740035057068, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 2060 + }, + { + "epoch": 1.5027223230490017, + "grad_norm": 0.5890517830848694, + "learning_rate": 0.0002, + "loss": 1.0963, + "step": 2070 + }, + { + "epoch": 1.5099818511796732, + "grad_norm": 0.4162650406360626, + "learning_rate": 0.0002, + "loss": 1.0639, + "step": 2080 + }, + { + "epoch": 1.5172413793103448, + "grad_norm": 0.7334717512130737, + "learning_rate": 0.0002, + "loss": 1.071, + "step": 2090 + }, + { + "epoch": 1.5245009074410163, + "grad_norm": 0.2963249683380127, + "learning_rate": 0.0002, + "loss": 0.8957, + "step": 2100 + }, + { + "epoch": 1.5317604355716878, + "grad_norm": 0.30676454305648804, + "learning_rate": 0.0002, + "loss": 1.0446, + "step": 2110 + }, + { + "epoch": 1.5390199637023594, + "grad_norm": 0.35984641313552856, + "learning_rate": 0.0002, + "loss": 0.9967, + "step": 2120 + }, + { + "epoch": 1.546279491833031, + "grad_norm": 0.3384549617767334, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 2130 + }, + { + "epoch": 1.5535390199637025, + "grad_norm": 0.4725518226623535, + "learning_rate": 0.0002, + "loss": 0.9874, + "step": 2140 + }, + { + "epoch": 1.560798548094374, + "grad_norm": 0.3252685070037842, + "learning_rate": 0.0002, + "loss": 1.1166, + "step": 2150 + }, + { + "epoch": 1.5680580762250453, + "grad_norm": 0.45043081045150757, + "learning_rate": 0.0002, + "loss": 0.9127, + "step": 2160 + }, + { + "epoch": 1.5753176043557169, + "grad_norm": 0.374208003282547, + "learning_rate": 0.0002, + "loss": 1.0767, + "step": 2170 + }, + { + "epoch": 1.5825771324863884, + "grad_norm": 0.5118404030799866, + "learning_rate": 0.0002, + "loss": 1.0255, + "step": 2180 + }, + { + "epoch": 1.5898366606170597, + "grad_norm": 0.482170969247818, + "learning_rate": 0.0002, + "loss": 1.0062, + "step": 2190 + }, + { + "epoch": 1.5970961887477313, + "grad_norm": 0.5337533950805664, + "learning_rate": 0.0002, + "loss": 1.0872, + "step": 2200 + }, + { + "epoch": 1.6043557168784028, + "grad_norm": 0.5195064544677734, + "learning_rate": 0.0002, + "loss": 1.0405, + "step": 2210 + }, + { + "epoch": 1.6116152450090744, + "grad_norm": 0.30807098746299744, + "learning_rate": 0.0002, + "loss": 1.0454, + "step": 2220 + }, + { + "epoch": 1.618874773139746, + "grad_norm": 0.3962925672531128, + "learning_rate": 0.0002, + "loss": 1.0293, + "step": 2230 + }, + { + "epoch": 1.6261343012704175, + "grad_norm": 0.7636962532997131, + "learning_rate": 0.0002, + "loss": 1.0137, + "step": 2240 + }, + { + "epoch": 1.633393829401089, + "grad_norm": 0.32380592823028564, + "learning_rate": 0.0002, + "loss": 0.991, + "step": 2250 + }, + { + "epoch": 1.6406533575317606, + "grad_norm": 0.5767741799354553, + "learning_rate": 0.0002, + "loss": 0.9471, + "step": 2260 + }, + { + "epoch": 1.647912885662432, + "grad_norm": 0.39964812994003296, + "learning_rate": 0.0002, + "loss": 0.9524, + "step": 2270 + }, + { + "epoch": 1.6551724137931034, + "grad_norm": 0.622629702091217, + "learning_rate": 0.0002, + "loss": 0.9866, + "step": 2280 + }, + { + "epoch": 1.662431941923775, + "grad_norm": 0.40202152729034424, + "learning_rate": 0.0002, + "loss": 0.9609, + "step": 2290 + }, + { + "epoch": 1.6696914700544465, + "grad_norm": 0.4467349052429199, + "learning_rate": 0.0002, + "loss": 0.9859, + "step": 2300 + }, + { + "epoch": 1.6769509981851178, + "grad_norm": 0.5026949048042297, + "learning_rate": 0.0002, + "loss": 1.0312, + "step": 2310 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 0.43754541873931885, + "learning_rate": 0.0002, + "loss": 1.0776, + "step": 2320 + }, + { + "epoch": 1.691470054446461, + "grad_norm": 0.42869430780410767, + "learning_rate": 0.0002, + "loss": 0.9786, + "step": 2330 + }, + { + "epoch": 1.6987295825771325, + "grad_norm": 0.4192679524421692, + "learning_rate": 0.0002, + "loss": 1.0747, + "step": 2340 + }, + { + "epoch": 1.705989110707804, + "grad_norm": 0.3243155777454376, + "learning_rate": 0.0002, + "loss": 1.0054, + "step": 2350 + }, + { + "epoch": 1.7132486388384756, + "grad_norm": 1.0514075756072998, + "learning_rate": 0.0002, + "loss": 0.9165, + "step": 2360 + }, + { + "epoch": 1.720508166969147, + "grad_norm": 0.4719122648239136, + "learning_rate": 0.0002, + "loss": 1.1353, + "step": 2370 + }, + { + "epoch": 1.7277676950998186, + "grad_norm": 0.3846144676208496, + "learning_rate": 0.0002, + "loss": 0.9913, + "step": 2380 + }, + { + "epoch": 1.73502722323049, + "grad_norm": 0.7266581058502197, + "learning_rate": 0.0002, + "loss": 0.9935, + "step": 2390 + }, + { + "epoch": 1.7422867513611615, + "grad_norm": 0.6183241605758667, + "learning_rate": 0.0002, + "loss": 0.9509, + "step": 2400 + }, + { + "epoch": 1.749546279491833, + "grad_norm": 0.3658260405063629, + "learning_rate": 0.0002, + "loss": 0.9713, + "step": 2410 + }, + { + "epoch": 1.7568058076225044, + "grad_norm": 0.6036322712898254, + "learning_rate": 0.0002, + "loss": 1.0033, + "step": 2420 + }, + { + "epoch": 1.764065335753176, + "grad_norm": 0.7872936129570007, + "learning_rate": 0.0002, + "loss": 1.0526, + "step": 2430 + }, + { + "epoch": 1.7713248638838475, + "grad_norm": 0.35946124792099, + "learning_rate": 0.0002, + "loss": 0.99, + "step": 2440 + }, + { + "epoch": 1.778584392014519, + "grad_norm": 0.3740338981151581, + "learning_rate": 0.0002, + "loss": 0.8845, + "step": 2450 + }, + { + "epoch": 1.7858439201451906, + "grad_norm": 0.6150230169296265, + "learning_rate": 0.0002, + "loss": 0.9333, + "step": 2460 + }, + { + "epoch": 1.793103448275862, + "grad_norm": 0.4726075530052185, + "learning_rate": 0.0002, + "loss": 1.0373, + "step": 2470 + }, + { + "epoch": 1.8003629764065336, + "grad_norm": 0.31292253732681274, + "learning_rate": 0.0002, + "loss": 0.974, + "step": 2480 + }, + { + "epoch": 1.8076225045372052, + "grad_norm": 0.4463104009628296, + "learning_rate": 0.0002, + "loss": 1.0205, + "step": 2490 + }, + { + "epoch": 1.8148820326678767, + "grad_norm": 0.7848200798034668, + "learning_rate": 0.0002, + "loss": 1.0771, + "step": 2500 + }, + { + "epoch": 1.822141560798548, + "grad_norm": 0.5562082529067993, + "learning_rate": 0.0002, + "loss": 0.9508, + "step": 2510 + }, + { + "epoch": 1.8294010889292196, + "grad_norm": 0.39892756938934326, + "learning_rate": 0.0002, + "loss": 0.9343, + "step": 2520 + }, + { + "epoch": 1.8366606170598911, + "grad_norm": 1.2923320531845093, + "learning_rate": 0.0002, + "loss": 1.0489, + "step": 2530 + }, + { + "epoch": 1.8439201451905625, + "grad_norm": 0.6316490769386292, + "learning_rate": 0.0002, + "loss": 1.1203, + "step": 2540 + }, + { + "epoch": 1.851179673321234, + "grad_norm": 0.46100497245788574, + "learning_rate": 0.0002, + "loss": 0.9131, + "step": 2550 + }, + { + "epoch": 1.8584392014519056, + "grad_norm": 0.7902987003326416, + "learning_rate": 0.0002, + "loss": 1.0977, + "step": 2560 + }, + { + "epoch": 1.865698729582577, + "grad_norm": 0.4596365690231323, + "learning_rate": 0.0002, + "loss": 0.9702, + "step": 2570 + }, + { + "epoch": 1.8729582577132486, + "grad_norm": 0.6592172384262085, + "learning_rate": 0.0002, + "loss": 0.9322, + "step": 2580 + }, + { + "epoch": 1.8802177858439202, + "grad_norm": 0.5764662027359009, + "learning_rate": 0.0002, + "loss": 0.9549, + "step": 2590 + }, + { + "epoch": 1.8874773139745917, + "grad_norm": 0.8421637415885925, + "learning_rate": 0.0002, + "loss": 0.9485, + "step": 2600 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.5635305047035217, + "learning_rate": 0.0002, + "loss": 1.012, + "step": 2610 + }, + { + "epoch": 1.9019963702359348, + "grad_norm": 0.46231237053871155, + "learning_rate": 0.0002, + "loss": 0.8907, + "step": 2620 + }, + { + "epoch": 1.9092558983666061, + "grad_norm": 0.3944607079029083, + "learning_rate": 0.0002, + "loss": 0.9543, + "step": 2630 + }, + { + "epoch": 1.9165154264972777, + "grad_norm": 0.4753907322883606, + "learning_rate": 0.0002, + "loss": 0.9964, + "step": 2640 + }, + { + "epoch": 1.9237749546279492, + "grad_norm": 0.4151090979576111, + "learning_rate": 0.0002, + "loss": 1.0217, + "step": 2650 + }, + { + "epoch": 1.9310344827586206, + "grad_norm": 0.6793725490570068, + "learning_rate": 0.0002, + "loss": 1.0397, + "step": 2660 + }, + { + "epoch": 1.938294010889292, + "grad_norm": 0.339755117893219, + "learning_rate": 0.0002, + "loss": 1.0401, + "step": 2670 + }, + { + "epoch": 1.9455535390199636, + "grad_norm": 0.40789374709129333, + "learning_rate": 0.0002, + "loss": 1.082, + "step": 2680 + }, + { + "epoch": 1.9528130671506352, + "grad_norm": 0.3750005066394806, + "learning_rate": 0.0002, + "loss": 0.9973, + "step": 2690 + }, + { + "epoch": 1.9600725952813067, + "grad_norm": 0.39684441685676575, + "learning_rate": 0.0002, + "loss": 0.9837, + "step": 2700 + }, + { + "epoch": 1.9673321234119783, + "grad_norm": 0.378287672996521, + "learning_rate": 0.0002, + "loss": 1.1204, + "step": 2710 + }, + { + "epoch": 1.9745916515426498, + "grad_norm": 0.3668482005596161, + "learning_rate": 0.0002, + "loss": 0.9284, + "step": 2720 + }, + { + "epoch": 1.9818511796733214, + "grad_norm": 0.49997565150260925, + "learning_rate": 0.0002, + "loss": 1.0512, + "step": 2730 + }, + { + "epoch": 1.989110707803993, + "grad_norm": 0.36852124333381653, + "learning_rate": 0.0002, + "loss": 0.9311, + "step": 2740 + }, + { + "epoch": 1.9963702359346642, + "grad_norm": 0.5203380584716797, + "learning_rate": 0.0002, + "loss": 1.0898, + "step": 2750 + }, + { + "epoch": 2.0, + "eval_loss": 1.1086540222167969, + "eval_runtime": 95.6866, + "eval_samples_per_second": 4.557, + "eval_steps_per_second": 0.575, + "step": 2755 + } + ], + "logging_steps": 10, + "max_steps": 11016, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.415071192055808e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..140bceabdec44fc473842cb9ace4a62a4dc9fe89 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4abec5480aff960b5a7b03a48be1cdb1ae29ba8792469c451f5c2f269e4be93b +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f86d1076a959ad7c340c5f949c6b4b4b5c701e81 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec236235430d88b0ad03c049b7c5fb8da1beded051d5a85162681953de990a28 +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/optimizer.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..254c107dc7d441bd7e87c04f7cd394d5cc445643 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:983c73e9e8d3c89c114e978e78a75ed19d4826eeb7fb5778ed383f7d848f3d28 +size 72886650 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/rng_state.pth b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2e974d8deb466a24553e90c17446b11000cf19e9 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d996d97ac7036342ea56710f25417defb3ec407c108a59efb25cf82648499bd1 +size 14244 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/scheduler.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4000c9c3ae30d23b98f9d56765f7e8540cc8a8d --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f54153354bf146aa410b1e9645486e50d7214025e24901ae1b6c5a987a1a46e9 +size 1064 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/trainer_state.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7434b2f03b41ca20d41d38a0fd1103894b284f71 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/trainer_state.json @@ -0,0 +1,2948 @@ +{ + "best_metric": 1.1086540222167969, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755", + "epoch": 2.9996370235934666, + "eval_steps": 10, + "global_step": 4132, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007259528130671506, + "grad_norm": 0.46010470390319824, + "learning_rate": 0.0002, + "loss": 2.4936, + "step": 10 + }, + { + "epoch": 0.014519056261343012, + "grad_norm": 0.5103179216384888, + "learning_rate": 0.0002, + "loss": 1.8442, + "step": 20 + }, + { + "epoch": 0.021778584392014518, + "grad_norm": 0.6282716989517212, + "learning_rate": 0.0002, + "loss": 1.559, + "step": 30 + }, + { + "epoch": 0.029038112522686024, + "grad_norm": 1.2480497360229492, + "learning_rate": 0.0002, + "loss": 1.3618, + "step": 40 + }, + { + "epoch": 0.036297640653357534, + "grad_norm": 0.4114855229854584, + "learning_rate": 0.0002, + "loss": 1.3874, + "step": 50 + }, + { + "epoch": 0.043557168784029036, + "grad_norm": 0.49482840299606323, + "learning_rate": 0.0002, + "loss": 1.2836, + "step": 60 + }, + { + "epoch": 0.050816696914700546, + "grad_norm": 0.4536272883415222, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 70 + }, + { + "epoch": 0.05807622504537205, + "grad_norm": 0.32328274846076965, + "learning_rate": 0.0002, + "loss": 1.1125, + "step": 80 + }, + { + "epoch": 0.06533575317604355, + "grad_norm": 0.40990468859672546, + "learning_rate": 0.0002, + "loss": 1.2447, + "step": 90 + }, + { + "epoch": 0.07259528130671507, + "grad_norm": 0.37273502349853516, + "learning_rate": 0.0002, + "loss": 1.2216, + "step": 100 + }, + { + "epoch": 0.07985480943738657, + "grad_norm": 0.3903871476650238, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 110 + }, + { + "epoch": 0.08711433756805807, + "grad_norm": 0.3272787928581238, + "learning_rate": 0.0002, + "loss": 1.3718, + "step": 120 + }, + { + "epoch": 0.09437386569872959, + "grad_norm": 0.3622824251651764, + "learning_rate": 0.0002, + "loss": 1.1832, + "step": 130 + }, + { + "epoch": 0.10163339382940109, + "grad_norm": 0.3503916561603546, + "learning_rate": 0.0002, + "loss": 1.2966, + "step": 140 + }, + { + "epoch": 0.1088929219600726, + "grad_norm": 0.32787832617759705, + "learning_rate": 0.0002, + "loss": 1.0913, + "step": 150 + }, + { + "epoch": 0.1161524500907441, + "grad_norm": 0.5822657942771912, + "learning_rate": 0.0002, + "loss": 1.2143, + "step": 160 + }, + { + "epoch": 0.12341197822141561, + "grad_norm": 0.28028249740600586, + "learning_rate": 0.0002, + "loss": 1.2091, + "step": 170 + }, + { + "epoch": 0.1306715063520871, + "grad_norm": 0.33602750301361084, + "learning_rate": 0.0002, + "loss": 1.1731, + "step": 180 + }, + { + "epoch": 0.13793103448275862, + "grad_norm": 0.29106274247169495, + "learning_rate": 0.0002, + "loss": 1.127, + "step": 190 + }, + { + "epoch": 0.14519056261343014, + "grad_norm": 0.38753363490104675, + "learning_rate": 0.0002, + "loss": 1.3261, + "step": 200 + }, + { + "epoch": 0.15245009074410162, + "grad_norm": 0.361009418964386, + "learning_rate": 0.0002, + "loss": 1.1536, + "step": 210 + }, + { + "epoch": 0.15970961887477314, + "grad_norm": 0.6743836402893066, + "learning_rate": 0.0002, + "loss": 1.1483, + "step": 220 + }, + { + "epoch": 0.16696914700544466, + "grad_norm": 0.3923613727092743, + "learning_rate": 0.0002, + "loss": 1.264, + "step": 230 + }, + { + "epoch": 0.17422867513611615, + "grad_norm": 0.2809699773788452, + "learning_rate": 0.0002, + "loss": 1.0437, + "step": 240 + }, + { + "epoch": 0.18148820326678766, + "grad_norm": 0.3631494641304016, + "learning_rate": 0.0002, + "loss": 1.1826, + "step": 250 + }, + { + "epoch": 0.18874773139745918, + "grad_norm": 0.24658171832561493, + "learning_rate": 0.0002, + "loss": 1.1584, + "step": 260 + }, + { + "epoch": 0.19600725952813067, + "grad_norm": 0.5780664682388306, + "learning_rate": 0.0002, + "loss": 1.0065, + "step": 270 + }, + { + "epoch": 0.20326678765880218, + "grad_norm": 0.3056720495223999, + "learning_rate": 0.0002, + "loss": 1.1251, + "step": 280 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.2857084572315216, + "learning_rate": 0.0002, + "loss": 1.1503, + "step": 290 + }, + { + "epoch": 0.2177858439201452, + "grad_norm": 0.6645345687866211, + "learning_rate": 0.0002, + "loss": 1.1215, + "step": 300 + }, + { + "epoch": 0.2250453720508167, + "grad_norm": 0.5966078639030457, + "learning_rate": 0.0002, + "loss": 1.127, + "step": 310 + }, + { + "epoch": 0.2323049001814882, + "grad_norm": 0.40937140583992004, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 320 + }, + { + "epoch": 0.2395644283121597, + "grad_norm": 0.5642806887626648, + "learning_rate": 0.0002, + "loss": 1.2385, + "step": 330 + }, + { + "epoch": 0.24682395644283123, + "grad_norm": 0.2750748097896576, + "learning_rate": 0.0002, + "loss": 1.1703, + "step": 340 + }, + { + "epoch": 0.2540834845735027, + "grad_norm": 0.34350234270095825, + "learning_rate": 0.0002, + "loss": 1.2598, + "step": 350 + }, + { + "epoch": 0.2613430127041742, + "grad_norm": 0.6767239570617676, + "learning_rate": 0.0002, + "loss": 1.1942, + "step": 360 + }, + { + "epoch": 0.26860254083484575, + "grad_norm": 0.31006959080696106, + "learning_rate": 0.0002, + "loss": 1.1436, + "step": 370 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 0.3825474679470062, + "learning_rate": 0.0002, + "loss": 1.2084, + "step": 380 + }, + { + "epoch": 0.2831215970961887, + "grad_norm": 0.30241551995277405, + "learning_rate": 0.0002, + "loss": 1.1523, + "step": 390 + }, + { + "epoch": 0.29038112522686027, + "grad_norm": 0.2962397336959839, + "learning_rate": 0.0002, + "loss": 1.1298, + "step": 400 + }, + { + "epoch": 0.29764065335753176, + "grad_norm": 0.2600369155406952, + "learning_rate": 0.0002, + "loss": 1.1299, + "step": 410 + }, + { + "epoch": 0.30490018148820325, + "grad_norm": 0.3675060272216797, + "learning_rate": 0.0002, + "loss": 1.1366, + "step": 420 + }, + { + "epoch": 0.3121597096188748, + "grad_norm": 0.3429498076438904, + "learning_rate": 0.0002, + "loss": 1.156, + "step": 430 + }, + { + "epoch": 0.3194192377495463, + "grad_norm": 0.34311825037002563, + "learning_rate": 0.0002, + "loss": 1.2741, + "step": 440 + }, + { + "epoch": 0.32667876588021777, + "grad_norm": 0.37872210144996643, + "learning_rate": 0.0002, + "loss": 1.3523, + "step": 450 + }, + { + "epoch": 0.3339382940108893, + "grad_norm": 0.33271121978759766, + "learning_rate": 0.0002, + "loss": 1.1365, + "step": 460 + }, + { + "epoch": 0.3411978221415608, + "grad_norm": 0.34605276584625244, + "learning_rate": 0.0002, + "loss": 1.1572, + "step": 470 + }, + { + "epoch": 0.3484573502722323, + "grad_norm": 0.41050562262535095, + "learning_rate": 0.0002, + "loss": 1.2755, + "step": 480 + }, + { + "epoch": 0.35571687840290384, + "grad_norm": 0.2066836953163147, + "learning_rate": 0.0002, + "loss": 1.075, + "step": 490 + }, + { + "epoch": 0.3629764065335753, + "grad_norm": 0.2859014868736267, + "learning_rate": 0.0002, + "loss": 1.2072, + "step": 500 + }, + { + "epoch": 0.3702359346642468, + "grad_norm": 0.28763777017593384, + "learning_rate": 0.0002, + "loss": 1.1435, + "step": 510 + }, + { + "epoch": 0.37749546279491836, + "grad_norm": 0.2730471193790436, + "learning_rate": 0.0002, + "loss": 1.1341, + "step": 520 + }, + { + "epoch": 0.38475499092558985, + "grad_norm": 0.3968936800956726, + "learning_rate": 0.0002, + "loss": 1.348, + "step": 530 + }, + { + "epoch": 0.39201451905626133, + "grad_norm": 0.3624701201915741, + "learning_rate": 0.0002, + "loss": 1.2363, + "step": 540 + }, + { + "epoch": 0.3992740471869328, + "grad_norm": 0.3303608298301697, + "learning_rate": 0.0002, + "loss": 1.1206, + "step": 550 + }, + { + "epoch": 0.40653357531760437, + "grad_norm": 0.33507466316223145, + "learning_rate": 0.0002, + "loss": 1.203, + "step": 560 + }, + { + "epoch": 0.41379310344827586, + "grad_norm": 0.3297670781612396, + "learning_rate": 0.0002, + "loss": 1.2294, + "step": 570 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.32334890961647034, + "learning_rate": 0.0002, + "loss": 1.2, + "step": 580 + }, + { + "epoch": 0.4283121597096189, + "grad_norm": 0.30281195044517517, + "learning_rate": 0.0002, + "loss": 1.112, + "step": 590 + }, + { + "epoch": 0.4355716878402904, + "grad_norm": 0.5900027751922607, + "learning_rate": 0.0002, + "loss": 1.1352, + "step": 600 + }, + { + "epoch": 0.44283121597096187, + "grad_norm": 0.28437477350234985, + "learning_rate": 0.0002, + "loss": 1.1575, + "step": 610 + }, + { + "epoch": 0.4500907441016334, + "grad_norm": 0.39601704478263855, + "learning_rate": 0.0002, + "loss": 1.2009, + "step": 620 + }, + { + "epoch": 0.4573502722323049, + "grad_norm": 0.41971510648727417, + "learning_rate": 0.0002, + "loss": 1.2302, + "step": 630 + }, + { + "epoch": 0.4646098003629764, + "grad_norm": 0.33814409375190735, + "learning_rate": 0.0002, + "loss": 1.236, + "step": 640 + }, + { + "epoch": 0.47186932849364793, + "grad_norm": 0.575718104839325, + "learning_rate": 0.0002, + "loss": 1.2471, + "step": 650 + }, + { + "epoch": 0.4791288566243194, + "grad_norm": 0.37927401065826416, + "learning_rate": 0.0002, + "loss": 1.1012, + "step": 660 + }, + { + "epoch": 0.4863883847549909, + "grad_norm": 0.3224332630634308, + "learning_rate": 0.0002, + "loss": 1.1552, + "step": 670 + }, + { + "epoch": 0.49364791288566245, + "grad_norm": 0.32683515548706055, + "learning_rate": 0.0002, + "loss": 1.1354, + "step": 680 + }, + { + "epoch": 0.5009074410163339, + "grad_norm": 0.4316163659095764, + "learning_rate": 0.0002, + "loss": 1.207, + "step": 690 + }, + { + "epoch": 0.5081669691470054, + "grad_norm": 0.342602401971817, + "learning_rate": 0.0002, + "loss": 1.2116, + "step": 700 + }, + { + "epoch": 0.515426497277677, + "grad_norm": 0.2794898748397827, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 710 + }, + { + "epoch": 0.5226860254083484, + "grad_norm": 0.3322339951992035, + "learning_rate": 0.0002, + "loss": 1.2395, + "step": 720 + }, + { + "epoch": 0.52994555353902, + "grad_norm": 0.3088509142398834, + "learning_rate": 0.0002, + "loss": 1.0762, + "step": 730 + }, + { + "epoch": 0.5372050816696915, + "grad_norm": 0.24444378912448883, + "learning_rate": 0.0002, + "loss": 1.1495, + "step": 740 + }, + { + "epoch": 0.5444646098003629, + "grad_norm": 0.3483171761035919, + "learning_rate": 0.0002, + "loss": 1.2176, + "step": 750 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 0.3425690531730652, + "learning_rate": 0.0002, + "loss": 1.2248, + "step": 760 + }, + { + "epoch": 0.558983666061706, + "grad_norm": 0.31841927766799927, + "learning_rate": 0.0002, + "loss": 1.1127, + "step": 770 + }, + { + "epoch": 0.5662431941923775, + "grad_norm": 0.39423868060112, + "learning_rate": 0.0002, + "loss": 1.1841, + "step": 780 + }, + { + "epoch": 0.573502722323049, + "grad_norm": 0.30328479409217834, + "learning_rate": 0.0002, + "loss": 1.0583, + "step": 790 + }, + { + "epoch": 0.5807622504537205, + "grad_norm": 0.24475938081741333, + "learning_rate": 0.0002, + "loss": 1.0718, + "step": 800 + }, + { + "epoch": 0.588021778584392, + "grad_norm": 0.37132805585861206, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 810 + }, + { + "epoch": 0.5952813067150635, + "grad_norm": 0.32195979356765747, + "learning_rate": 0.0002, + "loss": 1.1544, + "step": 820 + }, + { + "epoch": 0.6025408348457351, + "grad_norm": 0.2848738729953766, + "learning_rate": 0.0002, + "loss": 0.9937, + "step": 830 + }, + { + "epoch": 0.6098003629764065, + "grad_norm": 0.28015264868736267, + "learning_rate": 0.0002, + "loss": 1.0937, + "step": 840 + }, + { + "epoch": 0.617059891107078, + "grad_norm": 0.37796008586883545, + "learning_rate": 0.0002, + "loss": 1.1234, + "step": 850 + }, + { + "epoch": 0.6243194192377496, + "grad_norm": 0.39311841130256653, + "learning_rate": 0.0002, + "loss": 1.1455, + "step": 860 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.2761685252189636, + "learning_rate": 0.0002, + "loss": 1.1066, + "step": 870 + }, + { + "epoch": 0.6388384754990926, + "grad_norm": 0.3826720118522644, + "learning_rate": 0.0002, + "loss": 1.0859, + "step": 880 + }, + { + "epoch": 0.6460980036297641, + "grad_norm": 0.30076679587364197, + "learning_rate": 0.0002, + "loss": 1.1182, + "step": 890 + }, + { + "epoch": 0.6533575317604355, + "grad_norm": 0.21997687220573425, + "learning_rate": 0.0002, + "loss": 1.0927, + "step": 900 + }, + { + "epoch": 0.6606170598911071, + "grad_norm": 0.32593777775764465, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 910 + }, + { + "epoch": 0.6678765880217786, + "grad_norm": 0.30347898602485657, + "learning_rate": 0.0002, + "loss": 1.1578, + "step": 920 + }, + { + "epoch": 0.6751361161524501, + "grad_norm": 0.44173774123191833, + "learning_rate": 0.0002, + "loss": 1.1642, + "step": 930 + }, + { + "epoch": 0.6823956442831216, + "grad_norm": 0.2507467269897461, + "learning_rate": 0.0002, + "loss": 1.1832, + "step": 940 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 0.29463833570480347, + "learning_rate": 0.0002, + "loss": 1.3001, + "step": 950 + }, + { + "epoch": 0.6969147005444646, + "grad_norm": 0.9363154172897339, + "learning_rate": 0.0002, + "loss": 1.1651, + "step": 960 + }, + { + "epoch": 0.7041742286751361, + "grad_norm": 0.3236212134361267, + "learning_rate": 0.0002, + "loss": 1.158, + "step": 970 + }, + { + "epoch": 0.7114337568058077, + "grad_norm": 0.3123254179954529, + "learning_rate": 0.0002, + "loss": 1.1078, + "step": 980 + }, + { + "epoch": 0.7186932849364791, + "grad_norm": 0.3395805060863495, + "learning_rate": 0.0002, + "loss": 1.1578, + "step": 990 + }, + { + "epoch": 0.7259528130671506, + "grad_norm": 0.3240964412689209, + "learning_rate": 0.0002, + "loss": 1.0754, + "step": 1000 + }, + { + "epoch": 0.7332123411978222, + "grad_norm": 0.31902948021888733, + "learning_rate": 0.0002, + "loss": 1.0313, + "step": 1010 + }, + { + "epoch": 0.7404718693284936, + "grad_norm": 0.4848408102989197, + "learning_rate": 0.0002, + "loss": 1.1664, + "step": 1020 + }, + { + "epoch": 0.7477313974591652, + "grad_norm": 0.33006033301353455, + "learning_rate": 0.0002, + "loss": 1.0838, + "step": 1030 + }, + { + "epoch": 0.7549909255898367, + "grad_norm": 0.2928730547428131, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 1040 + }, + { + "epoch": 0.7622504537205081, + "grad_norm": 0.3529164791107178, + "learning_rate": 0.0002, + "loss": 1.1351, + "step": 1050 + }, + { + "epoch": 0.7695099818511797, + "grad_norm": 0.2736213803291321, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 1060 + }, + { + "epoch": 0.7767695099818511, + "grad_norm": 0.7200686931610107, + "learning_rate": 0.0002, + "loss": 1.1169, + "step": 1070 + }, + { + "epoch": 0.7840290381125227, + "grad_norm": 0.33396708965301514, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 1080 + }, + { + "epoch": 0.7912885662431942, + "grad_norm": 1.5760449171066284, + "learning_rate": 0.0002, + "loss": 1.2447, + "step": 1090 + }, + { + "epoch": 0.7985480943738656, + "grad_norm": 0.28138381242752075, + "learning_rate": 0.0002, + "loss": 1.1126, + "step": 1100 + }, + { + "epoch": 0.8058076225045372, + "grad_norm": 0.2597472369670868, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 1110 + }, + { + "epoch": 0.8130671506352087, + "grad_norm": 0.3305445611476898, + "learning_rate": 0.0002, + "loss": 1.1177, + "step": 1120 + }, + { + "epoch": 0.8203266787658802, + "grad_norm": 0.3934599459171295, + "learning_rate": 0.0002, + "loss": 1.0849, + "step": 1130 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 0.3472191393375397, + "learning_rate": 0.0002, + "loss": 1.0889, + "step": 1140 + }, + { + "epoch": 0.8348457350272233, + "grad_norm": 0.2857365906238556, + "learning_rate": 0.0002, + "loss": 1.265, + "step": 1150 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.3207702934741974, + "learning_rate": 0.0002, + "loss": 1.03, + "step": 1160 + }, + { + "epoch": 0.8493647912885662, + "grad_norm": 0.3176484704017639, + "learning_rate": 0.0002, + "loss": 1.1669, + "step": 1170 + }, + { + "epoch": 0.8566243194192378, + "grad_norm": 0.40685558319091797, + "learning_rate": 0.0002, + "loss": 1.1386, + "step": 1180 + }, + { + "epoch": 0.8638838475499092, + "grad_norm": 0.31125199794769287, + "learning_rate": 0.0002, + "loss": 1.1383, + "step": 1190 + }, + { + "epoch": 0.8711433756805808, + "grad_norm": 0.7361181378364563, + "learning_rate": 0.0002, + "loss": 1.1512, + "step": 1200 + }, + { + "epoch": 0.8784029038112523, + "grad_norm": 0.33699527382850647, + "learning_rate": 0.0002, + "loss": 0.9983, + "step": 1210 + }, + { + "epoch": 0.8856624319419237, + "grad_norm": 0.3315220773220062, + "learning_rate": 0.0002, + "loss": 1.11, + "step": 1220 + }, + { + "epoch": 0.8929219600725953, + "grad_norm": 0.6256054043769836, + "learning_rate": 0.0002, + "loss": 1.0266, + "step": 1230 + }, + { + "epoch": 0.9001814882032668, + "grad_norm": 0.3692137897014618, + "learning_rate": 0.0002, + "loss": 1.1738, + "step": 1240 + }, + { + "epoch": 0.9074410163339383, + "grad_norm": 0.3538484573364258, + "learning_rate": 0.0002, + "loss": 1.1112, + "step": 1250 + }, + { + "epoch": 0.9147005444646098, + "grad_norm": 0.27863040566444397, + "learning_rate": 0.0002, + "loss": 1.0878, + "step": 1260 + }, + { + "epoch": 0.9219600725952813, + "grad_norm": 0.3322528302669525, + "learning_rate": 0.0002, + "loss": 1.0826, + "step": 1270 + }, + { + "epoch": 0.9292196007259528, + "grad_norm": 0.7553173303604126, + "learning_rate": 0.0002, + "loss": 1.0158, + "step": 1280 + }, + { + "epoch": 0.9364791288566243, + "grad_norm": 0.4856191575527191, + "learning_rate": 0.0002, + "loss": 1.198, + "step": 1290 + }, + { + "epoch": 0.9437386569872959, + "grad_norm": 0.3668074905872345, + "learning_rate": 0.0002, + "loss": 1.1678, + "step": 1300 + }, + { + "epoch": 0.9509981851179673, + "grad_norm": 0.29851067066192627, + "learning_rate": 0.0002, + "loss": 1.1809, + "step": 1310 + }, + { + "epoch": 0.9582577132486388, + "grad_norm": 0.276664674282074, + "learning_rate": 0.0002, + "loss": 1.1299, + "step": 1320 + }, + { + "epoch": 0.9655172413793104, + "grad_norm": 0.2941018342971802, + "learning_rate": 0.0002, + "loss": 1.0597, + "step": 1330 + }, + { + "epoch": 0.9727767695099818, + "grad_norm": 0.3505859076976776, + "learning_rate": 0.0002, + "loss": 1.1731, + "step": 1340 + }, + { + "epoch": 0.9800362976406534, + "grad_norm": 0.3067687451839447, + "learning_rate": 0.0002, + "loss": 1.1809, + "step": 1350 + }, + { + "epoch": 0.9872958257713249, + "grad_norm": 0.27151066064834595, + "learning_rate": 0.0002, + "loss": 1.1511, + "step": 1360 + }, + { + "epoch": 0.9945553539019963, + "grad_norm": 0.36370083689689636, + "learning_rate": 0.0002, + "loss": 1.1919, + "step": 1370 + }, + { + "epoch": 0.9996370235934664, + "eval_loss": 1.1381088495254517, + "eval_runtime": 96.0848, + "eval_samples_per_second": 4.538, + "eval_steps_per_second": 0.572, + "step": 1377 + }, + { + "epoch": 1.0018148820326678, + "grad_norm": 0.27980583906173706, + "learning_rate": 0.0002, + "loss": 1.0082, + "step": 1380 + }, + { + "epoch": 1.0090744101633393, + "grad_norm": 0.26713913679122925, + "learning_rate": 0.0002, + "loss": 1.0565, + "step": 1390 + }, + { + "epoch": 1.0163339382940109, + "grad_norm": 0.3089541494846344, + "learning_rate": 0.0002, + "loss": 1.0241, + "step": 1400 + }, + { + "epoch": 1.0235934664246824, + "grad_norm": 0.4188242256641388, + "learning_rate": 0.0002, + "loss": 0.9649, + "step": 1410 + }, + { + "epoch": 1.030852994555354, + "grad_norm": 0.5246463418006897, + "learning_rate": 0.0002, + "loss": 0.9821, + "step": 1420 + }, + { + "epoch": 1.0381125226860255, + "grad_norm": 0.2728777825832367, + "learning_rate": 0.0002, + "loss": 1.0143, + "step": 1430 + }, + { + "epoch": 1.0453720508166968, + "grad_norm": 0.38167616724967957, + "learning_rate": 0.0002, + "loss": 1.075, + "step": 1440 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.4439380168914795, + "learning_rate": 0.0002, + "loss": 1.0229, + "step": 1450 + }, + { + "epoch": 1.05989110707804, + "grad_norm": 0.30954182147979736, + "learning_rate": 0.0002, + "loss": 1.0451, + "step": 1460 + }, + { + "epoch": 1.0671506352087115, + "grad_norm": 0.4022280275821686, + "learning_rate": 0.0002, + "loss": 1.0762, + "step": 1470 + }, + { + "epoch": 1.074410163339383, + "grad_norm": 0.7390811443328857, + "learning_rate": 0.0002, + "loss": 1.0754, + "step": 1480 + }, + { + "epoch": 1.0816696914700545, + "grad_norm": 0.3885486423969269, + "learning_rate": 0.0002, + "loss": 1.1378, + "step": 1490 + }, + { + "epoch": 1.0889292196007259, + "grad_norm": 0.5275560617446899, + "learning_rate": 0.0002, + "loss": 1.005, + "step": 1500 + }, + { + "epoch": 1.0961887477313974, + "grad_norm": 0.35112282633781433, + "learning_rate": 0.0002, + "loss": 1.0147, + "step": 1510 + }, + { + "epoch": 1.103448275862069, + "grad_norm": 0.33714351058006287, + "learning_rate": 0.0002, + "loss": 1.0331, + "step": 1520 + }, + { + "epoch": 1.1107078039927405, + "grad_norm": 0.31221693754196167, + "learning_rate": 0.0002, + "loss": 0.8723, + "step": 1530 + }, + { + "epoch": 1.117967332123412, + "grad_norm": 0.27549654245376587, + "learning_rate": 0.0002, + "loss": 1.0701, + "step": 1540 + }, + { + "epoch": 1.1252268602540836, + "grad_norm": 0.8465521335601807, + "learning_rate": 0.0002, + "loss": 1.0114, + "step": 1550 + }, + { + "epoch": 1.132486388384755, + "grad_norm": 0.36125949025154114, + "learning_rate": 0.0002, + "loss": 0.9643, + "step": 1560 + }, + { + "epoch": 1.1397459165154264, + "grad_norm": 0.37420371174812317, + "learning_rate": 0.0002, + "loss": 1.0744, + "step": 1570 + }, + { + "epoch": 1.147005444646098, + "grad_norm": 0.3294760584831238, + "learning_rate": 0.0002, + "loss": 1.165, + "step": 1580 + }, + { + "epoch": 1.1542649727767695, + "grad_norm": 0.3881238102912903, + "learning_rate": 0.0002, + "loss": 0.9207, + "step": 1590 + }, + { + "epoch": 1.161524500907441, + "grad_norm": 0.4766491651535034, + "learning_rate": 0.0002, + "loss": 1.0983, + "step": 1600 + }, + { + "epoch": 1.1687840290381124, + "grad_norm": 0.395530104637146, + "learning_rate": 0.0002, + "loss": 1.0092, + "step": 1610 + }, + { + "epoch": 1.176043557168784, + "grad_norm": 0.3297106623649597, + "learning_rate": 0.0002, + "loss": 0.9895, + "step": 1620 + }, + { + "epoch": 1.1833030852994555, + "grad_norm": 0.39528271555900574, + "learning_rate": 0.0002, + "loss": 1.1121, + "step": 1630 + }, + { + "epoch": 1.190562613430127, + "grad_norm": 0.3370221257209778, + "learning_rate": 0.0002, + "loss": 0.9202, + "step": 1640 + }, + { + "epoch": 1.1978221415607986, + "grad_norm": 0.31922030448913574, + "learning_rate": 0.0002, + "loss": 1.102, + "step": 1650 + }, + { + "epoch": 1.2050816696914701, + "grad_norm": 0.6142027378082275, + "learning_rate": 0.0002, + "loss": 1.0565, + "step": 1660 + }, + { + "epoch": 1.2123411978221417, + "grad_norm": 0.44769710302352905, + "learning_rate": 0.0002, + "loss": 0.9541, + "step": 1670 + }, + { + "epoch": 1.219600725952813, + "grad_norm": 0.41233646869659424, + "learning_rate": 0.0002, + "loss": 1.2501, + "step": 1680 + }, + { + "epoch": 1.2268602540834845, + "grad_norm": 0.2928866147994995, + "learning_rate": 0.0002, + "loss": 1.0147, + "step": 1690 + }, + { + "epoch": 1.234119782214156, + "grad_norm": 0.36913734674453735, + "learning_rate": 0.0002, + "loss": 0.9074, + "step": 1700 + }, + { + "epoch": 1.2413793103448276, + "grad_norm": 0.5281891226768494, + "learning_rate": 0.0002, + "loss": 1.0847, + "step": 1710 + }, + { + "epoch": 1.2486388384754992, + "grad_norm": 0.3374697268009186, + "learning_rate": 0.0002, + "loss": 0.9482, + "step": 1720 + }, + { + "epoch": 1.2558983666061705, + "grad_norm": 0.3802020847797394, + "learning_rate": 0.0002, + "loss": 0.9826, + "step": 1730 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 0.38048651814460754, + "learning_rate": 0.0002, + "loss": 1.0191, + "step": 1740 + }, + { + "epoch": 1.2704174228675136, + "grad_norm": 0.6676169633865356, + "learning_rate": 0.0002, + "loss": 1.0142, + "step": 1750 + }, + { + "epoch": 1.2776769509981851, + "grad_norm": 0.4075961410999298, + "learning_rate": 0.0002, + "loss": 0.992, + "step": 1760 + }, + { + "epoch": 1.2849364791288567, + "grad_norm": 0.4374721348285675, + "learning_rate": 0.0002, + "loss": 1.0301, + "step": 1770 + }, + { + "epoch": 1.2921960072595282, + "grad_norm": 0.4638824164867401, + "learning_rate": 0.0002, + "loss": 1.0601, + "step": 1780 + }, + { + "epoch": 1.2994555353901998, + "grad_norm": 0.38631564378738403, + "learning_rate": 0.0002, + "loss": 0.9593, + "step": 1790 + }, + { + "epoch": 1.306715063520871, + "grad_norm": 0.35873809456825256, + "learning_rate": 0.0002, + "loss": 1.0042, + "step": 1800 + }, + { + "epoch": 1.3139745916515426, + "grad_norm": 0.33602237701416016, + "learning_rate": 0.0002, + "loss": 0.9835, + "step": 1810 + }, + { + "epoch": 1.3212341197822142, + "grad_norm": 0.46696463227272034, + "learning_rate": 0.0002, + "loss": 1.1032, + "step": 1820 + }, + { + "epoch": 1.3284936479128857, + "grad_norm": 0.368958979845047, + "learning_rate": 0.0002, + "loss": 1.0123, + "step": 1830 + }, + { + "epoch": 1.335753176043557, + "grad_norm": 0.3160957396030426, + "learning_rate": 0.0002, + "loss": 1.0456, + "step": 1840 + }, + { + "epoch": 1.3430127041742286, + "grad_norm": 0.4511511027812958, + "learning_rate": 0.0002, + "loss": 1.0264, + "step": 1850 + }, + { + "epoch": 1.3502722323049001, + "grad_norm": 0.6769845485687256, + "learning_rate": 0.0002, + "loss": 1.0179, + "step": 1860 + }, + { + "epoch": 1.3575317604355717, + "grad_norm": 0.3749309480190277, + "learning_rate": 0.0002, + "loss": 1.011, + "step": 1870 + }, + { + "epoch": 1.3647912885662432, + "grad_norm": 0.553798496723175, + "learning_rate": 0.0002, + "loss": 1.0223, + "step": 1880 + }, + { + "epoch": 1.3720508166969148, + "grad_norm": 0.3538985550403595, + "learning_rate": 0.0002, + "loss": 0.9634, + "step": 1890 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 0.5501534938812256, + "learning_rate": 0.0002, + "loss": 1.0275, + "step": 1900 + }, + { + "epoch": 1.3865698729582578, + "grad_norm": 0.4432051181793213, + "learning_rate": 0.0002, + "loss": 1.0461, + "step": 1910 + }, + { + "epoch": 1.3938294010889292, + "grad_norm": 0.41755786538124084, + "learning_rate": 0.0002, + "loss": 0.9082, + "step": 1920 + }, + { + "epoch": 1.4010889292196007, + "grad_norm": 0.5626114010810852, + "learning_rate": 0.0002, + "loss": 1.0858, + "step": 1930 + }, + { + "epoch": 1.4083484573502723, + "grad_norm": 0.44038185477256775, + "learning_rate": 0.0002, + "loss": 1.0687, + "step": 1940 + }, + { + "epoch": 1.4156079854809438, + "grad_norm": 0.3397001624107361, + "learning_rate": 0.0002, + "loss": 0.9454, + "step": 1950 + }, + { + "epoch": 1.4228675136116151, + "grad_norm": 0.4325368106365204, + "learning_rate": 0.0002, + "loss": 0.943, + "step": 1960 + }, + { + "epoch": 1.4301270417422867, + "grad_norm": 0.3900907039642334, + "learning_rate": 0.0002, + "loss": 1.0488, + "step": 1970 + }, + { + "epoch": 1.4373865698729582, + "grad_norm": 0.369612455368042, + "learning_rate": 0.0002, + "loss": 0.9699, + "step": 1980 + }, + { + "epoch": 1.4446460980036298, + "grad_norm": 0.4389338791370392, + "learning_rate": 0.0002, + "loss": 1.0609, + "step": 1990 + }, + { + "epoch": 1.4519056261343013, + "grad_norm": 1.694450855255127, + "learning_rate": 0.0002, + "loss": 1.042, + "step": 2000 + }, + { + "epoch": 1.4591651542649728, + "grad_norm": 0.516957700252533, + "learning_rate": 0.0002, + "loss": 0.997, + "step": 2010 + }, + { + "epoch": 1.4664246823956444, + "grad_norm": 0.45515501499176025, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 2020 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 0.4153139591217041, + "learning_rate": 0.0002, + "loss": 1.0188, + "step": 2030 + }, + { + "epoch": 1.4809437386569873, + "grad_norm": 0.44353052973747253, + "learning_rate": 0.0002, + "loss": 1.1521, + "step": 2040 + }, + { + "epoch": 1.4882032667876588, + "grad_norm": 0.570554256439209, + "learning_rate": 0.0002, + "loss": 0.9653, + "step": 2050 + }, + { + "epoch": 1.4954627949183303, + "grad_norm": 0.5742740035057068, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 2060 + }, + { + "epoch": 1.5027223230490017, + "grad_norm": 0.5890517830848694, + "learning_rate": 0.0002, + "loss": 1.0963, + "step": 2070 + }, + { + "epoch": 1.5099818511796732, + "grad_norm": 0.4162650406360626, + "learning_rate": 0.0002, + "loss": 1.0639, + "step": 2080 + }, + { + "epoch": 1.5172413793103448, + "grad_norm": 0.7334717512130737, + "learning_rate": 0.0002, + "loss": 1.071, + "step": 2090 + }, + { + "epoch": 1.5245009074410163, + "grad_norm": 0.2963249683380127, + "learning_rate": 0.0002, + "loss": 0.8957, + "step": 2100 + }, + { + "epoch": 1.5317604355716878, + "grad_norm": 0.30676454305648804, + "learning_rate": 0.0002, + "loss": 1.0446, + "step": 2110 + }, + { + "epoch": 1.5390199637023594, + "grad_norm": 0.35984641313552856, + "learning_rate": 0.0002, + "loss": 0.9967, + "step": 2120 + }, + { + "epoch": 1.546279491833031, + "grad_norm": 0.3384549617767334, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 2130 + }, + { + "epoch": 1.5535390199637025, + "grad_norm": 0.4725518226623535, + "learning_rate": 0.0002, + "loss": 0.9874, + "step": 2140 + }, + { + "epoch": 1.560798548094374, + "grad_norm": 0.3252685070037842, + "learning_rate": 0.0002, + "loss": 1.1166, + "step": 2150 + }, + { + "epoch": 1.5680580762250453, + "grad_norm": 0.45043081045150757, + "learning_rate": 0.0002, + "loss": 0.9127, + "step": 2160 + }, + { + "epoch": 1.5753176043557169, + "grad_norm": 0.374208003282547, + "learning_rate": 0.0002, + "loss": 1.0767, + "step": 2170 + }, + { + "epoch": 1.5825771324863884, + "grad_norm": 0.5118404030799866, + "learning_rate": 0.0002, + "loss": 1.0255, + "step": 2180 + }, + { + "epoch": 1.5898366606170597, + "grad_norm": 0.482170969247818, + "learning_rate": 0.0002, + "loss": 1.0062, + "step": 2190 + }, + { + "epoch": 1.5970961887477313, + "grad_norm": 0.5337533950805664, + "learning_rate": 0.0002, + "loss": 1.0872, + "step": 2200 + }, + { + "epoch": 1.6043557168784028, + "grad_norm": 0.5195064544677734, + "learning_rate": 0.0002, + "loss": 1.0405, + "step": 2210 + }, + { + "epoch": 1.6116152450090744, + "grad_norm": 0.30807098746299744, + "learning_rate": 0.0002, + "loss": 1.0454, + "step": 2220 + }, + { + "epoch": 1.618874773139746, + "grad_norm": 0.3962925672531128, + "learning_rate": 0.0002, + "loss": 1.0293, + "step": 2230 + }, + { + "epoch": 1.6261343012704175, + "grad_norm": 0.7636962532997131, + "learning_rate": 0.0002, + "loss": 1.0137, + "step": 2240 + }, + { + "epoch": 1.633393829401089, + "grad_norm": 0.32380592823028564, + "learning_rate": 0.0002, + "loss": 0.991, + "step": 2250 + }, + { + "epoch": 1.6406533575317606, + "grad_norm": 0.5767741799354553, + "learning_rate": 0.0002, + "loss": 0.9471, + "step": 2260 + }, + { + "epoch": 1.647912885662432, + "grad_norm": 0.39964812994003296, + "learning_rate": 0.0002, + "loss": 0.9524, + "step": 2270 + }, + { + "epoch": 1.6551724137931034, + "grad_norm": 0.622629702091217, + "learning_rate": 0.0002, + "loss": 0.9866, + "step": 2280 + }, + { + "epoch": 1.662431941923775, + "grad_norm": 0.40202152729034424, + "learning_rate": 0.0002, + "loss": 0.9609, + "step": 2290 + }, + { + "epoch": 1.6696914700544465, + "grad_norm": 0.4467349052429199, + "learning_rate": 0.0002, + "loss": 0.9859, + "step": 2300 + }, + { + "epoch": 1.6769509981851178, + "grad_norm": 0.5026949048042297, + "learning_rate": 0.0002, + "loss": 1.0312, + "step": 2310 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 0.43754541873931885, + "learning_rate": 0.0002, + "loss": 1.0776, + "step": 2320 + }, + { + "epoch": 1.691470054446461, + "grad_norm": 0.42869430780410767, + "learning_rate": 0.0002, + "loss": 0.9786, + "step": 2330 + }, + { + "epoch": 1.6987295825771325, + "grad_norm": 0.4192679524421692, + "learning_rate": 0.0002, + "loss": 1.0747, + "step": 2340 + }, + { + "epoch": 1.705989110707804, + "grad_norm": 0.3243155777454376, + "learning_rate": 0.0002, + "loss": 1.0054, + "step": 2350 + }, + { + "epoch": 1.7132486388384756, + "grad_norm": 1.0514075756072998, + "learning_rate": 0.0002, + "loss": 0.9165, + "step": 2360 + }, + { + "epoch": 1.720508166969147, + "grad_norm": 0.4719122648239136, + "learning_rate": 0.0002, + "loss": 1.1353, + "step": 2370 + }, + { + "epoch": 1.7277676950998186, + "grad_norm": 0.3846144676208496, + "learning_rate": 0.0002, + "loss": 0.9913, + "step": 2380 + }, + { + "epoch": 1.73502722323049, + "grad_norm": 0.7266581058502197, + "learning_rate": 0.0002, + "loss": 0.9935, + "step": 2390 + }, + { + "epoch": 1.7422867513611615, + "grad_norm": 0.6183241605758667, + "learning_rate": 0.0002, + "loss": 0.9509, + "step": 2400 + }, + { + "epoch": 1.749546279491833, + "grad_norm": 0.3658260405063629, + "learning_rate": 0.0002, + "loss": 0.9713, + "step": 2410 + }, + { + "epoch": 1.7568058076225044, + "grad_norm": 0.6036322712898254, + "learning_rate": 0.0002, + "loss": 1.0033, + "step": 2420 + }, + { + "epoch": 1.764065335753176, + "grad_norm": 0.7872936129570007, + "learning_rate": 0.0002, + "loss": 1.0526, + "step": 2430 + }, + { + "epoch": 1.7713248638838475, + "grad_norm": 0.35946124792099, + "learning_rate": 0.0002, + "loss": 0.99, + "step": 2440 + }, + { + "epoch": 1.778584392014519, + "grad_norm": 0.3740338981151581, + "learning_rate": 0.0002, + "loss": 0.8845, + "step": 2450 + }, + { + "epoch": 1.7858439201451906, + "grad_norm": 0.6150230169296265, + "learning_rate": 0.0002, + "loss": 0.9333, + "step": 2460 + }, + { + "epoch": 1.793103448275862, + "grad_norm": 0.4726075530052185, + "learning_rate": 0.0002, + "loss": 1.0373, + "step": 2470 + }, + { + "epoch": 1.8003629764065336, + "grad_norm": 0.31292253732681274, + "learning_rate": 0.0002, + "loss": 0.974, + "step": 2480 + }, + { + "epoch": 1.8076225045372052, + "grad_norm": 0.4463104009628296, + "learning_rate": 0.0002, + "loss": 1.0205, + "step": 2490 + }, + { + "epoch": 1.8148820326678767, + "grad_norm": 0.7848200798034668, + "learning_rate": 0.0002, + "loss": 1.0771, + "step": 2500 + }, + { + "epoch": 1.822141560798548, + "grad_norm": 0.5562082529067993, + "learning_rate": 0.0002, + "loss": 0.9508, + "step": 2510 + }, + { + "epoch": 1.8294010889292196, + "grad_norm": 0.39892756938934326, + "learning_rate": 0.0002, + "loss": 0.9343, + "step": 2520 + }, + { + "epoch": 1.8366606170598911, + "grad_norm": 1.2923320531845093, + "learning_rate": 0.0002, + "loss": 1.0489, + "step": 2530 + }, + { + "epoch": 1.8439201451905625, + "grad_norm": 0.6316490769386292, + "learning_rate": 0.0002, + "loss": 1.1203, + "step": 2540 + }, + { + "epoch": 1.851179673321234, + "grad_norm": 0.46100497245788574, + "learning_rate": 0.0002, + "loss": 0.9131, + "step": 2550 + }, + { + "epoch": 1.8584392014519056, + "grad_norm": 0.7902987003326416, + "learning_rate": 0.0002, + "loss": 1.0977, + "step": 2560 + }, + { + "epoch": 1.865698729582577, + "grad_norm": 0.4596365690231323, + "learning_rate": 0.0002, + "loss": 0.9702, + "step": 2570 + }, + { + "epoch": 1.8729582577132486, + "grad_norm": 0.6592172384262085, + "learning_rate": 0.0002, + "loss": 0.9322, + "step": 2580 + }, + { + "epoch": 1.8802177858439202, + "grad_norm": 0.5764662027359009, + "learning_rate": 0.0002, + "loss": 0.9549, + "step": 2590 + }, + { + "epoch": 1.8874773139745917, + "grad_norm": 0.8421637415885925, + "learning_rate": 0.0002, + "loss": 0.9485, + "step": 2600 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.5635305047035217, + "learning_rate": 0.0002, + "loss": 1.012, + "step": 2610 + }, + { + "epoch": 1.9019963702359348, + "grad_norm": 0.46231237053871155, + "learning_rate": 0.0002, + "loss": 0.8907, + "step": 2620 + }, + { + "epoch": 1.9092558983666061, + "grad_norm": 0.3944607079029083, + "learning_rate": 0.0002, + "loss": 0.9543, + "step": 2630 + }, + { + "epoch": 1.9165154264972777, + "grad_norm": 0.4753907322883606, + "learning_rate": 0.0002, + "loss": 0.9964, + "step": 2640 + }, + { + "epoch": 1.9237749546279492, + "grad_norm": 0.4151090979576111, + "learning_rate": 0.0002, + "loss": 1.0217, + "step": 2650 + }, + { + "epoch": 1.9310344827586206, + "grad_norm": 0.6793725490570068, + "learning_rate": 0.0002, + "loss": 1.0397, + "step": 2660 + }, + { + "epoch": 1.938294010889292, + "grad_norm": 0.339755117893219, + "learning_rate": 0.0002, + "loss": 1.0401, + "step": 2670 + }, + { + "epoch": 1.9455535390199636, + "grad_norm": 0.40789374709129333, + "learning_rate": 0.0002, + "loss": 1.082, + "step": 2680 + }, + { + "epoch": 1.9528130671506352, + "grad_norm": 0.3750005066394806, + "learning_rate": 0.0002, + "loss": 0.9973, + "step": 2690 + }, + { + "epoch": 1.9600725952813067, + "grad_norm": 0.39684441685676575, + "learning_rate": 0.0002, + "loss": 0.9837, + "step": 2700 + }, + { + "epoch": 1.9673321234119783, + "grad_norm": 0.378287672996521, + "learning_rate": 0.0002, + "loss": 1.1204, + "step": 2710 + }, + { + "epoch": 1.9745916515426498, + "grad_norm": 0.3668482005596161, + "learning_rate": 0.0002, + "loss": 0.9284, + "step": 2720 + }, + { + "epoch": 1.9818511796733214, + "grad_norm": 0.49997565150260925, + "learning_rate": 0.0002, + "loss": 1.0512, + "step": 2730 + }, + { + "epoch": 1.989110707803993, + "grad_norm": 0.36852124333381653, + "learning_rate": 0.0002, + "loss": 0.9311, + "step": 2740 + }, + { + "epoch": 1.9963702359346642, + "grad_norm": 0.5203380584716797, + "learning_rate": 0.0002, + "loss": 1.0898, + "step": 2750 + }, + { + "epoch": 2.0, + "eval_loss": 1.1086540222167969, + "eval_runtime": 95.6866, + "eval_samples_per_second": 4.557, + "eval_steps_per_second": 0.575, + "step": 2755 + }, + { + "epoch": 2.0036297640653356, + "grad_norm": 0.35921573638916016, + "learning_rate": 0.0002, + "loss": 0.9791, + "step": 2760 + }, + { + "epoch": 2.010889292196007, + "grad_norm": 1.013861894607544, + "learning_rate": 0.0002, + "loss": 0.8968, + "step": 2770 + }, + { + "epoch": 2.0181488203266786, + "grad_norm": 0.4425240159034729, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 2780 + }, + { + "epoch": 2.02540834845735, + "grad_norm": 0.60174161195755, + "learning_rate": 0.0002, + "loss": 0.8758, + "step": 2790 + }, + { + "epoch": 2.0326678765880217, + "grad_norm": 0.47582098841667175, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 2800 + }, + { + "epoch": 2.0399274047186933, + "grad_norm": 0.6012811660766602, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 2810 + }, + { + "epoch": 2.047186932849365, + "grad_norm": 0.4444408118724823, + "learning_rate": 0.0002, + "loss": 0.8109, + "step": 2820 + }, + { + "epoch": 2.0544464609800364, + "grad_norm": 0.4864003360271454, + "learning_rate": 0.0002, + "loss": 0.8621, + "step": 2830 + }, + { + "epoch": 2.061705989110708, + "grad_norm": 0.5104215741157532, + "learning_rate": 0.0002, + "loss": 0.8817, + "step": 2840 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 0.6218489408493042, + "learning_rate": 0.0002, + "loss": 0.7789, + "step": 2850 + }, + { + "epoch": 2.076225045372051, + "grad_norm": 0.705784261226654, + "learning_rate": 0.0002, + "loss": 0.8778, + "step": 2860 + }, + { + "epoch": 2.0834845735027225, + "grad_norm": 0.48091503977775574, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 2870 + }, + { + "epoch": 2.0907441016333936, + "grad_norm": 0.5062456727027893, + "learning_rate": 0.0002, + "loss": 0.8444, + "step": 2880 + }, + { + "epoch": 2.098003629764065, + "grad_norm": 0.4862022399902344, + "learning_rate": 0.0002, + "loss": 0.7898, + "step": 2890 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.48264044523239136, + "learning_rate": 0.0002, + "loss": 0.8593, + "step": 2900 + }, + { + "epoch": 2.1125226860254083, + "grad_norm": 0.43744346499443054, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 2910 + }, + { + "epoch": 2.11978221415608, + "grad_norm": 0.5480492115020752, + "learning_rate": 0.0002, + "loss": 0.9099, + "step": 2920 + }, + { + "epoch": 2.1270417422867514, + "grad_norm": 0.5068560838699341, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 2930 + }, + { + "epoch": 2.134301270417423, + "grad_norm": 0.4650852680206299, + "learning_rate": 0.0002, + "loss": 0.8403, + "step": 2940 + }, + { + "epoch": 2.1415607985480944, + "grad_norm": 0.4929981231689453, + "learning_rate": 0.0002, + "loss": 0.8467, + "step": 2950 + }, + { + "epoch": 2.148820326678766, + "grad_norm": 0.6537389755249023, + "learning_rate": 0.0002, + "loss": 0.8747, + "step": 2960 + }, + { + "epoch": 2.1560798548094375, + "grad_norm": 0.8032940626144409, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 2970 + }, + { + "epoch": 2.163339382940109, + "grad_norm": 0.7131643891334534, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 2980 + }, + { + "epoch": 2.1705989110707806, + "grad_norm": 0.6034275889396667, + "learning_rate": 0.0002, + "loss": 0.8424, + "step": 2990 + }, + { + "epoch": 2.1778584392014517, + "grad_norm": 0.6081095933914185, + "learning_rate": 0.0002, + "loss": 0.8838, + "step": 3000 + }, + { + "epoch": 2.1851179673321233, + "grad_norm": 0.5706912875175476, + "learning_rate": 0.0002, + "loss": 0.9013, + "step": 3010 + }, + { + "epoch": 2.192377495462795, + "grad_norm": 0.6742380261421204, + "learning_rate": 0.0002, + "loss": 0.8473, + "step": 3020 + }, + { + "epoch": 2.1996370235934664, + "grad_norm": 0.4847496449947357, + "learning_rate": 0.0002, + "loss": 0.8372, + "step": 3030 + }, + { + "epoch": 2.206896551724138, + "grad_norm": 0.5529342889785767, + "learning_rate": 0.0002, + "loss": 0.8487, + "step": 3040 + }, + { + "epoch": 2.2141560798548094, + "grad_norm": 0.6108783483505249, + "learning_rate": 0.0002, + "loss": 0.9057, + "step": 3050 + }, + { + "epoch": 2.221415607985481, + "grad_norm": 0.8841571807861328, + "learning_rate": 0.0002, + "loss": 0.9659, + "step": 3060 + }, + { + "epoch": 2.2286751361161525, + "grad_norm": 0.4227530360221863, + "learning_rate": 0.0002, + "loss": 0.8538, + "step": 3070 + }, + { + "epoch": 2.235934664246824, + "grad_norm": 0.461935818195343, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 3080 + }, + { + "epoch": 2.2431941923774956, + "grad_norm": 0.5407412648200989, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 3090 + }, + { + "epoch": 2.250453720508167, + "grad_norm": 0.5057998895645142, + "learning_rate": 0.0002, + "loss": 0.8392, + "step": 3100 + }, + { + "epoch": 2.2577132486388383, + "grad_norm": 0.530057966709137, + "learning_rate": 0.0002, + "loss": 0.7898, + "step": 3110 + }, + { + "epoch": 2.26497277676951, + "grad_norm": 0.5066842436790466, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 3120 + }, + { + "epoch": 2.2722323049001814, + "grad_norm": 0.5069178342819214, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 3130 + }, + { + "epoch": 2.279491833030853, + "grad_norm": 0.6095499396324158, + "learning_rate": 0.0002, + "loss": 0.8458, + "step": 3140 + }, + { + "epoch": 2.2867513611615244, + "grad_norm": 0.49524766206741333, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 3150 + }, + { + "epoch": 2.294010889292196, + "grad_norm": 0.5334409475326538, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 3160 + }, + { + "epoch": 2.3012704174228675, + "grad_norm": 1.681748867034912, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 3170 + }, + { + "epoch": 2.308529945553539, + "grad_norm": 0.7225565314292908, + "learning_rate": 0.0002, + "loss": 0.8155, + "step": 3180 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 0.5379388928413391, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 3190 + }, + { + "epoch": 2.323049001814882, + "grad_norm": 0.45770326256752014, + "learning_rate": 0.0002, + "loss": 0.8833, + "step": 3200 + }, + { + "epoch": 2.3303085299455537, + "grad_norm": 0.6984533071517944, + "learning_rate": 0.0002, + "loss": 0.8419, + "step": 3210 + }, + { + "epoch": 2.337568058076225, + "grad_norm": 0.6725744605064392, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 3220 + }, + { + "epoch": 2.344827586206897, + "grad_norm": 1.1247471570968628, + "learning_rate": 0.0002, + "loss": 0.8657, + "step": 3230 + }, + { + "epoch": 2.352087114337568, + "grad_norm": 1.0240263938903809, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 3240 + }, + { + "epoch": 2.3593466424682394, + "grad_norm": 0.5608096122741699, + "learning_rate": 0.0002, + "loss": 0.826, + "step": 3250 + }, + { + "epoch": 2.366606170598911, + "grad_norm": 0.8294990062713623, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 3260 + }, + { + "epoch": 2.3738656987295825, + "grad_norm": 0.6734224557876587, + "learning_rate": 0.0002, + "loss": 0.9845, + "step": 3270 + }, + { + "epoch": 2.381125226860254, + "grad_norm": 0.6862800717353821, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 3280 + }, + { + "epoch": 2.3883847549909256, + "grad_norm": 0.5442930459976196, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 3290 + }, + { + "epoch": 2.395644283121597, + "grad_norm": 0.5745394229888916, + "learning_rate": 0.0002, + "loss": 0.8543, + "step": 3300 + }, + { + "epoch": 2.4029038112522687, + "grad_norm": 0.6257799863815308, + "learning_rate": 0.0002, + "loss": 0.833, + "step": 3310 + }, + { + "epoch": 2.4101633393829403, + "grad_norm": 0.5608420968055725, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 3320 + }, + { + "epoch": 2.417422867513612, + "grad_norm": 0.5512017011642456, + "learning_rate": 0.0002, + "loss": 0.8745, + "step": 3330 + }, + { + "epoch": 2.4246823956442833, + "grad_norm": 0.7452999353408813, + "learning_rate": 0.0002, + "loss": 0.8642, + "step": 3340 + }, + { + "epoch": 2.4319419237749544, + "grad_norm": 0.4604301452636719, + "learning_rate": 0.0002, + "loss": 0.8417, + "step": 3350 + }, + { + "epoch": 2.439201451905626, + "grad_norm": 0.8225823640823364, + "learning_rate": 0.0002, + "loss": 0.9148, + "step": 3360 + }, + { + "epoch": 2.4464609800362975, + "grad_norm": 0.8604981303215027, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 3370 + }, + { + "epoch": 2.453720508166969, + "grad_norm": 0.6620925664901733, + "learning_rate": 0.0002, + "loss": 0.9276, + "step": 3380 + }, + { + "epoch": 2.4609800362976406, + "grad_norm": 0.4750158488750458, + "learning_rate": 0.0002, + "loss": 0.8381, + "step": 3390 + }, + { + "epoch": 2.468239564428312, + "grad_norm": 0.6061418056488037, + "learning_rate": 0.0002, + "loss": 0.8249, + "step": 3400 + }, + { + "epoch": 2.4754990925589837, + "grad_norm": 0.5977247357368469, + "learning_rate": 0.0002, + "loss": 0.8944, + "step": 3410 + }, + { + "epoch": 2.4827586206896552, + "grad_norm": 0.5004227757453918, + "learning_rate": 0.0002, + "loss": 0.8366, + "step": 3420 + }, + { + "epoch": 2.490018148820327, + "grad_norm": 0.46428972482681274, + "learning_rate": 0.0002, + "loss": 0.9133, + "step": 3430 + }, + { + "epoch": 2.4972776769509983, + "grad_norm": 0.7658395171165466, + "learning_rate": 0.0002, + "loss": 0.8534, + "step": 3440 + }, + { + "epoch": 2.50453720508167, + "grad_norm": 0.621819794178009, + "learning_rate": 0.0002, + "loss": 0.9066, + "step": 3450 + }, + { + "epoch": 2.511796733212341, + "grad_norm": 0.4556088149547577, + "learning_rate": 0.0002, + "loss": 0.8481, + "step": 3460 + }, + { + "epoch": 2.519056261343013, + "grad_norm": 0.6124140024185181, + "learning_rate": 0.0002, + "loss": 0.8522, + "step": 3470 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 0.6256231665611267, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 3480 + }, + { + "epoch": 2.5335753176043556, + "grad_norm": 0.5464141964912415, + "learning_rate": 0.0002, + "loss": 0.9097, + "step": 3490 + }, + { + "epoch": 2.540834845735027, + "grad_norm": 0.51471346616745, + "learning_rate": 0.0002, + "loss": 0.8708, + "step": 3500 + }, + { + "epoch": 2.5480943738656987, + "grad_norm": 0.5326165556907654, + "learning_rate": 0.0002, + "loss": 0.9042, + "step": 3510 + }, + { + "epoch": 2.5553539019963702, + "grad_norm": 0.4750378429889679, + "learning_rate": 0.0002, + "loss": 0.8749, + "step": 3520 + }, + { + "epoch": 2.562613430127042, + "grad_norm": 0.5292279124259949, + "learning_rate": 0.0002, + "loss": 0.8248, + "step": 3530 + }, + { + "epoch": 2.5698729582577133, + "grad_norm": 0.6145227551460266, + "learning_rate": 0.0002, + "loss": 0.8808, + "step": 3540 + }, + { + "epoch": 2.577132486388385, + "grad_norm": 0.8275189399719238, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 3550 + }, + { + "epoch": 2.5843920145190564, + "grad_norm": 0.5037438273429871, + "learning_rate": 0.0002, + "loss": 0.875, + "step": 3560 + }, + { + "epoch": 2.5916515426497275, + "grad_norm": 0.5838707685470581, + "learning_rate": 0.0002, + "loss": 0.8324, + "step": 3570 + }, + { + "epoch": 2.5989110707803995, + "grad_norm": 0.5398710370063782, + "learning_rate": 0.0002, + "loss": 0.8716, + "step": 3580 + }, + { + "epoch": 2.6061705989110706, + "grad_norm": 0.6115376949310303, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 3590 + }, + { + "epoch": 2.613430127041742, + "grad_norm": 1.118809461593628, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 3600 + }, + { + "epoch": 2.6206896551724137, + "grad_norm": 0.7811771631240845, + "learning_rate": 0.0002, + "loss": 0.8282, + "step": 3610 + }, + { + "epoch": 2.6279491833030852, + "grad_norm": 0.5753175020217896, + "learning_rate": 0.0002, + "loss": 0.7997, + "step": 3620 + }, + { + "epoch": 2.635208711433757, + "grad_norm": 0.550829291343689, + "learning_rate": 0.0002, + "loss": 0.7705, + "step": 3630 + }, + { + "epoch": 2.6424682395644283, + "grad_norm": 0.5360019207000732, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 3640 + }, + { + "epoch": 2.6497277676951, + "grad_norm": 0.63050377368927, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 3650 + }, + { + "epoch": 2.6569872958257714, + "grad_norm": 0.5833110213279724, + "learning_rate": 0.0002, + "loss": 0.9156, + "step": 3660 + }, + { + "epoch": 2.664246823956443, + "grad_norm": 0.5543047189712524, + "learning_rate": 0.0002, + "loss": 0.8033, + "step": 3670 + }, + { + "epoch": 2.671506352087114, + "grad_norm": 0.6842212080955505, + "learning_rate": 0.0002, + "loss": 0.8867, + "step": 3680 + }, + { + "epoch": 2.678765880217786, + "grad_norm": 0.6859333515167236, + "learning_rate": 0.0002, + "loss": 0.8569, + "step": 3690 + }, + { + "epoch": 2.686025408348457, + "grad_norm": 0.7038410902023315, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 3700 + }, + { + "epoch": 2.6932849364791287, + "grad_norm": 0.447233647108078, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 3710 + }, + { + "epoch": 2.7005444646098002, + "grad_norm": 0.6453872919082642, + "learning_rate": 0.0002, + "loss": 0.8951, + "step": 3720 + }, + { + "epoch": 2.707803992740472, + "grad_norm": 0.8025672435760498, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 3730 + }, + { + "epoch": 2.7150635208711433, + "grad_norm": 0.5997087955474854, + "learning_rate": 0.0002, + "loss": 0.8488, + "step": 3740 + }, + { + "epoch": 2.722323049001815, + "grad_norm": 0.6901142001152039, + "learning_rate": 0.0002, + "loss": 0.8308, + "step": 3750 + }, + { + "epoch": 2.7295825771324864, + "grad_norm": 1.036145567893982, + "learning_rate": 0.0002, + "loss": 0.8517, + "step": 3760 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 0.7207072377204895, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 3770 + }, + { + "epoch": 2.7441016333938295, + "grad_norm": 1.0452989339828491, + "learning_rate": 0.0002, + "loss": 0.8009, + "step": 3780 + }, + { + "epoch": 2.751361161524501, + "grad_norm": 0.5615278482437134, + "learning_rate": 0.0002, + "loss": 0.8868, + "step": 3790 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 0.46439215540885925, + "learning_rate": 0.0002, + "loss": 0.9047, + "step": 3800 + }, + { + "epoch": 2.7658802177858437, + "grad_norm": 0.7134917974472046, + "learning_rate": 0.0002, + "loss": 0.7847, + "step": 3810 + }, + { + "epoch": 2.7731397459165157, + "grad_norm": 0.5139115452766418, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 3820 + }, + { + "epoch": 2.780399274047187, + "grad_norm": 0.8595781326293945, + "learning_rate": 0.0002, + "loss": 0.8995, + "step": 3830 + }, + { + "epoch": 2.7876588021778583, + "grad_norm": 0.544614851474762, + "learning_rate": 0.0002, + "loss": 0.9079, + "step": 3840 + }, + { + "epoch": 2.79491833030853, + "grad_norm": 0.6073850393295288, + "learning_rate": 0.0002, + "loss": 0.8322, + "step": 3850 + }, + { + "epoch": 2.8021778584392014, + "grad_norm": 0.8224069476127625, + "learning_rate": 0.0002, + "loss": 0.8277, + "step": 3860 + }, + { + "epoch": 2.809437386569873, + "grad_norm": 0.5347970128059387, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 3870 + }, + { + "epoch": 2.8166969147005445, + "grad_norm": 0.6799601912498474, + "learning_rate": 0.0002, + "loss": 0.8219, + "step": 3880 + }, + { + "epoch": 2.823956442831216, + "grad_norm": 0.5219197869300842, + "learning_rate": 0.0002, + "loss": 0.8796, + "step": 3890 + }, + { + "epoch": 2.8312159709618876, + "grad_norm": 0.5710130333900452, + "learning_rate": 0.0002, + "loss": 0.8649, + "step": 3900 + }, + { + "epoch": 2.838475499092559, + "grad_norm": 0.5857213139533997, + "learning_rate": 0.0002, + "loss": 0.8741, + "step": 3910 + }, + { + "epoch": 2.8457350272232302, + "grad_norm": 0.5206644535064697, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 3920 + }, + { + "epoch": 2.8529945553539022, + "grad_norm": 1.2902015447616577, + "learning_rate": 0.0002, + "loss": 0.9328, + "step": 3930 + }, + { + "epoch": 2.8602540834845733, + "grad_norm": 0.5252797603607178, + "learning_rate": 0.0002, + "loss": 0.8192, + "step": 3940 + }, + { + "epoch": 2.867513611615245, + "grad_norm": 0.5925108790397644, + "learning_rate": 0.0002, + "loss": 0.9047, + "step": 3950 + }, + { + "epoch": 2.8747731397459164, + "grad_norm": 0.9719610810279846, + "learning_rate": 0.0002, + "loss": 0.8611, + "step": 3960 + }, + { + "epoch": 2.882032667876588, + "grad_norm": 0.6834747195243835, + "learning_rate": 0.0002, + "loss": 0.8531, + "step": 3970 + }, + { + "epoch": 2.8892921960072595, + "grad_norm": 0.6952353119850159, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 3980 + }, + { + "epoch": 2.896551724137931, + "grad_norm": 0.49889910221099854, + "learning_rate": 0.0002, + "loss": 0.8365, + "step": 3990 + }, + { + "epoch": 2.9038112522686026, + "grad_norm": 0.5007026791572571, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 4000 + }, + { + "epoch": 2.911070780399274, + "grad_norm": 0.5474239587783813, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 4010 + }, + { + "epoch": 2.9183303085299457, + "grad_norm": 0.6617428660392761, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 4020 + }, + { + "epoch": 2.925589836660617, + "grad_norm": 0.6097776293754578, + "learning_rate": 0.0002, + "loss": 0.8583, + "step": 4030 + }, + { + "epoch": 2.9328493647912888, + "grad_norm": 0.5985828638076782, + "learning_rate": 0.0002, + "loss": 0.8296, + "step": 4040 + }, + { + "epoch": 2.94010889292196, + "grad_norm": 0.769488513469696, + "learning_rate": 0.0002, + "loss": 0.9134, + "step": 4050 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 0.5167055130004883, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 4060 + }, + { + "epoch": 2.954627949183303, + "grad_norm": 0.6265496015548706, + "learning_rate": 0.0002, + "loss": 0.8665, + "step": 4070 + }, + { + "epoch": 2.9618874773139745, + "grad_norm": 1.2644082307815552, + "learning_rate": 0.0002, + "loss": 0.9218, + "step": 4080 + }, + { + "epoch": 2.969147005444646, + "grad_norm": 0.6007561087608337, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 4090 + }, + { + "epoch": 2.9764065335753176, + "grad_norm": 0.47984927892684937, + "learning_rate": 0.0002, + "loss": 0.8262, + "step": 4100 + }, + { + "epoch": 2.983666061705989, + "grad_norm": 1.128198504447937, + "learning_rate": 0.0002, + "loss": 0.7897, + "step": 4110 + }, + { + "epoch": 2.9909255898366607, + "grad_norm": 0.526292085647583, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 4120 + }, + { + "epoch": 2.9981851179673322, + "grad_norm": 0.5462674498558044, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 4130 + }, + { + "epoch": 2.9996370235934666, + "eval_loss": 1.1093357801437378, + "eval_runtime": 46.2498, + "eval_samples_per_second": 9.427, + "eval_steps_per_second": 1.189, + "step": 4132 + } + ], + "logging_steps": 10, + "max_steps": 11016, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.122606788083712e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..140bceabdec44fc473842cb9ace4a62a4dc9fe89 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-4132/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4abec5480aff960b5a7b03a48be1cdb1ae29ba8792469c451f5c2f269e4be93b +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7ea497b2c459907a15aa8a30d556a9e94b6b1d59 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ace6f198b7b601c431a919517fd6ec1f8a057a534c1ecefd21c2c303d649b96f +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/optimizer.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb80c05d5b79d05ada17b8744555e2d8cd23b153 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05edafa7d9883453eabeb3309388918a362ca33bf36981f47d6f6feb0faeb3ef +size 72886650 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/rng_state.pth b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..5969ad87c7b111906b62d5bcd54e63d4f91b81de --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2961f1b3aaeaa95bebc77ef91a7b5d990dfe95bcaa7e2ad9a7fb22fa5ae8d2cf +size 14244 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/scheduler.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c580d4796cbe45bdcb2a39acc06afba66279aab5 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35b86eb7ec49369219fd6c7b5185fd429cf5e2b9681284ddd6335300e9f880a5 +size 1064 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/trainer_state.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..446926ed076157d6bfdacd2e3ff5459f6c67760f --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/trainer_state.json @@ -0,0 +1,3922 @@ +{ + "best_metric": 1.1086540222167969, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 5510, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007259528130671506, + "grad_norm": 0.46010470390319824, + "learning_rate": 0.0002, + "loss": 2.4936, + "step": 10 + }, + { + "epoch": 0.014519056261343012, + "grad_norm": 0.5103179216384888, + "learning_rate": 0.0002, + "loss": 1.8442, + "step": 20 + }, + { + "epoch": 0.021778584392014518, + "grad_norm": 0.6282716989517212, + "learning_rate": 0.0002, + "loss": 1.559, + "step": 30 + }, + { + "epoch": 0.029038112522686024, + "grad_norm": 1.2480497360229492, + "learning_rate": 0.0002, + "loss": 1.3618, + "step": 40 + }, + { + "epoch": 0.036297640653357534, + "grad_norm": 0.4114855229854584, + "learning_rate": 0.0002, + "loss": 1.3874, + "step": 50 + }, + { + "epoch": 0.043557168784029036, + "grad_norm": 0.49482840299606323, + "learning_rate": 0.0002, + "loss": 1.2836, + "step": 60 + }, + { + "epoch": 0.050816696914700546, + "grad_norm": 0.4536272883415222, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 70 + }, + { + "epoch": 0.05807622504537205, + "grad_norm": 0.32328274846076965, + "learning_rate": 0.0002, + "loss": 1.1125, + "step": 80 + }, + { + "epoch": 0.06533575317604355, + "grad_norm": 0.40990468859672546, + "learning_rate": 0.0002, + "loss": 1.2447, + "step": 90 + }, + { + "epoch": 0.07259528130671507, + "grad_norm": 0.37273502349853516, + "learning_rate": 0.0002, + "loss": 1.2216, + "step": 100 + }, + { + "epoch": 0.07985480943738657, + "grad_norm": 0.3903871476650238, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 110 + }, + { + "epoch": 0.08711433756805807, + "grad_norm": 0.3272787928581238, + "learning_rate": 0.0002, + "loss": 1.3718, + "step": 120 + }, + { + "epoch": 0.09437386569872959, + "grad_norm": 0.3622824251651764, + "learning_rate": 0.0002, + "loss": 1.1832, + "step": 130 + }, + { + "epoch": 0.10163339382940109, + "grad_norm": 0.3503916561603546, + "learning_rate": 0.0002, + "loss": 1.2966, + "step": 140 + }, + { + "epoch": 0.1088929219600726, + "grad_norm": 0.32787832617759705, + "learning_rate": 0.0002, + "loss": 1.0913, + "step": 150 + }, + { + "epoch": 0.1161524500907441, + "grad_norm": 0.5822657942771912, + "learning_rate": 0.0002, + "loss": 1.2143, + "step": 160 + }, + { + "epoch": 0.12341197822141561, + "grad_norm": 0.28028249740600586, + "learning_rate": 0.0002, + "loss": 1.2091, + "step": 170 + }, + { + "epoch": 0.1306715063520871, + "grad_norm": 0.33602750301361084, + "learning_rate": 0.0002, + "loss": 1.1731, + "step": 180 + }, + { + "epoch": 0.13793103448275862, + "grad_norm": 0.29106274247169495, + "learning_rate": 0.0002, + "loss": 1.127, + "step": 190 + }, + { + "epoch": 0.14519056261343014, + "grad_norm": 0.38753363490104675, + "learning_rate": 0.0002, + "loss": 1.3261, + "step": 200 + }, + { + "epoch": 0.15245009074410162, + "grad_norm": 0.361009418964386, + "learning_rate": 0.0002, + "loss": 1.1536, + "step": 210 + }, + { + "epoch": 0.15970961887477314, + "grad_norm": 0.6743836402893066, + "learning_rate": 0.0002, + "loss": 1.1483, + "step": 220 + }, + { + "epoch": 0.16696914700544466, + "grad_norm": 0.3923613727092743, + "learning_rate": 0.0002, + "loss": 1.264, + "step": 230 + }, + { + "epoch": 0.17422867513611615, + "grad_norm": 0.2809699773788452, + "learning_rate": 0.0002, + "loss": 1.0437, + "step": 240 + }, + { + "epoch": 0.18148820326678766, + "grad_norm": 0.3631494641304016, + "learning_rate": 0.0002, + "loss": 1.1826, + "step": 250 + }, + { + "epoch": 0.18874773139745918, + "grad_norm": 0.24658171832561493, + "learning_rate": 0.0002, + "loss": 1.1584, + "step": 260 + }, + { + "epoch": 0.19600725952813067, + "grad_norm": 0.5780664682388306, + "learning_rate": 0.0002, + "loss": 1.0065, + "step": 270 + }, + { + "epoch": 0.20326678765880218, + "grad_norm": 0.3056720495223999, + "learning_rate": 0.0002, + "loss": 1.1251, + "step": 280 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.2857084572315216, + "learning_rate": 0.0002, + "loss": 1.1503, + "step": 290 + }, + { + "epoch": 0.2177858439201452, + "grad_norm": 0.6645345687866211, + "learning_rate": 0.0002, + "loss": 1.1215, + "step": 300 + }, + { + "epoch": 0.2250453720508167, + "grad_norm": 0.5966078639030457, + "learning_rate": 0.0002, + "loss": 1.127, + "step": 310 + }, + { + "epoch": 0.2323049001814882, + "grad_norm": 0.40937140583992004, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 320 + }, + { + "epoch": 0.2395644283121597, + "grad_norm": 0.5642806887626648, + "learning_rate": 0.0002, + "loss": 1.2385, + "step": 330 + }, + { + "epoch": 0.24682395644283123, + "grad_norm": 0.2750748097896576, + "learning_rate": 0.0002, + "loss": 1.1703, + "step": 340 + }, + { + "epoch": 0.2540834845735027, + "grad_norm": 0.34350234270095825, + "learning_rate": 0.0002, + "loss": 1.2598, + "step": 350 + }, + { + "epoch": 0.2613430127041742, + "grad_norm": 0.6767239570617676, + "learning_rate": 0.0002, + "loss": 1.1942, + "step": 360 + }, + { + "epoch": 0.26860254083484575, + "grad_norm": 0.31006959080696106, + "learning_rate": 0.0002, + "loss": 1.1436, + "step": 370 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 0.3825474679470062, + "learning_rate": 0.0002, + "loss": 1.2084, + "step": 380 + }, + { + "epoch": 0.2831215970961887, + "grad_norm": 0.30241551995277405, + "learning_rate": 0.0002, + "loss": 1.1523, + "step": 390 + }, + { + "epoch": 0.29038112522686027, + "grad_norm": 0.2962397336959839, + "learning_rate": 0.0002, + "loss": 1.1298, + "step": 400 + }, + { + "epoch": 0.29764065335753176, + "grad_norm": 0.2600369155406952, + "learning_rate": 0.0002, + "loss": 1.1299, + "step": 410 + }, + { + "epoch": 0.30490018148820325, + "grad_norm": 0.3675060272216797, + "learning_rate": 0.0002, + "loss": 1.1366, + "step": 420 + }, + { + "epoch": 0.3121597096188748, + "grad_norm": 0.3429498076438904, + "learning_rate": 0.0002, + "loss": 1.156, + "step": 430 + }, + { + "epoch": 0.3194192377495463, + "grad_norm": 0.34311825037002563, + "learning_rate": 0.0002, + "loss": 1.2741, + "step": 440 + }, + { + "epoch": 0.32667876588021777, + "grad_norm": 0.37872210144996643, + "learning_rate": 0.0002, + "loss": 1.3523, + "step": 450 + }, + { + "epoch": 0.3339382940108893, + "grad_norm": 0.33271121978759766, + "learning_rate": 0.0002, + "loss": 1.1365, + "step": 460 + }, + { + "epoch": 0.3411978221415608, + "grad_norm": 0.34605276584625244, + "learning_rate": 0.0002, + "loss": 1.1572, + "step": 470 + }, + { + "epoch": 0.3484573502722323, + "grad_norm": 0.41050562262535095, + "learning_rate": 0.0002, + "loss": 1.2755, + "step": 480 + }, + { + "epoch": 0.35571687840290384, + "grad_norm": 0.2066836953163147, + "learning_rate": 0.0002, + "loss": 1.075, + "step": 490 + }, + { + "epoch": 0.3629764065335753, + "grad_norm": 0.2859014868736267, + "learning_rate": 0.0002, + "loss": 1.2072, + "step": 500 + }, + { + "epoch": 0.3702359346642468, + "grad_norm": 0.28763777017593384, + "learning_rate": 0.0002, + "loss": 1.1435, + "step": 510 + }, + { + "epoch": 0.37749546279491836, + "grad_norm": 0.2730471193790436, + "learning_rate": 0.0002, + "loss": 1.1341, + "step": 520 + }, + { + "epoch": 0.38475499092558985, + "grad_norm": 0.3968936800956726, + "learning_rate": 0.0002, + "loss": 1.348, + "step": 530 + }, + { + "epoch": 0.39201451905626133, + "grad_norm": 0.3624701201915741, + "learning_rate": 0.0002, + "loss": 1.2363, + "step": 540 + }, + { + "epoch": 0.3992740471869328, + "grad_norm": 0.3303608298301697, + "learning_rate": 0.0002, + "loss": 1.1206, + "step": 550 + }, + { + "epoch": 0.40653357531760437, + "grad_norm": 0.33507466316223145, + "learning_rate": 0.0002, + "loss": 1.203, + "step": 560 + }, + { + "epoch": 0.41379310344827586, + "grad_norm": 0.3297670781612396, + "learning_rate": 0.0002, + "loss": 1.2294, + "step": 570 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.32334890961647034, + "learning_rate": 0.0002, + "loss": 1.2, + "step": 580 + }, + { + "epoch": 0.4283121597096189, + "grad_norm": 0.30281195044517517, + "learning_rate": 0.0002, + "loss": 1.112, + "step": 590 + }, + { + "epoch": 0.4355716878402904, + "grad_norm": 0.5900027751922607, + "learning_rate": 0.0002, + "loss": 1.1352, + "step": 600 + }, + { + "epoch": 0.44283121597096187, + "grad_norm": 0.28437477350234985, + "learning_rate": 0.0002, + "loss": 1.1575, + "step": 610 + }, + { + "epoch": 0.4500907441016334, + "grad_norm": 0.39601704478263855, + "learning_rate": 0.0002, + "loss": 1.2009, + "step": 620 + }, + { + "epoch": 0.4573502722323049, + "grad_norm": 0.41971510648727417, + "learning_rate": 0.0002, + "loss": 1.2302, + "step": 630 + }, + { + "epoch": 0.4646098003629764, + "grad_norm": 0.33814409375190735, + "learning_rate": 0.0002, + "loss": 1.236, + "step": 640 + }, + { + "epoch": 0.47186932849364793, + "grad_norm": 0.575718104839325, + "learning_rate": 0.0002, + "loss": 1.2471, + "step": 650 + }, + { + "epoch": 0.4791288566243194, + "grad_norm": 0.37927401065826416, + "learning_rate": 0.0002, + "loss": 1.1012, + "step": 660 + }, + { + "epoch": 0.4863883847549909, + "grad_norm": 0.3224332630634308, + "learning_rate": 0.0002, + "loss": 1.1552, + "step": 670 + }, + { + "epoch": 0.49364791288566245, + "grad_norm": 0.32683515548706055, + "learning_rate": 0.0002, + "loss": 1.1354, + "step": 680 + }, + { + "epoch": 0.5009074410163339, + "grad_norm": 0.4316163659095764, + "learning_rate": 0.0002, + "loss": 1.207, + "step": 690 + }, + { + "epoch": 0.5081669691470054, + "grad_norm": 0.342602401971817, + "learning_rate": 0.0002, + "loss": 1.2116, + "step": 700 + }, + { + "epoch": 0.515426497277677, + "grad_norm": 0.2794898748397827, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 710 + }, + { + "epoch": 0.5226860254083484, + "grad_norm": 0.3322339951992035, + "learning_rate": 0.0002, + "loss": 1.2395, + "step": 720 + }, + { + "epoch": 0.52994555353902, + "grad_norm": 0.3088509142398834, + "learning_rate": 0.0002, + "loss": 1.0762, + "step": 730 + }, + { + "epoch": 0.5372050816696915, + "grad_norm": 0.24444378912448883, + "learning_rate": 0.0002, + "loss": 1.1495, + "step": 740 + }, + { + "epoch": 0.5444646098003629, + "grad_norm": 0.3483171761035919, + "learning_rate": 0.0002, + "loss": 1.2176, + "step": 750 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 0.3425690531730652, + "learning_rate": 0.0002, + "loss": 1.2248, + "step": 760 + }, + { + "epoch": 0.558983666061706, + "grad_norm": 0.31841927766799927, + "learning_rate": 0.0002, + "loss": 1.1127, + "step": 770 + }, + { + "epoch": 0.5662431941923775, + "grad_norm": 0.39423868060112, + "learning_rate": 0.0002, + "loss": 1.1841, + "step": 780 + }, + { + "epoch": 0.573502722323049, + "grad_norm": 0.30328479409217834, + "learning_rate": 0.0002, + "loss": 1.0583, + "step": 790 + }, + { + "epoch": 0.5807622504537205, + "grad_norm": 0.24475938081741333, + "learning_rate": 0.0002, + "loss": 1.0718, + "step": 800 + }, + { + "epoch": 0.588021778584392, + "grad_norm": 0.37132805585861206, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 810 + }, + { + "epoch": 0.5952813067150635, + "grad_norm": 0.32195979356765747, + "learning_rate": 0.0002, + "loss": 1.1544, + "step": 820 + }, + { + "epoch": 0.6025408348457351, + "grad_norm": 0.2848738729953766, + "learning_rate": 0.0002, + "loss": 0.9937, + "step": 830 + }, + { + "epoch": 0.6098003629764065, + "grad_norm": 0.28015264868736267, + "learning_rate": 0.0002, + "loss": 1.0937, + "step": 840 + }, + { + "epoch": 0.617059891107078, + "grad_norm": 0.37796008586883545, + "learning_rate": 0.0002, + "loss": 1.1234, + "step": 850 + }, + { + "epoch": 0.6243194192377496, + "grad_norm": 0.39311841130256653, + "learning_rate": 0.0002, + "loss": 1.1455, + "step": 860 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.2761685252189636, + "learning_rate": 0.0002, + "loss": 1.1066, + "step": 870 + }, + { + "epoch": 0.6388384754990926, + "grad_norm": 0.3826720118522644, + "learning_rate": 0.0002, + "loss": 1.0859, + "step": 880 + }, + { + "epoch": 0.6460980036297641, + "grad_norm": 0.30076679587364197, + "learning_rate": 0.0002, + "loss": 1.1182, + "step": 890 + }, + { + "epoch": 0.6533575317604355, + "grad_norm": 0.21997687220573425, + "learning_rate": 0.0002, + "loss": 1.0927, + "step": 900 + }, + { + "epoch": 0.6606170598911071, + "grad_norm": 0.32593777775764465, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 910 + }, + { + "epoch": 0.6678765880217786, + "grad_norm": 0.30347898602485657, + "learning_rate": 0.0002, + "loss": 1.1578, + "step": 920 + }, + { + "epoch": 0.6751361161524501, + "grad_norm": 0.44173774123191833, + "learning_rate": 0.0002, + "loss": 1.1642, + "step": 930 + }, + { + "epoch": 0.6823956442831216, + "grad_norm": 0.2507467269897461, + "learning_rate": 0.0002, + "loss": 1.1832, + "step": 940 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 0.29463833570480347, + "learning_rate": 0.0002, + "loss": 1.3001, + "step": 950 + }, + { + "epoch": 0.6969147005444646, + "grad_norm": 0.9363154172897339, + "learning_rate": 0.0002, + "loss": 1.1651, + "step": 960 + }, + { + "epoch": 0.7041742286751361, + "grad_norm": 0.3236212134361267, + "learning_rate": 0.0002, + "loss": 1.158, + "step": 970 + }, + { + "epoch": 0.7114337568058077, + "grad_norm": 0.3123254179954529, + "learning_rate": 0.0002, + "loss": 1.1078, + "step": 980 + }, + { + "epoch": 0.7186932849364791, + "grad_norm": 0.3395805060863495, + "learning_rate": 0.0002, + "loss": 1.1578, + "step": 990 + }, + { + "epoch": 0.7259528130671506, + "grad_norm": 0.3240964412689209, + "learning_rate": 0.0002, + "loss": 1.0754, + "step": 1000 + }, + { + "epoch": 0.7332123411978222, + "grad_norm": 0.31902948021888733, + "learning_rate": 0.0002, + "loss": 1.0313, + "step": 1010 + }, + { + "epoch": 0.7404718693284936, + "grad_norm": 0.4848408102989197, + "learning_rate": 0.0002, + "loss": 1.1664, + "step": 1020 + }, + { + "epoch": 0.7477313974591652, + "grad_norm": 0.33006033301353455, + "learning_rate": 0.0002, + "loss": 1.0838, + "step": 1030 + }, + { + "epoch": 0.7549909255898367, + "grad_norm": 0.2928730547428131, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 1040 + }, + { + "epoch": 0.7622504537205081, + "grad_norm": 0.3529164791107178, + "learning_rate": 0.0002, + "loss": 1.1351, + "step": 1050 + }, + { + "epoch": 0.7695099818511797, + "grad_norm": 0.2736213803291321, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 1060 + }, + { + "epoch": 0.7767695099818511, + "grad_norm": 0.7200686931610107, + "learning_rate": 0.0002, + "loss": 1.1169, + "step": 1070 + }, + { + "epoch": 0.7840290381125227, + "grad_norm": 0.33396708965301514, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 1080 + }, + { + "epoch": 0.7912885662431942, + "grad_norm": 1.5760449171066284, + "learning_rate": 0.0002, + "loss": 1.2447, + "step": 1090 + }, + { + "epoch": 0.7985480943738656, + "grad_norm": 0.28138381242752075, + "learning_rate": 0.0002, + "loss": 1.1126, + "step": 1100 + }, + { + "epoch": 0.8058076225045372, + "grad_norm": 0.2597472369670868, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 1110 + }, + { + "epoch": 0.8130671506352087, + "grad_norm": 0.3305445611476898, + "learning_rate": 0.0002, + "loss": 1.1177, + "step": 1120 + }, + { + "epoch": 0.8203266787658802, + "grad_norm": 0.3934599459171295, + "learning_rate": 0.0002, + "loss": 1.0849, + "step": 1130 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 0.3472191393375397, + "learning_rate": 0.0002, + "loss": 1.0889, + "step": 1140 + }, + { + "epoch": 0.8348457350272233, + "grad_norm": 0.2857365906238556, + "learning_rate": 0.0002, + "loss": 1.265, + "step": 1150 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.3207702934741974, + "learning_rate": 0.0002, + "loss": 1.03, + "step": 1160 + }, + { + "epoch": 0.8493647912885662, + "grad_norm": 0.3176484704017639, + "learning_rate": 0.0002, + "loss": 1.1669, + "step": 1170 + }, + { + "epoch": 0.8566243194192378, + "grad_norm": 0.40685558319091797, + "learning_rate": 0.0002, + "loss": 1.1386, + "step": 1180 + }, + { + "epoch": 0.8638838475499092, + "grad_norm": 0.31125199794769287, + "learning_rate": 0.0002, + "loss": 1.1383, + "step": 1190 + }, + { + "epoch": 0.8711433756805808, + "grad_norm": 0.7361181378364563, + "learning_rate": 0.0002, + "loss": 1.1512, + "step": 1200 + }, + { + "epoch": 0.8784029038112523, + "grad_norm": 0.33699527382850647, + "learning_rate": 0.0002, + "loss": 0.9983, + "step": 1210 + }, + { + "epoch": 0.8856624319419237, + "grad_norm": 0.3315220773220062, + "learning_rate": 0.0002, + "loss": 1.11, + "step": 1220 + }, + { + "epoch": 0.8929219600725953, + "grad_norm": 0.6256054043769836, + "learning_rate": 0.0002, + "loss": 1.0266, + "step": 1230 + }, + { + "epoch": 0.9001814882032668, + "grad_norm": 0.3692137897014618, + "learning_rate": 0.0002, + "loss": 1.1738, + "step": 1240 + }, + { + "epoch": 0.9074410163339383, + "grad_norm": 0.3538484573364258, + "learning_rate": 0.0002, + "loss": 1.1112, + "step": 1250 + }, + { + "epoch": 0.9147005444646098, + "grad_norm": 0.27863040566444397, + "learning_rate": 0.0002, + "loss": 1.0878, + "step": 1260 + }, + { + "epoch": 0.9219600725952813, + "grad_norm": 0.3322528302669525, + "learning_rate": 0.0002, + "loss": 1.0826, + "step": 1270 + }, + { + "epoch": 0.9292196007259528, + "grad_norm": 0.7553173303604126, + "learning_rate": 0.0002, + "loss": 1.0158, + "step": 1280 + }, + { + "epoch": 0.9364791288566243, + "grad_norm": 0.4856191575527191, + "learning_rate": 0.0002, + "loss": 1.198, + "step": 1290 + }, + { + "epoch": 0.9437386569872959, + "grad_norm": 0.3668074905872345, + "learning_rate": 0.0002, + "loss": 1.1678, + "step": 1300 + }, + { + "epoch": 0.9509981851179673, + "grad_norm": 0.29851067066192627, + "learning_rate": 0.0002, + "loss": 1.1809, + "step": 1310 + }, + { + "epoch": 0.9582577132486388, + "grad_norm": 0.276664674282074, + "learning_rate": 0.0002, + "loss": 1.1299, + "step": 1320 + }, + { + "epoch": 0.9655172413793104, + "grad_norm": 0.2941018342971802, + "learning_rate": 0.0002, + "loss": 1.0597, + "step": 1330 + }, + { + "epoch": 0.9727767695099818, + "grad_norm": 0.3505859076976776, + "learning_rate": 0.0002, + "loss": 1.1731, + "step": 1340 + }, + { + "epoch": 0.9800362976406534, + "grad_norm": 0.3067687451839447, + "learning_rate": 0.0002, + "loss": 1.1809, + "step": 1350 + }, + { + "epoch": 0.9872958257713249, + "grad_norm": 0.27151066064834595, + "learning_rate": 0.0002, + "loss": 1.1511, + "step": 1360 + }, + { + "epoch": 0.9945553539019963, + "grad_norm": 0.36370083689689636, + "learning_rate": 0.0002, + "loss": 1.1919, + "step": 1370 + }, + { + "epoch": 0.9996370235934664, + "eval_loss": 1.1381088495254517, + "eval_runtime": 96.0848, + "eval_samples_per_second": 4.538, + "eval_steps_per_second": 0.572, + "step": 1377 + }, + { + "epoch": 1.0018148820326678, + "grad_norm": 0.27980583906173706, + "learning_rate": 0.0002, + "loss": 1.0082, + "step": 1380 + }, + { + "epoch": 1.0090744101633393, + "grad_norm": 0.26713913679122925, + "learning_rate": 0.0002, + "loss": 1.0565, + "step": 1390 + }, + { + "epoch": 1.0163339382940109, + "grad_norm": 0.3089541494846344, + "learning_rate": 0.0002, + "loss": 1.0241, + "step": 1400 + }, + { + "epoch": 1.0235934664246824, + "grad_norm": 0.4188242256641388, + "learning_rate": 0.0002, + "loss": 0.9649, + "step": 1410 + }, + { + "epoch": 1.030852994555354, + "grad_norm": 0.5246463418006897, + "learning_rate": 0.0002, + "loss": 0.9821, + "step": 1420 + }, + { + "epoch": 1.0381125226860255, + "grad_norm": 0.2728777825832367, + "learning_rate": 0.0002, + "loss": 1.0143, + "step": 1430 + }, + { + "epoch": 1.0453720508166968, + "grad_norm": 0.38167616724967957, + "learning_rate": 0.0002, + "loss": 1.075, + "step": 1440 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.4439380168914795, + "learning_rate": 0.0002, + "loss": 1.0229, + "step": 1450 + }, + { + "epoch": 1.05989110707804, + "grad_norm": 0.30954182147979736, + "learning_rate": 0.0002, + "loss": 1.0451, + "step": 1460 + }, + { + "epoch": 1.0671506352087115, + "grad_norm": 0.4022280275821686, + "learning_rate": 0.0002, + "loss": 1.0762, + "step": 1470 + }, + { + "epoch": 1.074410163339383, + "grad_norm": 0.7390811443328857, + "learning_rate": 0.0002, + "loss": 1.0754, + "step": 1480 + }, + { + "epoch": 1.0816696914700545, + "grad_norm": 0.3885486423969269, + "learning_rate": 0.0002, + "loss": 1.1378, + "step": 1490 + }, + { + "epoch": 1.0889292196007259, + "grad_norm": 0.5275560617446899, + "learning_rate": 0.0002, + "loss": 1.005, + "step": 1500 + }, + { + "epoch": 1.0961887477313974, + "grad_norm": 0.35112282633781433, + "learning_rate": 0.0002, + "loss": 1.0147, + "step": 1510 + }, + { + "epoch": 1.103448275862069, + "grad_norm": 0.33714351058006287, + "learning_rate": 0.0002, + "loss": 1.0331, + "step": 1520 + }, + { + "epoch": 1.1107078039927405, + "grad_norm": 0.31221693754196167, + "learning_rate": 0.0002, + "loss": 0.8723, + "step": 1530 + }, + { + "epoch": 1.117967332123412, + "grad_norm": 0.27549654245376587, + "learning_rate": 0.0002, + "loss": 1.0701, + "step": 1540 + }, + { + "epoch": 1.1252268602540836, + "grad_norm": 0.8465521335601807, + "learning_rate": 0.0002, + "loss": 1.0114, + "step": 1550 + }, + { + "epoch": 1.132486388384755, + "grad_norm": 0.36125949025154114, + "learning_rate": 0.0002, + "loss": 0.9643, + "step": 1560 + }, + { + "epoch": 1.1397459165154264, + "grad_norm": 0.37420371174812317, + "learning_rate": 0.0002, + "loss": 1.0744, + "step": 1570 + }, + { + "epoch": 1.147005444646098, + "grad_norm": 0.3294760584831238, + "learning_rate": 0.0002, + "loss": 1.165, + "step": 1580 + }, + { + "epoch": 1.1542649727767695, + "grad_norm": 0.3881238102912903, + "learning_rate": 0.0002, + "loss": 0.9207, + "step": 1590 + }, + { + "epoch": 1.161524500907441, + "grad_norm": 0.4766491651535034, + "learning_rate": 0.0002, + "loss": 1.0983, + "step": 1600 + }, + { + "epoch": 1.1687840290381124, + "grad_norm": 0.395530104637146, + "learning_rate": 0.0002, + "loss": 1.0092, + "step": 1610 + }, + { + "epoch": 1.176043557168784, + "grad_norm": 0.3297106623649597, + "learning_rate": 0.0002, + "loss": 0.9895, + "step": 1620 + }, + { + "epoch": 1.1833030852994555, + "grad_norm": 0.39528271555900574, + "learning_rate": 0.0002, + "loss": 1.1121, + "step": 1630 + }, + { + "epoch": 1.190562613430127, + "grad_norm": 0.3370221257209778, + "learning_rate": 0.0002, + "loss": 0.9202, + "step": 1640 + }, + { + "epoch": 1.1978221415607986, + "grad_norm": 0.31922030448913574, + "learning_rate": 0.0002, + "loss": 1.102, + "step": 1650 + }, + { + "epoch": 1.2050816696914701, + "grad_norm": 0.6142027378082275, + "learning_rate": 0.0002, + "loss": 1.0565, + "step": 1660 + }, + { + "epoch": 1.2123411978221417, + "grad_norm": 0.44769710302352905, + "learning_rate": 0.0002, + "loss": 0.9541, + "step": 1670 + }, + { + "epoch": 1.219600725952813, + "grad_norm": 0.41233646869659424, + "learning_rate": 0.0002, + "loss": 1.2501, + "step": 1680 + }, + { + "epoch": 1.2268602540834845, + "grad_norm": 0.2928866147994995, + "learning_rate": 0.0002, + "loss": 1.0147, + "step": 1690 + }, + { + "epoch": 1.234119782214156, + "grad_norm": 0.36913734674453735, + "learning_rate": 0.0002, + "loss": 0.9074, + "step": 1700 + }, + { + "epoch": 1.2413793103448276, + "grad_norm": 0.5281891226768494, + "learning_rate": 0.0002, + "loss": 1.0847, + "step": 1710 + }, + { + "epoch": 1.2486388384754992, + "grad_norm": 0.3374697268009186, + "learning_rate": 0.0002, + "loss": 0.9482, + "step": 1720 + }, + { + "epoch": 1.2558983666061705, + "grad_norm": 0.3802020847797394, + "learning_rate": 0.0002, + "loss": 0.9826, + "step": 1730 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 0.38048651814460754, + "learning_rate": 0.0002, + "loss": 1.0191, + "step": 1740 + }, + { + "epoch": 1.2704174228675136, + "grad_norm": 0.6676169633865356, + "learning_rate": 0.0002, + "loss": 1.0142, + "step": 1750 + }, + { + "epoch": 1.2776769509981851, + "grad_norm": 0.4075961410999298, + "learning_rate": 0.0002, + "loss": 0.992, + "step": 1760 + }, + { + "epoch": 1.2849364791288567, + "grad_norm": 0.4374721348285675, + "learning_rate": 0.0002, + "loss": 1.0301, + "step": 1770 + }, + { + "epoch": 1.2921960072595282, + "grad_norm": 0.4638824164867401, + "learning_rate": 0.0002, + "loss": 1.0601, + "step": 1780 + }, + { + "epoch": 1.2994555353901998, + "grad_norm": 0.38631564378738403, + "learning_rate": 0.0002, + "loss": 0.9593, + "step": 1790 + }, + { + "epoch": 1.306715063520871, + "grad_norm": 0.35873809456825256, + "learning_rate": 0.0002, + "loss": 1.0042, + "step": 1800 + }, + { + "epoch": 1.3139745916515426, + "grad_norm": 0.33602237701416016, + "learning_rate": 0.0002, + "loss": 0.9835, + "step": 1810 + }, + { + "epoch": 1.3212341197822142, + "grad_norm": 0.46696463227272034, + "learning_rate": 0.0002, + "loss": 1.1032, + "step": 1820 + }, + { + "epoch": 1.3284936479128857, + "grad_norm": 0.368958979845047, + "learning_rate": 0.0002, + "loss": 1.0123, + "step": 1830 + }, + { + "epoch": 1.335753176043557, + "grad_norm": 0.3160957396030426, + "learning_rate": 0.0002, + "loss": 1.0456, + "step": 1840 + }, + { + "epoch": 1.3430127041742286, + "grad_norm": 0.4511511027812958, + "learning_rate": 0.0002, + "loss": 1.0264, + "step": 1850 + }, + { + "epoch": 1.3502722323049001, + "grad_norm": 0.6769845485687256, + "learning_rate": 0.0002, + "loss": 1.0179, + "step": 1860 + }, + { + "epoch": 1.3575317604355717, + "grad_norm": 0.3749309480190277, + "learning_rate": 0.0002, + "loss": 1.011, + "step": 1870 + }, + { + "epoch": 1.3647912885662432, + "grad_norm": 0.553798496723175, + "learning_rate": 0.0002, + "loss": 1.0223, + "step": 1880 + }, + { + "epoch": 1.3720508166969148, + "grad_norm": 0.3538985550403595, + "learning_rate": 0.0002, + "loss": 0.9634, + "step": 1890 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 0.5501534938812256, + "learning_rate": 0.0002, + "loss": 1.0275, + "step": 1900 + }, + { + "epoch": 1.3865698729582578, + "grad_norm": 0.4432051181793213, + "learning_rate": 0.0002, + "loss": 1.0461, + "step": 1910 + }, + { + "epoch": 1.3938294010889292, + "grad_norm": 0.41755786538124084, + "learning_rate": 0.0002, + "loss": 0.9082, + "step": 1920 + }, + { + "epoch": 1.4010889292196007, + "grad_norm": 0.5626114010810852, + "learning_rate": 0.0002, + "loss": 1.0858, + "step": 1930 + }, + { + "epoch": 1.4083484573502723, + "grad_norm": 0.44038185477256775, + "learning_rate": 0.0002, + "loss": 1.0687, + "step": 1940 + }, + { + "epoch": 1.4156079854809438, + "grad_norm": 0.3397001624107361, + "learning_rate": 0.0002, + "loss": 0.9454, + "step": 1950 + }, + { + "epoch": 1.4228675136116151, + "grad_norm": 0.4325368106365204, + "learning_rate": 0.0002, + "loss": 0.943, + "step": 1960 + }, + { + "epoch": 1.4301270417422867, + "grad_norm": 0.3900907039642334, + "learning_rate": 0.0002, + "loss": 1.0488, + "step": 1970 + }, + { + "epoch": 1.4373865698729582, + "grad_norm": 0.369612455368042, + "learning_rate": 0.0002, + "loss": 0.9699, + "step": 1980 + }, + { + "epoch": 1.4446460980036298, + "grad_norm": 0.4389338791370392, + "learning_rate": 0.0002, + "loss": 1.0609, + "step": 1990 + }, + { + "epoch": 1.4519056261343013, + "grad_norm": 1.694450855255127, + "learning_rate": 0.0002, + "loss": 1.042, + "step": 2000 + }, + { + "epoch": 1.4591651542649728, + "grad_norm": 0.516957700252533, + "learning_rate": 0.0002, + "loss": 0.997, + "step": 2010 + }, + { + "epoch": 1.4664246823956444, + "grad_norm": 0.45515501499176025, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 2020 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 0.4153139591217041, + "learning_rate": 0.0002, + "loss": 1.0188, + "step": 2030 + }, + { + "epoch": 1.4809437386569873, + "grad_norm": 0.44353052973747253, + "learning_rate": 0.0002, + "loss": 1.1521, + "step": 2040 + }, + { + "epoch": 1.4882032667876588, + "grad_norm": 0.570554256439209, + "learning_rate": 0.0002, + "loss": 0.9653, + "step": 2050 + }, + { + "epoch": 1.4954627949183303, + "grad_norm": 0.5742740035057068, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 2060 + }, + { + "epoch": 1.5027223230490017, + "grad_norm": 0.5890517830848694, + "learning_rate": 0.0002, + "loss": 1.0963, + "step": 2070 + }, + { + "epoch": 1.5099818511796732, + "grad_norm": 0.4162650406360626, + "learning_rate": 0.0002, + "loss": 1.0639, + "step": 2080 + }, + { + "epoch": 1.5172413793103448, + "grad_norm": 0.7334717512130737, + "learning_rate": 0.0002, + "loss": 1.071, + "step": 2090 + }, + { + "epoch": 1.5245009074410163, + "grad_norm": 0.2963249683380127, + "learning_rate": 0.0002, + "loss": 0.8957, + "step": 2100 + }, + { + "epoch": 1.5317604355716878, + "grad_norm": 0.30676454305648804, + "learning_rate": 0.0002, + "loss": 1.0446, + "step": 2110 + }, + { + "epoch": 1.5390199637023594, + "grad_norm": 0.35984641313552856, + "learning_rate": 0.0002, + "loss": 0.9967, + "step": 2120 + }, + { + "epoch": 1.546279491833031, + "grad_norm": 0.3384549617767334, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 2130 + }, + { + "epoch": 1.5535390199637025, + "grad_norm": 0.4725518226623535, + "learning_rate": 0.0002, + "loss": 0.9874, + "step": 2140 + }, + { + "epoch": 1.560798548094374, + "grad_norm": 0.3252685070037842, + "learning_rate": 0.0002, + "loss": 1.1166, + "step": 2150 + }, + { + "epoch": 1.5680580762250453, + "grad_norm": 0.45043081045150757, + "learning_rate": 0.0002, + "loss": 0.9127, + "step": 2160 + }, + { + "epoch": 1.5753176043557169, + "grad_norm": 0.374208003282547, + "learning_rate": 0.0002, + "loss": 1.0767, + "step": 2170 + }, + { + "epoch": 1.5825771324863884, + "grad_norm": 0.5118404030799866, + "learning_rate": 0.0002, + "loss": 1.0255, + "step": 2180 + }, + { + "epoch": 1.5898366606170597, + "grad_norm": 0.482170969247818, + "learning_rate": 0.0002, + "loss": 1.0062, + "step": 2190 + }, + { + "epoch": 1.5970961887477313, + "grad_norm": 0.5337533950805664, + "learning_rate": 0.0002, + "loss": 1.0872, + "step": 2200 + }, + { + "epoch": 1.6043557168784028, + "grad_norm": 0.5195064544677734, + "learning_rate": 0.0002, + "loss": 1.0405, + "step": 2210 + }, + { + "epoch": 1.6116152450090744, + "grad_norm": 0.30807098746299744, + "learning_rate": 0.0002, + "loss": 1.0454, + "step": 2220 + }, + { + "epoch": 1.618874773139746, + "grad_norm": 0.3962925672531128, + "learning_rate": 0.0002, + "loss": 1.0293, + "step": 2230 + }, + { + "epoch": 1.6261343012704175, + "grad_norm": 0.7636962532997131, + "learning_rate": 0.0002, + "loss": 1.0137, + "step": 2240 + }, + { + "epoch": 1.633393829401089, + "grad_norm": 0.32380592823028564, + "learning_rate": 0.0002, + "loss": 0.991, + "step": 2250 + }, + { + "epoch": 1.6406533575317606, + "grad_norm": 0.5767741799354553, + "learning_rate": 0.0002, + "loss": 0.9471, + "step": 2260 + }, + { + "epoch": 1.647912885662432, + "grad_norm": 0.39964812994003296, + "learning_rate": 0.0002, + "loss": 0.9524, + "step": 2270 + }, + { + "epoch": 1.6551724137931034, + "grad_norm": 0.622629702091217, + "learning_rate": 0.0002, + "loss": 0.9866, + "step": 2280 + }, + { + "epoch": 1.662431941923775, + "grad_norm": 0.40202152729034424, + "learning_rate": 0.0002, + "loss": 0.9609, + "step": 2290 + }, + { + "epoch": 1.6696914700544465, + "grad_norm": 0.4467349052429199, + "learning_rate": 0.0002, + "loss": 0.9859, + "step": 2300 + }, + { + "epoch": 1.6769509981851178, + "grad_norm": 0.5026949048042297, + "learning_rate": 0.0002, + "loss": 1.0312, + "step": 2310 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 0.43754541873931885, + "learning_rate": 0.0002, + "loss": 1.0776, + "step": 2320 + }, + { + "epoch": 1.691470054446461, + "grad_norm": 0.42869430780410767, + "learning_rate": 0.0002, + "loss": 0.9786, + "step": 2330 + }, + { + "epoch": 1.6987295825771325, + "grad_norm": 0.4192679524421692, + "learning_rate": 0.0002, + "loss": 1.0747, + "step": 2340 + }, + { + "epoch": 1.705989110707804, + "grad_norm": 0.3243155777454376, + "learning_rate": 0.0002, + "loss": 1.0054, + "step": 2350 + }, + { + "epoch": 1.7132486388384756, + "grad_norm": 1.0514075756072998, + "learning_rate": 0.0002, + "loss": 0.9165, + "step": 2360 + }, + { + "epoch": 1.720508166969147, + "grad_norm": 0.4719122648239136, + "learning_rate": 0.0002, + "loss": 1.1353, + "step": 2370 + }, + { + "epoch": 1.7277676950998186, + "grad_norm": 0.3846144676208496, + "learning_rate": 0.0002, + "loss": 0.9913, + "step": 2380 + }, + { + "epoch": 1.73502722323049, + "grad_norm": 0.7266581058502197, + "learning_rate": 0.0002, + "loss": 0.9935, + "step": 2390 + }, + { + "epoch": 1.7422867513611615, + "grad_norm": 0.6183241605758667, + "learning_rate": 0.0002, + "loss": 0.9509, + "step": 2400 + }, + { + "epoch": 1.749546279491833, + "grad_norm": 0.3658260405063629, + "learning_rate": 0.0002, + "loss": 0.9713, + "step": 2410 + }, + { + "epoch": 1.7568058076225044, + "grad_norm": 0.6036322712898254, + "learning_rate": 0.0002, + "loss": 1.0033, + "step": 2420 + }, + { + "epoch": 1.764065335753176, + "grad_norm": 0.7872936129570007, + "learning_rate": 0.0002, + "loss": 1.0526, + "step": 2430 + }, + { + "epoch": 1.7713248638838475, + "grad_norm": 0.35946124792099, + "learning_rate": 0.0002, + "loss": 0.99, + "step": 2440 + }, + { + "epoch": 1.778584392014519, + "grad_norm": 0.3740338981151581, + "learning_rate": 0.0002, + "loss": 0.8845, + "step": 2450 + }, + { + "epoch": 1.7858439201451906, + "grad_norm": 0.6150230169296265, + "learning_rate": 0.0002, + "loss": 0.9333, + "step": 2460 + }, + { + "epoch": 1.793103448275862, + "grad_norm": 0.4726075530052185, + "learning_rate": 0.0002, + "loss": 1.0373, + "step": 2470 + }, + { + "epoch": 1.8003629764065336, + "grad_norm": 0.31292253732681274, + "learning_rate": 0.0002, + "loss": 0.974, + "step": 2480 + }, + { + "epoch": 1.8076225045372052, + "grad_norm": 0.4463104009628296, + "learning_rate": 0.0002, + "loss": 1.0205, + "step": 2490 + }, + { + "epoch": 1.8148820326678767, + "grad_norm": 0.7848200798034668, + "learning_rate": 0.0002, + "loss": 1.0771, + "step": 2500 + }, + { + "epoch": 1.822141560798548, + "grad_norm": 0.5562082529067993, + "learning_rate": 0.0002, + "loss": 0.9508, + "step": 2510 + }, + { + "epoch": 1.8294010889292196, + "grad_norm": 0.39892756938934326, + "learning_rate": 0.0002, + "loss": 0.9343, + "step": 2520 + }, + { + "epoch": 1.8366606170598911, + "grad_norm": 1.2923320531845093, + "learning_rate": 0.0002, + "loss": 1.0489, + "step": 2530 + }, + { + "epoch": 1.8439201451905625, + "grad_norm": 0.6316490769386292, + "learning_rate": 0.0002, + "loss": 1.1203, + "step": 2540 + }, + { + "epoch": 1.851179673321234, + "grad_norm": 0.46100497245788574, + "learning_rate": 0.0002, + "loss": 0.9131, + "step": 2550 + }, + { + "epoch": 1.8584392014519056, + "grad_norm": 0.7902987003326416, + "learning_rate": 0.0002, + "loss": 1.0977, + "step": 2560 + }, + { + "epoch": 1.865698729582577, + "grad_norm": 0.4596365690231323, + "learning_rate": 0.0002, + "loss": 0.9702, + "step": 2570 + }, + { + "epoch": 1.8729582577132486, + "grad_norm": 0.6592172384262085, + "learning_rate": 0.0002, + "loss": 0.9322, + "step": 2580 + }, + { + "epoch": 1.8802177858439202, + "grad_norm": 0.5764662027359009, + "learning_rate": 0.0002, + "loss": 0.9549, + "step": 2590 + }, + { + "epoch": 1.8874773139745917, + "grad_norm": 0.8421637415885925, + "learning_rate": 0.0002, + "loss": 0.9485, + "step": 2600 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.5635305047035217, + "learning_rate": 0.0002, + "loss": 1.012, + "step": 2610 + }, + { + "epoch": 1.9019963702359348, + "grad_norm": 0.46231237053871155, + "learning_rate": 0.0002, + "loss": 0.8907, + "step": 2620 + }, + { + "epoch": 1.9092558983666061, + "grad_norm": 0.3944607079029083, + "learning_rate": 0.0002, + "loss": 0.9543, + "step": 2630 + }, + { + "epoch": 1.9165154264972777, + "grad_norm": 0.4753907322883606, + "learning_rate": 0.0002, + "loss": 0.9964, + "step": 2640 + }, + { + "epoch": 1.9237749546279492, + "grad_norm": 0.4151090979576111, + "learning_rate": 0.0002, + "loss": 1.0217, + "step": 2650 + }, + { + "epoch": 1.9310344827586206, + "grad_norm": 0.6793725490570068, + "learning_rate": 0.0002, + "loss": 1.0397, + "step": 2660 + }, + { + "epoch": 1.938294010889292, + "grad_norm": 0.339755117893219, + "learning_rate": 0.0002, + "loss": 1.0401, + "step": 2670 + }, + { + "epoch": 1.9455535390199636, + "grad_norm": 0.40789374709129333, + "learning_rate": 0.0002, + "loss": 1.082, + "step": 2680 + }, + { + "epoch": 1.9528130671506352, + "grad_norm": 0.3750005066394806, + "learning_rate": 0.0002, + "loss": 0.9973, + "step": 2690 + }, + { + "epoch": 1.9600725952813067, + "grad_norm": 0.39684441685676575, + "learning_rate": 0.0002, + "loss": 0.9837, + "step": 2700 + }, + { + "epoch": 1.9673321234119783, + "grad_norm": 0.378287672996521, + "learning_rate": 0.0002, + "loss": 1.1204, + "step": 2710 + }, + { + "epoch": 1.9745916515426498, + "grad_norm": 0.3668482005596161, + "learning_rate": 0.0002, + "loss": 0.9284, + "step": 2720 + }, + { + "epoch": 1.9818511796733214, + "grad_norm": 0.49997565150260925, + "learning_rate": 0.0002, + "loss": 1.0512, + "step": 2730 + }, + { + "epoch": 1.989110707803993, + "grad_norm": 0.36852124333381653, + "learning_rate": 0.0002, + "loss": 0.9311, + "step": 2740 + }, + { + "epoch": 1.9963702359346642, + "grad_norm": 0.5203380584716797, + "learning_rate": 0.0002, + "loss": 1.0898, + "step": 2750 + }, + { + "epoch": 2.0, + "eval_loss": 1.1086540222167969, + "eval_runtime": 95.6866, + "eval_samples_per_second": 4.557, + "eval_steps_per_second": 0.575, + "step": 2755 + }, + { + "epoch": 2.0036297640653356, + "grad_norm": 0.35921573638916016, + "learning_rate": 0.0002, + "loss": 0.9791, + "step": 2760 + }, + { + "epoch": 2.010889292196007, + "grad_norm": 1.013861894607544, + "learning_rate": 0.0002, + "loss": 0.8968, + "step": 2770 + }, + { + "epoch": 2.0181488203266786, + "grad_norm": 0.4425240159034729, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 2780 + }, + { + "epoch": 2.02540834845735, + "grad_norm": 0.60174161195755, + "learning_rate": 0.0002, + "loss": 0.8758, + "step": 2790 + }, + { + "epoch": 2.0326678765880217, + "grad_norm": 0.47582098841667175, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 2800 + }, + { + "epoch": 2.0399274047186933, + "grad_norm": 0.6012811660766602, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 2810 + }, + { + "epoch": 2.047186932849365, + "grad_norm": 0.4444408118724823, + "learning_rate": 0.0002, + "loss": 0.8109, + "step": 2820 + }, + { + "epoch": 2.0544464609800364, + "grad_norm": 0.4864003360271454, + "learning_rate": 0.0002, + "loss": 0.8621, + "step": 2830 + }, + { + "epoch": 2.061705989110708, + "grad_norm": 0.5104215741157532, + "learning_rate": 0.0002, + "loss": 0.8817, + "step": 2840 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 0.6218489408493042, + "learning_rate": 0.0002, + "loss": 0.7789, + "step": 2850 + }, + { + "epoch": 2.076225045372051, + "grad_norm": 0.705784261226654, + "learning_rate": 0.0002, + "loss": 0.8778, + "step": 2860 + }, + { + "epoch": 2.0834845735027225, + "grad_norm": 0.48091503977775574, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 2870 + }, + { + "epoch": 2.0907441016333936, + "grad_norm": 0.5062456727027893, + "learning_rate": 0.0002, + "loss": 0.8444, + "step": 2880 + }, + { + "epoch": 2.098003629764065, + "grad_norm": 0.4862022399902344, + "learning_rate": 0.0002, + "loss": 0.7898, + "step": 2890 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.48264044523239136, + "learning_rate": 0.0002, + "loss": 0.8593, + "step": 2900 + }, + { + "epoch": 2.1125226860254083, + "grad_norm": 0.43744346499443054, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 2910 + }, + { + "epoch": 2.11978221415608, + "grad_norm": 0.5480492115020752, + "learning_rate": 0.0002, + "loss": 0.9099, + "step": 2920 + }, + { + "epoch": 2.1270417422867514, + "grad_norm": 0.5068560838699341, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 2930 + }, + { + "epoch": 2.134301270417423, + "grad_norm": 0.4650852680206299, + "learning_rate": 0.0002, + "loss": 0.8403, + "step": 2940 + }, + { + "epoch": 2.1415607985480944, + "grad_norm": 0.4929981231689453, + "learning_rate": 0.0002, + "loss": 0.8467, + "step": 2950 + }, + { + "epoch": 2.148820326678766, + "grad_norm": 0.6537389755249023, + "learning_rate": 0.0002, + "loss": 0.8747, + "step": 2960 + }, + { + "epoch": 2.1560798548094375, + "grad_norm": 0.8032940626144409, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 2970 + }, + { + "epoch": 2.163339382940109, + "grad_norm": 0.7131643891334534, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 2980 + }, + { + "epoch": 2.1705989110707806, + "grad_norm": 0.6034275889396667, + "learning_rate": 0.0002, + "loss": 0.8424, + "step": 2990 + }, + { + "epoch": 2.1778584392014517, + "grad_norm": 0.6081095933914185, + "learning_rate": 0.0002, + "loss": 0.8838, + "step": 3000 + }, + { + "epoch": 2.1851179673321233, + "grad_norm": 0.5706912875175476, + "learning_rate": 0.0002, + "loss": 0.9013, + "step": 3010 + }, + { + "epoch": 2.192377495462795, + "grad_norm": 0.6742380261421204, + "learning_rate": 0.0002, + "loss": 0.8473, + "step": 3020 + }, + { + "epoch": 2.1996370235934664, + "grad_norm": 0.4847496449947357, + "learning_rate": 0.0002, + "loss": 0.8372, + "step": 3030 + }, + { + "epoch": 2.206896551724138, + "grad_norm": 0.5529342889785767, + "learning_rate": 0.0002, + "loss": 0.8487, + "step": 3040 + }, + { + "epoch": 2.2141560798548094, + "grad_norm": 0.6108783483505249, + "learning_rate": 0.0002, + "loss": 0.9057, + "step": 3050 + }, + { + "epoch": 2.221415607985481, + "grad_norm": 0.8841571807861328, + "learning_rate": 0.0002, + "loss": 0.9659, + "step": 3060 + }, + { + "epoch": 2.2286751361161525, + "grad_norm": 0.4227530360221863, + "learning_rate": 0.0002, + "loss": 0.8538, + "step": 3070 + }, + { + "epoch": 2.235934664246824, + "grad_norm": 0.461935818195343, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 3080 + }, + { + "epoch": 2.2431941923774956, + "grad_norm": 0.5407412648200989, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 3090 + }, + { + "epoch": 2.250453720508167, + "grad_norm": 0.5057998895645142, + "learning_rate": 0.0002, + "loss": 0.8392, + "step": 3100 + }, + { + "epoch": 2.2577132486388383, + "grad_norm": 0.530057966709137, + "learning_rate": 0.0002, + "loss": 0.7898, + "step": 3110 + }, + { + "epoch": 2.26497277676951, + "grad_norm": 0.5066842436790466, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 3120 + }, + { + "epoch": 2.2722323049001814, + "grad_norm": 0.5069178342819214, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 3130 + }, + { + "epoch": 2.279491833030853, + "grad_norm": 0.6095499396324158, + "learning_rate": 0.0002, + "loss": 0.8458, + "step": 3140 + }, + { + "epoch": 2.2867513611615244, + "grad_norm": 0.49524766206741333, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 3150 + }, + { + "epoch": 2.294010889292196, + "grad_norm": 0.5334409475326538, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 3160 + }, + { + "epoch": 2.3012704174228675, + "grad_norm": 1.681748867034912, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 3170 + }, + { + "epoch": 2.308529945553539, + "grad_norm": 0.7225565314292908, + "learning_rate": 0.0002, + "loss": 0.8155, + "step": 3180 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 0.5379388928413391, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 3190 + }, + { + "epoch": 2.323049001814882, + "grad_norm": 0.45770326256752014, + "learning_rate": 0.0002, + "loss": 0.8833, + "step": 3200 + }, + { + "epoch": 2.3303085299455537, + "grad_norm": 0.6984533071517944, + "learning_rate": 0.0002, + "loss": 0.8419, + "step": 3210 + }, + { + "epoch": 2.337568058076225, + "grad_norm": 0.6725744605064392, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 3220 + }, + { + "epoch": 2.344827586206897, + "grad_norm": 1.1247471570968628, + "learning_rate": 0.0002, + "loss": 0.8657, + "step": 3230 + }, + { + "epoch": 2.352087114337568, + "grad_norm": 1.0240263938903809, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 3240 + }, + { + "epoch": 2.3593466424682394, + "grad_norm": 0.5608096122741699, + "learning_rate": 0.0002, + "loss": 0.826, + "step": 3250 + }, + { + "epoch": 2.366606170598911, + "grad_norm": 0.8294990062713623, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 3260 + }, + { + "epoch": 2.3738656987295825, + "grad_norm": 0.6734224557876587, + "learning_rate": 0.0002, + "loss": 0.9845, + "step": 3270 + }, + { + "epoch": 2.381125226860254, + "grad_norm": 0.6862800717353821, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 3280 + }, + { + "epoch": 2.3883847549909256, + "grad_norm": 0.5442930459976196, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 3290 + }, + { + "epoch": 2.395644283121597, + "grad_norm": 0.5745394229888916, + "learning_rate": 0.0002, + "loss": 0.8543, + "step": 3300 + }, + { + "epoch": 2.4029038112522687, + "grad_norm": 0.6257799863815308, + "learning_rate": 0.0002, + "loss": 0.833, + "step": 3310 + }, + { + "epoch": 2.4101633393829403, + "grad_norm": 0.5608420968055725, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 3320 + }, + { + "epoch": 2.417422867513612, + "grad_norm": 0.5512017011642456, + "learning_rate": 0.0002, + "loss": 0.8745, + "step": 3330 + }, + { + "epoch": 2.4246823956442833, + "grad_norm": 0.7452999353408813, + "learning_rate": 0.0002, + "loss": 0.8642, + "step": 3340 + }, + { + "epoch": 2.4319419237749544, + "grad_norm": 0.4604301452636719, + "learning_rate": 0.0002, + "loss": 0.8417, + "step": 3350 + }, + { + "epoch": 2.439201451905626, + "grad_norm": 0.8225823640823364, + "learning_rate": 0.0002, + "loss": 0.9148, + "step": 3360 + }, + { + "epoch": 2.4464609800362975, + "grad_norm": 0.8604981303215027, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 3370 + }, + { + "epoch": 2.453720508166969, + "grad_norm": 0.6620925664901733, + "learning_rate": 0.0002, + "loss": 0.9276, + "step": 3380 + }, + { + "epoch": 2.4609800362976406, + "grad_norm": 0.4750158488750458, + "learning_rate": 0.0002, + "loss": 0.8381, + "step": 3390 + }, + { + "epoch": 2.468239564428312, + "grad_norm": 0.6061418056488037, + "learning_rate": 0.0002, + "loss": 0.8249, + "step": 3400 + }, + { + "epoch": 2.4754990925589837, + "grad_norm": 0.5977247357368469, + "learning_rate": 0.0002, + "loss": 0.8944, + "step": 3410 + }, + { + "epoch": 2.4827586206896552, + "grad_norm": 0.5004227757453918, + "learning_rate": 0.0002, + "loss": 0.8366, + "step": 3420 + }, + { + "epoch": 2.490018148820327, + "grad_norm": 0.46428972482681274, + "learning_rate": 0.0002, + "loss": 0.9133, + "step": 3430 + }, + { + "epoch": 2.4972776769509983, + "grad_norm": 0.7658395171165466, + "learning_rate": 0.0002, + "loss": 0.8534, + "step": 3440 + }, + { + "epoch": 2.50453720508167, + "grad_norm": 0.621819794178009, + "learning_rate": 0.0002, + "loss": 0.9066, + "step": 3450 + }, + { + "epoch": 2.511796733212341, + "grad_norm": 0.4556088149547577, + "learning_rate": 0.0002, + "loss": 0.8481, + "step": 3460 + }, + { + "epoch": 2.519056261343013, + "grad_norm": 0.6124140024185181, + "learning_rate": 0.0002, + "loss": 0.8522, + "step": 3470 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 0.6256231665611267, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 3480 + }, + { + "epoch": 2.5335753176043556, + "grad_norm": 0.5464141964912415, + "learning_rate": 0.0002, + "loss": 0.9097, + "step": 3490 + }, + { + "epoch": 2.540834845735027, + "grad_norm": 0.51471346616745, + "learning_rate": 0.0002, + "loss": 0.8708, + "step": 3500 + }, + { + "epoch": 2.5480943738656987, + "grad_norm": 0.5326165556907654, + "learning_rate": 0.0002, + "loss": 0.9042, + "step": 3510 + }, + { + "epoch": 2.5553539019963702, + "grad_norm": 0.4750378429889679, + "learning_rate": 0.0002, + "loss": 0.8749, + "step": 3520 + }, + { + "epoch": 2.562613430127042, + "grad_norm": 0.5292279124259949, + "learning_rate": 0.0002, + "loss": 0.8248, + "step": 3530 + }, + { + "epoch": 2.5698729582577133, + "grad_norm": 0.6145227551460266, + "learning_rate": 0.0002, + "loss": 0.8808, + "step": 3540 + }, + { + "epoch": 2.577132486388385, + "grad_norm": 0.8275189399719238, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 3550 + }, + { + "epoch": 2.5843920145190564, + "grad_norm": 0.5037438273429871, + "learning_rate": 0.0002, + "loss": 0.875, + "step": 3560 + }, + { + "epoch": 2.5916515426497275, + "grad_norm": 0.5838707685470581, + "learning_rate": 0.0002, + "loss": 0.8324, + "step": 3570 + }, + { + "epoch": 2.5989110707803995, + "grad_norm": 0.5398710370063782, + "learning_rate": 0.0002, + "loss": 0.8716, + "step": 3580 + }, + { + "epoch": 2.6061705989110706, + "grad_norm": 0.6115376949310303, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 3590 + }, + { + "epoch": 2.613430127041742, + "grad_norm": 1.118809461593628, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 3600 + }, + { + "epoch": 2.6206896551724137, + "grad_norm": 0.7811771631240845, + "learning_rate": 0.0002, + "loss": 0.8282, + "step": 3610 + }, + { + "epoch": 2.6279491833030852, + "grad_norm": 0.5753175020217896, + "learning_rate": 0.0002, + "loss": 0.7997, + "step": 3620 + }, + { + "epoch": 2.635208711433757, + "grad_norm": 0.550829291343689, + "learning_rate": 0.0002, + "loss": 0.7705, + "step": 3630 + }, + { + "epoch": 2.6424682395644283, + "grad_norm": 0.5360019207000732, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 3640 + }, + { + "epoch": 2.6497277676951, + "grad_norm": 0.63050377368927, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 3650 + }, + { + "epoch": 2.6569872958257714, + "grad_norm": 0.5833110213279724, + "learning_rate": 0.0002, + "loss": 0.9156, + "step": 3660 + }, + { + "epoch": 2.664246823956443, + "grad_norm": 0.5543047189712524, + "learning_rate": 0.0002, + "loss": 0.8033, + "step": 3670 + }, + { + "epoch": 2.671506352087114, + "grad_norm": 0.6842212080955505, + "learning_rate": 0.0002, + "loss": 0.8867, + "step": 3680 + }, + { + "epoch": 2.678765880217786, + "grad_norm": 0.6859333515167236, + "learning_rate": 0.0002, + "loss": 0.8569, + "step": 3690 + }, + { + "epoch": 2.686025408348457, + "grad_norm": 0.7038410902023315, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 3700 + }, + { + "epoch": 2.6932849364791287, + "grad_norm": 0.447233647108078, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 3710 + }, + { + "epoch": 2.7005444646098002, + "grad_norm": 0.6453872919082642, + "learning_rate": 0.0002, + "loss": 0.8951, + "step": 3720 + }, + { + "epoch": 2.707803992740472, + "grad_norm": 0.8025672435760498, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 3730 + }, + { + "epoch": 2.7150635208711433, + "grad_norm": 0.5997087955474854, + "learning_rate": 0.0002, + "loss": 0.8488, + "step": 3740 + }, + { + "epoch": 2.722323049001815, + "grad_norm": 0.6901142001152039, + "learning_rate": 0.0002, + "loss": 0.8308, + "step": 3750 + }, + { + "epoch": 2.7295825771324864, + "grad_norm": 1.036145567893982, + "learning_rate": 0.0002, + "loss": 0.8517, + "step": 3760 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 0.7207072377204895, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 3770 + }, + { + "epoch": 2.7441016333938295, + "grad_norm": 1.0452989339828491, + "learning_rate": 0.0002, + "loss": 0.8009, + "step": 3780 + }, + { + "epoch": 2.751361161524501, + "grad_norm": 0.5615278482437134, + "learning_rate": 0.0002, + "loss": 0.8868, + "step": 3790 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 0.46439215540885925, + "learning_rate": 0.0002, + "loss": 0.9047, + "step": 3800 + }, + { + "epoch": 2.7658802177858437, + "grad_norm": 0.7134917974472046, + "learning_rate": 0.0002, + "loss": 0.7847, + "step": 3810 + }, + { + "epoch": 2.7731397459165157, + "grad_norm": 0.5139115452766418, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 3820 + }, + { + "epoch": 2.780399274047187, + "grad_norm": 0.8595781326293945, + "learning_rate": 0.0002, + "loss": 0.8995, + "step": 3830 + }, + { + "epoch": 2.7876588021778583, + "grad_norm": 0.544614851474762, + "learning_rate": 0.0002, + "loss": 0.9079, + "step": 3840 + }, + { + "epoch": 2.79491833030853, + "grad_norm": 0.6073850393295288, + "learning_rate": 0.0002, + "loss": 0.8322, + "step": 3850 + }, + { + "epoch": 2.8021778584392014, + "grad_norm": 0.8224069476127625, + "learning_rate": 0.0002, + "loss": 0.8277, + "step": 3860 + }, + { + "epoch": 2.809437386569873, + "grad_norm": 0.5347970128059387, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 3870 + }, + { + "epoch": 2.8166969147005445, + "grad_norm": 0.6799601912498474, + "learning_rate": 0.0002, + "loss": 0.8219, + "step": 3880 + }, + { + "epoch": 2.823956442831216, + "grad_norm": 0.5219197869300842, + "learning_rate": 0.0002, + "loss": 0.8796, + "step": 3890 + }, + { + "epoch": 2.8312159709618876, + "grad_norm": 0.5710130333900452, + "learning_rate": 0.0002, + "loss": 0.8649, + "step": 3900 + }, + { + "epoch": 2.838475499092559, + "grad_norm": 0.5857213139533997, + "learning_rate": 0.0002, + "loss": 0.8741, + "step": 3910 + }, + { + "epoch": 2.8457350272232302, + "grad_norm": 0.5206644535064697, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 3920 + }, + { + "epoch": 2.8529945553539022, + "grad_norm": 1.2902015447616577, + "learning_rate": 0.0002, + "loss": 0.9328, + "step": 3930 + }, + { + "epoch": 2.8602540834845733, + "grad_norm": 0.5252797603607178, + "learning_rate": 0.0002, + "loss": 0.8192, + "step": 3940 + }, + { + "epoch": 2.867513611615245, + "grad_norm": 0.5925108790397644, + "learning_rate": 0.0002, + "loss": 0.9047, + "step": 3950 + }, + { + "epoch": 2.8747731397459164, + "grad_norm": 0.9719610810279846, + "learning_rate": 0.0002, + "loss": 0.8611, + "step": 3960 + }, + { + "epoch": 2.882032667876588, + "grad_norm": 0.6834747195243835, + "learning_rate": 0.0002, + "loss": 0.8531, + "step": 3970 + }, + { + "epoch": 2.8892921960072595, + "grad_norm": 0.6952353119850159, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 3980 + }, + { + "epoch": 2.896551724137931, + "grad_norm": 0.49889910221099854, + "learning_rate": 0.0002, + "loss": 0.8365, + "step": 3990 + }, + { + "epoch": 2.9038112522686026, + "grad_norm": 0.5007026791572571, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 4000 + }, + { + "epoch": 2.911070780399274, + "grad_norm": 0.5474239587783813, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 4010 + }, + { + "epoch": 2.9183303085299457, + "grad_norm": 0.6617428660392761, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 4020 + }, + { + "epoch": 2.925589836660617, + "grad_norm": 0.6097776293754578, + "learning_rate": 0.0002, + "loss": 0.8583, + "step": 4030 + }, + { + "epoch": 2.9328493647912888, + "grad_norm": 0.5985828638076782, + "learning_rate": 0.0002, + "loss": 0.8296, + "step": 4040 + }, + { + "epoch": 2.94010889292196, + "grad_norm": 0.769488513469696, + "learning_rate": 0.0002, + "loss": 0.9134, + "step": 4050 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 0.5167055130004883, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 4060 + }, + { + "epoch": 2.954627949183303, + "grad_norm": 0.6265496015548706, + "learning_rate": 0.0002, + "loss": 0.8665, + "step": 4070 + }, + { + "epoch": 2.9618874773139745, + "grad_norm": 1.2644082307815552, + "learning_rate": 0.0002, + "loss": 0.9218, + "step": 4080 + }, + { + "epoch": 2.969147005444646, + "grad_norm": 0.6007561087608337, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 4090 + }, + { + "epoch": 2.9764065335753176, + "grad_norm": 0.47984927892684937, + "learning_rate": 0.0002, + "loss": 0.8262, + "step": 4100 + }, + { + "epoch": 2.983666061705989, + "grad_norm": 1.128198504447937, + "learning_rate": 0.0002, + "loss": 0.7897, + "step": 4110 + }, + { + "epoch": 2.9909255898366607, + "grad_norm": 0.526292085647583, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 4120 + }, + { + "epoch": 2.9981851179673322, + "grad_norm": 0.5462674498558044, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 4130 + }, + { + "epoch": 2.9996370235934666, + "eval_loss": 1.1093357801437378, + "eval_runtime": 46.2498, + "eval_samples_per_second": 9.427, + "eval_steps_per_second": 1.189, + "step": 4132 + }, + { + "epoch": 3.0054446460980038, + "grad_norm": 0.48288026452064514, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 4140 + }, + { + "epoch": 3.0127041742286753, + "grad_norm": 1.0181782245635986, + "learning_rate": 0.0002, + "loss": 0.6769, + "step": 4150 + }, + { + "epoch": 3.019963702359347, + "grad_norm": 0.7718019485473633, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 4160 + }, + { + "epoch": 3.027223230490018, + "grad_norm": 0.7492219805717468, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 4170 + }, + { + "epoch": 3.0344827586206895, + "grad_norm": 0.9363632798194885, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 4180 + }, + { + "epoch": 3.041742286751361, + "grad_norm": 0.6888533234596252, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 4190 + }, + { + "epoch": 3.0490018148820326, + "grad_norm": 0.7072834968566895, + "learning_rate": 0.0002, + "loss": 0.6469, + "step": 4200 + }, + { + "epoch": 3.056261343012704, + "grad_norm": 0.7182047963142395, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 4210 + }, + { + "epoch": 3.0635208711433757, + "grad_norm": 0.7194355130195618, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 4220 + }, + { + "epoch": 3.0707803992740472, + "grad_norm": 0.9454023838043213, + "learning_rate": 0.0002, + "loss": 0.5812, + "step": 4230 + }, + { + "epoch": 3.0780399274047188, + "grad_norm": 0.838657557964325, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 4240 + }, + { + "epoch": 3.0852994555353903, + "grad_norm": 0.740113377571106, + "learning_rate": 0.0002, + "loss": 0.646, + "step": 4250 + }, + { + "epoch": 3.092558983666062, + "grad_norm": 0.6616561412811279, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 4260 + }, + { + "epoch": 3.0998185117967334, + "grad_norm": 0.8846506476402283, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 4270 + }, + { + "epoch": 3.107078039927405, + "grad_norm": 0.6322125792503357, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 4280 + }, + { + "epoch": 3.114337568058076, + "grad_norm": 0.7461467385292053, + "learning_rate": 0.0002, + "loss": 0.5953, + "step": 4290 + }, + { + "epoch": 3.1215970961887476, + "grad_norm": 0.8251287341117859, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 4300 + }, + { + "epoch": 3.128856624319419, + "grad_norm": 0.8767673373222351, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 4310 + }, + { + "epoch": 3.1361161524500907, + "grad_norm": 0.7758759260177612, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 4320 + }, + { + "epoch": 3.143375680580762, + "grad_norm": 1.1056879758834839, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 4330 + }, + { + "epoch": 3.1506352087114338, + "grad_norm": 0.8259835243225098, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 4340 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 0.6607027053833008, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 4350 + }, + { + "epoch": 3.165154264972777, + "grad_norm": 0.7983301281929016, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 4360 + }, + { + "epoch": 3.1724137931034484, + "grad_norm": 0.6725239157676697, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 4370 + }, + { + "epoch": 3.17967332123412, + "grad_norm": 0.9052095413208008, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 4380 + }, + { + "epoch": 3.1869328493647915, + "grad_norm": 0.8131307363510132, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 4390 + }, + { + "epoch": 3.1941923774954626, + "grad_norm": 0.6435626149177551, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 4400 + }, + { + "epoch": 3.201451905626134, + "grad_norm": 0.84367436170578, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 4410 + }, + { + "epoch": 3.2087114337568057, + "grad_norm": 1.5018867254257202, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 4420 + }, + { + "epoch": 3.215970961887477, + "grad_norm": 0.7019091844558716, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 4430 + }, + { + "epoch": 3.2232304900181488, + "grad_norm": 0.9164197444915771, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 4440 + }, + { + "epoch": 3.2304900181488203, + "grad_norm": 0.7890861630439758, + "learning_rate": 0.0002, + "loss": 0.6618, + "step": 4450 + }, + { + "epoch": 3.237749546279492, + "grad_norm": 0.6517660617828369, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 4460 + }, + { + "epoch": 3.2450090744101634, + "grad_norm": 1.10188889503479, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 4470 + }, + { + "epoch": 3.252268602540835, + "grad_norm": 0.8158330917358398, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 4480 + }, + { + "epoch": 3.2595281306715065, + "grad_norm": 0.7663109302520752, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 4490 + }, + { + "epoch": 3.266787658802178, + "grad_norm": 0.8473444581031799, + "learning_rate": 0.0002, + "loss": 0.6539, + "step": 4500 + }, + { + "epoch": 3.274047186932849, + "grad_norm": 0.9724768996238708, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 4510 + }, + { + "epoch": 3.281306715063521, + "grad_norm": 0.8516759276390076, + "learning_rate": 0.0002, + "loss": 0.5464, + "step": 4520 + }, + { + "epoch": 3.288566243194192, + "grad_norm": 0.7543437480926514, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 4530 + }, + { + "epoch": 3.2958257713248638, + "grad_norm": 1.0472029447555542, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 4540 + }, + { + "epoch": 3.3030852994555353, + "grad_norm": 0.6240826845169067, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 4550 + }, + { + "epoch": 3.310344827586207, + "grad_norm": 0.9957774877548218, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 4560 + }, + { + "epoch": 3.3176043557168784, + "grad_norm": 0.6448912620544434, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 4570 + }, + { + "epoch": 3.32486388384755, + "grad_norm": 0.7519692778587341, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 4580 + }, + { + "epoch": 3.3321234119782215, + "grad_norm": 0.7367453575134277, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 4590 + }, + { + "epoch": 3.339382940108893, + "grad_norm": 0.8064960837364197, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 4600 + }, + { + "epoch": 3.3466424682395646, + "grad_norm": 0.7664631009101868, + "learning_rate": 0.0002, + "loss": 0.6062, + "step": 4610 + }, + { + "epoch": 3.353901996370236, + "grad_norm": 0.7803396582603455, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 4620 + }, + { + "epoch": 3.3611615245009077, + "grad_norm": 0.9141599535942078, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 4630 + }, + { + "epoch": 3.3684210526315788, + "grad_norm": 0.9719856381416321, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 4640 + }, + { + "epoch": 3.3756805807622503, + "grad_norm": 0.9223218560218811, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 4650 + }, + { + "epoch": 3.382940108892922, + "grad_norm": 0.7289277911186218, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 4660 + }, + { + "epoch": 3.3901996370235934, + "grad_norm": 1.039724349975586, + "learning_rate": 0.0002, + "loss": 0.595, + "step": 4670 + }, + { + "epoch": 3.397459165154265, + "grad_norm": 1.397438883781433, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 4680 + }, + { + "epoch": 3.4047186932849365, + "grad_norm": 1.0069999694824219, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 4690 + }, + { + "epoch": 3.411978221415608, + "grad_norm": 0.816291332244873, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 4700 + }, + { + "epoch": 3.4192377495462796, + "grad_norm": 1.2831530570983887, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 4710 + }, + { + "epoch": 3.426497277676951, + "grad_norm": 0.9573889970779419, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 4720 + }, + { + "epoch": 3.4337568058076227, + "grad_norm": 0.7685632705688477, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 4730 + }, + { + "epoch": 3.441016333938294, + "grad_norm": 0.7019195556640625, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 4740 + }, + { + "epoch": 3.4482758620689653, + "grad_norm": 0.7244833707809448, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 4750 + }, + { + "epoch": 3.455535390199637, + "grad_norm": 1.3468551635742188, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 4760 + }, + { + "epoch": 3.4627949183303084, + "grad_norm": 0.822846531867981, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 4770 + }, + { + "epoch": 3.47005444646098, + "grad_norm": 0.7311608195304871, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 4780 + }, + { + "epoch": 3.4773139745916515, + "grad_norm": 0.9466770887374878, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 4790 + }, + { + "epoch": 3.484573502722323, + "grad_norm": 1.1527155637741089, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 4800 + }, + { + "epoch": 3.4918330308529946, + "grad_norm": 1.1288906335830688, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 4810 + }, + { + "epoch": 3.499092558983666, + "grad_norm": 0.9096164107322693, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 4820 + }, + { + "epoch": 3.5063520871143377, + "grad_norm": 0.7988565564155579, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 4830 + }, + { + "epoch": 3.513611615245009, + "grad_norm": 0.7183415293693542, + "learning_rate": 0.0002, + "loss": 0.7004, + "step": 4840 + }, + { + "epoch": 3.5208711433756807, + "grad_norm": 0.6614915132522583, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 4850 + }, + { + "epoch": 3.528130671506352, + "grad_norm": 0.8609521985054016, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 4860 + }, + { + "epoch": 3.535390199637024, + "grad_norm": 0.86552894115448, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 4870 + }, + { + "epoch": 3.542649727767695, + "grad_norm": 0.6926496028900146, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 4880 + }, + { + "epoch": 3.5499092558983665, + "grad_norm": 0.8157467246055603, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 4890 + }, + { + "epoch": 3.557168784029038, + "grad_norm": 0.9085357189178467, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 4900 + }, + { + "epoch": 3.5644283121597096, + "grad_norm": 0.6322644948959351, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 4910 + }, + { + "epoch": 3.571687840290381, + "grad_norm": 1.263205885887146, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 4920 + }, + { + "epoch": 3.5789473684210527, + "grad_norm": 0.8901070356369019, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 4930 + }, + { + "epoch": 3.586206896551724, + "grad_norm": 0.7983952164649963, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 4940 + }, + { + "epoch": 3.5934664246823957, + "grad_norm": 0.9887813925743103, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 4950 + }, + { + "epoch": 3.6007259528130673, + "grad_norm": 0.7895187735557556, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 4960 + }, + { + "epoch": 3.6079854809437384, + "grad_norm": 0.9685819745063782, + "learning_rate": 0.0002, + "loss": 0.7111, + "step": 4970 + }, + { + "epoch": 3.6152450090744104, + "grad_norm": 0.6576591730117798, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 4980 + }, + { + "epoch": 3.6225045372050815, + "grad_norm": 0.856985330581665, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 4990 + }, + { + "epoch": 3.629764065335753, + "grad_norm": 0.7230252623558044, + "learning_rate": 0.0002, + "loss": 0.6318, + "step": 5000 + }, + { + "epoch": 3.6370235934664246, + "grad_norm": 0.8260893821716309, + "learning_rate": 0.0002, + "loss": 0.742, + "step": 5010 + }, + { + "epoch": 3.644283121597096, + "grad_norm": 0.7635950446128845, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 5020 + }, + { + "epoch": 3.6515426497277677, + "grad_norm": 0.7060768604278564, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 5030 + }, + { + "epoch": 3.658802177858439, + "grad_norm": 0.8020303249359131, + "learning_rate": 0.0002, + "loss": 0.6921, + "step": 5040 + }, + { + "epoch": 3.6660617059891107, + "grad_norm": 0.8530341386795044, + "learning_rate": 0.0002, + "loss": 0.6446, + "step": 5050 + }, + { + "epoch": 3.6733212341197823, + "grad_norm": 0.6667101979255676, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 5060 + }, + { + "epoch": 3.680580762250454, + "grad_norm": 0.7385406494140625, + "learning_rate": 0.0002, + "loss": 0.7081, + "step": 5070 + }, + { + "epoch": 3.6878402903811254, + "grad_norm": 0.7753380537033081, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 5080 + }, + { + "epoch": 3.695099818511797, + "grad_norm": 0.7516207098960876, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 5090 + }, + { + "epoch": 3.702359346642468, + "grad_norm": 0.8171586394309998, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 5100 + }, + { + "epoch": 3.70961887477314, + "grad_norm": 1.0796279907226562, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 5110 + }, + { + "epoch": 3.716878402903811, + "grad_norm": 0.6957688927650452, + "learning_rate": 0.0002, + "loss": 0.5948, + "step": 5120 + }, + { + "epoch": 3.7241379310344827, + "grad_norm": 0.8550161719322205, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 5130 + }, + { + "epoch": 3.731397459165154, + "grad_norm": 0.9396728277206421, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 5140 + }, + { + "epoch": 3.7386569872958257, + "grad_norm": 1.4264805316925049, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 5150 + }, + { + "epoch": 3.7459165154264973, + "grad_norm": 0.8725108504295349, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 5160 + }, + { + "epoch": 3.753176043557169, + "grad_norm": 1.0346195697784424, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 5170 + }, + { + "epoch": 3.7604355716878404, + "grad_norm": 0.5395554304122925, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 5180 + }, + { + "epoch": 3.767695099818512, + "grad_norm": 1.3153616189956665, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 5190 + }, + { + "epoch": 3.7749546279491835, + "grad_norm": 0.9879828691482544, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 5200 + }, + { + "epoch": 3.7822141560798546, + "grad_norm": 0.8876672983169556, + "learning_rate": 0.0002, + "loss": 0.7068, + "step": 5210 + }, + { + "epoch": 3.7894736842105265, + "grad_norm": 0.8363267779350281, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 5220 + }, + { + "epoch": 3.7967332123411976, + "grad_norm": 0.637294590473175, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 5230 + }, + { + "epoch": 3.803992740471869, + "grad_norm": 1.1408970355987549, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 5240 + }, + { + "epoch": 3.8112522686025407, + "grad_norm": 1.0128360986709595, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 5250 + }, + { + "epoch": 3.8185117967332123, + "grad_norm": 0.8061144351959229, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 5260 + }, + { + "epoch": 3.825771324863884, + "grad_norm": 0.9626626968383789, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 5270 + }, + { + "epoch": 3.8330308529945554, + "grad_norm": 0.9013627171516418, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 5280 + }, + { + "epoch": 3.840290381125227, + "grad_norm": 0.8411344289779663, + "learning_rate": 0.0002, + "loss": 0.6806, + "step": 5290 + }, + { + "epoch": 3.8475499092558985, + "grad_norm": 0.7426059246063232, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 5300 + }, + { + "epoch": 3.85480943738657, + "grad_norm": 1.003413438796997, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 5310 + }, + { + "epoch": 3.862068965517241, + "grad_norm": 0.7527840733528137, + "learning_rate": 0.0002, + "loss": 0.8554, + "step": 5320 + }, + { + "epoch": 3.869328493647913, + "grad_norm": 0.738610565662384, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 5330 + }, + { + "epoch": 3.876588021778584, + "grad_norm": 0.7277999520301819, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 5340 + }, + { + "epoch": 3.8838475499092557, + "grad_norm": 0.5951359272003174, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 5350 + }, + { + "epoch": 3.8911070780399273, + "grad_norm": 1.043884038925171, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 5360 + }, + { + "epoch": 3.898366606170599, + "grad_norm": 0.8436498045921326, + "learning_rate": 0.0002, + "loss": 0.6862, + "step": 5370 + }, + { + "epoch": 3.9056261343012704, + "grad_norm": 0.5603365302085876, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 5380 + }, + { + "epoch": 3.912885662431942, + "grad_norm": 1.0128886699676514, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 5390 + }, + { + "epoch": 3.9201451905626135, + "grad_norm": 0.7970930337905884, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 5400 + }, + { + "epoch": 3.927404718693285, + "grad_norm": 0.7699369192123413, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 5410 + }, + { + "epoch": 3.9346642468239565, + "grad_norm": 0.800561249256134, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 5420 + }, + { + "epoch": 3.941923774954628, + "grad_norm": 0.8020331859588623, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 5430 + }, + { + "epoch": 3.9491833030852996, + "grad_norm": 0.7461140155792236, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 5440 + }, + { + "epoch": 3.9564428312159707, + "grad_norm": 0.8346918821334839, + "learning_rate": 0.0002, + "loss": 0.7013, + "step": 5450 + }, + { + "epoch": 3.9637023593466427, + "grad_norm": 0.9723302125930786, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 5460 + }, + { + "epoch": 3.970961887477314, + "grad_norm": 0.6809740662574768, + "learning_rate": 0.0002, + "loss": 0.8029, + "step": 5470 + }, + { + "epoch": 3.9782214156079854, + "grad_norm": 0.7353498339653015, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 5480 + }, + { + "epoch": 3.985480943738657, + "grad_norm": 0.748009443283081, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 5490 + }, + { + "epoch": 3.9927404718693285, + "grad_norm": 1.3656195402145386, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 5500 + }, + { + "epoch": 4.0, + "grad_norm": 0.8402108550071716, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 5510 + }, + { + "epoch": 4.0, + "eval_loss": 1.17229425907135, + "eval_runtime": 46.2554, + "eval_samples_per_second": 9.426, + "eval_steps_per_second": 1.189, + "step": 5510 + } + ], + "logging_steps": 10, + "max_steps": 11016, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.830142384111616e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..140bceabdec44fc473842cb9ace4a62a4dc9fe89 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-5510/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4abec5480aff960b5a7b03a48be1cdb1ae29ba8792469c451f5c2f269e4be93b +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7da450895fa1608a2cc58faccd6e26831db22107 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68eaeba0cc63008c5bd7c3bc5aa7bdd8e28a511348c857e109ff24809f57d6ee +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/optimizer.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ec6fee050830f7c9ff4a382547b40ff418b5b0e --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0a833460c3dd7d23d953c8af07fb3bf2b05669c63cabaf6fa51acb9f4050be7 +size 72886650 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/rng_state.pth b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ef2f7c31a7db238c7327dc554e587b7ae913704d --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84484997bb1fd50f9a979febb3f3a5e33d8908d9d7d1ad2009936d364854484c +size 14244 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/scheduler.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..966c1a8c7cc28c297f81d9de5accd07e8d81a61d --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c998afeb2f35200deae216475374398ad29855a63f049e215b018de796764f10 +size 1064 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/trainer_state.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1596f14ed1b03d8480a2e4640c4f7cd958261a5b --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/trainer_state.json @@ -0,0 +1,4889 @@ +{ + "best_metric": 1.1086540222167969, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755", + "epoch": 4.999637023593467, + "eval_steps": 10, + "global_step": 6887, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007259528130671506, + "grad_norm": 0.46010470390319824, + "learning_rate": 0.0002, + "loss": 2.4936, + "step": 10 + }, + { + "epoch": 0.014519056261343012, + "grad_norm": 0.5103179216384888, + "learning_rate": 0.0002, + "loss": 1.8442, + "step": 20 + }, + { + "epoch": 0.021778584392014518, + "grad_norm": 0.6282716989517212, + "learning_rate": 0.0002, + "loss": 1.559, + "step": 30 + }, + { + "epoch": 0.029038112522686024, + "grad_norm": 1.2480497360229492, + "learning_rate": 0.0002, + "loss": 1.3618, + "step": 40 + }, + { + "epoch": 0.036297640653357534, + "grad_norm": 0.4114855229854584, + "learning_rate": 0.0002, + "loss": 1.3874, + "step": 50 + }, + { + "epoch": 0.043557168784029036, + "grad_norm": 0.49482840299606323, + "learning_rate": 0.0002, + "loss": 1.2836, + "step": 60 + }, + { + "epoch": 0.050816696914700546, + "grad_norm": 0.4536272883415222, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 70 + }, + { + "epoch": 0.05807622504537205, + "grad_norm": 0.32328274846076965, + "learning_rate": 0.0002, + "loss": 1.1125, + "step": 80 + }, + { + "epoch": 0.06533575317604355, + "grad_norm": 0.40990468859672546, + "learning_rate": 0.0002, + "loss": 1.2447, + "step": 90 + }, + { + "epoch": 0.07259528130671507, + "grad_norm": 0.37273502349853516, + "learning_rate": 0.0002, + "loss": 1.2216, + "step": 100 + }, + { + "epoch": 0.07985480943738657, + "grad_norm": 0.3903871476650238, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 110 + }, + { + "epoch": 0.08711433756805807, + "grad_norm": 0.3272787928581238, + "learning_rate": 0.0002, + "loss": 1.3718, + "step": 120 + }, + { + "epoch": 0.09437386569872959, + "grad_norm": 0.3622824251651764, + "learning_rate": 0.0002, + "loss": 1.1832, + "step": 130 + }, + { + "epoch": 0.10163339382940109, + "grad_norm": 0.3503916561603546, + "learning_rate": 0.0002, + "loss": 1.2966, + "step": 140 + }, + { + "epoch": 0.1088929219600726, + "grad_norm": 0.32787832617759705, + "learning_rate": 0.0002, + "loss": 1.0913, + "step": 150 + }, + { + "epoch": 0.1161524500907441, + "grad_norm": 0.5822657942771912, + "learning_rate": 0.0002, + "loss": 1.2143, + "step": 160 + }, + { + "epoch": 0.12341197822141561, + "grad_norm": 0.28028249740600586, + "learning_rate": 0.0002, + "loss": 1.2091, + "step": 170 + }, + { + "epoch": 0.1306715063520871, + "grad_norm": 0.33602750301361084, + "learning_rate": 0.0002, + "loss": 1.1731, + "step": 180 + }, + { + "epoch": 0.13793103448275862, + "grad_norm": 0.29106274247169495, + "learning_rate": 0.0002, + "loss": 1.127, + "step": 190 + }, + { + "epoch": 0.14519056261343014, + "grad_norm": 0.38753363490104675, + "learning_rate": 0.0002, + "loss": 1.3261, + "step": 200 + }, + { + "epoch": 0.15245009074410162, + "grad_norm": 0.361009418964386, + "learning_rate": 0.0002, + "loss": 1.1536, + "step": 210 + }, + { + "epoch": 0.15970961887477314, + "grad_norm": 0.6743836402893066, + "learning_rate": 0.0002, + "loss": 1.1483, + "step": 220 + }, + { + "epoch": 0.16696914700544466, + "grad_norm": 0.3923613727092743, + "learning_rate": 0.0002, + "loss": 1.264, + "step": 230 + }, + { + "epoch": 0.17422867513611615, + "grad_norm": 0.2809699773788452, + "learning_rate": 0.0002, + "loss": 1.0437, + "step": 240 + }, + { + "epoch": 0.18148820326678766, + "grad_norm": 0.3631494641304016, + "learning_rate": 0.0002, + "loss": 1.1826, + "step": 250 + }, + { + "epoch": 0.18874773139745918, + "grad_norm": 0.24658171832561493, + "learning_rate": 0.0002, + "loss": 1.1584, + "step": 260 + }, + { + "epoch": 0.19600725952813067, + "grad_norm": 0.5780664682388306, + "learning_rate": 0.0002, + "loss": 1.0065, + "step": 270 + }, + { + "epoch": 0.20326678765880218, + "grad_norm": 0.3056720495223999, + "learning_rate": 0.0002, + "loss": 1.1251, + "step": 280 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.2857084572315216, + "learning_rate": 0.0002, + "loss": 1.1503, + "step": 290 + }, + { + "epoch": 0.2177858439201452, + "grad_norm": 0.6645345687866211, + "learning_rate": 0.0002, + "loss": 1.1215, + "step": 300 + }, + { + "epoch": 0.2250453720508167, + "grad_norm": 0.5966078639030457, + "learning_rate": 0.0002, + "loss": 1.127, + "step": 310 + }, + { + "epoch": 0.2323049001814882, + "grad_norm": 0.40937140583992004, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 320 + }, + { + "epoch": 0.2395644283121597, + "grad_norm": 0.5642806887626648, + "learning_rate": 0.0002, + "loss": 1.2385, + "step": 330 + }, + { + "epoch": 0.24682395644283123, + "grad_norm": 0.2750748097896576, + "learning_rate": 0.0002, + "loss": 1.1703, + "step": 340 + }, + { + "epoch": 0.2540834845735027, + "grad_norm": 0.34350234270095825, + "learning_rate": 0.0002, + "loss": 1.2598, + "step": 350 + }, + { + "epoch": 0.2613430127041742, + "grad_norm": 0.6767239570617676, + "learning_rate": 0.0002, + "loss": 1.1942, + "step": 360 + }, + { + "epoch": 0.26860254083484575, + "grad_norm": 0.31006959080696106, + "learning_rate": 0.0002, + "loss": 1.1436, + "step": 370 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 0.3825474679470062, + "learning_rate": 0.0002, + "loss": 1.2084, + "step": 380 + }, + { + "epoch": 0.2831215970961887, + "grad_norm": 0.30241551995277405, + "learning_rate": 0.0002, + "loss": 1.1523, + "step": 390 + }, + { + "epoch": 0.29038112522686027, + "grad_norm": 0.2962397336959839, + "learning_rate": 0.0002, + "loss": 1.1298, + "step": 400 + }, + { + "epoch": 0.29764065335753176, + "grad_norm": 0.2600369155406952, + "learning_rate": 0.0002, + "loss": 1.1299, + "step": 410 + }, + { + "epoch": 0.30490018148820325, + "grad_norm": 0.3675060272216797, + "learning_rate": 0.0002, + "loss": 1.1366, + "step": 420 + }, + { + "epoch": 0.3121597096188748, + "grad_norm": 0.3429498076438904, + "learning_rate": 0.0002, + "loss": 1.156, + "step": 430 + }, + { + "epoch": 0.3194192377495463, + "grad_norm": 0.34311825037002563, + "learning_rate": 0.0002, + "loss": 1.2741, + "step": 440 + }, + { + "epoch": 0.32667876588021777, + "grad_norm": 0.37872210144996643, + "learning_rate": 0.0002, + "loss": 1.3523, + "step": 450 + }, + { + "epoch": 0.3339382940108893, + "grad_norm": 0.33271121978759766, + "learning_rate": 0.0002, + "loss": 1.1365, + "step": 460 + }, + { + "epoch": 0.3411978221415608, + "grad_norm": 0.34605276584625244, + "learning_rate": 0.0002, + "loss": 1.1572, + "step": 470 + }, + { + "epoch": 0.3484573502722323, + "grad_norm": 0.41050562262535095, + "learning_rate": 0.0002, + "loss": 1.2755, + "step": 480 + }, + { + "epoch": 0.35571687840290384, + "grad_norm": 0.2066836953163147, + "learning_rate": 0.0002, + "loss": 1.075, + "step": 490 + }, + { + "epoch": 0.3629764065335753, + "grad_norm": 0.2859014868736267, + "learning_rate": 0.0002, + "loss": 1.2072, + "step": 500 + }, + { + "epoch": 0.3702359346642468, + "grad_norm": 0.28763777017593384, + "learning_rate": 0.0002, + "loss": 1.1435, + "step": 510 + }, + { + "epoch": 0.37749546279491836, + "grad_norm": 0.2730471193790436, + "learning_rate": 0.0002, + "loss": 1.1341, + "step": 520 + }, + { + "epoch": 0.38475499092558985, + "grad_norm": 0.3968936800956726, + "learning_rate": 0.0002, + "loss": 1.348, + "step": 530 + }, + { + "epoch": 0.39201451905626133, + "grad_norm": 0.3624701201915741, + "learning_rate": 0.0002, + "loss": 1.2363, + "step": 540 + }, + { + "epoch": 0.3992740471869328, + "grad_norm": 0.3303608298301697, + "learning_rate": 0.0002, + "loss": 1.1206, + "step": 550 + }, + { + "epoch": 0.40653357531760437, + "grad_norm": 0.33507466316223145, + "learning_rate": 0.0002, + "loss": 1.203, + "step": 560 + }, + { + "epoch": 0.41379310344827586, + "grad_norm": 0.3297670781612396, + "learning_rate": 0.0002, + "loss": 1.2294, + "step": 570 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.32334890961647034, + "learning_rate": 0.0002, + "loss": 1.2, + "step": 580 + }, + { + "epoch": 0.4283121597096189, + "grad_norm": 0.30281195044517517, + "learning_rate": 0.0002, + "loss": 1.112, + "step": 590 + }, + { + "epoch": 0.4355716878402904, + "grad_norm": 0.5900027751922607, + "learning_rate": 0.0002, + "loss": 1.1352, + "step": 600 + }, + { + "epoch": 0.44283121597096187, + "grad_norm": 0.28437477350234985, + "learning_rate": 0.0002, + "loss": 1.1575, + "step": 610 + }, + { + "epoch": 0.4500907441016334, + "grad_norm": 0.39601704478263855, + "learning_rate": 0.0002, + "loss": 1.2009, + "step": 620 + }, + { + "epoch": 0.4573502722323049, + "grad_norm": 0.41971510648727417, + "learning_rate": 0.0002, + "loss": 1.2302, + "step": 630 + }, + { + "epoch": 0.4646098003629764, + "grad_norm": 0.33814409375190735, + "learning_rate": 0.0002, + "loss": 1.236, + "step": 640 + }, + { + "epoch": 0.47186932849364793, + "grad_norm": 0.575718104839325, + "learning_rate": 0.0002, + "loss": 1.2471, + "step": 650 + }, + { + "epoch": 0.4791288566243194, + "grad_norm": 0.37927401065826416, + "learning_rate": 0.0002, + "loss": 1.1012, + "step": 660 + }, + { + "epoch": 0.4863883847549909, + "grad_norm": 0.3224332630634308, + "learning_rate": 0.0002, + "loss": 1.1552, + "step": 670 + }, + { + "epoch": 0.49364791288566245, + "grad_norm": 0.32683515548706055, + "learning_rate": 0.0002, + "loss": 1.1354, + "step": 680 + }, + { + "epoch": 0.5009074410163339, + "grad_norm": 0.4316163659095764, + "learning_rate": 0.0002, + "loss": 1.207, + "step": 690 + }, + { + "epoch": 0.5081669691470054, + "grad_norm": 0.342602401971817, + "learning_rate": 0.0002, + "loss": 1.2116, + "step": 700 + }, + { + "epoch": 0.515426497277677, + "grad_norm": 0.2794898748397827, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 710 + }, + { + "epoch": 0.5226860254083484, + "grad_norm": 0.3322339951992035, + "learning_rate": 0.0002, + "loss": 1.2395, + "step": 720 + }, + { + "epoch": 0.52994555353902, + "grad_norm": 0.3088509142398834, + "learning_rate": 0.0002, + "loss": 1.0762, + "step": 730 + }, + { + "epoch": 0.5372050816696915, + "grad_norm": 0.24444378912448883, + "learning_rate": 0.0002, + "loss": 1.1495, + "step": 740 + }, + { + "epoch": 0.5444646098003629, + "grad_norm": 0.3483171761035919, + "learning_rate": 0.0002, + "loss": 1.2176, + "step": 750 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 0.3425690531730652, + "learning_rate": 0.0002, + "loss": 1.2248, + "step": 760 + }, + { + "epoch": 0.558983666061706, + "grad_norm": 0.31841927766799927, + "learning_rate": 0.0002, + "loss": 1.1127, + "step": 770 + }, + { + "epoch": 0.5662431941923775, + "grad_norm": 0.39423868060112, + "learning_rate": 0.0002, + "loss": 1.1841, + "step": 780 + }, + { + "epoch": 0.573502722323049, + "grad_norm": 0.30328479409217834, + "learning_rate": 0.0002, + "loss": 1.0583, + "step": 790 + }, + { + "epoch": 0.5807622504537205, + "grad_norm": 0.24475938081741333, + "learning_rate": 0.0002, + "loss": 1.0718, + "step": 800 + }, + { + "epoch": 0.588021778584392, + "grad_norm": 0.37132805585861206, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 810 + }, + { + "epoch": 0.5952813067150635, + "grad_norm": 0.32195979356765747, + "learning_rate": 0.0002, + "loss": 1.1544, + "step": 820 + }, + { + "epoch": 0.6025408348457351, + "grad_norm": 0.2848738729953766, + "learning_rate": 0.0002, + "loss": 0.9937, + "step": 830 + }, + { + "epoch": 0.6098003629764065, + "grad_norm": 0.28015264868736267, + "learning_rate": 0.0002, + "loss": 1.0937, + "step": 840 + }, + { + "epoch": 0.617059891107078, + "grad_norm": 0.37796008586883545, + "learning_rate": 0.0002, + "loss": 1.1234, + "step": 850 + }, + { + "epoch": 0.6243194192377496, + "grad_norm": 0.39311841130256653, + "learning_rate": 0.0002, + "loss": 1.1455, + "step": 860 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.2761685252189636, + "learning_rate": 0.0002, + "loss": 1.1066, + "step": 870 + }, + { + "epoch": 0.6388384754990926, + "grad_norm": 0.3826720118522644, + "learning_rate": 0.0002, + "loss": 1.0859, + "step": 880 + }, + { + "epoch": 0.6460980036297641, + "grad_norm": 0.30076679587364197, + "learning_rate": 0.0002, + "loss": 1.1182, + "step": 890 + }, + { + "epoch": 0.6533575317604355, + "grad_norm": 0.21997687220573425, + "learning_rate": 0.0002, + "loss": 1.0927, + "step": 900 + }, + { + "epoch": 0.6606170598911071, + "grad_norm": 0.32593777775764465, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 910 + }, + { + "epoch": 0.6678765880217786, + "grad_norm": 0.30347898602485657, + "learning_rate": 0.0002, + "loss": 1.1578, + "step": 920 + }, + { + "epoch": 0.6751361161524501, + "grad_norm": 0.44173774123191833, + "learning_rate": 0.0002, + "loss": 1.1642, + "step": 930 + }, + { + "epoch": 0.6823956442831216, + "grad_norm": 0.2507467269897461, + "learning_rate": 0.0002, + "loss": 1.1832, + "step": 940 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 0.29463833570480347, + "learning_rate": 0.0002, + "loss": 1.3001, + "step": 950 + }, + { + "epoch": 0.6969147005444646, + "grad_norm": 0.9363154172897339, + "learning_rate": 0.0002, + "loss": 1.1651, + "step": 960 + }, + { + "epoch": 0.7041742286751361, + "grad_norm": 0.3236212134361267, + "learning_rate": 0.0002, + "loss": 1.158, + "step": 970 + }, + { + "epoch": 0.7114337568058077, + "grad_norm": 0.3123254179954529, + "learning_rate": 0.0002, + "loss": 1.1078, + "step": 980 + }, + { + "epoch": 0.7186932849364791, + "grad_norm": 0.3395805060863495, + "learning_rate": 0.0002, + "loss": 1.1578, + "step": 990 + }, + { + "epoch": 0.7259528130671506, + "grad_norm": 0.3240964412689209, + "learning_rate": 0.0002, + "loss": 1.0754, + "step": 1000 + }, + { + "epoch": 0.7332123411978222, + "grad_norm": 0.31902948021888733, + "learning_rate": 0.0002, + "loss": 1.0313, + "step": 1010 + }, + { + "epoch": 0.7404718693284936, + "grad_norm": 0.4848408102989197, + "learning_rate": 0.0002, + "loss": 1.1664, + "step": 1020 + }, + { + "epoch": 0.7477313974591652, + "grad_norm": 0.33006033301353455, + "learning_rate": 0.0002, + "loss": 1.0838, + "step": 1030 + }, + { + "epoch": 0.7549909255898367, + "grad_norm": 0.2928730547428131, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 1040 + }, + { + "epoch": 0.7622504537205081, + "grad_norm": 0.3529164791107178, + "learning_rate": 0.0002, + "loss": 1.1351, + "step": 1050 + }, + { + "epoch": 0.7695099818511797, + "grad_norm": 0.2736213803291321, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 1060 + }, + { + "epoch": 0.7767695099818511, + "grad_norm": 0.7200686931610107, + "learning_rate": 0.0002, + "loss": 1.1169, + "step": 1070 + }, + { + "epoch": 0.7840290381125227, + "grad_norm": 0.33396708965301514, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 1080 + }, + { + "epoch": 0.7912885662431942, + "grad_norm": 1.5760449171066284, + "learning_rate": 0.0002, + "loss": 1.2447, + "step": 1090 + }, + { + "epoch": 0.7985480943738656, + "grad_norm": 0.28138381242752075, + "learning_rate": 0.0002, + "loss": 1.1126, + "step": 1100 + }, + { + "epoch": 0.8058076225045372, + "grad_norm": 0.2597472369670868, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 1110 + }, + { + "epoch": 0.8130671506352087, + "grad_norm": 0.3305445611476898, + "learning_rate": 0.0002, + "loss": 1.1177, + "step": 1120 + }, + { + "epoch": 0.8203266787658802, + "grad_norm": 0.3934599459171295, + "learning_rate": 0.0002, + "loss": 1.0849, + "step": 1130 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 0.3472191393375397, + "learning_rate": 0.0002, + "loss": 1.0889, + "step": 1140 + }, + { + "epoch": 0.8348457350272233, + "grad_norm": 0.2857365906238556, + "learning_rate": 0.0002, + "loss": 1.265, + "step": 1150 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.3207702934741974, + "learning_rate": 0.0002, + "loss": 1.03, + "step": 1160 + }, + { + "epoch": 0.8493647912885662, + "grad_norm": 0.3176484704017639, + "learning_rate": 0.0002, + "loss": 1.1669, + "step": 1170 + }, + { + "epoch": 0.8566243194192378, + "grad_norm": 0.40685558319091797, + "learning_rate": 0.0002, + "loss": 1.1386, + "step": 1180 + }, + { + "epoch": 0.8638838475499092, + "grad_norm": 0.31125199794769287, + "learning_rate": 0.0002, + "loss": 1.1383, + "step": 1190 + }, + { + "epoch": 0.8711433756805808, + "grad_norm": 0.7361181378364563, + "learning_rate": 0.0002, + "loss": 1.1512, + "step": 1200 + }, + { + "epoch": 0.8784029038112523, + "grad_norm": 0.33699527382850647, + "learning_rate": 0.0002, + "loss": 0.9983, + "step": 1210 + }, + { + "epoch": 0.8856624319419237, + "grad_norm": 0.3315220773220062, + "learning_rate": 0.0002, + "loss": 1.11, + "step": 1220 + }, + { + "epoch": 0.8929219600725953, + "grad_norm": 0.6256054043769836, + "learning_rate": 0.0002, + "loss": 1.0266, + "step": 1230 + }, + { + "epoch": 0.9001814882032668, + "grad_norm": 0.3692137897014618, + "learning_rate": 0.0002, + "loss": 1.1738, + "step": 1240 + }, + { + "epoch": 0.9074410163339383, + "grad_norm": 0.3538484573364258, + "learning_rate": 0.0002, + "loss": 1.1112, + "step": 1250 + }, + { + "epoch": 0.9147005444646098, + "grad_norm": 0.27863040566444397, + "learning_rate": 0.0002, + "loss": 1.0878, + "step": 1260 + }, + { + "epoch": 0.9219600725952813, + "grad_norm": 0.3322528302669525, + "learning_rate": 0.0002, + "loss": 1.0826, + "step": 1270 + }, + { + "epoch": 0.9292196007259528, + "grad_norm": 0.7553173303604126, + "learning_rate": 0.0002, + "loss": 1.0158, + "step": 1280 + }, + { + "epoch": 0.9364791288566243, + "grad_norm": 0.4856191575527191, + "learning_rate": 0.0002, + "loss": 1.198, + "step": 1290 + }, + { + "epoch": 0.9437386569872959, + "grad_norm": 0.3668074905872345, + "learning_rate": 0.0002, + "loss": 1.1678, + "step": 1300 + }, + { + "epoch": 0.9509981851179673, + "grad_norm": 0.29851067066192627, + "learning_rate": 0.0002, + "loss": 1.1809, + "step": 1310 + }, + { + "epoch": 0.9582577132486388, + "grad_norm": 0.276664674282074, + "learning_rate": 0.0002, + "loss": 1.1299, + "step": 1320 + }, + { + "epoch": 0.9655172413793104, + "grad_norm": 0.2941018342971802, + "learning_rate": 0.0002, + "loss": 1.0597, + "step": 1330 + }, + { + "epoch": 0.9727767695099818, + "grad_norm": 0.3505859076976776, + "learning_rate": 0.0002, + "loss": 1.1731, + "step": 1340 + }, + { + "epoch": 0.9800362976406534, + "grad_norm": 0.3067687451839447, + "learning_rate": 0.0002, + "loss": 1.1809, + "step": 1350 + }, + { + "epoch": 0.9872958257713249, + "grad_norm": 0.27151066064834595, + "learning_rate": 0.0002, + "loss": 1.1511, + "step": 1360 + }, + { + "epoch": 0.9945553539019963, + "grad_norm": 0.36370083689689636, + "learning_rate": 0.0002, + "loss": 1.1919, + "step": 1370 + }, + { + "epoch": 0.9996370235934664, + "eval_loss": 1.1381088495254517, + "eval_runtime": 96.0848, + "eval_samples_per_second": 4.538, + "eval_steps_per_second": 0.572, + "step": 1377 + }, + { + "epoch": 1.0018148820326678, + "grad_norm": 0.27980583906173706, + "learning_rate": 0.0002, + "loss": 1.0082, + "step": 1380 + }, + { + "epoch": 1.0090744101633393, + "grad_norm": 0.26713913679122925, + "learning_rate": 0.0002, + "loss": 1.0565, + "step": 1390 + }, + { + "epoch": 1.0163339382940109, + "grad_norm": 0.3089541494846344, + "learning_rate": 0.0002, + "loss": 1.0241, + "step": 1400 + }, + { + "epoch": 1.0235934664246824, + "grad_norm": 0.4188242256641388, + "learning_rate": 0.0002, + "loss": 0.9649, + "step": 1410 + }, + { + "epoch": 1.030852994555354, + "grad_norm": 0.5246463418006897, + "learning_rate": 0.0002, + "loss": 0.9821, + "step": 1420 + }, + { + "epoch": 1.0381125226860255, + "grad_norm": 0.2728777825832367, + "learning_rate": 0.0002, + "loss": 1.0143, + "step": 1430 + }, + { + "epoch": 1.0453720508166968, + "grad_norm": 0.38167616724967957, + "learning_rate": 0.0002, + "loss": 1.075, + "step": 1440 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.4439380168914795, + "learning_rate": 0.0002, + "loss": 1.0229, + "step": 1450 + }, + { + "epoch": 1.05989110707804, + "grad_norm": 0.30954182147979736, + "learning_rate": 0.0002, + "loss": 1.0451, + "step": 1460 + }, + { + "epoch": 1.0671506352087115, + "grad_norm": 0.4022280275821686, + "learning_rate": 0.0002, + "loss": 1.0762, + "step": 1470 + }, + { + "epoch": 1.074410163339383, + "grad_norm": 0.7390811443328857, + "learning_rate": 0.0002, + "loss": 1.0754, + "step": 1480 + }, + { + "epoch": 1.0816696914700545, + "grad_norm": 0.3885486423969269, + "learning_rate": 0.0002, + "loss": 1.1378, + "step": 1490 + }, + { + "epoch": 1.0889292196007259, + "grad_norm": 0.5275560617446899, + "learning_rate": 0.0002, + "loss": 1.005, + "step": 1500 + }, + { + "epoch": 1.0961887477313974, + "grad_norm": 0.35112282633781433, + "learning_rate": 0.0002, + "loss": 1.0147, + "step": 1510 + }, + { + "epoch": 1.103448275862069, + "grad_norm": 0.33714351058006287, + "learning_rate": 0.0002, + "loss": 1.0331, + "step": 1520 + }, + { + "epoch": 1.1107078039927405, + "grad_norm": 0.31221693754196167, + "learning_rate": 0.0002, + "loss": 0.8723, + "step": 1530 + }, + { + "epoch": 1.117967332123412, + "grad_norm": 0.27549654245376587, + "learning_rate": 0.0002, + "loss": 1.0701, + "step": 1540 + }, + { + "epoch": 1.1252268602540836, + "grad_norm": 0.8465521335601807, + "learning_rate": 0.0002, + "loss": 1.0114, + "step": 1550 + }, + { + "epoch": 1.132486388384755, + "grad_norm": 0.36125949025154114, + "learning_rate": 0.0002, + "loss": 0.9643, + "step": 1560 + }, + { + "epoch": 1.1397459165154264, + "grad_norm": 0.37420371174812317, + "learning_rate": 0.0002, + "loss": 1.0744, + "step": 1570 + }, + { + "epoch": 1.147005444646098, + "grad_norm": 0.3294760584831238, + "learning_rate": 0.0002, + "loss": 1.165, + "step": 1580 + }, + { + "epoch": 1.1542649727767695, + "grad_norm": 0.3881238102912903, + "learning_rate": 0.0002, + "loss": 0.9207, + "step": 1590 + }, + { + "epoch": 1.161524500907441, + "grad_norm": 0.4766491651535034, + "learning_rate": 0.0002, + "loss": 1.0983, + "step": 1600 + }, + { + "epoch": 1.1687840290381124, + "grad_norm": 0.395530104637146, + "learning_rate": 0.0002, + "loss": 1.0092, + "step": 1610 + }, + { + "epoch": 1.176043557168784, + "grad_norm": 0.3297106623649597, + "learning_rate": 0.0002, + "loss": 0.9895, + "step": 1620 + }, + { + "epoch": 1.1833030852994555, + "grad_norm": 0.39528271555900574, + "learning_rate": 0.0002, + "loss": 1.1121, + "step": 1630 + }, + { + "epoch": 1.190562613430127, + "grad_norm": 0.3370221257209778, + "learning_rate": 0.0002, + "loss": 0.9202, + "step": 1640 + }, + { + "epoch": 1.1978221415607986, + "grad_norm": 0.31922030448913574, + "learning_rate": 0.0002, + "loss": 1.102, + "step": 1650 + }, + { + "epoch": 1.2050816696914701, + "grad_norm": 0.6142027378082275, + "learning_rate": 0.0002, + "loss": 1.0565, + "step": 1660 + }, + { + "epoch": 1.2123411978221417, + "grad_norm": 0.44769710302352905, + "learning_rate": 0.0002, + "loss": 0.9541, + "step": 1670 + }, + { + "epoch": 1.219600725952813, + "grad_norm": 0.41233646869659424, + "learning_rate": 0.0002, + "loss": 1.2501, + "step": 1680 + }, + { + "epoch": 1.2268602540834845, + "grad_norm": 0.2928866147994995, + "learning_rate": 0.0002, + "loss": 1.0147, + "step": 1690 + }, + { + "epoch": 1.234119782214156, + "grad_norm": 0.36913734674453735, + "learning_rate": 0.0002, + "loss": 0.9074, + "step": 1700 + }, + { + "epoch": 1.2413793103448276, + "grad_norm": 0.5281891226768494, + "learning_rate": 0.0002, + "loss": 1.0847, + "step": 1710 + }, + { + "epoch": 1.2486388384754992, + "grad_norm": 0.3374697268009186, + "learning_rate": 0.0002, + "loss": 0.9482, + "step": 1720 + }, + { + "epoch": 1.2558983666061705, + "grad_norm": 0.3802020847797394, + "learning_rate": 0.0002, + "loss": 0.9826, + "step": 1730 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 0.38048651814460754, + "learning_rate": 0.0002, + "loss": 1.0191, + "step": 1740 + }, + { + "epoch": 1.2704174228675136, + "grad_norm": 0.6676169633865356, + "learning_rate": 0.0002, + "loss": 1.0142, + "step": 1750 + }, + { + "epoch": 1.2776769509981851, + "grad_norm": 0.4075961410999298, + "learning_rate": 0.0002, + "loss": 0.992, + "step": 1760 + }, + { + "epoch": 1.2849364791288567, + "grad_norm": 0.4374721348285675, + "learning_rate": 0.0002, + "loss": 1.0301, + "step": 1770 + }, + { + "epoch": 1.2921960072595282, + "grad_norm": 0.4638824164867401, + "learning_rate": 0.0002, + "loss": 1.0601, + "step": 1780 + }, + { + "epoch": 1.2994555353901998, + "grad_norm": 0.38631564378738403, + "learning_rate": 0.0002, + "loss": 0.9593, + "step": 1790 + }, + { + "epoch": 1.306715063520871, + "grad_norm": 0.35873809456825256, + "learning_rate": 0.0002, + "loss": 1.0042, + "step": 1800 + }, + { + "epoch": 1.3139745916515426, + "grad_norm": 0.33602237701416016, + "learning_rate": 0.0002, + "loss": 0.9835, + "step": 1810 + }, + { + "epoch": 1.3212341197822142, + "grad_norm": 0.46696463227272034, + "learning_rate": 0.0002, + "loss": 1.1032, + "step": 1820 + }, + { + "epoch": 1.3284936479128857, + "grad_norm": 0.368958979845047, + "learning_rate": 0.0002, + "loss": 1.0123, + "step": 1830 + }, + { + "epoch": 1.335753176043557, + "grad_norm": 0.3160957396030426, + "learning_rate": 0.0002, + "loss": 1.0456, + "step": 1840 + }, + { + "epoch": 1.3430127041742286, + "grad_norm": 0.4511511027812958, + "learning_rate": 0.0002, + "loss": 1.0264, + "step": 1850 + }, + { + "epoch": 1.3502722323049001, + "grad_norm": 0.6769845485687256, + "learning_rate": 0.0002, + "loss": 1.0179, + "step": 1860 + }, + { + "epoch": 1.3575317604355717, + "grad_norm": 0.3749309480190277, + "learning_rate": 0.0002, + "loss": 1.011, + "step": 1870 + }, + { + "epoch": 1.3647912885662432, + "grad_norm": 0.553798496723175, + "learning_rate": 0.0002, + "loss": 1.0223, + "step": 1880 + }, + { + "epoch": 1.3720508166969148, + "grad_norm": 0.3538985550403595, + "learning_rate": 0.0002, + "loss": 0.9634, + "step": 1890 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 0.5501534938812256, + "learning_rate": 0.0002, + "loss": 1.0275, + "step": 1900 + }, + { + "epoch": 1.3865698729582578, + "grad_norm": 0.4432051181793213, + "learning_rate": 0.0002, + "loss": 1.0461, + "step": 1910 + }, + { + "epoch": 1.3938294010889292, + "grad_norm": 0.41755786538124084, + "learning_rate": 0.0002, + "loss": 0.9082, + "step": 1920 + }, + { + "epoch": 1.4010889292196007, + "grad_norm": 0.5626114010810852, + "learning_rate": 0.0002, + "loss": 1.0858, + "step": 1930 + }, + { + "epoch": 1.4083484573502723, + "grad_norm": 0.44038185477256775, + "learning_rate": 0.0002, + "loss": 1.0687, + "step": 1940 + }, + { + "epoch": 1.4156079854809438, + "grad_norm": 0.3397001624107361, + "learning_rate": 0.0002, + "loss": 0.9454, + "step": 1950 + }, + { + "epoch": 1.4228675136116151, + "grad_norm": 0.4325368106365204, + "learning_rate": 0.0002, + "loss": 0.943, + "step": 1960 + }, + { + "epoch": 1.4301270417422867, + "grad_norm": 0.3900907039642334, + "learning_rate": 0.0002, + "loss": 1.0488, + "step": 1970 + }, + { + "epoch": 1.4373865698729582, + "grad_norm": 0.369612455368042, + "learning_rate": 0.0002, + "loss": 0.9699, + "step": 1980 + }, + { + "epoch": 1.4446460980036298, + "grad_norm": 0.4389338791370392, + "learning_rate": 0.0002, + "loss": 1.0609, + "step": 1990 + }, + { + "epoch": 1.4519056261343013, + "grad_norm": 1.694450855255127, + "learning_rate": 0.0002, + "loss": 1.042, + "step": 2000 + }, + { + "epoch": 1.4591651542649728, + "grad_norm": 0.516957700252533, + "learning_rate": 0.0002, + "loss": 0.997, + "step": 2010 + }, + { + "epoch": 1.4664246823956444, + "grad_norm": 0.45515501499176025, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 2020 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 0.4153139591217041, + "learning_rate": 0.0002, + "loss": 1.0188, + "step": 2030 + }, + { + "epoch": 1.4809437386569873, + "grad_norm": 0.44353052973747253, + "learning_rate": 0.0002, + "loss": 1.1521, + "step": 2040 + }, + { + "epoch": 1.4882032667876588, + "grad_norm": 0.570554256439209, + "learning_rate": 0.0002, + "loss": 0.9653, + "step": 2050 + }, + { + "epoch": 1.4954627949183303, + "grad_norm": 0.5742740035057068, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 2060 + }, + { + "epoch": 1.5027223230490017, + "grad_norm": 0.5890517830848694, + "learning_rate": 0.0002, + "loss": 1.0963, + "step": 2070 + }, + { + "epoch": 1.5099818511796732, + "grad_norm": 0.4162650406360626, + "learning_rate": 0.0002, + "loss": 1.0639, + "step": 2080 + }, + { + "epoch": 1.5172413793103448, + "grad_norm": 0.7334717512130737, + "learning_rate": 0.0002, + "loss": 1.071, + "step": 2090 + }, + { + "epoch": 1.5245009074410163, + "grad_norm": 0.2963249683380127, + "learning_rate": 0.0002, + "loss": 0.8957, + "step": 2100 + }, + { + "epoch": 1.5317604355716878, + "grad_norm": 0.30676454305648804, + "learning_rate": 0.0002, + "loss": 1.0446, + "step": 2110 + }, + { + "epoch": 1.5390199637023594, + "grad_norm": 0.35984641313552856, + "learning_rate": 0.0002, + "loss": 0.9967, + "step": 2120 + }, + { + "epoch": 1.546279491833031, + "grad_norm": 0.3384549617767334, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 2130 + }, + { + "epoch": 1.5535390199637025, + "grad_norm": 0.4725518226623535, + "learning_rate": 0.0002, + "loss": 0.9874, + "step": 2140 + }, + { + "epoch": 1.560798548094374, + "grad_norm": 0.3252685070037842, + "learning_rate": 0.0002, + "loss": 1.1166, + "step": 2150 + }, + { + "epoch": 1.5680580762250453, + "grad_norm": 0.45043081045150757, + "learning_rate": 0.0002, + "loss": 0.9127, + "step": 2160 + }, + { + "epoch": 1.5753176043557169, + "grad_norm": 0.374208003282547, + "learning_rate": 0.0002, + "loss": 1.0767, + "step": 2170 + }, + { + "epoch": 1.5825771324863884, + "grad_norm": 0.5118404030799866, + "learning_rate": 0.0002, + "loss": 1.0255, + "step": 2180 + }, + { + "epoch": 1.5898366606170597, + "grad_norm": 0.482170969247818, + "learning_rate": 0.0002, + "loss": 1.0062, + "step": 2190 + }, + { + "epoch": 1.5970961887477313, + "grad_norm": 0.5337533950805664, + "learning_rate": 0.0002, + "loss": 1.0872, + "step": 2200 + }, + { + "epoch": 1.6043557168784028, + "grad_norm": 0.5195064544677734, + "learning_rate": 0.0002, + "loss": 1.0405, + "step": 2210 + }, + { + "epoch": 1.6116152450090744, + "grad_norm": 0.30807098746299744, + "learning_rate": 0.0002, + "loss": 1.0454, + "step": 2220 + }, + { + "epoch": 1.618874773139746, + "grad_norm": 0.3962925672531128, + "learning_rate": 0.0002, + "loss": 1.0293, + "step": 2230 + }, + { + "epoch": 1.6261343012704175, + "grad_norm": 0.7636962532997131, + "learning_rate": 0.0002, + "loss": 1.0137, + "step": 2240 + }, + { + "epoch": 1.633393829401089, + "grad_norm": 0.32380592823028564, + "learning_rate": 0.0002, + "loss": 0.991, + "step": 2250 + }, + { + "epoch": 1.6406533575317606, + "grad_norm": 0.5767741799354553, + "learning_rate": 0.0002, + "loss": 0.9471, + "step": 2260 + }, + { + "epoch": 1.647912885662432, + "grad_norm": 0.39964812994003296, + "learning_rate": 0.0002, + "loss": 0.9524, + "step": 2270 + }, + { + "epoch": 1.6551724137931034, + "grad_norm": 0.622629702091217, + "learning_rate": 0.0002, + "loss": 0.9866, + "step": 2280 + }, + { + "epoch": 1.662431941923775, + "grad_norm": 0.40202152729034424, + "learning_rate": 0.0002, + "loss": 0.9609, + "step": 2290 + }, + { + "epoch": 1.6696914700544465, + "grad_norm": 0.4467349052429199, + "learning_rate": 0.0002, + "loss": 0.9859, + "step": 2300 + }, + { + "epoch": 1.6769509981851178, + "grad_norm": 0.5026949048042297, + "learning_rate": 0.0002, + "loss": 1.0312, + "step": 2310 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 0.43754541873931885, + "learning_rate": 0.0002, + "loss": 1.0776, + "step": 2320 + }, + { + "epoch": 1.691470054446461, + "grad_norm": 0.42869430780410767, + "learning_rate": 0.0002, + "loss": 0.9786, + "step": 2330 + }, + { + "epoch": 1.6987295825771325, + "grad_norm": 0.4192679524421692, + "learning_rate": 0.0002, + "loss": 1.0747, + "step": 2340 + }, + { + "epoch": 1.705989110707804, + "grad_norm": 0.3243155777454376, + "learning_rate": 0.0002, + "loss": 1.0054, + "step": 2350 + }, + { + "epoch": 1.7132486388384756, + "grad_norm": 1.0514075756072998, + "learning_rate": 0.0002, + "loss": 0.9165, + "step": 2360 + }, + { + "epoch": 1.720508166969147, + "grad_norm": 0.4719122648239136, + "learning_rate": 0.0002, + "loss": 1.1353, + "step": 2370 + }, + { + "epoch": 1.7277676950998186, + "grad_norm": 0.3846144676208496, + "learning_rate": 0.0002, + "loss": 0.9913, + "step": 2380 + }, + { + "epoch": 1.73502722323049, + "grad_norm": 0.7266581058502197, + "learning_rate": 0.0002, + "loss": 0.9935, + "step": 2390 + }, + { + "epoch": 1.7422867513611615, + "grad_norm": 0.6183241605758667, + "learning_rate": 0.0002, + "loss": 0.9509, + "step": 2400 + }, + { + "epoch": 1.749546279491833, + "grad_norm": 0.3658260405063629, + "learning_rate": 0.0002, + "loss": 0.9713, + "step": 2410 + }, + { + "epoch": 1.7568058076225044, + "grad_norm": 0.6036322712898254, + "learning_rate": 0.0002, + "loss": 1.0033, + "step": 2420 + }, + { + "epoch": 1.764065335753176, + "grad_norm": 0.7872936129570007, + "learning_rate": 0.0002, + "loss": 1.0526, + "step": 2430 + }, + { + "epoch": 1.7713248638838475, + "grad_norm": 0.35946124792099, + "learning_rate": 0.0002, + "loss": 0.99, + "step": 2440 + }, + { + "epoch": 1.778584392014519, + "grad_norm": 0.3740338981151581, + "learning_rate": 0.0002, + "loss": 0.8845, + "step": 2450 + }, + { + "epoch": 1.7858439201451906, + "grad_norm": 0.6150230169296265, + "learning_rate": 0.0002, + "loss": 0.9333, + "step": 2460 + }, + { + "epoch": 1.793103448275862, + "grad_norm": 0.4726075530052185, + "learning_rate": 0.0002, + "loss": 1.0373, + "step": 2470 + }, + { + "epoch": 1.8003629764065336, + "grad_norm": 0.31292253732681274, + "learning_rate": 0.0002, + "loss": 0.974, + "step": 2480 + }, + { + "epoch": 1.8076225045372052, + "grad_norm": 0.4463104009628296, + "learning_rate": 0.0002, + "loss": 1.0205, + "step": 2490 + }, + { + "epoch": 1.8148820326678767, + "grad_norm": 0.7848200798034668, + "learning_rate": 0.0002, + "loss": 1.0771, + "step": 2500 + }, + { + "epoch": 1.822141560798548, + "grad_norm": 0.5562082529067993, + "learning_rate": 0.0002, + "loss": 0.9508, + "step": 2510 + }, + { + "epoch": 1.8294010889292196, + "grad_norm": 0.39892756938934326, + "learning_rate": 0.0002, + "loss": 0.9343, + "step": 2520 + }, + { + "epoch": 1.8366606170598911, + "grad_norm": 1.2923320531845093, + "learning_rate": 0.0002, + "loss": 1.0489, + "step": 2530 + }, + { + "epoch": 1.8439201451905625, + "grad_norm": 0.6316490769386292, + "learning_rate": 0.0002, + "loss": 1.1203, + "step": 2540 + }, + { + "epoch": 1.851179673321234, + "grad_norm": 0.46100497245788574, + "learning_rate": 0.0002, + "loss": 0.9131, + "step": 2550 + }, + { + "epoch": 1.8584392014519056, + "grad_norm": 0.7902987003326416, + "learning_rate": 0.0002, + "loss": 1.0977, + "step": 2560 + }, + { + "epoch": 1.865698729582577, + "grad_norm": 0.4596365690231323, + "learning_rate": 0.0002, + "loss": 0.9702, + "step": 2570 + }, + { + "epoch": 1.8729582577132486, + "grad_norm": 0.6592172384262085, + "learning_rate": 0.0002, + "loss": 0.9322, + "step": 2580 + }, + { + "epoch": 1.8802177858439202, + "grad_norm": 0.5764662027359009, + "learning_rate": 0.0002, + "loss": 0.9549, + "step": 2590 + }, + { + "epoch": 1.8874773139745917, + "grad_norm": 0.8421637415885925, + "learning_rate": 0.0002, + "loss": 0.9485, + "step": 2600 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.5635305047035217, + "learning_rate": 0.0002, + "loss": 1.012, + "step": 2610 + }, + { + "epoch": 1.9019963702359348, + "grad_norm": 0.46231237053871155, + "learning_rate": 0.0002, + "loss": 0.8907, + "step": 2620 + }, + { + "epoch": 1.9092558983666061, + "grad_norm": 0.3944607079029083, + "learning_rate": 0.0002, + "loss": 0.9543, + "step": 2630 + }, + { + "epoch": 1.9165154264972777, + "grad_norm": 0.4753907322883606, + "learning_rate": 0.0002, + "loss": 0.9964, + "step": 2640 + }, + { + "epoch": 1.9237749546279492, + "grad_norm": 0.4151090979576111, + "learning_rate": 0.0002, + "loss": 1.0217, + "step": 2650 + }, + { + "epoch": 1.9310344827586206, + "grad_norm": 0.6793725490570068, + "learning_rate": 0.0002, + "loss": 1.0397, + "step": 2660 + }, + { + "epoch": 1.938294010889292, + "grad_norm": 0.339755117893219, + "learning_rate": 0.0002, + "loss": 1.0401, + "step": 2670 + }, + { + "epoch": 1.9455535390199636, + "grad_norm": 0.40789374709129333, + "learning_rate": 0.0002, + "loss": 1.082, + "step": 2680 + }, + { + "epoch": 1.9528130671506352, + "grad_norm": 0.3750005066394806, + "learning_rate": 0.0002, + "loss": 0.9973, + "step": 2690 + }, + { + "epoch": 1.9600725952813067, + "grad_norm": 0.39684441685676575, + "learning_rate": 0.0002, + "loss": 0.9837, + "step": 2700 + }, + { + "epoch": 1.9673321234119783, + "grad_norm": 0.378287672996521, + "learning_rate": 0.0002, + "loss": 1.1204, + "step": 2710 + }, + { + "epoch": 1.9745916515426498, + "grad_norm": 0.3668482005596161, + "learning_rate": 0.0002, + "loss": 0.9284, + "step": 2720 + }, + { + "epoch": 1.9818511796733214, + "grad_norm": 0.49997565150260925, + "learning_rate": 0.0002, + "loss": 1.0512, + "step": 2730 + }, + { + "epoch": 1.989110707803993, + "grad_norm": 0.36852124333381653, + "learning_rate": 0.0002, + "loss": 0.9311, + "step": 2740 + }, + { + "epoch": 1.9963702359346642, + "grad_norm": 0.5203380584716797, + "learning_rate": 0.0002, + "loss": 1.0898, + "step": 2750 + }, + { + "epoch": 2.0, + "eval_loss": 1.1086540222167969, + "eval_runtime": 95.6866, + "eval_samples_per_second": 4.557, + "eval_steps_per_second": 0.575, + "step": 2755 + }, + { + "epoch": 2.0036297640653356, + "grad_norm": 0.35921573638916016, + "learning_rate": 0.0002, + "loss": 0.9791, + "step": 2760 + }, + { + "epoch": 2.010889292196007, + "grad_norm": 1.013861894607544, + "learning_rate": 0.0002, + "loss": 0.8968, + "step": 2770 + }, + { + "epoch": 2.0181488203266786, + "grad_norm": 0.4425240159034729, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 2780 + }, + { + "epoch": 2.02540834845735, + "grad_norm": 0.60174161195755, + "learning_rate": 0.0002, + "loss": 0.8758, + "step": 2790 + }, + { + "epoch": 2.0326678765880217, + "grad_norm": 0.47582098841667175, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 2800 + }, + { + "epoch": 2.0399274047186933, + "grad_norm": 0.6012811660766602, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 2810 + }, + { + "epoch": 2.047186932849365, + "grad_norm": 0.4444408118724823, + "learning_rate": 0.0002, + "loss": 0.8109, + "step": 2820 + }, + { + "epoch": 2.0544464609800364, + "grad_norm": 0.4864003360271454, + "learning_rate": 0.0002, + "loss": 0.8621, + "step": 2830 + }, + { + "epoch": 2.061705989110708, + "grad_norm": 0.5104215741157532, + "learning_rate": 0.0002, + "loss": 0.8817, + "step": 2840 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 0.6218489408493042, + "learning_rate": 0.0002, + "loss": 0.7789, + "step": 2850 + }, + { + "epoch": 2.076225045372051, + "grad_norm": 0.705784261226654, + "learning_rate": 0.0002, + "loss": 0.8778, + "step": 2860 + }, + { + "epoch": 2.0834845735027225, + "grad_norm": 0.48091503977775574, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 2870 + }, + { + "epoch": 2.0907441016333936, + "grad_norm": 0.5062456727027893, + "learning_rate": 0.0002, + "loss": 0.8444, + "step": 2880 + }, + { + "epoch": 2.098003629764065, + "grad_norm": 0.4862022399902344, + "learning_rate": 0.0002, + "loss": 0.7898, + "step": 2890 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.48264044523239136, + "learning_rate": 0.0002, + "loss": 0.8593, + "step": 2900 + }, + { + "epoch": 2.1125226860254083, + "grad_norm": 0.43744346499443054, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 2910 + }, + { + "epoch": 2.11978221415608, + "grad_norm": 0.5480492115020752, + "learning_rate": 0.0002, + "loss": 0.9099, + "step": 2920 + }, + { + "epoch": 2.1270417422867514, + "grad_norm": 0.5068560838699341, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 2930 + }, + { + "epoch": 2.134301270417423, + "grad_norm": 0.4650852680206299, + "learning_rate": 0.0002, + "loss": 0.8403, + "step": 2940 + }, + { + "epoch": 2.1415607985480944, + "grad_norm": 0.4929981231689453, + "learning_rate": 0.0002, + "loss": 0.8467, + "step": 2950 + }, + { + "epoch": 2.148820326678766, + "grad_norm": 0.6537389755249023, + "learning_rate": 0.0002, + "loss": 0.8747, + "step": 2960 + }, + { + "epoch": 2.1560798548094375, + "grad_norm": 0.8032940626144409, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 2970 + }, + { + "epoch": 2.163339382940109, + "grad_norm": 0.7131643891334534, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 2980 + }, + { + "epoch": 2.1705989110707806, + "grad_norm": 0.6034275889396667, + "learning_rate": 0.0002, + "loss": 0.8424, + "step": 2990 + }, + { + "epoch": 2.1778584392014517, + "grad_norm": 0.6081095933914185, + "learning_rate": 0.0002, + "loss": 0.8838, + "step": 3000 + }, + { + "epoch": 2.1851179673321233, + "grad_norm": 0.5706912875175476, + "learning_rate": 0.0002, + "loss": 0.9013, + "step": 3010 + }, + { + "epoch": 2.192377495462795, + "grad_norm": 0.6742380261421204, + "learning_rate": 0.0002, + "loss": 0.8473, + "step": 3020 + }, + { + "epoch": 2.1996370235934664, + "grad_norm": 0.4847496449947357, + "learning_rate": 0.0002, + "loss": 0.8372, + "step": 3030 + }, + { + "epoch": 2.206896551724138, + "grad_norm": 0.5529342889785767, + "learning_rate": 0.0002, + "loss": 0.8487, + "step": 3040 + }, + { + "epoch": 2.2141560798548094, + "grad_norm": 0.6108783483505249, + "learning_rate": 0.0002, + "loss": 0.9057, + "step": 3050 + }, + { + "epoch": 2.221415607985481, + "grad_norm": 0.8841571807861328, + "learning_rate": 0.0002, + "loss": 0.9659, + "step": 3060 + }, + { + "epoch": 2.2286751361161525, + "grad_norm": 0.4227530360221863, + "learning_rate": 0.0002, + "loss": 0.8538, + "step": 3070 + }, + { + "epoch": 2.235934664246824, + "grad_norm": 0.461935818195343, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 3080 + }, + { + "epoch": 2.2431941923774956, + "grad_norm": 0.5407412648200989, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 3090 + }, + { + "epoch": 2.250453720508167, + "grad_norm": 0.5057998895645142, + "learning_rate": 0.0002, + "loss": 0.8392, + "step": 3100 + }, + { + "epoch": 2.2577132486388383, + "grad_norm": 0.530057966709137, + "learning_rate": 0.0002, + "loss": 0.7898, + "step": 3110 + }, + { + "epoch": 2.26497277676951, + "grad_norm": 0.5066842436790466, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 3120 + }, + { + "epoch": 2.2722323049001814, + "grad_norm": 0.5069178342819214, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 3130 + }, + { + "epoch": 2.279491833030853, + "grad_norm": 0.6095499396324158, + "learning_rate": 0.0002, + "loss": 0.8458, + "step": 3140 + }, + { + "epoch": 2.2867513611615244, + "grad_norm": 0.49524766206741333, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 3150 + }, + { + "epoch": 2.294010889292196, + "grad_norm": 0.5334409475326538, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 3160 + }, + { + "epoch": 2.3012704174228675, + "grad_norm": 1.681748867034912, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 3170 + }, + { + "epoch": 2.308529945553539, + "grad_norm": 0.7225565314292908, + "learning_rate": 0.0002, + "loss": 0.8155, + "step": 3180 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 0.5379388928413391, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 3190 + }, + { + "epoch": 2.323049001814882, + "grad_norm": 0.45770326256752014, + "learning_rate": 0.0002, + "loss": 0.8833, + "step": 3200 + }, + { + "epoch": 2.3303085299455537, + "grad_norm": 0.6984533071517944, + "learning_rate": 0.0002, + "loss": 0.8419, + "step": 3210 + }, + { + "epoch": 2.337568058076225, + "grad_norm": 0.6725744605064392, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 3220 + }, + { + "epoch": 2.344827586206897, + "grad_norm": 1.1247471570968628, + "learning_rate": 0.0002, + "loss": 0.8657, + "step": 3230 + }, + { + "epoch": 2.352087114337568, + "grad_norm": 1.0240263938903809, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 3240 + }, + { + "epoch": 2.3593466424682394, + "grad_norm": 0.5608096122741699, + "learning_rate": 0.0002, + "loss": 0.826, + "step": 3250 + }, + { + "epoch": 2.366606170598911, + "grad_norm": 0.8294990062713623, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 3260 + }, + { + "epoch": 2.3738656987295825, + "grad_norm": 0.6734224557876587, + "learning_rate": 0.0002, + "loss": 0.9845, + "step": 3270 + }, + { + "epoch": 2.381125226860254, + "grad_norm": 0.6862800717353821, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 3280 + }, + { + "epoch": 2.3883847549909256, + "grad_norm": 0.5442930459976196, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 3290 + }, + { + "epoch": 2.395644283121597, + "grad_norm": 0.5745394229888916, + "learning_rate": 0.0002, + "loss": 0.8543, + "step": 3300 + }, + { + "epoch": 2.4029038112522687, + "grad_norm": 0.6257799863815308, + "learning_rate": 0.0002, + "loss": 0.833, + "step": 3310 + }, + { + "epoch": 2.4101633393829403, + "grad_norm": 0.5608420968055725, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 3320 + }, + { + "epoch": 2.417422867513612, + "grad_norm": 0.5512017011642456, + "learning_rate": 0.0002, + "loss": 0.8745, + "step": 3330 + }, + { + "epoch": 2.4246823956442833, + "grad_norm": 0.7452999353408813, + "learning_rate": 0.0002, + "loss": 0.8642, + "step": 3340 + }, + { + "epoch": 2.4319419237749544, + "grad_norm": 0.4604301452636719, + "learning_rate": 0.0002, + "loss": 0.8417, + "step": 3350 + }, + { + "epoch": 2.439201451905626, + "grad_norm": 0.8225823640823364, + "learning_rate": 0.0002, + "loss": 0.9148, + "step": 3360 + }, + { + "epoch": 2.4464609800362975, + "grad_norm": 0.8604981303215027, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 3370 + }, + { + "epoch": 2.453720508166969, + "grad_norm": 0.6620925664901733, + "learning_rate": 0.0002, + "loss": 0.9276, + "step": 3380 + }, + { + "epoch": 2.4609800362976406, + "grad_norm": 0.4750158488750458, + "learning_rate": 0.0002, + "loss": 0.8381, + "step": 3390 + }, + { + "epoch": 2.468239564428312, + "grad_norm": 0.6061418056488037, + "learning_rate": 0.0002, + "loss": 0.8249, + "step": 3400 + }, + { + "epoch": 2.4754990925589837, + "grad_norm": 0.5977247357368469, + "learning_rate": 0.0002, + "loss": 0.8944, + "step": 3410 + }, + { + "epoch": 2.4827586206896552, + "grad_norm": 0.5004227757453918, + "learning_rate": 0.0002, + "loss": 0.8366, + "step": 3420 + }, + { + "epoch": 2.490018148820327, + "grad_norm": 0.46428972482681274, + "learning_rate": 0.0002, + "loss": 0.9133, + "step": 3430 + }, + { + "epoch": 2.4972776769509983, + "grad_norm": 0.7658395171165466, + "learning_rate": 0.0002, + "loss": 0.8534, + "step": 3440 + }, + { + "epoch": 2.50453720508167, + "grad_norm": 0.621819794178009, + "learning_rate": 0.0002, + "loss": 0.9066, + "step": 3450 + }, + { + "epoch": 2.511796733212341, + "grad_norm": 0.4556088149547577, + "learning_rate": 0.0002, + "loss": 0.8481, + "step": 3460 + }, + { + "epoch": 2.519056261343013, + "grad_norm": 0.6124140024185181, + "learning_rate": 0.0002, + "loss": 0.8522, + "step": 3470 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 0.6256231665611267, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 3480 + }, + { + "epoch": 2.5335753176043556, + "grad_norm": 0.5464141964912415, + "learning_rate": 0.0002, + "loss": 0.9097, + "step": 3490 + }, + { + "epoch": 2.540834845735027, + "grad_norm": 0.51471346616745, + "learning_rate": 0.0002, + "loss": 0.8708, + "step": 3500 + }, + { + "epoch": 2.5480943738656987, + "grad_norm": 0.5326165556907654, + "learning_rate": 0.0002, + "loss": 0.9042, + "step": 3510 + }, + { + "epoch": 2.5553539019963702, + "grad_norm": 0.4750378429889679, + "learning_rate": 0.0002, + "loss": 0.8749, + "step": 3520 + }, + { + "epoch": 2.562613430127042, + "grad_norm": 0.5292279124259949, + "learning_rate": 0.0002, + "loss": 0.8248, + "step": 3530 + }, + { + "epoch": 2.5698729582577133, + "grad_norm": 0.6145227551460266, + "learning_rate": 0.0002, + "loss": 0.8808, + "step": 3540 + }, + { + "epoch": 2.577132486388385, + "grad_norm": 0.8275189399719238, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 3550 + }, + { + "epoch": 2.5843920145190564, + "grad_norm": 0.5037438273429871, + "learning_rate": 0.0002, + "loss": 0.875, + "step": 3560 + }, + { + "epoch": 2.5916515426497275, + "grad_norm": 0.5838707685470581, + "learning_rate": 0.0002, + "loss": 0.8324, + "step": 3570 + }, + { + "epoch": 2.5989110707803995, + "grad_norm": 0.5398710370063782, + "learning_rate": 0.0002, + "loss": 0.8716, + "step": 3580 + }, + { + "epoch": 2.6061705989110706, + "grad_norm": 0.6115376949310303, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 3590 + }, + { + "epoch": 2.613430127041742, + "grad_norm": 1.118809461593628, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 3600 + }, + { + "epoch": 2.6206896551724137, + "grad_norm": 0.7811771631240845, + "learning_rate": 0.0002, + "loss": 0.8282, + "step": 3610 + }, + { + "epoch": 2.6279491833030852, + "grad_norm": 0.5753175020217896, + "learning_rate": 0.0002, + "loss": 0.7997, + "step": 3620 + }, + { + "epoch": 2.635208711433757, + "grad_norm": 0.550829291343689, + "learning_rate": 0.0002, + "loss": 0.7705, + "step": 3630 + }, + { + "epoch": 2.6424682395644283, + "grad_norm": 0.5360019207000732, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 3640 + }, + { + "epoch": 2.6497277676951, + "grad_norm": 0.63050377368927, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 3650 + }, + { + "epoch": 2.6569872958257714, + "grad_norm": 0.5833110213279724, + "learning_rate": 0.0002, + "loss": 0.9156, + "step": 3660 + }, + { + "epoch": 2.664246823956443, + "grad_norm": 0.5543047189712524, + "learning_rate": 0.0002, + "loss": 0.8033, + "step": 3670 + }, + { + "epoch": 2.671506352087114, + "grad_norm": 0.6842212080955505, + "learning_rate": 0.0002, + "loss": 0.8867, + "step": 3680 + }, + { + "epoch": 2.678765880217786, + "grad_norm": 0.6859333515167236, + "learning_rate": 0.0002, + "loss": 0.8569, + "step": 3690 + }, + { + "epoch": 2.686025408348457, + "grad_norm": 0.7038410902023315, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 3700 + }, + { + "epoch": 2.6932849364791287, + "grad_norm": 0.447233647108078, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 3710 + }, + { + "epoch": 2.7005444646098002, + "grad_norm": 0.6453872919082642, + "learning_rate": 0.0002, + "loss": 0.8951, + "step": 3720 + }, + { + "epoch": 2.707803992740472, + "grad_norm": 0.8025672435760498, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 3730 + }, + { + "epoch": 2.7150635208711433, + "grad_norm": 0.5997087955474854, + "learning_rate": 0.0002, + "loss": 0.8488, + "step": 3740 + }, + { + "epoch": 2.722323049001815, + "grad_norm": 0.6901142001152039, + "learning_rate": 0.0002, + "loss": 0.8308, + "step": 3750 + }, + { + "epoch": 2.7295825771324864, + "grad_norm": 1.036145567893982, + "learning_rate": 0.0002, + "loss": 0.8517, + "step": 3760 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 0.7207072377204895, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 3770 + }, + { + "epoch": 2.7441016333938295, + "grad_norm": 1.0452989339828491, + "learning_rate": 0.0002, + "loss": 0.8009, + "step": 3780 + }, + { + "epoch": 2.751361161524501, + "grad_norm": 0.5615278482437134, + "learning_rate": 0.0002, + "loss": 0.8868, + "step": 3790 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 0.46439215540885925, + "learning_rate": 0.0002, + "loss": 0.9047, + "step": 3800 + }, + { + "epoch": 2.7658802177858437, + "grad_norm": 0.7134917974472046, + "learning_rate": 0.0002, + "loss": 0.7847, + "step": 3810 + }, + { + "epoch": 2.7731397459165157, + "grad_norm": 0.5139115452766418, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 3820 + }, + { + "epoch": 2.780399274047187, + "grad_norm": 0.8595781326293945, + "learning_rate": 0.0002, + "loss": 0.8995, + "step": 3830 + }, + { + "epoch": 2.7876588021778583, + "grad_norm": 0.544614851474762, + "learning_rate": 0.0002, + "loss": 0.9079, + "step": 3840 + }, + { + "epoch": 2.79491833030853, + "grad_norm": 0.6073850393295288, + "learning_rate": 0.0002, + "loss": 0.8322, + "step": 3850 + }, + { + "epoch": 2.8021778584392014, + "grad_norm": 0.8224069476127625, + "learning_rate": 0.0002, + "loss": 0.8277, + "step": 3860 + }, + { + "epoch": 2.809437386569873, + "grad_norm": 0.5347970128059387, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 3870 + }, + { + "epoch": 2.8166969147005445, + "grad_norm": 0.6799601912498474, + "learning_rate": 0.0002, + "loss": 0.8219, + "step": 3880 + }, + { + "epoch": 2.823956442831216, + "grad_norm": 0.5219197869300842, + "learning_rate": 0.0002, + "loss": 0.8796, + "step": 3890 + }, + { + "epoch": 2.8312159709618876, + "grad_norm": 0.5710130333900452, + "learning_rate": 0.0002, + "loss": 0.8649, + "step": 3900 + }, + { + "epoch": 2.838475499092559, + "grad_norm": 0.5857213139533997, + "learning_rate": 0.0002, + "loss": 0.8741, + "step": 3910 + }, + { + "epoch": 2.8457350272232302, + "grad_norm": 0.5206644535064697, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 3920 + }, + { + "epoch": 2.8529945553539022, + "grad_norm": 1.2902015447616577, + "learning_rate": 0.0002, + "loss": 0.9328, + "step": 3930 + }, + { + "epoch": 2.8602540834845733, + "grad_norm": 0.5252797603607178, + "learning_rate": 0.0002, + "loss": 0.8192, + "step": 3940 + }, + { + "epoch": 2.867513611615245, + "grad_norm": 0.5925108790397644, + "learning_rate": 0.0002, + "loss": 0.9047, + "step": 3950 + }, + { + "epoch": 2.8747731397459164, + "grad_norm": 0.9719610810279846, + "learning_rate": 0.0002, + "loss": 0.8611, + "step": 3960 + }, + { + "epoch": 2.882032667876588, + "grad_norm": 0.6834747195243835, + "learning_rate": 0.0002, + "loss": 0.8531, + "step": 3970 + }, + { + "epoch": 2.8892921960072595, + "grad_norm": 0.6952353119850159, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 3980 + }, + { + "epoch": 2.896551724137931, + "grad_norm": 0.49889910221099854, + "learning_rate": 0.0002, + "loss": 0.8365, + "step": 3990 + }, + { + "epoch": 2.9038112522686026, + "grad_norm": 0.5007026791572571, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 4000 + }, + { + "epoch": 2.911070780399274, + "grad_norm": 0.5474239587783813, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 4010 + }, + { + "epoch": 2.9183303085299457, + "grad_norm": 0.6617428660392761, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 4020 + }, + { + "epoch": 2.925589836660617, + "grad_norm": 0.6097776293754578, + "learning_rate": 0.0002, + "loss": 0.8583, + "step": 4030 + }, + { + "epoch": 2.9328493647912888, + "grad_norm": 0.5985828638076782, + "learning_rate": 0.0002, + "loss": 0.8296, + "step": 4040 + }, + { + "epoch": 2.94010889292196, + "grad_norm": 0.769488513469696, + "learning_rate": 0.0002, + "loss": 0.9134, + "step": 4050 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 0.5167055130004883, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 4060 + }, + { + "epoch": 2.954627949183303, + "grad_norm": 0.6265496015548706, + "learning_rate": 0.0002, + "loss": 0.8665, + "step": 4070 + }, + { + "epoch": 2.9618874773139745, + "grad_norm": 1.2644082307815552, + "learning_rate": 0.0002, + "loss": 0.9218, + "step": 4080 + }, + { + "epoch": 2.969147005444646, + "grad_norm": 0.6007561087608337, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 4090 + }, + { + "epoch": 2.9764065335753176, + "grad_norm": 0.47984927892684937, + "learning_rate": 0.0002, + "loss": 0.8262, + "step": 4100 + }, + { + "epoch": 2.983666061705989, + "grad_norm": 1.128198504447937, + "learning_rate": 0.0002, + "loss": 0.7897, + "step": 4110 + }, + { + "epoch": 2.9909255898366607, + "grad_norm": 0.526292085647583, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 4120 + }, + { + "epoch": 2.9981851179673322, + "grad_norm": 0.5462674498558044, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 4130 + }, + { + "epoch": 2.9996370235934666, + "eval_loss": 1.1093357801437378, + "eval_runtime": 46.2498, + "eval_samples_per_second": 9.427, + "eval_steps_per_second": 1.189, + "step": 4132 + }, + { + "epoch": 3.0054446460980038, + "grad_norm": 0.48288026452064514, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 4140 + }, + { + "epoch": 3.0127041742286753, + "grad_norm": 1.0181782245635986, + "learning_rate": 0.0002, + "loss": 0.6769, + "step": 4150 + }, + { + "epoch": 3.019963702359347, + "grad_norm": 0.7718019485473633, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 4160 + }, + { + "epoch": 3.027223230490018, + "grad_norm": 0.7492219805717468, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 4170 + }, + { + "epoch": 3.0344827586206895, + "grad_norm": 0.9363632798194885, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 4180 + }, + { + "epoch": 3.041742286751361, + "grad_norm": 0.6888533234596252, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 4190 + }, + { + "epoch": 3.0490018148820326, + "grad_norm": 0.7072834968566895, + "learning_rate": 0.0002, + "loss": 0.6469, + "step": 4200 + }, + { + "epoch": 3.056261343012704, + "grad_norm": 0.7182047963142395, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 4210 + }, + { + "epoch": 3.0635208711433757, + "grad_norm": 0.7194355130195618, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 4220 + }, + { + "epoch": 3.0707803992740472, + "grad_norm": 0.9454023838043213, + "learning_rate": 0.0002, + "loss": 0.5812, + "step": 4230 + }, + { + "epoch": 3.0780399274047188, + "grad_norm": 0.838657557964325, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 4240 + }, + { + "epoch": 3.0852994555353903, + "grad_norm": 0.740113377571106, + "learning_rate": 0.0002, + "loss": 0.646, + "step": 4250 + }, + { + "epoch": 3.092558983666062, + "grad_norm": 0.6616561412811279, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 4260 + }, + { + "epoch": 3.0998185117967334, + "grad_norm": 0.8846506476402283, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 4270 + }, + { + "epoch": 3.107078039927405, + "grad_norm": 0.6322125792503357, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 4280 + }, + { + "epoch": 3.114337568058076, + "grad_norm": 0.7461467385292053, + "learning_rate": 0.0002, + "loss": 0.5953, + "step": 4290 + }, + { + "epoch": 3.1215970961887476, + "grad_norm": 0.8251287341117859, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 4300 + }, + { + "epoch": 3.128856624319419, + "grad_norm": 0.8767673373222351, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 4310 + }, + { + "epoch": 3.1361161524500907, + "grad_norm": 0.7758759260177612, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 4320 + }, + { + "epoch": 3.143375680580762, + "grad_norm": 1.1056879758834839, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 4330 + }, + { + "epoch": 3.1506352087114338, + "grad_norm": 0.8259835243225098, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 4340 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 0.6607027053833008, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 4350 + }, + { + "epoch": 3.165154264972777, + "grad_norm": 0.7983301281929016, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 4360 + }, + { + "epoch": 3.1724137931034484, + "grad_norm": 0.6725239157676697, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 4370 + }, + { + "epoch": 3.17967332123412, + "grad_norm": 0.9052095413208008, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 4380 + }, + { + "epoch": 3.1869328493647915, + "grad_norm": 0.8131307363510132, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 4390 + }, + { + "epoch": 3.1941923774954626, + "grad_norm": 0.6435626149177551, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 4400 + }, + { + "epoch": 3.201451905626134, + "grad_norm": 0.84367436170578, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 4410 + }, + { + "epoch": 3.2087114337568057, + "grad_norm": 1.5018867254257202, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 4420 + }, + { + "epoch": 3.215970961887477, + "grad_norm": 0.7019091844558716, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 4430 + }, + { + "epoch": 3.2232304900181488, + "grad_norm": 0.9164197444915771, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 4440 + }, + { + "epoch": 3.2304900181488203, + "grad_norm": 0.7890861630439758, + "learning_rate": 0.0002, + "loss": 0.6618, + "step": 4450 + }, + { + "epoch": 3.237749546279492, + "grad_norm": 0.6517660617828369, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 4460 + }, + { + "epoch": 3.2450090744101634, + "grad_norm": 1.10188889503479, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 4470 + }, + { + "epoch": 3.252268602540835, + "grad_norm": 0.8158330917358398, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 4480 + }, + { + "epoch": 3.2595281306715065, + "grad_norm": 0.7663109302520752, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 4490 + }, + { + "epoch": 3.266787658802178, + "grad_norm": 0.8473444581031799, + "learning_rate": 0.0002, + "loss": 0.6539, + "step": 4500 + }, + { + "epoch": 3.274047186932849, + "grad_norm": 0.9724768996238708, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 4510 + }, + { + "epoch": 3.281306715063521, + "grad_norm": 0.8516759276390076, + "learning_rate": 0.0002, + "loss": 0.5464, + "step": 4520 + }, + { + "epoch": 3.288566243194192, + "grad_norm": 0.7543437480926514, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 4530 + }, + { + "epoch": 3.2958257713248638, + "grad_norm": 1.0472029447555542, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 4540 + }, + { + "epoch": 3.3030852994555353, + "grad_norm": 0.6240826845169067, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 4550 + }, + { + "epoch": 3.310344827586207, + "grad_norm": 0.9957774877548218, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 4560 + }, + { + "epoch": 3.3176043557168784, + "grad_norm": 0.6448912620544434, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 4570 + }, + { + "epoch": 3.32486388384755, + "grad_norm": 0.7519692778587341, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 4580 + }, + { + "epoch": 3.3321234119782215, + "grad_norm": 0.7367453575134277, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 4590 + }, + { + "epoch": 3.339382940108893, + "grad_norm": 0.8064960837364197, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 4600 + }, + { + "epoch": 3.3466424682395646, + "grad_norm": 0.7664631009101868, + "learning_rate": 0.0002, + "loss": 0.6062, + "step": 4610 + }, + { + "epoch": 3.353901996370236, + "grad_norm": 0.7803396582603455, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 4620 + }, + { + "epoch": 3.3611615245009077, + "grad_norm": 0.9141599535942078, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 4630 + }, + { + "epoch": 3.3684210526315788, + "grad_norm": 0.9719856381416321, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 4640 + }, + { + "epoch": 3.3756805807622503, + "grad_norm": 0.9223218560218811, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 4650 + }, + { + "epoch": 3.382940108892922, + "grad_norm": 0.7289277911186218, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 4660 + }, + { + "epoch": 3.3901996370235934, + "grad_norm": 1.039724349975586, + "learning_rate": 0.0002, + "loss": 0.595, + "step": 4670 + }, + { + "epoch": 3.397459165154265, + "grad_norm": 1.397438883781433, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 4680 + }, + { + "epoch": 3.4047186932849365, + "grad_norm": 1.0069999694824219, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 4690 + }, + { + "epoch": 3.411978221415608, + "grad_norm": 0.816291332244873, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 4700 + }, + { + "epoch": 3.4192377495462796, + "grad_norm": 1.2831530570983887, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 4710 + }, + { + "epoch": 3.426497277676951, + "grad_norm": 0.9573889970779419, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 4720 + }, + { + "epoch": 3.4337568058076227, + "grad_norm": 0.7685632705688477, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 4730 + }, + { + "epoch": 3.441016333938294, + "grad_norm": 0.7019195556640625, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 4740 + }, + { + "epoch": 3.4482758620689653, + "grad_norm": 0.7244833707809448, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 4750 + }, + { + "epoch": 3.455535390199637, + "grad_norm": 1.3468551635742188, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 4760 + }, + { + "epoch": 3.4627949183303084, + "grad_norm": 0.822846531867981, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 4770 + }, + { + "epoch": 3.47005444646098, + "grad_norm": 0.7311608195304871, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 4780 + }, + { + "epoch": 3.4773139745916515, + "grad_norm": 0.9466770887374878, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 4790 + }, + { + "epoch": 3.484573502722323, + "grad_norm": 1.1527155637741089, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 4800 + }, + { + "epoch": 3.4918330308529946, + "grad_norm": 1.1288906335830688, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 4810 + }, + { + "epoch": 3.499092558983666, + "grad_norm": 0.9096164107322693, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 4820 + }, + { + "epoch": 3.5063520871143377, + "grad_norm": 0.7988565564155579, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 4830 + }, + { + "epoch": 3.513611615245009, + "grad_norm": 0.7183415293693542, + "learning_rate": 0.0002, + "loss": 0.7004, + "step": 4840 + }, + { + "epoch": 3.5208711433756807, + "grad_norm": 0.6614915132522583, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 4850 + }, + { + "epoch": 3.528130671506352, + "grad_norm": 0.8609521985054016, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 4860 + }, + { + "epoch": 3.535390199637024, + "grad_norm": 0.86552894115448, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 4870 + }, + { + "epoch": 3.542649727767695, + "grad_norm": 0.6926496028900146, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 4880 + }, + { + "epoch": 3.5499092558983665, + "grad_norm": 0.8157467246055603, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 4890 + }, + { + "epoch": 3.557168784029038, + "grad_norm": 0.9085357189178467, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 4900 + }, + { + "epoch": 3.5644283121597096, + "grad_norm": 0.6322644948959351, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 4910 + }, + { + "epoch": 3.571687840290381, + "grad_norm": 1.263205885887146, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 4920 + }, + { + "epoch": 3.5789473684210527, + "grad_norm": 0.8901070356369019, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 4930 + }, + { + "epoch": 3.586206896551724, + "grad_norm": 0.7983952164649963, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 4940 + }, + { + "epoch": 3.5934664246823957, + "grad_norm": 0.9887813925743103, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 4950 + }, + { + "epoch": 3.6007259528130673, + "grad_norm": 0.7895187735557556, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 4960 + }, + { + "epoch": 3.6079854809437384, + "grad_norm": 0.9685819745063782, + "learning_rate": 0.0002, + "loss": 0.7111, + "step": 4970 + }, + { + "epoch": 3.6152450090744104, + "grad_norm": 0.6576591730117798, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 4980 + }, + { + "epoch": 3.6225045372050815, + "grad_norm": 0.856985330581665, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 4990 + }, + { + "epoch": 3.629764065335753, + "grad_norm": 0.7230252623558044, + "learning_rate": 0.0002, + "loss": 0.6318, + "step": 5000 + }, + { + "epoch": 3.6370235934664246, + "grad_norm": 0.8260893821716309, + "learning_rate": 0.0002, + "loss": 0.742, + "step": 5010 + }, + { + "epoch": 3.644283121597096, + "grad_norm": 0.7635950446128845, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 5020 + }, + { + "epoch": 3.6515426497277677, + "grad_norm": 0.7060768604278564, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 5030 + }, + { + "epoch": 3.658802177858439, + "grad_norm": 0.8020303249359131, + "learning_rate": 0.0002, + "loss": 0.6921, + "step": 5040 + }, + { + "epoch": 3.6660617059891107, + "grad_norm": 0.8530341386795044, + "learning_rate": 0.0002, + "loss": 0.6446, + "step": 5050 + }, + { + "epoch": 3.6733212341197823, + "grad_norm": 0.6667101979255676, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 5060 + }, + { + "epoch": 3.680580762250454, + "grad_norm": 0.7385406494140625, + "learning_rate": 0.0002, + "loss": 0.7081, + "step": 5070 + }, + { + "epoch": 3.6878402903811254, + "grad_norm": 0.7753380537033081, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 5080 + }, + { + "epoch": 3.695099818511797, + "grad_norm": 0.7516207098960876, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 5090 + }, + { + "epoch": 3.702359346642468, + "grad_norm": 0.8171586394309998, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 5100 + }, + { + "epoch": 3.70961887477314, + "grad_norm": 1.0796279907226562, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 5110 + }, + { + "epoch": 3.716878402903811, + "grad_norm": 0.6957688927650452, + "learning_rate": 0.0002, + "loss": 0.5948, + "step": 5120 + }, + { + "epoch": 3.7241379310344827, + "grad_norm": 0.8550161719322205, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 5130 + }, + { + "epoch": 3.731397459165154, + "grad_norm": 0.9396728277206421, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 5140 + }, + { + "epoch": 3.7386569872958257, + "grad_norm": 1.4264805316925049, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 5150 + }, + { + "epoch": 3.7459165154264973, + "grad_norm": 0.8725108504295349, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 5160 + }, + { + "epoch": 3.753176043557169, + "grad_norm": 1.0346195697784424, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 5170 + }, + { + "epoch": 3.7604355716878404, + "grad_norm": 0.5395554304122925, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 5180 + }, + { + "epoch": 3.767695099818512, + "grad_norm": 1.3153616189956665, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 5190 + }, + { + "epoch": 3.7749546279491835, + "grad_norm": 0.9879828691482544, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 5200 + }, + { + "epoch": 3.7822141560798546, + "grad_norm": 0.8876672983169556, + "learning_rate": 0.0002, + "loss": 0.7068, + "step": 5210 + }, + { + "epoch": 3.7894736842105265, + "grad_norm": 0.8363267779350281, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 5220 + }, + { + "epoch": 3.7967332123411976, + "grad_norm": 0.637294590473175, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 5230 + }, + { + "epoch": 3.803992740471869, + "grad_norm": 1.1408970355987549, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 5240 + }, + { + "epoch": 3.8112522686025407, + "grad_norm": 1.0128360986709595, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 5250 + }, + { + "epoch": 3.8185117967332123, + "grad_norm": 0.8061144351959229, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 5260 + }, + { + "epoch": 3.825771324863884, + "grad_norm": 0.9626626968383789, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 5270 + }, + { + "epoch": 3.8330308529945554, + "grad_norm": 0.9013627171516418, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 5280 + }, + { + "epoch": 3.840290381125227, + "grad_norm": 0.8411344289779663, + "learning_rate": 0.0002, + "loss": 0.6806, + "step": 5290 + }, + { + "epoch": 3.8475499092558985, + "grad_norm": 0.7426059246063232, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 5300 + }, + { + "epoch": 3.85480943738657, + "grad_norm": 1.003413438796997, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 5310 + }, + { + "epoch": 3.862068965517241, + "grad_norm": 0.7527840733528137, + "learning_rate": 0.0002, + "loss": 0.8554, + "step": 5320 + }, + { + "epoch": 3.869328493647913, + "grad_norm": 0.738610565662384, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 5330 + }, + { + "epoch": 3.876588021778584, + "grad_norm": 0.7277999520301819, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 5340 + }, + { + "epoch": 3.8838475499092557, + "grad_norm": 0.5951359272003174, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 5350 + }, + { + "epoch": 3.8911070780399273, + "grad_norm": 1.043884038925171, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 5360 + }, + { + "epoch": 3.898366606170599, + "grad_norm": 0.8436498045921326, + "learning_rate": 0.0002, + "loss": 0.6862, + "step": 5370 + }, + { + "epoch": 3.9056261343012704, + "grad_norm": 0.5603365302085876, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 5380 + }, + { + "epoch": 3.912885662431942, + "grad_norm": 1.0128886699676514, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 5390 + }, + { + "epoch": 3.9201451905626135, + "grad_norm": 0.7970930337905884, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 5400 + }, + { + "epoch": 3.927404718693285, + "grad_norm": 0.7699369192123413, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 5410 + }, + { + "epoch": 3.9346642468239565, + "grad_norm": 0.800561249256134, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 5420 + }, + { + "epoch": 3.941923774954628, + "grad_norm": 0.8020331859588623, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 5430 + }, + { + "epoch": 3.9491833030852996, + "grad_norm": 0.7461140155792236, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 5440 + }, + { + "epoch": 3.9564428312159707, + "grad_norm": 0.8346918821334839, + "learning_rate": 0.0002, + "loss": 0.7013, + "step": 5450 + }, + { + "epoch": 3.9637023593466427, + "grad_norm": 0.9723302125930786, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 5460 + }, + { + "epoch": 3.970961887477314, + "grad_norm": 0.6809740662574768, + "learning_rate": 0.0002, + "loss": 0.8029, + "step": 5470 + }, + { + "epoch": 3.9782214156079854, + "grad_norm": 0.7353498339653015, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 5480 + }, + { + "epoch": 3.985480943738657, + "grad_norm": 0.748009443283081, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 5490 + }, + { + "epoch": 3.9927404718693285, + "grad_norm": 1.3656195402145386, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 5500 + }, + { + "epoch": 4.0, + "grad_norm": 0.8402108550071716, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 5510 + }, + { + "epoch": 4.0, + "eval_loss": 1.17229425907135, + "eval_runtime": 46.2554, + "eval_samples_per_second": 9.426, + "eval_steps_per_second": 1.189, + "step": 5510 + }, + { + "epoch": 4.007259528130671, + "grad_norm": 0.8601235747337341, + "learning_rate": 0.0002, + "loss": 0.4637, + "step": 5520 + }, + { + "epoch": 4.014519056261343, + "grad_norm": 1.2635200023651123, + "learning_rate": 0.0002, + "loss": 0.4717, + "step": 5530 + }, + { + "epoch": 4.021778584392014, + "grad_norm": 1.0257477760314941, + "learning_rate": 0.0002, + "loss": 0.503, + "step": 5540 + }, + { + "epoch": 4.029038112522686, + "grad_norm": 0.9436745047569275, + "learning_rate": 0.0002, + "loss": 0.4547, + "step": 5550 + }, + { + "epoch": 4.036297640653357, + "grad_norm": 0.9443606734275818, + "learning_rate": 0.0002, + "loss": 0.459, + "step": 5560 + }, + { + "epoch": 4.043557168784029, + "grad_norm": 1.3965742588043213, + "learning_rate": 0.0002, + "loss": 0.5386, + "step": 5570 + }, + { + "epoch": 4.0508166969147, + "grad_norm": 0.8973520398139954, + "learning_rate": 0.0002, + "loss": 0.4248, + "step": 5580 + }, + { + "epoch": 4.058076225045372, + "grad_norm": 0.9998409748077393, + "learning_rate": 0.0002, + "loss": 0.4111, + "step": 5590 + }, + { + "epoch": 4.0653357531760435, + "grad_norm": 1.1213387250900269, + "learning_rate": 0.0002, + "loss": 0.4828, + "step": 5600 + }, + { + "epoch": 4.072595281306715, + "grad_norm": 0.7064558863639832, + "learning_rate": 0.0002, + "loss": 0.439, + "step": 5610 + }, + { + "epoch": 4.0798548094373865, + "grad_norm": 1.2390803098678589, + "learning_rate": 0.0002, + "loss": 0.4607, + "step": 5620 + }, + { + "epoch": 4.087114337568058, + "grad_norm": 1.123469591140747, + "learning_rate": 0.0002, + "loss": 0.5014, + "step": 5630 + }, + { + "epoch": 4.09437386569873, + "grad_norm": 1.229573369026184, + "learning_rate": 0.0002, + "loss": 0.513, + "step": 5640 + }, + { + "epoch": 4.101633393829401, + "grad_norm": 1.7182831764221191, + "learning_rate": 0.0002, + "loss": 0.5258, + "step": 5650 + }, + { + "epoch": 4.108892921960073, + "grad_norm": 0.894903302192688, + "learning_rate": 0.0002, + "loss": 0.5371, + "step": 5660 + }, + { + "epoch": 4.116152450090744, + "grad_norm": 0.8754552006721497, + "learning_rate": 0.0002, + "loss": 0.4813, + "step": 5670 + }, + { + "epoch": 4.123411978221416, + "grad_norm": 1.2401553392410278, + "learning_rate": 0.0002, + "loss": 0.491, + "step": 5680 + }, + { + "epoch": 4.130671506352087, + "grad_norm": 0.8631148934364319, + "learning_rate": 0.0002, + "loss": 0.4549, + "step": 5690 + }, + { + "epoch": 4.137931034482759, + "grad_norm": 1.1798022985458374, + "learning_rate": 0.0002, + "loss": 0.487, + "step": 5700 + }, + { + "epoch": 4.14519056261343, + "grad_norm": 0.8344549536705017, + "learning_rate": 0.0002, + "loss": 0.4522, + "step": 5710 + }, + { + "epoch": 4.152450090744102, + "grad_norm": 1.2342697381973267, + "learning_rate": 0.0002, + "loss": 0.4559, + "step": 5720 + }, + { + "epoch": 4.159709618874773, + "grad_norm": 1.1601094007492065, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 5730 + }, + { + "epoch": 4.166969147005445, + "grad_norm": 1.2925703525543213, + "learning_rate": 0.0002, + "loss": 0.4755, + "step": 5740 + }, + { + "epoch": 4.174228675136116, + "grad_norm": 1.0870997905731201, + "learning_rate": 0.0002, + "loss": 0.4973, + "step": 5750 + }, + { + "epoch": 4.181488203266787, + "grad_norm": 0.9077792763710022, + "learning_rate": 0.0002, + "loss": 0.5184, + "step": 5760 + }, + { + "epoch": 4.188747731397459, + "grad_norm": 1.009273886680603, + "learning_rate": 0.0002, + "loss": 0.4606, + "step": 5770 + }, + { + "epoch": 4.19600725952813, + "grad_norm": 1.2465530633926392, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 5780 + }, + { + "epoch": 4.203266787658802, + "grad_norm": 1.2261253595352173, + "learning_rate": 0.0002, + "loss": 0.4938, + "step": 5790 + }, + { + "epoch": 4.2105263157894735, + "grad_norm": 1.1498041152954102, + "learning_rate": 0.0002, + "loss": 0.5255, + "step": 5800 + }, + { + "epoch": 4.217785843920145, + "grad_norm": 1.1966725587844849, + "learning_rate": 0.0002, + "loss": 0.5269, + "step": 5810 + }, + { + "epoch": 4.2250453720508165, + "grad_norm": 1.2651296854019165, + "learning_rate": 0.0002, + "loss": 0.5626, + "step": 5820 + }, + { + "epoch": 4.2323049001814885, + "grad_norm": 1.0388574600219727, + "learning_rate": 0.0002, + "loss": 0.5213, + "step": 5830 + }, + { + "epoch": 4.23956442831216, + "grad_norm": 1.3042771816253662, + "learning_rate": 0.0002, + "loss": 0.4965, + "step": 5840 + }, + { + "epoch": 4.246823956442832, + "grad_norm": 1.1127727031707764, + "learning_rate": 0.0002, + "loss": 0.5116, + "step": 5850 + }, + { + "epoch": 4.254083484573503, + "grad_norm": 0.9653958082199097, + "learning_rate": 0.0002, + "loss": 0.5197, + "step": 5860 + }, + { + "epoch": 4.261343012704174, + "grad_norm": 1.0500504970550537, + "learning_rate": 0.0002, + "loss": 0.4414, + "step": 5870 + }, + { + "epoch": 4.268602540834846, + "grad_norm": 1.1476165056228638, + "learning_rate": 0.0002, + "loss": 0.544, + "step": 5880 + }, + { + "epoch": 4.275862068965517, + "grad_norm": 0.9424414038658142, + "learning_rate": 0.0002, + "loss": 0.4667, + "step": 5890 + }, + { + "epoch": 4.283121597096189, + "grad_norm": 1.3309166431427002, + "learning_rate": 0.0002, + "loss": 0.5039, + "step": 5900 + }, + { + "epoch": 4.29038112522686, + "grad_norm": 1.3025873899459839, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 5910 + }, + { + "epoch": 4.297640653357532, + "grad_norm": 1.1442325115203857, + "learning_rate": 0.0002, + "loss": 0.4644, + "step": 5920 + }, + { + "epoch": 4.304900181488203, + "grad_norm": 0.9820859432220459, + "learning_rate": 0.0002, + "loss": 0.5066, + "step": 5930 + }, + { + "epoch": 4.312159709618875, + "grad_norm": 0.9615740180015564, + "learning_rate": 0.0002, + "loss": 0.5224, + "step": 5940 + }, + { + "epoch": 4.319419237749546, + "grad_norm": 1.1627109050750732, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 5950 + }, + { + "epoch": 4.326678765880218, + "grad_norm": 0.9381322860717773, + "learning_rate": 0.0002, + "loss": 0.4852, + "step": 5960 + }, + { + "epoch": 4.333938294010889, + "grad_norm": 0.8154335618019104, + "learning_rate": 0.0002, + "loss": 0.4532, + "step": 5970 + }, + { + "epoch": 4.341197822141561, + "grad_norm": 0.877671480178833, + "learning_rate": 0.0002, + "loss": 0.5237, + "step": 5980 + }, + { + "epoch": 4.348457350272232, + "grad_norm": 1.1742031574249268, + "learning_rate": 0.0002, + "loss": 0.6113, + "step": 5990 + }, + { + "epoch": 4.3557168784029034, + "grad_norm": 1.0352917909622192, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 6000 + }, + { + "epoch": 4.362976406533575, + "grad_norm": 0.9963878989219666, + "learning_rate": 0.0002, + "loss": 0.4996, + "step": 6010 + }, + { + "epoch": 4.3702359346642465, + "grad_norm": 1.1892237663269043, + "learning_rate": 0.0002, + "loss": 0.4813, + "step": 6020 + }, + { + "epoch": 4.3774954627949185, + "grad_norm": 1.2516111135482788, + "learning_rate": 0.0002, + "loss": 0.5224, + "step": 6030 + }, + { + "epoch": 4.38475499092559, + "grad_norm": 1.2111951112747192, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 6040 + }, + { + "epoch": 4.392014519056262, + "grad_norm": 1.0820083618164062, + "learning_rate": 0.0002, + "loss": 0.4275, + "step": 6050 + }, + { + "epoch": 4.399274047186933, + "grad_norm": 1.033915638923645, + "learning_rate": 0.0002, + "loss": 0.5117, + "step": 6060 + }, + { + "epoch": 4.406533575317605, + "grad_norm": 1.0635870695114136, + "learning_rate": 0.0002, + "loss": 0.5431, + "step": 6070 + }, + { + "epoch": 4.413793103448276, + "grad_norm": 1.0520414113998413, + "learning_rate": 0.0002, + "loss": 0.5341, + "step": 6080 + }, + { + "epoch": 4.421052631578947, + "grad_norm": 1.0821926593780518, + "learning_rate": 0.0002, + "loss": 0.512, + "step": 6090 + }, + { + "epoch": 4.428312159709619, + "grad_norm": 1.0533246994018555, + "learning_rate": 0.0002, + "loss": 0.5065, + "step": 6100 + }, + { + "epoch": 4.43557168784029, + "grad_norm": 0.9231932759284973, + "learning_rate": 0.0002, + "loss": 0.4577, + "step": 6110 + }, + { + "epoch": 4.442831215970962, + "grad_norm": 0.9910260438919067, + "learning_rate": 0.0002, + "loss": 0.583, + "step": 6120 + }, + { + "epoch": 4.450090744101633, + "grad_norm": 1.061949372291565, + "learning_rate": 0.0002, + "loss": 0.4717, + "step": 6130 + }, + { + "epoch": 4.457350272232305, + "grad_norm": 1.2927039861679077, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 6140 + }, + { + "epoch": 4.464609800362976, + "grad_norm": 1.3966081142425537, + "learning_rate": 0.0002, + "loss": 0.4684, + "step": 6150 + }, + { + "epoch": 4.471869328493648, + "grad_norm": 1.3835992813110352, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 6160 + }, + { + "epoch": 4.479128856624319, + "grad_norm": 1.0892692804336548, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 6170 + }, + { + "epoch": 4.486388384754991, + "grad_norm": 1.0318800210952759, + "learning_rate": 0.0002, + "loss": 0.478, + "step": 6180 + }, + { + "epoch": 4.493647912885662, + "grad_norm": 0.8174677491188049, + "learning_rate": 0.0002, + "loss": 0.5198, + "step": 6190 + }, + { + "epoch": 4.500907441016334, + "grad_norm": 1.4157509803771973, + "learning_rate": 0.0002, + "loss": 0.5387, + "step": 6200 + }, + { + "epoch": 4.508166969147005, + "grad_norm": 1.5244114398956299, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 6210 + }, + { + "epoch": 4.5154264972776765, + "grad_norm": 0.8164850473403931, + "learning_rate": 0.0002, + "loss": 0.4642, + "step": 6220 + }, + { + "epoch": 4.5226860254083485, + "grad_norm": 1.2904746532440186, + "learning_rate": 0.0002, + "loss": 0.522, + "step": 6230 + }, + { + "epoch": 4.52994555353902, + "grad_norm": 0.7987732887268066, + "learning_rate": 0.0002, + "loss": 0.5103, + "step": 6240 + }, + { + "epoch": 4.537205081669692, + "grad_norm": 0.831040620803833, + "learning_rate": 0.0002, + "loss": 0.4615, + "step": 6250 + }, + { + "epoch": 4.544464609800363, + "grad_norm": 0.9545485973358154, + "learning_rate": 0.0002, + "loss": 0.5065, + "step": 6260 + }, + { + "epoch": 4.551724137931035, + "grad_norm": 0.9291793704032898, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 6270 + }, + { + "epoch": 4.558983666061706, + "grad_norm": 0.8977208733558655, + "learning_rate": 0.0002, + "loss": 0.4535, + "step": 6280 + }, + { + "epoch": 4.566243194192378, + "grad_norm": 1.1768537759780884, + "learning_rate": 0.0002, + "loss": 0.544, + "step": 6290 + }, + { + "epoch": 4.573502722323049, + "grad_norm": 1.0688952207565308, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 6300 + }, + { + "epoch": 4.580762250453721, + "grad_norm": 0.8800966739654541, + "learning_rate": 0.0002, + "loss": 0.5207, + "step": 6310 + }, + { + "epoch": 4.588021778584392, + "grad_norm": 1.0911834239959717, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 6320 + }, + { + "epoch": 4.595281306715064, + "grad_norm": 1.1420872211456299, + "learning_rate": 0.0002, + "loss": 0.5109, + "step": 6330 + }, + { + "epoch": 4.602540834845735, + "grad_norm": 1.0215224027633667, + "learning_rate": 0.0002, + "loss": 0.5147, + "step": 6340 + }, + { + "epoch": 4.609800362976406, + "grad_norm": 0.9685489535331726, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 6350 + }, + { + "epoch": 4.617059891107078, + "grad_norm": 1.12773597240448, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 6360 + }, + { + "epoch": 4.624319419237749, + "grad_norm": 1.0663973093032837, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 6370 + }, + { + "epoch": 4.631578947368421, + "grad_norm": 1.1707262992858887, + "learning_rate": 0.0002, + "loss": 0.512, + "step": 6380 + }, + { + "epoch": 4.638838475499092, + "grad_norm": 1.0672980546951294, + "learning_rate": 0.0002, + "loss": 0.5497, + "step": 6390 + }, + { + "epoch": 4.646098003629764, + "grad_norm": 1.1464333534240723, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 6400 + }, + { + "epoch": 4.653357531760435, + "grad_norm": 1.070230484008789, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 6410 + }, + { + "epoch": 4.660617059891107, + "grad_norm": 0.9673764109611511, + "learning_rate": 0.0002, + "loss": 0.5013, + "step": 6420 + }, + { + "epoch": 4.6678765880217785, + "grad_norm": 1.0189043283462524, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 6430 + }, + { + "epoch": 4.67513611615245, + "grad_norm": 1.185896396636963, + "learning_rate": 0.0002, + "loss": 0.5193, + "step": 6440 + }, + { + "epoch": 4.682395644283122, + "grad_norm": 1.0682812929153442, + "learning_rate": 0.0002, + "loss": 0.5318, + "step": 6450 + }, + { + "epoch": 4.689655172413794, + "grad_norm": 1.3586071729660034, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 6460 + }, + { + "epoch": 4.696914700544465, + "grad_norm": 0.6561792492866516, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 6470 + }, + { + "epoch": 4.704174228675136, + "grad_norm": 1.1394113302230835, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 6480 + }, + { + "epoch": 4.711433756805808, + "grad_norm": 0.9683151245117188, + "learning_rate": 0.0002, + "loss": 0.5325, + "step": 6490 + }, + { + "epoch": 4.718693284936479, + "grad_norm": 1.0247553586959839, + "learning_rate": 0.0002, + "loss": 0.5139, + "step": 6500 + }, + { + "epoch": 4.725952813067151, + "grad_norm": 0.8046169281005859, + "learning_rate": 0.0002, + "loss": 0.5794, + "step": 6510 + }, + { + "epoch": 4.733212341197822, + "grad_norm": 1.0710240602493286, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 6520 + }, + { + "epoch": 4.740471869328494, + "grad_norm": 0.9438924193382263, + "learning_rate": 0.0002, + "loss": 0.4805, + "step": 6530 + }, + { + "epoch": 4.747731397459165, + "grad_norm": 0.869162380695343, + "learning_rate": 0.0002, + "loss": 0.5404, + "step": 6540 + }, + { + "epoch": 4.754990925589837, + "grad_norm": 0.9776787161827087, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 6550 + }, + { + "epoch": 4.762250453720508, + "grad_norm": 1.1990505456924438, + "learning_rate": 0.0002, + "loss": 0.5288, + "step": 6560 + }, + { + "epoch": 4.769509981851179, + "grad_norm": 1.0582209825515747, + "learning_rate": 0.0002, + "loss": 0.5539, + "step": 6570 + }, + { + "epoch": 4.776769509981851, + "grad_norm": 0.9966367483139038, + "learning_rate": 0.0002, + "loss": 0.489, + "step": 6580 + }, + { + "epoch": 4.784029038112522, + "grad_norm": 0.9130612015724182, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 6590 + }, + { + "epoch": 4.791288566243194, + "grad_norm": 1.0950500965118408, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 6600 + }, + { + "epoch": 4.798548094373865, + "grad_norm": 1.108681321144104, + "learning_rate": 0.0002, + "loss": 0.5266, + "step": 6610 + }, + { + "epoch": 4.805807622504537, + "grad_norm": 1.1873763799667358, + "learning_rate": 0.0002, + "loss": 0.5875, + "step": 6620 + }, + { + "epoch": 4.8130671506352085, + "grad_norm": 1.305367112159729, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 6630 + }, + { + "epoch": 4.8203266787658805, + "grad_norm": 1.2801482677459717, + "learning_rate": 0.0002, + "loss": 0.5636, + "step": 6640 + }, + { + "epoch": 4.827586206896552, + "grad_norm": 1.26764976978302, + "learning_rate": 0.0002, + "loss": 0.582, + "step": 6650 + }, + { + "epoch": 4.834845735027224, + "grad_norm": 1.0018208026885986, + "learning_rate": 0.0002, + "loss": 0.5259, + "step": 6660 + }, + { + "epoch": 4.842105263157895, + "grad_norm": 1.2326326370239258, + "learning_rate": 0.0002, + "loss": 0.548, + "step": 6670 + }, + { + "epoch": 4.849364791288567, + "grad_norm": 0.9707282781600952, + "learning_rate": 0.0002, + "loss": 0.5933, + "step": 6680 + }, + { + "epoch": 4.856624319419238, + "grad_norm": 1.2772048711776733, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 6690 + }, + { + "epoch": 4.863883847549909, + "grad_norm": 2.6652262210845947, + "learning_rate": 0.0002, + "loss": 0.5346, + "step": 6700 + }, + { + "epoch": 4.871143375680581, + "grad_norm": 1.215828537940979, + "learning_rate": 0.0002, + "loss": 0.5428, + "step": 6710 + }, + { + "epoch": 4.878402903811252, + "grad_norm": 1.3704510927200317, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 6720 + }, + { + "epoch": 4.885662431941924, + "grad_norm": 0.7781757116317749, + "learning_rate": 0.0002, + "loss": 0.4963, + "step": 6730 + }, + { + "epoch": 4.892921960072595, + "grad_norm": 1.1883646249771118, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 6740 + }, + { + "epoch": 4.900181488203267, + "grad_norm": 0.9216066002845764, + "learning_rate": 0.0002, + "loss": 0.6067, + "step": 6750 + }, + { + "epoch": 4.907441016333938, + "grad_norm": 1.0558464527130127, + "learning_rate": 0.0002, + "loss": 0.5085, + "step": 6760 + }, + { + "epoch": 4.91470054446461, + "grad_norm": 1.032656192779541, + "learning_rate": 0.0002, + "loss": 0.5216, + "step": 6770 + }, + { + "epoch": 4.921960072595281, + "grad_norm": 1.1261441707611084, + "learning_rate": 0.0002, + "loss": 0.5426, + "step": 6780 + }, + { + "epoch": 4.929219600725952, + "grad_norm": 1.2178640365600586, + "learning_rate": 0.0002, + "loss": 0.5295, + "step": 6790 + }, + { + "epoch": 4.936479128856624, + "grad_norm": 1.5369361639022827, + "learning_rate": 0.0002, + "loss": 0.5476, + "step": 6800 + }, + { + "epoch": 4.943738656987296, + "grad_norm": 1.1188377141952515, + "learning_rate": 0.0002, + "loss": 0.5358, + "step": 6810 + }, + { + "epoch": 4.950998185117967, + "grad_norm": 1.2506113052368164, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 6820 + }, + { + "epoch": 4.9582577132486385, + "grad_norm": 0.8776047825813293, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 6830 + }, + { + "epoch": 4.9655172413793105, + "grad_norm": 0.9700555205345154, + "learning_rate": 0.0002, + "loss": 0.5764, + "step": 6840 + }, + { + "epoch": 4.972776769509982, + "grad_norm": 1.2713534832000732, + "learning_rate": 0.0002, + "loss": 0.5396, + "step": 6850 + }, + { + "epoch": 4.980036297640654, + "grad_norm": 0.9855955243110657, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 6860 + }, + { + "epoch": 4.987295825771325, + "grad_norm": 0.8734853863716125, + "learning_rate": 0.0002, + "loss": 0.5884, + "step": 6870 + }, + { + "epoch": 4.994555353901997, + "grad_norm": 0.8065403699874878, + "learning_rate": 0.0002, + "loss": 0.5189, + "step": 6880 + }, + { + "epoch": 4.999637023593467, + "eval_loss": 1.3302682638168335, + "eval_runtime": 46.2496, + "eval_samples_per_second": 9.427, + "eval_steps_per_second": 1.189, + "step": 6887 + } + ], + "logging_steps": 10, + "max_steps": 11016, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.53767798013952e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..140bceabdec44fc473842cb9ace4a62a4dc9fe89 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-6887/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4abec5480aff960b5a7b03a48be1cdb1ae29ba8792469c451f5c2f269e4be93b +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..81a0479b82ca0455cfa8893b910606301b2664b2 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f824b3a2fc6297fba39203f80f82c2375a85fa203c4cb7c7bd8aa052ebaf1169 +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/optimizer.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb7e77c552c41d69148dfb3f1cc9598c98a50c9a --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:742dd61e226f0f53ee486c61d21a0b86f73e61db5c329460f2e81b9d4a3e9949 +size 72886650 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/rng_state.pth b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..78f4ce556e5485ee856bd0d1b9988dbbed1a951c --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f815e186720e9ed960c1663c5ffe551ecf68125067fa5c89c0feb06e6c7a679 +size 14244 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/scheduler.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0cd3ff5a3b7962b8758e5d2affaa1b77d9549997 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eacf0861f52de0c0eef50849d3c288c99c87afb2d949a9bcc3e67628c7cd429f +size 1064 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/trainer_state.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8ed0ed427b6385b871dfa427a1ca09547c14f461 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/trainer_state.json @@ -0,0 +1,5863 @@ +{ + "best_metric": 1.1086540222167969, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 8265, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007259528130671506, + "grad_norm": 0.46010470390319824, + "learning_rate": 0.0002, + "loss": 2.4936, + "step": 10 + }, + { + "epoch": 0.014519056261343012, + "grad_norm": 0.5103179216384888, + "learning_rate": 0.0002, + "loss": 1.8442, + "step": 20 + }, + { + "epoch": 0.021778584392014518, + "grad_norm": 0.6282716989517212, + "learning_rate": 0.0002, + "loss": 1.559, + "step": 30 + }, + { + "epoch": 0.029038112522686024, + "grad_norm": 1.2480497360229492, + "learning_rate": 0.0002, + "loss": 1.3618, + "step": 40 + }, + { + "epoch": 0.036297640653357534, + "grad_norm": 0.4114855229854584, + "learning_rate": 0.0002, + "loss": 1.3874, + "step": 50 + }, + { + "epoch": 0.043557168784029036, + "grad_norm": 0.49482840299606323, + "learning_rate": 0.0002, + "loss": 1.2836, + "step": 60 + }, + { + "epoch": 0.050816696914700546, + "grad_norm": 0.4536272883415222, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 70 + }, + { + "epoch": 0.05807622504537205, + "grad_norm": 0.32328274846076965, + "learning_rate": 0.0002, + "loss": 1.1125, + "step": 80 + }, + { + "epoch": 0.06533575317604355, + "grad_norm": 0.40990468859672546, + "learning_rate": 0.0002, + "loss": 1.2447, + "step": 90 + }, + { + "epoch": 0.07259528130671507, + "grad_norm": 0.37273502349853516, + "learning_rate": 0.0002, + "loss": 1.2216, + "step": 100 + }, + { + "epoch": 0.07985480943738657, + "grad_norm": 0.3903871476650238, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 110 + }, + { + "epoch": 0.08711433756805807, + "grad_norm": 0.3272787928581238, + "learning_rate": 0.0002, + "loss": 1.3718, + "step": 120 + }, + { + "epoch": 0.09437386569872959, + "grad_norm": 0.3622824251651764, + "learning_rate": 0.0002, + "loss": 1.1832, + "step": 130 + }, + { + "epoch": 0.10163339382940109, + "grad_norm": 0.3503916561603546, + "learning_rate": 0.0002, + "loss": 1.2966, + "step": 140 + }, + { + "epoch": 0.1088929219600726, + "grad_norm": 0.32787832617759705, + "learning_rate": 0.0002, + "loss": 1.0913, + "step": 150 + }, + { + "epoch": 0.1161524500907441, + "grad_norm": 0.5822657942771912, + "learning_rate": 0.0002, + "loss": 1.2143, + "step": 160 + }, + { + "epoch": 0.12341197822141561, + "grad_norm": 0.28028249740600586, + "learning_rate": 0.0002, + "loss": 1.2091, + "step": 170 + }, + { + "epoch": 0.1306715063520871, + "grad_norm": 0.33602750301361084, + "learning_rate": 0.0002, + "loss": 1.1731, + "step": 180 + }, + { + "epoch": 0.13793103448275862, + "grad_norm": 0.29106274247169495, + "learning_rate": 0.0002, + "loss": 1.127, + "step": 190 + }, + { + "epoch": 0.14519056261343014, + "grad_norm": 0.38753363490104675, + "learning_rate": 0.0002, + "loss": 1.3261, + "step": 200 + }, + { + "epoch": 0.15245009074410162, + "grad_norm": 0.361009418964386, + "learning_rate": 0.0002, + "loss": 1.1536, + "step": 210 + }, + { + "epoch": 0.15970961887477314, + "grad_norm": 0.6743836402893066, + "learning_rate": 0.0002, + "loss": 1.1483, + "step": 220 + }, + { + "epoch": 0.16696914700544466, + "grad_norm": 0.3923613727092743, + "learning_rate": 0.0002, + "loss": 1.264, + "step": 230 + }, + { + "epoch": 0.17422867513611615, + "grad_norm": 0.2809699773788452, + "learning_rate": 0.0002, + "loss": 1.0437, + "step": 240 + }, + { + "epoch": 0.18148820326678766, + "grad_norm": 0.3631494641304016, + "learning_rate": 0.0002, + "loss": 1.1826, + "step": 250 + }, + { + "epoch": 0.18874773139745918, + "grad_norm": 0.24658171832561493, + "learning_rate": 0.0002, + "loss": 1.1584, + "step": 260 + }, + { + "epoch": 0.19600725952813067, + "grad_norm": 0.5780664682388306, + "learning_rate": 0.0002, + "loss": 1.0065, + "step": 270 + }, + { + "epoch": 0.20326678765880218, + "grad_norm": 0.3056720495223999, + "learning_rate": 0.0002, + "loss": 1.1251, + "step": 280 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.2857084572315216, + "learning_rate": 0.0002, + "loss": 1.1503, + "step": 290 + }, + { + "epoch": 0.2177858439201452, + "grad_norm": 0.6645345687866211, + "learning_rate": 0.0002, + "loss": 1.1215, + "step": 300 + }, + { + "epoch": 0.2250453720508167, + "grad_norm": 0.5966078639030457, + "learning_rate": 0.0002, + "loss": 1.127, + "step": 310 + }, + { + "epoch": 0.2323049001814882, + "grad_norm": 0.40937140583992004, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 320 + }, + { + "epoch": 0.2395644283121597, + "grad_norm": 0.5642806887626648, + "learning_rate": 0.0002, + "loss": 1.2385, + "step": 330 + }, + { + "epoch": 0.24682395644283123, + "grad_norm": 0.2750748097896576, + "learning_rate": 0.0002, + "loss": 1.1703, + "step": 340 + }, + { + "epoch": 0.2540834845735027, + "grad_norm": 0.34350234270095825, + "learning_rate": 0.0002, + "loss": 1.2598, + "step": 350 + }, + { + "epoch": 0.2613430127041742, + "grad_norm": 0.6767239570617676, + "learning_rate": 0.0002, + "loss": 1.1942, + "step": 360 + }, + { + "epoch": 0.26860254083484575, + "grad_norm": 0.31006959080696106, + "learning_rate": 0.0002, + "loss": 1.1436, + "step": 370 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 0.3825474679470062, + "learning_rate": 0.0002, + "loss": 1.2084, + "step": 380 + }, + { + "epoch": 0.2831215970961887, + "grad_norm": 0.30241551995277405, + "learning_rate": 0.0002, + "loss": 1.1523, + "step": 390 + }, + { + "epoch": 0.29038112522686027, + "grad_norm": 0.2962397336959839, + "learning_rate": 0.0002, + "loss": 1.1298, + "step": 400 + }, + { + "epoch": 0.29764065335753176, + "grad_norm": 0.2600369155406952, + "learning_rate": 0.0002, + "loss": 1.1299, + "step": 410 + }, + { + "epoch": 0.30490018148820325, + "grad_norm": 0.3675060272216797, + "learning_rate": 0.0002, + "loss": 1.1366, + "step": 420 + }, + { + "epoch": 0.3121597096188748, + "grad_norm": 0.3429498076438904, + "learning_rate": 0.0002, + "loss": 1.156, + "step": 430 + }, + { + "epoch": 0.3194192377495463, + "grad_norm": 0.34311825037002563, + "learning_rate": 0.0002, + "loss": 1.2741, + "step": 440 + }, + { + "epoch": 0.32667876588021777, + "grad_norm": 0.37872210144996643, + "learning_rate": 0.0002, + "loss": 1.3523, + "step": 450 + }, + { + "epoch": 0.3339382940108893, + "grad_norm": 0.33271121978759766, + "learning_rate": 0.0002, + "loss": 1.1365, + "step": 460 + }, + { + "epoch": 0.3411978221415608, + "grad_norm": 0.34605276584625244, + "learning_rate": 0.0002, + "loss": 1.1572, + "step": 470 + }, + { + "epoch": 0.3484573502722323, + "grad_norm": 0.41050562262535095, + "learning_rate": 0.0002, + "loss": 1.2755, + "step": 480 + }, + { + "epoch": 0.35571687840290384, + "grad_norm": 0.2066836953163147, + "learning_rate": 0.0002, + "loss": 1.075, + "step": 490 + }, + { + "epoch": 0.3629764065335753, + "grad_norm": 0.2859014868736267, + "learning_rate": 0.0002, + "loss": 1.2072, + "step": 500 + }, + { + "epoch": 0.3702359346642468, + "grad_norm": 0.28763777017593384, + "learning_rate": 0.0002, + "loss": 1.1435, + "step": 510 + }, + { + "epoch": 0.37749546279491836, + "grad_norm": 0.2730471193790436, + "learning_rate": 0.0002, + "loss": 1.1341, + "step": 520 + }, + { + "epoch": 0.38475499092558985, + "grad_norm": 0.3968936800956726, + "learning_rate": 0.0002, + "loss": 1.348, + "step": 530 + }, + { + "epoch": 0.39201451905626133, + "grad_norm": 0.3624701201915741, + "learning_rate": 0.0002, + "loss": 1.2363, + "step": 540 + }, + { + "epoch": 0.3992740471869328, + "grad_norm": 0.3303608298301697, + "learning_rate": 0.0002, + "loss": 1.1206, + "step": 550 + }, + { + "epoch": 0.40653357531760437, + "grad_norm": 0.33507466316223145, + "learning_rate": 0.0002, + "loss": 1.203, + "step": 560 + }, + { + "epoch": 0.41379310344827586, + "grad_norm": 0.3297670781612396, + "learning_rate": 0.0002, + "loss": 1.2294, + "step": 570 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.32334890961647034, + "learning_rate": 0.0002, + "loss": 1.2, + "step": 580 + }, + { + "epoch": 0.4283121597096189, + "grad_norm": 0.30281195044517517, + "learning_rate": 0.0002, + "loss": 1.112, + "step": 590 + }, + { + "epoch": 0.4355716878402904, + "grad_norm": 0.5900027751922607, + "learning_rate": 0.0002, + "loss": 1.1352, + "step": 600 + }, + { + "epoch": 0.44283121597096187, + "grad_norm": 0.28437477350234985, + "learning_rate": 0.0002, + "loss": 1.1575, + "step": 610 + }, + { + "epoch": 0.4500907441016334, + "grad_norm": 0.39601704478263855, + "learning_rate": 0.0002, + "loss": 1.2009, + "step": 620 + }, + { + "epoch": 0.4573502722323049, + "grad_norm": 0.41971510648727417, + "learning_rate": 0.0002, + "loss": 1.2302, + "step": 630 + }, + { + "epoch": 0.4646098003629764, + "grad_norm": 0.33814409375190735, + "learning_rate": 0.0002, + "loss": 1.236, + "step": 640 + }, + { + "epoch": 0.47186932849364793, + "grad_norm": 0.575718104839325, + "learning_rate": 0.0002, + "loss": 1.2471, + "step": 650 + }, + { + "epoch": 0.4791288566243194, + "grad_norm": 0.37927401065826416, + "learning_rate": 0.0002, + "loss": 1.1012, + "step": 660 + }, + { + "epoch": 0.4863883847549909, + "grad_norm": 0.3224332630634308, + "learning_rate": 0.0002, + "loss": 1.1552, + "step": 670 + }, + { + "epoch": 0.49364791288566245, + "grad_norm": 0.32683515548706055, + "learning_rate": 0.0002, + "loss": 1.1354, + "step": 680 + }, + { + "epoch": 0.5009074410163339, + "grad_norm": 0.4316163659095764, + "learning_rate": 0.0002, + "loss": 1.207, + "step": 690 + }, + { + "epoch": 0.5081669691470054, + "grad_norm": 0.342602401971817, + "learning_rate": 0.0002, + "loss": 1.2116, + "step": 700 + }, + { + "epoch": 0.515426497277677, + "grad_norm": 0.2794898748397827, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 710 + }, + { + "epoch": 0.5226860254083484, + "grad_norm": 0.3322339951992035, + "learning_rate": 0.0002, + "loss": 1.2395, + "step": 720 + }, + { + "epoch": 0.52994555353902, + "grad_norm": 0.3088509142398834, + "learning_rate": 0.0002, + "loss": 1.0762, + "step": 730 + }, + { + "epoch": 0.5372050816696915, + "grad_norm": 0.24444378912448883, + "learning_rate": 0.0002, + "loss": 1.1495, + "step": 740 + }, + { + "epoch": 0.5444646098003629, + "grad_norm": 0.3483171761035919, + "learning_rate": 0.0002, + "loss": 1.2176, + "step": 750 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 0.3425690531730652, + "learning_rate": 0.0002, + "loss": 1.2248, + "step": 760 + }, + { + "epoch": 0.558983666061706, + "grad_norm": 0.31841927766799927, + "learning_rate": 0.0002, + "loss": 1.1127, + "step": 770 + }, + { + "epoch": 0.5662431941923775, + "grad_norm": 0.39423868060112, + "learning_rate": 0.0002, + "loss": 1.1841, + "step": 780 + }, + { + "epoch": 0.573502722323049, + "grad_norm": 0.30328479409217834, + "learning_rate": 0.0002, + "loss": 1.0583, + "step": 790 + }, + { + "epoch": 0.5807622504537205, + "grad_norm": 0.24475938081741333, + "learning_rate": 0.0002, + "loss": 1.0718, + "step": 800 + }, + { + "epoch": 0.588021778584392, + "grad_norm": 0.37132805585861206, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 810 + }, + { + "epoch": 0.5952813067150635, + "grad_norm": 0.32195979356765747, + "learning_rate": 0.0002, + "loss": 1.1544, + "step": 820 + }, + { + "epoch": 0.6025408348457351, + "grad_norm": 0.2848738729953766, + "learning_rate": 0.0002, + "loss": 0.9937, + "step": 830 + }, + { + "epoch": 0.6098003629764065, + "grad_norm": 0.28015264868736267, + "learning_rate": 0.0002, + "loss": 1.0937, + "step": 840 + }, + { + "epoch": 0.617059891107078, + "grad_norm": 0.37796008586883545, + "learning_rate": 0.0002, + "loss": 1.1234, + "step": 850 + }, + { + "epoch": 0.6243194192377496, + "grad_norm": 0.39311841130256653, + "learning_rate": 0.0002, + "loss": 1.1455, + "step": 860 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.2761685252189636, + "learning_rate": 0.0002, + "loss": 1.1066, + "step": 870 + }, + { + "epoch": 0.6388384754990926, + "grad_norm": 0.3826720118522644, + "learning_rate": 0.0002, + "loss": 1.0859, + "step": 880 + }, + { + "epoch": 0.6460980036297641, + "grad_norm": 0.30076679587364197, + "learning_rate": 0.0002, + "loss": 1.1182, + "step": 890 + }, + { + "epoch": 0.6533575317604355, + "grad_norm": 0.21997687220573425, + "learning_rate": 0.0002, + "loss": 1.0927, + "step": 900 + }, + { + "epoch": 0.6606170598911071, + "grad_norm": 0.32593777775764465, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 910 + }, + { + "epoch": 0.6678765880217786, + "grad_norm": 0.30347898602485657, + "learning_rate": 0.0002, + "loss": 1.1578, + "step": 920 + }, + { + "epoch": 0.6751361161524501, + "grad_norm": 0.44173774123191833, + "learning_rate": 0.0002, + "loss": 1.1642, + "step": 930 + }, + { + "epoch": 0.6823956442831216, + "grad_norm": 0.2507467269897461, + "learning_rate": 0.0002, + "loss": 1.1832, + "step": 940 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 0.29463833570480347, + "learning_rate": 0.0002, + "loss": 1.3001, + "step": 950 + }, + { + "epoch": 0.6969147005444646, + "grad_norm": 0.9363154172897339, + "learning_rate": 0.0002, + "loss": 1.1651, + "step": 960 + }, + { + "epoch": 0.7041742286751361, + "grad_norm": 0.3236212134361267, + "learning_rate": 0.0002, + "loss": 1.158, + "step": 970 + }, + { + "epoch": 0.7114337568058077, + "grad_norm": 0.3123254179954529, + "learning_rate": 0.0002, + "loss": 1.1078, + "step": 980 + }, + { + "epoch": 0.7186932849364791, + "grad_norm": 0.3395805060863495, + "learning_rate": 0.0002, + "loss": 1.1578, + "step": 990 + }, + { + "epoch": 0.7259528130671506, + "grad_norm": 0.3240964412689209, + "learning_rate": 0.0002, + "loss": 1.0754, + "step": 1000 + }, + { + "epoch": 0.7332123411978222, + "grad_norm": 0.31902948021888733, + "learning_rate": 0.0002, + "loss": 1.0313, + "step": 1010 + }, + { + "epoch": 0.7404718693284936, + "grad_norm": 0.4848408102989197, + "learning_rate": 0.0002, + "loss": 1.1664, + "step": 1020 + }, + { + "epoch": 0.7477313974591652, + "grad_norm": 0.33006033301353455, + "learning_rate": 0.0002, + "loss": 1.0838, + "step": 1030 + }, + { + "epoch": 0.7549909255898367, + "grad_norm": 0.2928730547428131, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 1040 + }, + { + "epoch": 0.7622504537205081, + "grad_norm": 0.3529164791107178, + "learning_rate": 0.0002, + "loss": 1.1351, + "step": 1050 + }, + { + "epoch": 0.7695099818511797, + "grad_norm": 0.2736213803291321, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 1060 + }, + { + "epoch": 0.7767695099818511, + "grad_norm": 0.7200686931610107, + "learning_rate": 0.0002, + "loss": 1.1169, + "step": 1070 + }, + { + "epoch": 0.7840290381125227, + "grad_norm": 0.33396708965301514, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 1080 + }, + { + "epoch": 0.7912885662431942, + "grad_norm": 1.5760449171066284, + "learning_rate": 0.0002, + "loss": 1.2447, + "step": 1090 + }, + { + "epoch": 0.7985480943738656, + "grad_norm": 0.28138381242752075, + "learning_rate": 0.0002, + "loss": 1.1126, + "step": 1100 + }, + { + "epoch": 0.8058076225045372, + "grad_norm": 0.2597472369670868, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 1110 + }, + { + "epoch": 0.8130671506352087, + "grad_norm": 0.3305445611476898, + "learning_rate": 0.0002, + "loss": 1.1177, + "step": 1120 + }, + { + "epoch": 0.8203266787658802, + "grad_norm": 0.3934599459171295, + "learning_rate": 0.0002, + "loss": 1.0849, + "step": 1130 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 0.3472191393375397, + "learning_rate": 0.0002, + "loss": 1.0889, + "step": 1140 + }, + { + "epoch": 0.8348457350272233, + "grad_norm": 0.2857365906238556, + "learning_rate": 0.0002, + "loss": 1.265, + "step": 1150 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.3207702934741974, + "learning_rate": 0.0002, + "loss": 1.03, + "step": 1160 + }, + { + "epoch": 0.8493647912885662, + "grad_norm": 0.3176484704017639, + "learning_rate": 0.0002, + "loss": 1.1669, + "step": 1170 + }, + { + "epoch": 0.8566243194192378, + "grad_norm": 0.40685558319091797, + "learning_rate": 0.0002, + "loss": 1.1386, + "step": 1180 + }, + { + "epoch": 0.8638838475499092, + "grad_norm": 0.31125199794769287, + "learning_rate": 0.0002, + "loss": 1.1383, + "step": 1190 + }, + { + "epoch": 0.8711433756805808, + "grad_norm": 0.7361181378364563, + "learning_rate": 0.0002, + "loss": 1.1512, + "step": 1200 + }, + { + "epoch": 0.8784029038112523, + "grad_norm": 0.33699527382850647, + "learning_rate": 0.0002, + "loss": 0.9983, + "step": 1210 + }, + { + "epoch": 0.8856624319419237, + "grad_norm": 0.3315220773220062, + "learning_rate": 0.0002, + "loss": 1.11, + "step": 1220 + }, + { + "epoch": 0.8929219600725953, + "grad_norm": 0.6256054043769836, + "learning_rate": 0.0002, + "loss": 1.0266, + "step": 1230 + }, + { + "epoch": 0.9001814882032668, + "grad_norm": 0.3692137897014618, + "learning_rate": 0.0002, + "loss": 1.1738, + "step": 1240 + }, + { + "epoch": 0.9074410163339383, + "grad_norm": 0.3538484573364258, + "learning_rate": 0.0002, + "loss": 1.1112, + "step": 1250 + }, + { + "epoch": 0.9147005444646098, + "grad_norm": 0.27863040566444397, + "learning_rate": 0.0002, + "loss": 1.0878, + "step": 1260 + }, + { + "epoch": 0.9219600725952813, + "grad_norm": 0.3322528302669525, + "learning_rate": 0.0002, + "loss": 1.0826, + "step": 1270 + }, + { + "epoch": 0.9292196007259528, + "grad_norm": 0.7553173303604126, + "learning_rate": 0.0002, + "loss": 1.0158, + "step": 1280 + }, + { + "epoch": 0.9364791288566243, + "grad_norm": 0.4856191575527191, + "learning_rate": 0.0002, + "loss": 1.198, + "step": 1290 + }, + { + "epoch": 0.9437386569872959, + "grad_norm": 0.3668074905872345, + "learning_rate": 0.0002, + "loss": 1.1678, + "step": 1300 + }, + { + "epoch": 0.9509981851179673, + "grad_norm": 0.29851067066192627, + "learning_rate": 0.0002, + "loss": 1.1809, + "step": 1310 + }, + { + "epoch": 0.9582577132486388, + "grad_norm": 0.276664674282074, + "learning_rate": 0.0002, + "loss": 1.1299, + "step": 1320 + }, + { + "epoch": 0.9655172413793104, + "grad_norm": 0.2941018342971802, + "learning_rate": 0.0002, + "loss": 1.0597, + "step": 1330 + }, + { + "epoch": 0.9727767695099818, + "grad_norm": 0.3505859076976776, + "learning_rate": 0.0002, + "loss": 1.1731, + "step": 1340 + }, + { + "epoch": 0.9800362976406534, + "grad_norm": 0.3067687451839447, + "learning_rate": 0.0002, + "loss": 1.1809, + "step": 1350 + }, + { + "epoch": 0.9872958257713249, + "grad_norm": 0.27151066064834595, + "learning_rate": 0.0002, + "loss": 1.1511, + "step": 1360 + }, + { + "epoch": 0.9945553539019963, + "grad_norm": 0.36370083689689636, + "learning_rate": 0.0002, + "loss": 1.1919, + "step": 1370 + }, + { + "epoch": 0.9996370235934664, + "eval_loss": 1.1381088495254517, + "eval_runtime": 96.0848, + "eval_samples_per_second": 4.538, + "eval_steps_per_second": 0.572, + "step": 1377 + }, + { + "epoch": 1.0018148820326678, + "grad_norm": 0.27980583906173706, + "learning_rate": 0.0002, + "loss": 1.0082, + "step": 1380 + }, + { + "epoch": 1.0090744101633393, + "grad_norm": 0.26713913679122925, + "learning_rate": 0.0002, + "loss": 1.0565, + "step": 1390 + }, + { + "epoch": 1.0163339382940109, + "grad_norm": 0.3089541494846344, + "learning_rate": 0.0002, + "loss": 1.0241, + "step": 1400 + }, + { + "epoch": 1.0235934664246824, + "grad_norm": 0.4188242256641388, + "learning_rate": 0.0002, + "loss": 0.9649, + "step": 1410 + }, + { + "epoch": 1.030852994555354, + "grad_norm": 0.5246463418006897, + "learning_rate": 0.0002, + "loss": 0.9821, + "step": 1420 + }, + { + "epoch": 1.0381125226860255, + "grad_norm": 0.2728777825832367, + "learning_rate": 0.0002, + "loss": 1.0143, + "step": 1430 + }, + { + "epoch": 1.0453720508166968, + "grad_norm": 0.38167616724967957, + "learning_rate": 0.0002, + "loss": 1.075, + "step": 1440 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.4439380168914795, + "learning_rate": 0.0002, + "loss": 1.0229, + "step": 1450 + }, + { + "epoch": 1.05989110707804, + "grad_norm": 0.30954182147979736, + "learning_rate": 0.0002, + "loss": 1.0451, + "step": 1460 + }, + { + "epoch": 1.0671506352087115, + "grad_norm": 0.4022280275821686, + "learning_rate": 0.0002, + "loss": 1.0762, + "step": 1470 + }, + { + "epoch": 1.074410163339383, + "grad_norm": 0.7390811443328857, + "learning_rate": 0.0002, + "loss": 1.0754, + "step": 1480 + }, + { + "epoch": 1.0816696914700545, + "grad_norm": 0.3885486423969269, + "learning_rate": 0.0002, + "loss": 1.1378, + "step": 1490 + }, + { + "epoch": 1.0889292196007259, + "grad_norm": 0.5275560617446899, + "learning_rate": 0.0002, + "loss": 1.005, + "step": 1500 + }, + { + "epoch": 1.0961887477313974, + "grad_norm": 0.35112282633781433, + "learning_rate": 0.0002, + "loss": 1.0147, + "step": 1510 + }, + { + "epoch": 1.103448275862069, + "grad_norm": 0.33714351058006287, + "learning_rate": 0.0002, + "loss": 1.0331, + "step": 1520 + }, + { + "epoch": 1.1107078039927405, + "grad_norm": 0.31221693754196167, + "learning_rate": 0.0002, + "loss": 0.8723, + "step": 1530 + }, + { + "epoch": 1.117967332123412, + "grad_norm": 0.27549654245376587, + "learning_rate": 0.0002, + "loss": 1.0701, + "step": 1540 + }, + { + "epoch": 1.1252268602540836, + "grad_norm": 0.8465521335601807, + "learning_rate": 0.0002, + "loss": 1.0114, + "step": 1550 + }, + { + "epoch": 1.132486388384755, + "grad_norm": 0.36125949025154114, + "learning_rate": 0.0002, + "loss": 0.9643, + "step": 1560 + }, + { + "epoch": 1.1397459165154264, + "grad_norm": 0.37420371174812317, + "learning_rate": 0.0002, + "loss": 1.0744, + "step": 1570 + }, + { + "epoch": 1.147005444646098, + "grad_norm": 0.3294760584831238, + "learning_rate": 0.0002, + "loss": 1.165, + "step": 1580 + }, + { + "epoch": 1.1542649727767695, + "grad_norm": 0.3881238102912903, + "learning_rate": 0.0002, + "loss": 0.9207, + "step": 1590 + }, + { + "epoch": 1.161524500907441, + "grad_norm": 0.4766491651535034, + "learning_rate": 0.0002, + "loss": 1.0983, + "step": 1600 + }, + { + "epoch": 1.1687840290381124, + "grad_norm": 0.395530104637146, + "learning_rate": 0.0002, + "loss": 1.0092, + "step": 1610 + }, + { + "epoch": 1.176043557168784, + "grad_norm": 0.3297106623649597, + "learning_rate": 0.0002, + "loss": 0.9895, + "step": 1620 + }, + { + "epoch": 1.1833030852994555, + "grad_norm": 0.39528271555900574, + "learning_rate": 0.0002, + "loss": 1.1121, + "step": 1630 + }, + { + "epoch": 1.190562613430127, + "grad_norm": 0.3370221257209778, + "learning_rate": 0.0002, + "loss": 0.9202, + "step": 1640 + }, + { + "epoch": 1.1978221415607986, + "grad_norm": 0.31922030448913574, + "learning_rate": 0.0002, + "loss": 1.102, + "step": 1650 + }, + { + "epoch": 1.2050816696914701, + "grad_norm": 0.6142027378082275, + "learning_rate": 0.0002, + "loss": 1.0565, + "step": 1660 + }, + { + "epoch": 1.2123411978221417, + "grad_norm": 0.44769710302352905, + "learning_rate": 0.0002, + "loss": 0.9541, + "step": 1670 + }, + { + "epoch": 1.219600725952813, + "grad_norm": 0.41233646869659424, + "learning_rate": 0.0002, + "loss": 1.2501, + "step": 1680 + }, + { + "epoch": 1.2268602540834845, + "grad_norm": 0.2928866147994995, + "learning_rate": 0.0002, + "loss": 1.0147, + "step": 1690 + }, + { + "epoch": 1.234119782214156, + "grad_norm": 0.36913734674453735, + "learning_rate": 0.0002, + "loss": 0.9074, + "step": 1700 + }, + { + "epoch": 1.2413793103448276, + "grad_norm": 0.5281891226768494, + "learning_rate": 0.0002, + "loss": 1.0847, + "step": 1710 + }, + { + "epoch": 1.2486388384754992, + "grad_norm": 0.3374697268009186, + "learning_rate": 0.0002, + "loss": 0.9482, + "step": 1720 + }, + { + "epoch": 1.2558983666061705, + "grad_norm": 0.3802020847797394, + "learning_rate": 0.0002, + "loss": 0.9826, + "step": 1730 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 0.38048651814460754, + "learning_rate": 0.0002, + "loss": 1.0191, + "step": 1740 + }, + { + "epoch": 1.2704174228675136, + "grad_norm": 0.6676169633865356, + "learning_rate": 0.0002, + "loss": 1.0142, + "step": 1750 + }, + { + "epoch": 1.2776769509981851, + "grad_norm": 0.4075961410999298, + "learning_rate": 0.0002, + "loss": 0.992, + "step": 1760 + }, + { + "epoch": 1.2849364791288567, + "grad_norm": 0.4374721348285675, + "learning_rate": 0.0002, + "loss": 1.0301, + "step": 1770 + }, + { + "epoch": 1.2921960072595282, + "grad_norm": 0.4638824164867401, + "learning_rate": 0.0002, + "loss": 1.0601, + "step": 1780 + }, + { + "epoch": 1.2994555353901998, + "grad_norm": 0.38631564378738403, + "learning_rate": 0.0002, + "loss": 0.9593, + "step": 1790 + }, + { + "epoch": 1.306715063520871, + "grad_norm": 0.35873809456825256, + "learning_rate": 0.0002, + "loss": 1.0042, + "step": 1800 + }, + { + "epoch": 1.3139745916515426, + "grad_norm": 0.33602237701416016, + "learning_rate": 0.0002, + "loss": 0.9835, + "step": 1810 + }, + { + "epoch": 1.3212341197822142, + "grad_norm": 0.46696463227272034, + "learning_rate": 0.0002, + "loss": 1.1032, + "step": 1820 + }, + { + "epoch": 1.3284936479128857, + "grad_norm": 0.368958979845047, + "learning_rate": 0.0002, + "loss": 1.0123, + "step": 1830 + }, + { + "epoch": 1.335753176043557, + "grad_norm": 0.3160957396030426, + "learning_rate": 0.0002, + "loss": 1.0456, + "step": 1840 + }, + { + "epoch": 1.3430127041742286, + "grad_norm": 0.4511511027812958, + "learning_rate": 0.0002, + "loss": 1.0264, + "step": 1850 + }, + { + "epoch": 1.3502722323049001, + "grad_norm": 0.6769845485687256, + "learning_rate": 0.0002, + "loss": 1.0179, + "step": 1860 + }, + { + "epoch": 1.3575317604355717, + "grad_norm": 0.3749309480190277, + "learning_rate": 0.0002, + "loss": 1.011, + "step": 1870 + }, + { + "epoch": 1.3647912885662432, + "grad_norm": 0.553798496723175, + "learning_rate": 0.0002, + "loss": 1.0223, + "step": 1880 + }, + { + "epoch": 1.3720508166969148, + "grad_norm": 0.3538985550403595, + "learning_rate": 0.0002, + "loss": 0.9634, + "step": 1890 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 0.5501534938812256, + "learning_rate": 0.0002, + "loss": 1.0275, + "step": 1900 + }, + { + "epoch": 1.3865698729582578, + "grad_norm": 0.4432051181793213, + "learning_rate": 0.0002, + "loss": 1.0461, + "step": 1910 + }, + { + "epoch": 1.3938294010889292, + "grad_norm": 0.41755786538124084, + "learning_rate": 0.0002, + "loss": 0.9082, + "step": 1920 + }, + { + "epoch": 1.4010889292196007, + "grad_norm": 0.5626114010810852, + "learning_rate": 0.0002, + "loss": 1.0858, + "step": 1930 + }, + { + "epoch": 1.4083484573502723, + "grad_norm": 0.44038185477256775, + "learning_rate": 0.0002, + "loss": 1.0687, + "step": 1940 + }, + { + "epoch": 1.4156079854809438, + "grad_norm": 0.3397001624107361, + "learning_rate": 0.0002, + "loss": 0.9454, + "step": 1950 + }, + { + "epoch": 1.4228675136116151, + "grad_norm": 0.4325368106365204, + "learning_rate": 0.0002, + "loss": 0.943, + "step": 1960 + }, + { + "epoch": 1.4301270417422867, + "grad_norm": 0.3900907039642334, + "learning_rate": 0.0002, + "loss": 1.0488, + "step": 1970 + }, + { + "epoch": 1.4373865698729582, + "grad_norm": 0.369612455368042, + "learning_rate": 0.0002, + "loss": 0.9699, + "step": 1980 + }, + { + "epoch": 1.4446460980036298, + "grad_norm": 0.4389338791370392, + "learning_rate": 0.0002, + "loss": 1.0609, + "step": 1990 + }, + { + "epoch": 1.4519056261343013, + "grad_norm": 1.694450855255127, + "learning_rate": 0.0002, + "loss": 1.042, + "step": 2000 + }, + { + "epoch": 1.4591651542649728, + "grad_norm": 0.516957700252533, + "learning_rate": 0.0002, + "loss": 0.997, + "step": 2010 + }, + { + "epoch": 1.4664246823956444, + "grad_norm": 0.45515501499176025, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 2020 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 0.4153139591217041, + "learning_rate": 0.0002, + "loss": 1.0188, + "step": 2030 + }, + { + "epoch": 1.4809437386569873, + "grad_norm": 0.44353052973747253, + "learning_rate": 0.0002, + "loss": 1.1521, + "step": 2040 + }, + { + "epoch": 1.4882032667876588, + "grad_norm": 0.570554256439209, + "learning_rate": 0.0002, + "loss": 0.9653, + "step": 2050 + }, + { + "epoch": 1.4954627949183303, + "grad_norm": 0.5742740035057068, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 2060 + }, + { + "epoch": 1.5027223230490017, + "grad_norm": 0.5890517830848694, + "learning_rate": 0.0002, + "loss": 1.0963, + "step": 2070 + }, + { + "epoch": 1.5099818511796732, + "grad_norm": 0.4162650406360626, + "learning_rate": 0.0002, + "loss": 1.0639, + "step": 2080 + }, + { + "epoch": 1.5172413793103448, + "grad_norm": 0.7334717512130737, + "learning_rate": 0.0002, + "loss": 1.071, + "step": 2090 + }, + { + "epoch": 1.5245009074410163, + "grad_norm": 0.2963249683380127, + "learning_rate": 0.0002, + "loss": 0.8957, + "step": 2100 + }, + { + "epoch": 1.5317604355716878, + "grad_norm": 0.30676454305648804, + "learning_rate": 0.0002, + "loss": 1.0446, + "step": 2110 + }, + { + "epoch": 1.5390199637023594, + "grad_norm": 0.35984641313552856, + "learning_rate": 0.0002, + "loss": 0.9967, + "step": 2120 + }, + { + "epoch": 1.546279491833031, + "grad_norm": 0.3384549617767334, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 2130 + }, + { + "epoch": 1.5535390199637025, + "grad_norm": 0.4725518226623535, + "learning_rate": 0.0002, + "loss": 0.9874, + "step": 2140 + }, + { + "epoch": 1.560798548094374, + "grad_norm": 0.3252685070037842, + "learning_rate": 0.0002, + "loss": 1.1166, + "step": 2150 + }, + { + "epoch": 1.5680580762250453, + "grad_norm": 0.45043081045150757, + "learning_rate": 0.0002, + "loss": 0.9127, + "step": 2160 + }, + { + "epoch": 1.5753176043557169, + "grad_norm": 0.374208003282547, + "learning_rate": 0.0002, + "loss": 1.0767, + "step": 2170 + }, + { + "epoch": 1.5825771324863884, + "grad_norm": 0.5118404030799866, + "learning_rate": 0.0002, + "loss": 1.0255, + "step": 2180 + }, + { + "epoch": 1.5898366606170597, + "grad_norm": 0.482170969247818, + "learning_rate": 0.0002, + "loss": 1.0062, + "step": 2190 + }, + { + "epoch": 1.5970961887477313, + "grad_norm": 0.5337533950805664, + "learning_rate": 0.0002, + "loss": 1.0872, + "step": 2200 + }, + { + "epoch": 1.6043557168784028, + "grad_norm": 0.5195064544677734, + "learning_rate": 0.0002, + "loss": 1.0405, + "step": 2210 + }, + { + "epoch": 1.6116152450090744, + "grad_norm": 0.30807098746299744, + "learning_rate": 0.0002, + "loss": 1.0454, + "step": 2220 + }, + { + "epoch": 1.618874773139746, + "grad_norm": 0.3962925672531128, + "learning_rate": 0.0002, + "loss": 1.0293, + "step": 2230 + }, + { + "epoch": 1.6261343012704175, + "grad_norm": 0.7636962532997131, + "learning_rate": 0.0002, + "loss": 1.0137, + "step": 2240 + }, + { + "epoch": 1.633393829401089, + "grad_norm": 0.32380592823028564, + "learning_rate": 0.0002, + "loss": 0.991, + "step": 2250 + }, + { + "epoch": 1.6406533575317606, + "grad_norm": 0.5767741799354553, + "learning_rate": 0.0002, + "loss": 0.9471, + "step": 2260 + }, + { + "epoch": 1.647912885662432, + "grad_norm": 0.39964812994003296, + "learning_rate": 0.0002, + "loss": 0.9524, + "step": 2270 + }, + { + "epoch": 1.6551724137931034, + "grad_norm": 0.622629702091217, + "learning_rate": 0.0002, + "loss": 0.9866, + "step": 2280 + }, + { + "epoch": 1.662431941923775, + "grad_norm": 0.40202152729034424, + "learning_rate": 0.0002, + "loss": 0.9609, + "step": 2290 + }, + { + "epoch": 1.6696914700544465, + "grad_norm": 0.4467349052429199, + "learning_rate": 0.0002, + "loss": 0.9859, + "step": 2300 + }, + { + "epoch": 1.6769509981851178, + "grad_norm": 0.5026949048042297, + "learning_rate": 0.0002, + "loss": 1.0312, + "step": 2310 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 0.43754541873931885, + "learning_rate": 0.0002, + "loss": 1.0776, + "step": 2320 + }, + { + "epoch": 1.691470054446461, + "grad_norm": 0.42869430780410767, + "learning_rate": 0.0002, + "loss": 0.9786, + "step": 2330 + }, + { + "epoch": 1.6987295825771325, + "grad_norm": 0.4192679524421692, + "learning_rate": 0.0002, + "loss": 1.0747, + "step": 2340 + }, + { + "epoch": 1.705989110707804, + "grad_norm": 0.3243155777454376, + "learning_rate": 0.0002, + "loss": 1.0054, + "step": 2350 + }, + { + "epoch": 1.7132486388384756, + "grad_norm": 1.0514075756072998, + "learning_rate": 0.0002, + "loss": 0.9165, + "step": 2360 + }, + { + "epoch": 1.720508166969147, + "grad_norm": 0.4719122648239136, + "learning_rate": 0.0002, + "loss": 1.1353, + "step": 2370 + }, + { + "epoch": 1.7277676950998186, + "grad_norm": 0.3846144676208496, + "learning_rate": 0.0002, + "loss": 0.9913, + "step": 2380 + }, + { + "epoch": 1.73502722323049, + "grad_norm": 0.7266581058502197, + "learning_rate": 0.0002, + "loss": 0.9935, + "step": 2390 + }, + { + "epoch": 1.7422867513611615, + "grad_norm": 0.6183241605758667, + "learning_rate": 0.0002, + "loss": 0.9509, + "step": 2400 + }, + { + "epoch": 1.749546279491833, + "grad_norm": 0.3658260405063629, + "learning_rate": 0.0002, + "loss": 0.9713, + "step": 2410 + }, + { + "epoch": 1.7568058076225044, + "grad_norm": 0.6036322712898254, + "learning_rate": 0.0002, + "loss": 1.0033, + "step": 2420 + }, + { + "epoch": 1.764065335753176, + "grad_norm": 0.7872936129570007, + "learning_rate": 0.0002, + "loss": 1.0526, + "step": 2430 + }, + { + "epoch": 1.7713248638838475, + "grad_norm": 0.35946124792099, + "learning_rate": 0.0002, + "loss": 0.99, + "step": 2440 + }, + { + "epoch": 1.778584392014519, + "grad_norm": 0.3740338981151581, + "learning_rate": 0.0002, + "loss": 0.8845, + "step": 2450 + }, + { + "epoch": 1.7858439201451906, + "grad_norm": 0.6150230169296265, + "learning_rate": 0.0002, + "loss": 0.9333, + "step": 2460 + }, + { + "epoch": 1.793103448275862, + "grad_norm": 0.4726075530052185, + "learning_rate": 0.0002, + "loss": 1.0373, + "step": 2470 + }, + { + "epoch": 1.8003629764065336, + "grad_norm": 0.31292253732681274, + "learning_rate": 0.0002, + "loss": 0.974, + "step": 2480 + }, + { + "epoch": 1.8076225045372052, + "grad_norm": 0.4463104009628296, + "learning_rate": 0.0002, + "loss": 1.0205, + "step": 2490 + }, + { + "epoch": 1.8148820326678767, + "grad_norm": 0.7848200798034668, + "learning_rate": 0.0002, + "loss": 1.0771, + "step": 2500 + }, + { + "epoch": 1.822141560798548, + "grad_norm": 0.5562082529067993, + "learning_rate": 0.0002, + "loss": 0.9508, + "step": 2510 + }, + { + "epoch": 1.8294010889292196, + "grad_norm": 0.39892756938934326, + "learning_rate": 0.0002, + "loss": 0.9343, + "step": 2520 + }, + { + "epoch": 1.8366606170598911, + "grad_norm": 1.2923320531845093, + "learning_rate": 0.0002, + "loss": 1.0489, + "step": 2530 + }, + { + "epoch": 1.8439201451905625, + "grad_norm": 0.6316490769386292, + "learning_rate": 0.0002, + "loss": 1.1203, + "step": 2540 + }, + { + "epoch": 1.851179673321234, + "grad_norm": 0.46100497245788574, + "learning_rate": 0.0002, + "loss": 0.9131, + "step": 2550 + }, + { + "epoch": 1.8584392014519056, + "grad_norm": 0.7902987003326416, + "learning_rate": 0.0002, + "loss": 1.0977, + "step": 2560 + }, + { + "epoch": 1.865698729582577, + "grad_norm": 0.4596365690231323, + "learning_rate": 0.0002, + "loss": 0.9702, + "step": 2570 + }, + { + "epoch": 1.8729582577132486, + "grad_norm": 0.6592172384262085, + "learning_rate": 0.0002, + "loss": 0.9322, + "step": 2580 + }, + { + "epoch": 1.8802177858439202, + "grad_norm": 0.5764662027359009, + "learning_rate": 0.0002, + "loss": 0.9549, + "step": 2590 + }, + { + "epoch": 1.8874773139745917, + "grad_norm": 0.8421637415885925, + "learning_rate": 0.0002, + "loss": 0.9485, + "step": 2600 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.5635305047035217, + "learning_rate": 0.0002, + "loss": 1.012, + "step": 2610 + }, + { + "epoch": 1.9019963702359348, + "grad_norm": 0.46231237053871155, + "learning_rate": 0.0002, + "loss": 0.8907, + "step": 2620 + }, + { + "epoch": 1.9092558983666061, + "grad_norm": 0.3944607079029083, + "learning_rate": 0.0002, + "loss": 0.9543, + "step": 2630 + }, + { + "epoch": 1.9165154264972777, + "grad_norm": 0.4753907322883606, + "learning_rate": 0.0002, + "loss": 0.9964, + "step": 2640 + }, + { + "epoch": 1.9237749546279492, + "grad_norm": 0.4151090979576111, + "learning_rate": 0.0002, + "loss": 1.0217, + "step": 2650 + }, + { + "epoch": 1.9310344827586206, + "grad_norm": 0.6793725490570068, + "learning_rate": 0.0002, + "loss": 1.0397, + "step": 2660 + }, + { + "epoch": 1.938294010889292, + "grad_norm": 0.339755117893219, + "learning_rate": 0.0002, + "loss": 1.0401, + "step": 2670 + }, + { + "epoch": 1.9455535390199636, + "grad_norm": 0.40789374709129333, + "learning_rate": 0.0002, + "loss": 1.082, + "step": 2680 + }, + { + "epoch": 1.9528130671506352, + "grad_norm": 0.3750005066394806, + "learning_rate": 0.0002, + "loss": 0.9973, + "step": 2690 + }, + { + "epoch": 1.9600725952813067, + "grad_norm": 0.39684441685676575, + "learning_rate": 0.0002, + "loss": 0.9837, + "step": 2700 + }, + { + "epoch": 1.9673321234119783, + "grad_norm": 0.378287672996521, + "learning_rate": 0.0002, + "loss": 1.1204, + "step": 2710 + }, + { + "epoch": 1.9745916515426498, + "grad_norm": 0.3668482005596161, + "learning_rate": 0.0002, + "loss": 0.9284, + "step": 2720 + }, + { + "epoch": 1.9818511796733214, + "grad_norm": 0.49997565150260925, + "learning_rate": 0.0002, + "loss": 1.0512, + "step": 2730 + }, + { + "epoch": 1.989110707803993, + "grad_norm": 0.36852124333381653, + "learning_rate": 0.0002, + "loss": 0.9311, + "step": 2740 + }, + { + "epoch": 1.9963702359346642, + "grad_norm": 0.5203380584716797, + "learning_rate": 0.0002, + "loss": 1.0898, + "step": 2750 + }, + { + "epoch": 2.0, + "eval_loss": 1.1086540222167969, + "eval_runtime": 95.6866, + "eval_samples_per_second": 4.557, + "eval_steps_per_second": 0.575, + "step": 2755 + }, + { + "epoch": 2.0036297640653356, + "grad_norm": 0.35921573638916016, + "learning_rate": 0.0002, + "loss": 0.9791, + "step": 2760 + }, + { + "epoch": 2.010889292196007, + "grad_norm": 1.013861894607544, + "learning_rate": 0.0002, + "loss": 0.8968, + "step": 2770 + }, + { + "epoch": 2.0181488203266786, + "grad_norm": 0.4425240159034729, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 2780 + }, + { + "epoch": 2.02540834845735, + "grad_norm": 0.60174161195755, + "learning_rate": 0.0002, + "loss": 0.8758, + "step": 2790 + }, + { + "epoch": 2.0326678765880217, + "grad_norm": 0.47582098841667175, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 2800 + }, + { + "epoch": 2.0399274047186933, + "grad_norm": 0.6012811660766602, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 2810 + }, + { + "epoch": 2.047186932849365, + "grad_norm": 0.4444408118724823, + "learning_rate": 0.0002, + "loss": 0.8109, + "step": 2820 + }, + { + "epoch": 2.0544464609800364, + "grad_norm": 0.4864003360271454, + "learning_rate": 0.0002, + "loss": 0.8621, + "step": 2830 + }, + { + "epoch": 2.061705989110708, + "grad_norm": 0.5104215741157532, + "learning_rate": 0.0002, + "loss": 0.8817, + "step": 2840 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 0.6218489408493042, + "learning_rate": 0.0002, + "loss": 0.7789, + "step": 2850 + }, + { + "epoch": 2.076225045372051, + "grad_norm": 0.705784261226654, + "learning_rate": 0.0002, + "loss": 0.8778, + "step": 2860 + }, + { + "epoch": 2.0834845735027225, + "grad_norm": 0.48091503977775574, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 2870 + }, + { + "epoch": 2.0907441016333936, + "grad_norm": 0.5062456727027893, + "learning_rate": 0.0002, + "loss": 0.8444, + "step": 2880 + }, + { + "epoch": 2.098003629764065, + "grad_norm": 0.4862022399902344, + "learning_rate": 0.0002, + "loss": 0.7898, + "step": 2890 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.48264044523239136, + "learning_rate": 0.0002, + "loss": 0.8593, + "step": 2900 + }, + { + "epoch": 2.1125226860254083, + "grad_norm": 0.43744346499443054, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 2910 + }, + { + "epoch": 2.11978221415608, + "grad_norm": 0.5480492115020752, + "learning_rate": 0.0002, + "loss": 0.9099, + "step": 2920 + }, + { + "epoch": 2.1270417422867514, + "grad_norm": 0.5068560838699341, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 2930 + }, + { + "epoch": 2.134301270417423, + "grad_norm": 0.4650852680206299, + "learning_rate": 0.0002, + "loss": 0.8403, + "step": 2940 + }, + { + "epoch": 2.1415607985480944, + "grad_norm": 0.4929981231689453, + "learning_rate": 0.0002, + "loss": 0.8467, + "step": 2950 + }, + { + "epoch": 2.148820326678766, + "grad_norm": 0.6537389755249023, + "learning_rate": 0.0002, + "loss": 0.8747, + "step": 2960 + }, + { + "epoch": 2.1560798548094375, + "grad_norm": 0.8032940626144409, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 2970 + }, + { + "epoch": 2.163339382940109, + "grad_norm": 0.7131643891334534, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 2980 + }, + { + "epoch": 2.1705989110707806, + "grad_norm": 0.6034275889396667, + "learning_rate": 0.0002, + "loss": 0.8424, + "step": 2990 + }, + { + "epoch": 2.1778584392014517, + "grad_norm": 0.6081095933914185, + "learning_rate": 0.0002, + "loss": 0.8838, + "step": 3000 + }, + { + "epoch": 2.1851179673321233, + "grad_norm": 0.5706912875175476, + "learning_rate": 0.0002, + "loss": 0.9013, + "step": 3010 + }, + { + "epoch": 2.192377495462795, + "grad_norm": 0.6742380261421204, + "learning_rate": 0.0002, + "loss": 0.8473, + "step": 3020 + }, + { + "epoch": 2.1996370235934664, + "grad_norm": 0.4847496449947357, + "learning_rate": 0.0002, + "loss": 0.8372, + "step": 3030 + }, + { + "epoch": 2.206896551724138, + "grad_norm": 0.5529342889785767, + "learning_rate": 0.0002, + "loss": 0.8487, + "step": 3040 + }, + { + "epoch": 2.2141560798548094, + "grad_norm": 0.6108783483505249, + "learning_rate": 0.0002, + "loss": 0.9057, + "step": 3050 + }, + { + "epoch": 2.221415607985481, + "grad_norm": 0.8841571807861328, + "learning_rate": 0.0002, + "loss": 0.9659, + "step": 3060 + }, + { + "epoch": 2.2286751361161525, + "grad_norm": 0.4227530360221863, + "learning_rate": 0.0002, + "loss": 0.8538, + "step": 3070 + }, + { + "epoch": 2.235934664246824, + "grad_norm": 0.461935818195343, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 3080 + }, + { + "epoch": 2.2431941923774956, + "grad_norm": 0.5407412648200989, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 3090 + }, + { + "epoch": 2.250453720508167, + "grad_norm": 0.5057998895645142, + "learning_rate": 0.0002, + "loss": 0.8392, + "step": 3100 + }, + { + "epoch": 2.2577132486388383, + "grad_norm": 0.530057966709137, + "learning_rate": 0.0002, + "loss": 0.7898, + "step": 3110 + }, + { + "epoch": 2.26497277676951, + "grad_norm": 0.5066842436790466, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 3120 + }, + { + "epoch": 2.2722323049001814, + "grad_norm": 0.5069178342819214, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 3130 + }, + { + "epoch": 2.279491833030853, + "grad_norm": 0.6095499396324158, + "learning_rate": 0.0002, + "loss": 0.8458, + "step": 3140 + }, + { + "epoch": 2.2867513611615244, + "grad_norm": 0.49524766206741333, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 3150 + }, + { + "epoch": 2.294010889292196, + "grad_norm": 0.5334409475326538, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 3160 + }, + { + "epoch": 2.3012704174228675, + "grad_norm": 1.681748867034912, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 3170 + }, + { + "epoch": 2.308529945553539, + "grad_norm": 0.7225565314292908, + "learning_rate": 0.0002, + "loss": 0.8155, + "step": 3180 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 0.5379388928413391, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 3190 + }, + { + "epoch": 2.323049001814882, + "grad_norm": 0.45770326256752014, + "learning_rate": 0.0002, + "loss": 0.8833, + "step": 3200 + }, + { + "epoch": 2.3303085299455537, + "grad_norm": 0.6984533071517944, + "learning_rate": 0.0002, + "loss": 0.8419, + "step": 3210 + }, + { + "epoch": 2.337568058076225, + "grad_norm": 0.6725744605064392, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 3220 + }, + { + "epoch": 2.344827586206897, + "grad_norm": 1.1247471570968628, + "learning_rate": 0.0002, + "loss": 0.8657, + "step": 3230 + }, + { + "epoch": 2.352087114337568, + "grad_norm": 1.0240263938903809, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 3240 + }, + { + "epoch": 2.3593466424682394, + "grad_norm": 0.5608096122741699, + "learning_rate": 0.0002, + "loss": 0.826, + "step": 3250 + }, + { + "epoch": 2.366606170598911, + "grad_norm": 0.8294990062713623, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 3260 + }, + { + "epoch": 2.3738656987295825, + "grad_norm": 0.6734224557876587, + "learning_rate": 0.0002, + "loss": 0.9845, + "step": 3270 + }, + { + "epoch": 2.381125226860254, + "grad_norm": 0.6862800717353821, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 3280 + }, + { + "epoch": 2.3883847549909256, + "grad_norm": 0.5442930459976196, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 3290 + }, + { + "epoch": 2.395644283121597, + "grad_norm": 0.5745394229888916, + "learning_rate": 0.0002, + "loss": 0.8543, + "step": 3300 + }, + { + "epoch": 2.4029038112522687, + "grad_norm": 0.6257799863815308, + "learning_rate": 0.0002, + "loss": 0.833, + "step": 3310 + }, + { + "epoch": 2.4101633393829403, + "grad_norm": 0.5608420968055725, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 3320 + }, + { + "epoch": 2.417422867513612, + "grad_norm": 0.5512017011642456, + "learning_rate": 0.0002, + "loss": 0.8745, + "step": 3330 + }, + { + "epoch": 2.4246823956442833, + "grad_norm": 0.7452999353408813, + "learning_rate": 0.0002, + "loss": 0.8642, + "step": 3340 + }, + { + "epoch": 2.4319419237749544, + "grad_norm": 0.4604301452636719, + "learning_rate": 0.0002, + "loss": 0.8417, + "step": 3350 + }, + { + "epoch": 2.439201451905626, + "grad_norm": 0.8225823640823364, + "learning_rate": 0.0002, + "loss": 0.9148, + "step": 3360 + }, + { + "epoch": 2.4464609800362975, + "grad_norm": 0.8604981303215027, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 3370 + }, + { + "epoch": 2.453720508166969, + "grad_norm": 0.6620925664901733, + "learning_rate": 0.0002, + "loss": 0.9276, + "step": 3380 + }, + { + "epoch": 2.4609800362976406, + "grad_norm": 0.4750158488750458, + "learning_rate": 0.0002, + "loss": 0.8381, + "step": 3390 + }, + { + "epoch": 2.468239564428312, + "grad_norm": 0.6061418056488037, + "learning_rate": 0.0002, + "loss": 0.8249, + "step": 3400 + }, + { + "epoch": 2.4754990925589837, + "grad_norm": 0.5977247357368469, + "learning_rate": 0.0002, + "loss": 0.8944, + "step": 3410 + }, + { + "epoch": 2.4827586206896552, + "grad_norm": 0.5004227757453918, + "learning_rate": 0.0002, + "loss": 0.8366, + "step": 3420 + }, + { + "epoch": 2.490018148820327, + "grad_norm": 0.46428972482681274, + "learning_rate": 0.0002, + "loss": 0.9133, + "step": 3430 + }, + { + "epoch": 2.4972776769509983, + "grad_norm": 0.7658395171165466, + "learning_rate": 0.0002, + "loss": 0.8534, + "step": 3440 + }, + { + "epoch": 2.50453720508167, + "grad_norm": 0.621819794178009, + "learning_rate": 0.0002, + "loss": 0.9066, + "step": 3450 + }, + { + "epoch": 2.511796733212341, + "grad_norm": 0.4556088149547577, + "learning_rate": 0.0002, + "loss": 0.8481, + "step": 3460 + }, + { + "epoch": 2.519056261343013, + "grad_norm": 0.6124140024185181, + "learning_rate": 0.0002, + "loss": 0.8522, + "step": 3470 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 0.6256231665611267, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 3480 + }, + { + "epoch": 2.5335753176043556, + "grad_norm": 0.5464141964912415, + "learning_rate": 0.0002, + "loss": 0.9097, + "step": 3490 + }, + { + "epoch": 2.540834845735027, + "grad_norm": 0.51471346616745, + "learning_rate": 0.0002, + "loss": 0.8708, + "step": 3500 + }, + { + "epoch": 2.5480943738656987, + "grad_norm": 0.5326165556907654, + "learning_rate": 0.0002, + "loss": 0.9042, + "step": 3510 + }, + { + "epoch": 2.5553539019963702, + "grad_norm": 0.4750378429889679, + "learning_rate": 0.0002, + "loss": 0.8749, + "step": 3520 + }, + { + "epoch": 2.562613430127042, + "grad_norm": 0.5292279124259949, + "learning_rate": 0.0002, + "loss": 0.8248, + "step": 3530 + }, + { + "epoch": 2.5698729582577133, + "grad_norm": 0.6145227551460266, + "learning_rate": 0.0002, + "loss": 0.8808, + "step": 3540 + }, + { + "epoch": 2.577132486388385, + "grad_norm": 0.8275189399719238, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 3550 + }, + { + "epoch": 2.5843920145190564, + "grad_norm": 0.5037438273429871, + "learning_rate": 0.0002, + "loss": 0.875, + "step": 3560 + }, + { + "epoch": 2.5916515426497275, + "grad_norm": 0.5838707685470581, + "learning_rate": 0.0002, + "loss": 0.8324, + "step": 3570 + }, + { + "epoch": 2.5989110707803995, + "grad_norm": 0.5398710370063782, + "learning_rate": 0.0002, + "loss": 0.8716, + "step": 3580 + }, + { + "epoch": 2.6061705989110706, + "grad_norm": 0.6115376949310303, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 3590 + }, + { + "epoch": 2.613430127041742, + "grad_norm": 1.118809461593628, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 3600 + }, + { + "epoch": 2.6206896551724137, + "grad_norm": 0.7811771631240845, + "learning_rate": 0.0002, + "loss": 0.8282, + "step": 3610 + }, + { + "epoch": 2.6279491833030852, + "grad_norm": 0.5753175020217896, + "learning_rate": 0.0002, + "loss": 0.7997, + "step": 3620 + }, + { + "epoch": 2.635208711433757, + "grad_norm": 0.550829291343689, + "learning_rate": 0.0002, + "loss": 0.7705, + "step": 3630 + }, + { + "epoch": 2.6424682395644283, + "grad_norm": 0.5360019207000732, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 3640 + }, + { + "epoch": 2.6497277676951, + "grad_norm": 0.63050377368927, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 3650 + }, + { + "epoch": 2.6569872958257714, + "grad_norm": 0.5833110213279724, + "learning_rate": 0.0002, + "loss": 0.9156, + "step": 3660 + }, + { + "epoch": 2.664246823956443, + "grad_norm": 0.5543047189712524, + "learning_rate": 0.0002, + "loss": 0.8033, + "step": 3670 + }, + { + "epoch": 2.671506352087114, + "grad_norm": 0.6842212080955505, + "learning_rate": 0.0002, + "loss": 0.8867, + "step": 3680 + }, + { + "epoch": 2.678765880217786, + "grad_norm": 0.6859333515167236, + "learning_rate": 0.0002, + "loss": 0.8569, + "step": 3690 + }, + { + "epoch": 2.686025408348457, + "grad_norm": 0.7038410902023315, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 3700 + }, + { + "epoch": 2.6932849364791287, + "grad_norm": 0.447233647108078, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 3710 + }, + { + "epoch": 2.7005444646098002, + "grad_norm": 0.6453872919082642, + "learning_rate": 0.0002, + "loss": 0.8951, + "step": 3720 + }, + { + "epoch": 2.707803992740472, + "grad_norm": 0.8025672435760498, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 3730 + }, + { + "epoch": 2.7150635208711433, + "grad_norm": 0.5997087955474854, + "learning_rate": 0.0002, + "loss": 0.8488, + "step": 3740 + }, + { + "epoch": 2.722323049001815, + "grad_norm": 0.6901142001152039, + "learning_rate": 0.0002, + "loss": 0.8308, + "step": 3750 + }, + { + "epoch": 2.7295825771324864, + "grad_norm": 1.036145567893982, + "learning_rate": 0.0002, + "loss": 0.8517, + "step": 3760 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 0.7207072377204895, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 3770 + }, + { + "epoch": 2.7441016333938295, + "grad_norm": 1.0452989339828491, + "learning_rate": 0.0002, + "loss": 0.8009, + "step": 3780 + }, + { + "epoch": 2.751361161524501, + "grad_norm": 0.5615278482437134, + "learning_rate": 0.0002, + "loss": 0.8868, + "step": 3790 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 0.46439215540885925, + "learning_rate": 0.0002, + "loss": 0.9047, + "step": 3800 + }, + { + "epoch": 2.7658802177858437, + "grad_norm": 0.7134917974472046, + "learning_rate": 0.0002, + "loss": 0.7847, + "step": 3810 + }, + { + "epoch": 2.7731397459165157, + "grad_norm": 0.5139115452766418, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 3820 + }, + { + "epoch": 2.780399274047187, + "grad_norm": 0.8595781326293945, + "learning_rate": 0.0002, + "loss": 0.8995, + "step": 3830 + }, + { + "epoch": 2.7876588021778583, + "grad_norm": 0.544614851474762, + "learning_rate": 0.0002, + "loss": 0.9079, + "step": 3840 + }, + { + "epoch": 2.79491833030853, + "grad_norm": 0.6073850393295288, + "learning_rate": 0.0002, + "loss": 0.8322, + "step": 3850 + }, + { + "epoch": 2.8021778584392014, + "grad_norm": 0.8224069476127625, + "learning_rate": 0.0002, + "loss": 0.8277, + "step": 3860 + }, + { + "epoch": 2.809437386569873, + "grad_norm": 0.5347970128059387, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 3870 + }, + { + "epoch": 2.8166969147005445, + "grad_norm": 0.6799601912498474, + "learning_rate": 0.0002, + "loss": 0.8219, + "step": 3880 + }, + { + "epoch": 2.823956442831216, + "grad_norm": 0.5219197869300842, + "learning_rate": 0.0002, + "loss": 0.8796, + "step": 3890 + }, + { + "epoch": 2.8312159709618876, + "grad_norm": 0.5710130333900452, + "learning_rate": 0.0002, + "loss": 0.8649, + "step": 3900 + }, + { + "epoch": 2.838475499092559, + "grad_norm": 0.5857213139533997, + "learning_rate": 0.0002, + "loss": 0.8741, + "step": 3910 + }, + { + "epoch": 2.8457350272232302, + "grad_norm": 0.5206644535064697, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 3920 + }, + { + "epoch": 2.8529945553539022, + "grad_norm": 1.2902015447616577, + "learning_rate": 0.0002, + "loss": 0.9328, + "step": 3930 + }, + { + "epoch": 2.8602540834845733, + "grad_norm": 0.5252797603607178, + "learning_rate": 0.0002, + "loss": 0.8192, + "step": 3940 + }, + { + "epoch": 2.867513611615245, + "grad_norm": 0.5925108790397644, + "learning_rate": 0.0002, + "loss": 0.9047, + "step": 3950 + }, + { + "epoch": 2.8747731397459164, + "grad_norm": 0.9719610810279846, + "learning_rate": 0.0002, + "loss": 0.8611, + "step": 3960 + }, + { + "epoch": 2.882032667876588, + "grad_norm": 0.6834747195243835, + "learning_rate": 0.0002, + "loss": 0.8531, + "step": 3970 + }, + { + "epoch": 2.8892921960072595, + "grad_norm": 0.6952353119850159, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 3980 + }, + { + "epoch": 2.896551724137931, + "grad_norm": 0.49889910221099854, + "learning_rate": 0.0002, + "loss": 0.8365, + "step": 3990 + }, + { + "epoch": 2.9038112522686026, + "grad_norm": 0.5007026791572571, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 4000 + }, + { + "epoch": 2.911070780399274, + "grad_norm": 0.5474239587783813, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 4010 + }, + { + "epoch": 2.9183303085299457, + "grad_norm": 0.6617428660392761, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 4020 + }, + { + "epoch": 2.925589836660617, + "grad_norm": 0.6097776293754578, + "learning_rate": 0.0002, + "loss": 0.8583, + "step": 4030 + }, + { + "epoch": 2.9328493647912888, + "grad_norm": 0.5985828638076782, + "learning_rate": 0.0002, + "loss": 0.8296, + "step": 4040 + }, + { + "epoch": 2.94010889292196, + "grad_norm": 0.769488513469696, + "learning_rate": 0.0002, + "loss": 0.9134, + "step": 4050 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 0.5167055130004883, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 4060 + }, + { + "epoch": 2.954627949183303, + "grad_norm": 0.6265496015548706, + "learning_rate": 0.0002, + "loss": 0.8665, + "step": 4070 + }, + { + "epoch": 2.9618874773139745, + "grad_norm": 1.2644082307815552, + "learning_rate": 0.0002, + "loss": 0.9218, + "step": 4080 + }, + { + "epoch": 2.969147005444646, + "grad_norm": 0.6007561087608337, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 4090 + }, + { + "epoch": 2.9764065335753176, + "grad_norm": 0.47984927892684937, + "learning_rate": 0.0002, + "loss": 0.8262, + "step": 4100 + }, + { + "epoch": 2.983666061705989, + "grad_norm": 1.128198504447937, + "learning_rate": 0.0002, + "loss": 0.7897, + "step": 4110 + }, + { + "epoch": 2.9909255898366607, + "grad_norm": 0.526292085647583, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 4120 + }, + { + "epoch": 2.9981851179673322, + "grad_norm": 0.5462674498558044, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 4130 + }, + { + "epoch": 2.9996370235934666, + "eval_loss": 1.1093357801437378, + "eval_runtime": 46.2498, + "eval_samples_per_second": 9.427, + "eval_steps_per_second": 1.189, + "step": 4132 + }, + { + "epoch": 3.0054446460980038, + "grad_norm": 0.48288026452064514, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 4140 + }, + { + "epoch": 3.0127041742286753, + "grad_norm": 1.0181782245635986, + "learning_rate": 0.0002, + "loss": 0.6769, + "step": 4150 + }, + { + "epoch": 3.019963702359347, + "grad_norm": 0.7718019485473633, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 4160 + }, + { + "epoch": 3.027223230490018, + "grad_norm": 0.7492219805717468, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 4170 + }, + { + "epoch": 3.0344827586206895, + "grad_norm": 0.9363632798194885, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 4180 + }, + { + "epoch": 3.041742286751361, + "grad_norm": 0.6888533234596252, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 4190 + }, + { + "epoch": 3.0490018148820326, + "grad_norm": 0.7072834968566895, + "learning_rate": 0.0002, + "loss": 0.6469, + "step": 4200 + }, + { + "epoch": 3.056261343012704, + "grad_norm": 0.7182047963142395, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 4210 + }, + { + "epoch": 3.0635208711433757, + "grad_norm": 0.7194355130195618, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 4220 + }, + { + "epoch": 3.0707803992740472, + "grad_norm": 0.9454023838043213, + "learning_rate": 0.0002, + "loss": 0.5812, + "step": 4230 + }, + { + "epoch": 3.0780399274047188, + "grad_norm": 0.838657557964325, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 4240 + }, + { + "epoch": 3.0852994555353903, + "grad_norm": 0.740113377571106, + "learning_rate": 0.0002, + "loss": 0.646, + "step": 4250 + }, + { + "epoch": 3.092558983666062, + "grad_norm": 0.6616561412811279, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 4260 + }, + { + "epoch": 3.0998185117967334, + "grad_norm": 0.8846506476402283, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 4270 + }, + { + "epoch": 3.107078039927405, + "grad_norm": 0.6322125792503357, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 4280 + }, + { + "epoch": 3.114337568058076, + "grad_norm": 0.7461467385292053, + "learning_rate": 0.0002, + "loss": 0.5953, + "step": 4290 + }, + { + "epoch": 3.1215970961887476, + "grad_norm": 0.8251287341117859, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 4300 + }, + { + "epoch": 3.128856624319419, + "grad_norm": 0.8767673373222351, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 4310 + }, + { + "epoch": 3.1361161524500907, + "grad_norm": 0.7758759260177612, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 4320 + }, + { + "epoch": 3.143375680580762, + "grad_norm": 1.1056879758834839, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 4330 + }, + { + "epoch": 3.1506352087114338, + "grad_norm": 0.8259835243225098, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 4340 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 0.6607027053833008, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 4350 + }, + { + "epoch": 3.165154264972777, + "grad_norm": 0.7983301281929016, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 4360 + }, + { + "epoch": 3.1724137931034484, + "grad_norm": 0.6725239157676697, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 4370 + }, + { + "epoch": 3.17967332123412, + "grad_norm": 0.9052095413208008, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 4380 + }, + { + "epoch": 3.1869328493647915, + "grad_norm": 0.8131307363510132, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 4390 + }, + { + "epoch": 3.1941923774954626, + "grad_norm": 0.6435626149177551, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 4400 + }, + { + "epoch": 3.201451905626134, + "grad_norm": 0.84367436170578, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 4410 + }, + { + "epoch": 3.2087114337568057, + "grad_norm": 1.5018867254257202, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 4420 + }, + { + "epoch": 3.215970961887477, + "grad_norm": 0.7019091844558716, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 4430 + }, + { + "epoch": 3.2232304900181488, + "grad_norm": 0.9164197444915771, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 4440 + }, + { + "epoch": 3.2304900181488203, + "grad_norm": 0.7890861630439758, + "learning_rate": 0.0002, + "loss": 0.6618, + "step": 4450 + }, + { + "epoch": 3.237749546279492, + "grad_norm": 0.6517660617828369, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 4460 + }, + { + "epoch": 3.2450090744101634, + "grad_norm": 1.10188889503479, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 4470 + }, + { + "epoch": 3.252268602540835, + "grad_norm": 0.8158330917358398, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 4480 + }, + { + "epoch": 3.2595281306715065, + "grad_norm": 0.7663109302520752, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 4490 + }, + { + "epoch": 3.266787658802178, + "grad_norm": 0.8473444581031799, + "learning_rate": 0.0002, + "loss": 0.6539, + "step": 4500 + }, + { + "epoch": 3.274047186932849, + "grad_norm": 0.9724768996238708, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 4510 + }, + { + "epoch": 3.281306715063521, + "grad_norm": 0.8516759276390076, + "learning_rate": 0.0002, + "loss": 0.5464, + "step": 4520 + }, + { + "epoch": 3.288566243194192, + "grad_norm": 0.7543437480926514, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 4530 + }, + { + "epoch": 3.2958257713248638, + "grad_norm": 1.0472029447555542, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 4540 + }, + { + "epoch": 3.3030852994555353, + "grad_norm": 0.6240826845169067, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 4550 + }, + { + "epoch": 3.310344827586207, + "grad_norm": 0.9957774877548218, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 4560 + }, + { + "epoch": 3.3176043557168784, + "grad_norm": 0.6448912620544434, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 4570 + }, + { + "epoch": 3.32486388384755, + "grad_norm": 0.7519692778587341, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 4580 + }, + { + "epoch": 3.3321234119782215, + "grad_norm": 0.7367453575134277, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 4590 + }, + { + "epoch": 3.339382940108893, + "grad_norm": 0.8064960837364197, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 4600 + }, + { + "epoch": 3.3466424682395646, + "grad_norm": 0.7664631009101868, + "learning_rate": 0.0002, + "loss": 0.6062, + "step": 4610 + }, + { + "epoch": 3.353901996370236, + "grad_norm": 0.7803396582603455, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 4620 + }, + { + "epoch": 3.3611615245009077, + "grad_norm": 0.9141599535942078, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 4630 + }, + { + "epoch": 3.3684210526315788, + "grad_norm": 0.9719856381416321, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 4640 + }, + { + "epoch": 3.3756805807622503, + "grad_norm": 0.9223218560218811, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 4650 + }, + { + "epoch": 3.382940108892922, + "grad_norm": 0.7289277911186218, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 4660 + }, + { + "epoch": 3.3901996370235934, + "grad_norm": 1.039724349975586, + "learning_rate": 0.0002, + "loss": 0.595, + "step": 4670 + }, + { + "epoch": 3.397459165154265, + "grad_norm": 1.397438883781433, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 4680 + }, + { + "epoch": 3.4047186932849365, + "grad_norm": 1.0069999694824219, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 4690 + }, + { + "epoch": 3.411978221415608, + "grad_norm": 0.816291332244873, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 4700 + }, + { + "epoch": 3.4192377495462796, + "grad_norm": 1.2831530570983887, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 4710 + }, + { + "epoch": 3.426497277676951, + "grad_norm": 0.9573889970779419, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 4720 + }, + { + "epoch": 3.4337568058076227, + "grad_norm": 0.7685632705688477, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 4730 + }, + { + "epoch": 3.441016333938294, + "grad_norm": 0.7019195556640625, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 4740 + }, + { + "epoch": 3.4482758620689653, + "grad_norm": 0.7244833707809448, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 4750 + }, + { + "epoch": 3.455535390199637, + "grad_norm": 1.3468551635742188, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 4760 + }, + { + "epoch": 3.4627949183303084, + "grad_norm": 0.822846531867981, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 4770 + }, + { + "epoch": 3.47005444646098, + "grad_norm": 0.7311608195304871, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 4780 + }, + { + "epoch": 3.4773139745916515, + "grad_norm": 0.9466770887374878, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 4790 + }, + { + "epoch": 3.484573502722323, + "grad_norm": 1.1527155637741089, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 4800 + }, + { + "epoch": 3.4918330308529946, + "grad_norm": 1.1288906335830688, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 4810 + }, + { + "epoch": 3.499092558983666, + "grad_norm": 0.9096164107322693, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 4820 + }, + { + "epoch": 3.5063520871143377, + "grad_norm": 0.7988565564155579, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 4830 + }, + { + "epoch": 3.513611615245009, + "grad_norm": 0.7183415293693542, + "learning_rate": 0.0002, + "loss": 0.7004, + "step": 4840 + }, + { + "epoch": 3.5208711433756807, + "grad_norm": 0.6614915132522583, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 4850 + }, + { + "epoch": 3.528130671506352, + "grad_norm": 0.8609521985054016, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 4860 + }, + { + "epoch": 3.535390199637024, + "grad_norm": 0.86552894115448, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 4870 + }, + { + "epoch": 3.542649727767695, + "grad_norm": 0.6926496028900146, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 4880 + }, + { + "epoch": 3.5499092558983665, + "grad_norm": 0.8157467246055603, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 4890 + }, + { + "epoch": 3.557168784029038, + "grad_norm": 0.9085357189178467, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 4900 + }, + { + "epoch": 3.5644283121597096, + "grad_norm": 0.6322644948959351, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 4910 + }, + { + "epoch": 3.571687840290381, + "grad_norm": 1.263205885887146, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 4920 + }, + { + "epoch": 3.5789473684210527, + "grad_norm": 0.8901070356369019, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 4930 + }, + { + "epoch": 3.586206896551724, + "grad_norm": 0.7983952164649963, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 4940 + }, + { + "epoch": 3.5934664246823957, + "grad_norm": 0.9887813925743103, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 4950 + }, + { + "epoch": 3.6007259528130673, + "grad_norm": 0.7895187735557556, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 4960 + }, + { + "epoch": 3.6079854809437384, + "grad_norm": 0.9685819745063782, + "learning_rate": 0.0002, + "loss": 0.7111, + "step": 4970 + }, + { + "epoch": 3.6152450090744104, + "grad_norm": 0.6576591730117798, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 4980 + }, + { + "epoch": 3.6225045372050815, + "grad_norm": 0.856985330581665, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 4990 + }, + { + "epoch": 3.629764065335753, + "grad_norm": 0.7230252623558044, + "learning_rate": 0.0002, + "loss": 0.6318, + "step": 5000 + }, + { + "epoch": 3.6370235934664246, + "grad_norm": 0.8260893821716309, + "learning_rate": 0.0002, + "loss": 0.742, + "step": 5010 + }, + { + "epoch": 3.644283121597096, + "grad_norm": 0.7635950446128845, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 5020 + }, + { + "epoch": 3.6515426497277677, + "grad_norm": 0.7060768604278564, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 5030 + }, + { + "epoch": 3.658802177858439, + "grad_norm": 0.8020303249359131, + "learning_rate": 0.0002, + "loss": 0.6921, + "step": 5040 + }, + { + "epoch": 3.6660617059891107, + "grad_norm": 0.8530341386795044, + "learning_rate": 0.0002, + "loss": 0.6446, + "step": 5050 + }, + { + "epoch": 3.6733212341197823, + "grad_norm": 0.6667101979255676, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 5060 + }, + { + "epoch": 3.680580762250454, + "grad_norm": 0.7385406494140625, + "learning_rate": 0.0002, + "loss": 0.7081, + "step": 5070 + }, + { + "epoch": 3.6878402903811254, + "grad_norm": 0.7753380537033081, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 5080 + }, + { + "epoch": 3.695099818511797, + "grad_norm": 0.7516207098960876, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 5090 + }, + { + "epoch": 3.702359346642468, + "grad_norm": 0.8171586394309998, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 5100 + }, + { + "epoch": 3.70961887477314, + "grad_norm": 1.0796279907226562, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 5110 + }, + { + "epoch": 3.716878402903811, + "grad_norm": 0.6957688927650452, + "learning_rate": 0.0002, + "loss": 0.5948, + "step": 5120 + }, + { + "epoch": 3.7241379310344827, + "grad_norm": 0.8550161719322205, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 5130 + }, + { + "epoch": 3.731397459165154, + "grad_norm": 0.9396728277206421, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 5140 + }, + { + "epoch": 3.7386569872958257, + "grad_norm": 1.4264805316925049, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 5150 + }, + { + "epoch": 3.7459165154264973, + "grad_norm": 0.8725108504295349, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 5160 + }, + { + "epoch": 3.753176043557169, + "grad_norm": 1.0346195697784424, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 5170 + }, + { + "epoch": 3.7604355716878404, + "grad_norm": 0.5395554304122925, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 5180 + }, + { + "epoch": 3.767695099818512, + "grad_norm": 1.3153616189956665, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 5190 + }, + { + "epoch": 3.7749546279491835, + "grad_norm": 0.9879828691482544, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 5200 + }, + { + "epoch": 3.7822141560798546, + "grad_norm": 0.8876672983169556, + "learning_rate": 0.0002, + "loss": 0.7068, + "step": 5210 + }, + { + "epoch": 3.7894736842105265, + "grad_norm": 0.8363267779350281, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 5220 + }, + { + "epoch": 3.7967332123411976, + "grad_norm": 0.637294590473175, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 5230 + }, + { + "epoch": 3.803992740471869, + "grad_norm": 1.1408970355987549, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 5240 + }, + { + "epoch": 3.8112522686025407, + "grad_norm": 1.0128360986709595, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 5250 + }, + { + "epoch": 3.8185117967332123, + "grad_norm": 0.8061144351959229, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 5260 + }, + { + "epoch": 3.825771324863884, + "grad_norm": 0.9626626968383789, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 5270 + }, + { + "epoch": 3.8330308529945554, + "grad_norm": 0.9013627171516418, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 5280 + }, + { + "epoch": 3.840290381125227, + "grad_norm": 0.8411344289779663, + "learning_rate": 0.0002, + "loss": 0.6806, + "step": 5290 + }, + { + "epoch": 3.8475499092558985, + "grad_norm": 0.7426059246063232, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 5300 + }, + { + "epoch": 3.85480943738657, + "grad_norm": 1.003413438796997, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 5310 + }, + { + "epoch": 3.862068965517241, + "grad_norm": 0.7527840733528137, + "learning_rate": 0.0002, + "loss": 0.8554, + "step": 5320 + }, + { + "epoch": 3.869328493647913, + "grad_norm": 0.738610565662384, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 5330 + }, + { + "epoch": 3.876588021778584, + "grad_norm": 0.7277999520301819, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 5340 + }, + { + "epoch": 3.8838475499092557, + "grad_norm": 0.5951359272003174, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 5350 + }, + { + "epoch": 3.8911070780399273, + "grad_norm": 1.043884038925171, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 5360 + }, + { + "epoch": 3.898366606170599, + "grad_norm": 0.8436498045921326, + "learning_rate": 0.0002, + "loss": 0.6862, + "step": 5370 + }, + { + "epoch": 3.9056261343012704, + "grad_norm": 0.5603365302085876, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 5380 + }, + { + "epoch": 3.912885662431942, + "grad_norm": 1.0128886699676514, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 5390 + }, + { + "epoch": 3.9201451905626135, + "grad_norm": 0.7970930337905884, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 5400 + }, + { + "epoch": 3.927404718693285, + "grad_norm": 0.7699369192123413, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 5410 + }, + { + "epoch": 3.9346642468239565, + "grad_norm": 0.800561249256134, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 5420 + }, + { + "epoch": 3.941923774954628, + "grad_norm": 0.8020331859588623, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 5430 + }, + { + "epoch": 3.9491833030852996, + "grad_norm": 0.7461140155792236, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 5440 + }, + { + "epoch": 3.9564428312159707, + "grad_norm": 0.8346918821334839, + "learning_rate": 0.0002, + "loss": 0.7013, + "step": 5450 + }, + { + "epoch": 3.9637023593466427, + "grad_norm": 0.9723302125930786, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 5460 + }, + { + "epoch": 3.970961887477314, + "grad_norm": 0.6809740662574768, + "learning_rate": 0.0002, + "loss": 0.8029, + "step": 5470 + }, + { + "epoch": 3.9782214156079854, + "grad_norm": 0.7353498339653015, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 5480 + }, + { + "epoch": 3.985480943738657, + "grad_norm": 0.748009443283081, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 5490 + }, + { + "epoch": 3.9927404718693285, + "grad_norm": 1.3656195402145386, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 5500 + }, + { + "epoch": 4.0, + "grad_norm": 0.8402108550071716, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 5510 + }, + { + "epoch": 4.0, + "eval_loss": 1.17229425907135, + "eval_runtime": 46.2554, + "eval_samples_per_second": 9.426, + "eval_steps_per_second": 1.189, + "step": 5510 + }, + { + "epoch": 4.007259528130671, + "grad_norm": 0.8601235747337341, + "learning_rate": 0.0002, + "loss": 0.4637, + "step": 5520 + }, + { + "epoch": 4.014519056261343, + "grad_norm": 1.2635200023651123, + "learning_rate": 0.0002, + "loss": 0.4717, + "step": 5530 + }, + { + "epoch": 4.021778584392014, + "grad_norm": 1.0257477760314941, + "learning_rate": 0.0002, + "loss": 0.503, + "step": 5540 + }, + { + "epoch": 4.029038112522686, + "grad_norm": 0.9436745047569275, + "learning_rate": 0.0002, + "loss": 0.4547, + "step": 5550 + }, + { + "epoch": 4.036297640653357, + "grad_norm": 0.9443606734275818, + "learning_rate": 0.0002, + "loss": 0.459, + "step": 5560 + }, + { + "epoch": 4.043557168784029, + "grad_norm": 1.3965742588043213, + "learning_rate": 0.0002, + "loss": 0.5386, + "step": 5570 + }, + { + "epoch": 4.0508166969147, + "grad_norm": 0.8973520398139954, + "learning_rate": 0.0002, + "loss": 0.4248, + "step": 5580 + }, + { + "epoch": 4.058076225045372, + "grad_norm": 0.9998409748077393, + "learning_rate": 0.0002, + "loss": 0.4111, + "step": 5590 + }, + { + "epoch": 4.0653357531760435, + "grad_norm": 1.1213387250900269, + "learning_rate": 0.0002, + "loss": 0.4828, + "step": 5600 + }, + { + "epoch": 4.072595281306715, + "grad_norm": 0.7064558863639832, + "learning_rate": 0.0002, + "loss": 0.439, + "step": 5610 + }, + { + "epoch": 4.0798548094373865, + "grad_norm": 1.2390803098678589, + "learning_rate": 0.0002, + "loss": 0.4607, + "step": 5620 + }, + { + "epoch": 4.087114337568058, + "grad_norm": 1.123469591140747, + "learning_rate": 0.0002, + "loss": 0.5014, + "step": 5630 + }, + { + "epoch": 4.09437386569873, + "grad_norm": 1.229573369026184, + "learning_rate": 0.0002, + "loss": 0.513, + "step": 5640 + }, + { + "epoch": 4.101633393829401, + "grad_norm": 1.7182831764221191, + "learning_rate": 0.0002, + "loss": 0.5258, + "step": 5650 + }, + { + "epoch": 4.108892921960073, + "grad_norm": 0.894903302192688, + "learning_rate": 0.0002, + "loss": 0.5371, + "step": 5660 + }, + { + "epoch": 4.116152450090744, + "grad_norm": 0.8754552006721497, + "learning_rate": 0.0002, + "loss": 0.4813, + "step": 5670 + }, + { + "epoch": 4.123411978221416, + "grad_norm": 1.2401553392410278, + "learning_rate": 0.0002, + "loss": 0.491, + "step": 5680 + }, + { + "epoch": 4.130671506352087, + "grad_norm": 0.8631148934364319, + "learning_rate": 0.0002, + "loss": 0.4549, + "step": 5690 + }, + { + "epoch": 4.137931034482759, + "grad_norm": 1.1798022985458374, + "learning_rate": 0.0002, + "loss": 0.487, + "step": 5700 + }, + { + "epoch": 4.14519056261343, + "grad_norm": 0.8344549536705017, + "learning_rate": 0.0002, + "loss": 0.4522, + "step": 5710 + }, + { + "epoch": 4.152450090744102, + "grad_norm": 1.2342697381973267, + "learning_rate": 0.0002, + "loss": 0.4559, + "step": 5720 + }, + { + "epoch": 4.159709618874773, + "grad_norm": 1.1601094007492065, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 5730 + }, + { + "epoch": 4.166969147005445, + "grad_norm": 1.2925703525543213, + "learning_rate": 0.0002, + "loss": 0.4755, + "step": 5740 + }, + { + "epoch": 4.174228675136116, + "grad_norm": 1.0870997905731201, + "learning_rate": 0.0002, + "loss": 0.4973, + "step": 5750 + }, + { + "epoch": 4.181488203266787, + "grad_norm": 0.9077792763710022, + "learning_rate": 0.0002, + "loss": 0.5184, + "step": 5760 + }, + { + "epoch": 4.188747731397459, + "grad_norm": 1.009273886680603, + "learning_rate": 0.0002, + "loss": 0.4606, + "step": 5770 + }, + { + "epoch": 4.19600725952813, + "grad_norm": 1.2465530633926392, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 5780 + }, + { + "epoch": 4.203266787658802, + "grad_norm": 1.2261253595352173, + "learning_rate": 0.0002, + "loss": 0.4938, + "step": 5790 + }, + { + "epoch": 4.2105263157894735, + "grad_norm": 1.1498041152954102, + "learning_rate": 0.0002, + "loss": 0.5255, + "step": 5800 + }, + { + "epoch": 4.217785843920145, + "grad_norm": 1.1966725587844849, + "learning_rate": 0.0002, + "loss": 0.5269, + "step": 5810 + }, + { + "epoch": 4.2250453720508165, + "grad_norm": 1.2651296854019165, + "learning_rate": 0.0002, + "loss": 0.5626, + "step": 5820 + }, + { + "epoch": 4.2323049001814885, + "grad_norm": 1.0388574600219727, + "learning_rate": 0.0002, + "loss": 0.5213, + "step": 5830 + }, + { + "epoch": 4.23956442831216, + "grad_norm": 1.3042771816253662, + "learning_rate": 0.0002, + "loss": 0.4965, + "step": 5840 + }, + { + "epoch": 4.246823956442832, + "grad_norm": 1.1127727031707764, + "learning_rate": 0.0002, + "loss": 0.5116, + "step": 5850 + }, + { + "epoch": 4.254083484573503, + "grad_norm": 0.9653958082199097, + "learning_rate": 0.0002, + "loss": 0.5197, + "step": 5860 + }, + { + "epoch": 4.261343012704174, + "grad_norm": 1.0500504970550537, + "learning_rate": 0.0002, + "loss": 0.4414, + "step": 5870 + }, + { + "epoch": 4.268602540834846, + "grad_norm": 1.1476165056228638, + "learning_rate": 0.0002, + "loss": 0.544, + "step": 5880 + }, + { + "epoch": 4.275862068965517, + "grad_norm": 0.9424414038658142, + "learning_rate": 0.0002, + "loss": 0.4667, + "step": 5890 + }, + { + "epoch": 4.283121597096189, + "grad_norm": 1.3309166431427002, + "learning_rate": 0.0002, + "loss": 0.5039, + "step": 5900 + }, + { + "epoch": 4.29038112522686, + "grad_norm": 1.3025873899459839, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 5910 + }, + { + "epoch": 4.297640653357532, + "grad_norm": 1.1442325115203857, + "learning_rate": 0.0002, + "loss": 0.4644, + "step": 5920 + }, + { + "epoch": 4.304900181488203, + "grad_norm": 0.9820859432220459, + "learning_rate": 0.0002, + "loss": 0.5066, + "step": 5930 + }, + { + "epoch": 4.312159709618875, + "grad_norm": 0.9615740180015564, + "learning_rate": 0.0002, + "loss": 0.5224, + "step": 5940 + }, + { + "epoch": 4.319419237749546, + "grad_norm": 1.1627109050750732, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 5950 + }, + { + "epoch": 4.326678765880218, + "grad_norm": 0.9381322860717773, + "learning_rate": 0.0002, + "loss": 0.4852, + "step": 5960 + }, + { + "epoch": 4.333938294010889, + "grad_norm": 0.8154335618019104, + "learning_rate": 0.0002, + "loss": 0.4532, + "step": 5970 + }, + { + "epoch": 4.341197822141561, + "grad_norm": 0.877671480178833, + "learning_rate": 0.0002, + "loss": 0.5237, + "step": 5980 + }, + { + "epoch": 4.348457350272232, + "grad_norm": 1.1742031574249268, + "learning_rate": 0.0002, + "loss": 0.6113, + "step": 5990 + }, + { + "epoch": 4.3557168784029034, + "grad_norm": 1.0352917909622192, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 6000 + }, + { + "epoch": 4.362976406533575, + "grad_norm": 0.9963878989219666, + "learning_rate": 0.0002, + "loss": 0.4996, + "step": 6010 + }, + { + "epoch": 4.3702359346642465, + "grad_norm": 1.1892237663269043, + "learning_rate": 0.0002, + "loss": 0.4813, + "step": 6020 + }, + { + "epoch": 4.3774954627949185, + "grad_norm": 1.2516111135482788, + "learning_rate": 0.0002, + "loss": 0.5224, + "step": 6030 + }, + { + "epoch": 4.38475499092559, + "grad_norm": 1.2111951112747192, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 6040 + }, + { + "epoch": 4.392014519056262, + "grad_norm": 1.0820083618164062, + "learning_rate": 0.0002, + "loss": 0.4275, + "step": 6050 + }, + { + "epoch": 4.399274047186933, + "grad_norm": 1.033915638923645, + "learning_rate": 0.0002, + "loss": 0.5117, + "step": 6060 + }, + { + "epoch": 4.406533575317605, + "grad_norm": 1.0635870695114136, + "learning_rate": 0.0002, + "loss": 0.5431, + "step": 6070 + }, + { + "epoch": 4.413793103448276, + "grad_norm": 1.0520414113998413, + "learning_rate": 0.0002, + "loss": 0.5341, + "step": 6080 + }, + { + "epoch": 4.421052631578947, + "grad_norm": 1.0821926593780518, + "learning_rate": 0.0002, + "loss": 0.512, + "step": 6090 + }, + { + "epoch": 4.428312159709619, + "grad_norm": 1.0533246994018555, + "learning_rate": 0.0002, + "loss": 0.5065, + "step": 6100 + }, + { + "epoch": 4.43557168784029, + "grad_norm": 0.9231932759284973, + "learning_rate": 0.0002, + "loss": 0.4577, + "step": 6110 + }, + { + "epoch": 4.442831215970962, + "grad_norm": 0.9910260438919067, + "learning_rate": 0.0002, + "loss": 0.583, + "step": 6120 + }, + { + "epoch": 4.450090744101633, + "grad_norm": 1.061949372291565, + "learning_rate": 0.0002, + "loss": 0.4717, + "step": 6130 + }, + { + "epoch": 4.457350272232305, + "grad_norm": 1.2927039861679077, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 6140 + }, + { + "epoch": 4.464609800362976, + "grad_norm": 1.3966081142425537, + "learning_rate": 0.0002, + "loss": 0.4684, + "step": 6150 + }, + { + "epoch": 4.471869328493648, + "grad_norm": 1.3835992813110352, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 6160 + }, + { + "epoch": 4.479128856624319, + "grad_norm": 1.0892692804336548, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 6170 + }, + { + "epoch": 4.486388384754991, + "grad_norm": 1.0318800210952759, + "learning_rate": 0.0002, + "loss": 0.478, + "step": 6180 + }, + { + "epoch": 4.493647912885662, + "grad_norm": 0.8174677491188049, + "learning_rate": 0.0002, + "loss": 0.5198, + "step": 6190 + }, + { + "epoch": 4.500907441016334, + "grad_norm": 1.4157509803771973, + "learning_rate": 0.0002, + "loss": 0.5387, + "step": 6200 + }, + { + "epoch": 4.508166969147005, + "grad_norm": 1.5244114398956299, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 6210 + }, + { + "epoch": 4.5154264972776765, + "grad_norm": 0.8164850473403931, + "learning_rate": 0.0002, + "loss": 0.4642, + "step": 6220 + }, + { + "epoch": 4.5226860254083485, + "grad_norm": 1.2904746532440186, + "learning_rate": 0.0002, + "loss": 0.522, + "step": 6230 + }, + { + "epoch": 4.52994555353902, + "grad_norm": 0.7987732887268066, + "learning_rate": 0.0002, + "loss": 0.5103, + "step": 6240 + }, + { + "epoch": 4.537205081669692, + "grad_norm": 0.831040620803833, + "learning_rate": 0.0002, + "loss": 0.4615, + "step": 6250 + }, + { + "epoch": 4.544464609800363, + "grad_norm": 0.9545485973358154, + "learning_rate": 0.0002, + "loss": 0.5065, + "step": 6260 + }, + { + "epoch": 4.551724137931035, + "grad_norm": 0.9291793704032898, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 6270 + }, + { + "epoch": 4.558983666061706, + "grad_norm": 0.8977208733558655, + "learning_rate": 0.0002, + "loss": 0.4535, + "step": 6280 + }, + { + "epoch": 4.566243194192378, + "grad_norm": 1.1768537759780884, + "learning_rate": 0.0002, + "loss": 0.544, + "step": 6290 + }, + { + "epoch": 4.573502722323049, + "grad_norm": 1.0688952207565308, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 6300 + }, + { + "epoch": 4.580762250453721, + "grad_norm": 0.8800966739654541, + "learning_rate": 0.0002, + "loss": 0.5207, + "step": 6310 + }, + { + "epoch": 4.588021778584392, + "grad_norm": 1.0911834239959717, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 6320 + }, + { + "epoch": 4.595281306715064, + "grad_norm": 1.1420872211456299, + "learning_rate": 0.0002, + "loss": 0.5109, + "step": 6330 + }, + { + "epoch": 4.602540834845735, + "grad_norm": 1.0215224027633667, + "learning_rate": 0.0002, + "loss": 0.5147, + "step": 6340 + }, + { + "epoch": 4.609800362976406, + "grad_norm": 0.9685489535331726, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 6350 + }, + { + "epoch": 4.617059891107078, + "grad_norm": 1.12773597240448, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 6360 + }, + { + "epoch": 4.624319419237749, + "grad_norm": 1.0663973093032837, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 6370 + }, + { + "epoch": 4.631578947368421, + "grad_norm": 1.1707262992858887, + "learning_rate": 0.0002, + "loss": 0.512, + "step": 6380 + }, + { + "epoch": 4.638838475499092, + "grad_norm": 1.0672980546951294, + "learning_rate": 0.0002, + "loss": 0.5497, + "step": 6390 + }, + { + "epoch": 4.646098003629764, + "grad_norm": 1.1464333534240723, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 6400 + }, + { + "epoch": 4.653357531760435, + "grad_norm": 1.070230484008789, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 6410 + }, + { + "epoch": 4.660617059891107, + "grad_norm": 0.9673764109611511, + "learning_rate": 0.0002, + "loss": 0.5013, + "step": 6420 + }, + { + "epoch": 4.6678765880217785, + "grad_norm": 1.0189043283462524, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 6430 + }, + { + "epoch": 4.67513611615245, + "grad_norm": 1.185896396636963, + "learning_rate": 0.0002, + "loss": 0.5193, + "step": 6440 + }, + { + "epoch": 4.682395644283122, + "grad_norm": 1.0682812929153442, + "learning_rate": 0.0002, + "loss": 0.5318, + "step": 6450 + }, + { + "epoch": 4.689655172413794, + "grad_norm": 1.3586071729660034, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 6460 + }, + { + "epoch": 4.696914700544465, + "grad_norm": 0.6561792492866516, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 6470 + }, + { + "epoch": 4.704174228675136, + "grad_norm": 1.1394113302230835, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 6480 + }, + { + "epoch": 4.711433756805808, + "grad_norm": 0.9683151245117188, + "learning_rate": 0.0002, + "loss": 0.5325, + "step": 6490 + }, + { + "epoch": 4.718693284936479, + "grad_norm": 1.0247553586959839, + "learning_rate": 0.0002, + "loss": 0.5139, + "step": 6500 + }, + { + "epoch": 4.725952813067151, + "grad_norm": 0.8046169281005859, + "learning_rate": 0.0002, + "loss": 0.5794, + "step": 6510 + }, + { + "epoch": 4.733212341197822, + "grad_norm": 1.0710240602493286, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 6520 + }, + { + "epoch": 4.740471869328494, + "grad_norm": 0.9438924193382263, + "learning_rate": 0.0002, + "loss": 0.4805, + "step": 6530 + }, + { + "epoch": 4.747731397459165, + "grad_norm": 0.869162380695343, + "learning_rate": 0.0002, + "loss": 0.5404, + "step": 6540 + }, + { + "epoch": 4.754990925589837, + "grad_norm": 0.9776787161827087, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 6550 + }, + { + "epoch": 4.762250453720508, + "grad_norm": 1.1990505456924438, + "learning_rate": 0.0002, + "loss": 0.5288, + "step": 6560 + }, + { + "epoch": 4.769509981851179, + "grad_norm": 1.0582209825515747, + "learning_rate": 0.0002, + "loss": 0.5539, + "step": 6570 + }, + { + "epoch": 4.776769509981851, + "grad_norm": 0.9966367483139038, + "learning_rate": 0.0002, + "loss": 0.489, + "step": 6580 + }, + { + "epoch": 4.784029038112522, + "grad_norm": 0.9130612015724182, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 6590 + }, + { + "epoch": 4.791288566243194, + "grad_norm": 1.0950500965118408, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 6600 + }, + { + "epoch": 4.798548094373865, + "grad_norm": 1.108681321144104, + "learning_rate": 0.0002, + "loss": 0.5266, + "step": 6610 + }, + { + "epoch": 4.805807622504537, + "grad_norm": 1.1873763799667358, + "learning_rate": 0.0002, + "loss": 0.5875, + "step": 6620 + }, + { + "epoch": 4.8130671506352085, + "grad_norm": 1.305367112159729, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 6630 + }, + { + "epoch": 4.8203266787658805, + "grad_norm": 1.2801482677459717, + "learning_rate": 0.0002, + "loss": 0.5636, + "step": 6640 + }, + { + "epoch": 4.827586206896552, + "grad_norm": 1.26764976978302, + "learning_rate": 0.0002, + "loss": 0.582, + "step": 6650 + }, + { + "epoch": 4.834845735027224, + "grad_norm": 1.0018208026885986, + "learning_rate": 0.0002, + "loss": 0.5259, + "step": 6660 + }, + { + "epoch": 4.842105263157895, + "grad_norm": 1.2326326370239258, + "learning_rate": 0.0002, + "loss": 0.548, + "step": 6670 + }, + { + "epoch": 4.849364791288567, + "grad_norm": 0.9707282781600952, + "learning_rate": 0.0002, + "loss": 0.5933, + "step": 6680 + }, + { + "epoch": 4.856624319419238, + "grad_norm": 1.2772048711776733, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 6690 + }, + { + "epoch": 4.863883847549909, + "grad_norm": 2.6652262210845947, + "learning_rate": 0.0002, + "loss": 0.5346, + "step": 6700 + }, + { + "epoch": 4.871143375680581, + "grad_norm": 1.215828537940979, + "learning_rate": 0.0002, + "loss": 0.5428, + "step": 6710 + }, + { + "epoch": 4.878402903811252, + "grad_norm": 1.3704510927200317, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 6720 + }, + { + "epoch": 4.885662431941924, + "grad_norm": 0.7781757116317749, + "learning_rate": 0.0002, + "loss": 0.4963, + "step": 6730 + }, + { + "epoch": 4.892921960072595, + "grad_norm": 1.1883646249771118, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 6740 + }, + { + "epoch": 4.900181488203267, + "grad_norm": 0.9216066002845764, + "learning_rate": 0.0002, + "loss": 0.6067, + "step": 6750 + }, + { + "epoch": 4.907441016333938, + "grad_norm": 1.0558464527130127, + "learning_rate": 0.0002, + "loss": 0.5085, + "step": 6760 + }, + { + "epoch": 4.91470054446461, + "grad_norm": 1.032656192779541, + "learning_rate": 0.0002, + "loss": 0.5216, + "step": 6770 + }, + { + "epoch": 4.921960072595281, + "grad_norm": 1.1261441707611084, + "learning_rate": 0.0002, + "loss": 0.5426, + "step": 6780 + }, + { + "epoch": 4.929219600725952, + "grad_norm": 1.2178640365600586, + "learning_rate": 0.0002, + "loss": 0.5295, + "step": 6790 + }, + { + "epoch": 4.936479128856624, + "grad_norm": 1.5369361639022827, + "learning_rate": 0.0002, + "loss": 0.5476, + "step": 6800 + }, + { + "epoch": 4.943738656987296, + "grad_norm": 1.1188377141952515, + "learning_rate": 0.0002, + "loss": 0.5358, + "step": 6810 + }, + { + "epoch": 4.950998185117967, + "grad_norm": 1.2506113052368164, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 6820 + }, + { + "epoch": 4.9582577132486385, + "grad_norm": 0.8776047825813293, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 6830 + }, + { + "epoch": 4.9655172413793105, + "grad_norm": 0.9700555205345154, + "learning_rate": 0.0002, + "loss": 0.5764, + "step": 6840 + }, + { + "epoch": 4.972776769509982, + "grad_norm": 1.2713534832000732, + "learning_rate": 0.0002, + "loss": 0.5396, + "step": 6850 + }, + { + "epoch": 4.980036297640654, + "grad_norm": 0.9855955243110657, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 6860 + }, + { + "epoch": 4.987295825771325, + "grad_norm": 0.8734853863716125, + "learning_rate": 0.0002, + "loss": 0.5884, + "step": 6870 + }, + { + "epoch": 4.994555353901997, + "grad_norm": 0.8065403699874878, + "learning_rate": 0.0002, + "loss": 0.5189, + "step": 6880 + }, + { + "epoch": 4.999637023593467, + "eval_loss": 1.3302682638168335, + "eval_runtime": 46.2496, + "eval_samples_per_second": 9.427, + "eval_steps_per_second": 1.189, + "step": 6887 + }, + { + "epoch": 5.001814882032668, + "grad_norm": 0.5163813829421997, + "learning_rate": 0.0002, + "loss": 0.4889, + "step": 6890 + }, + { + "epoch": 5.00907441016334, + "grad_norm": 1.1496137380599976, + "learning_rate": 0.0002, + "loss": 0.3545, + "step": 6900 + }, + { + "epoch": 5.016333938294011, + "grad_norm": 1.0133885145187378, + "learning_rate": 0.0002, + "loss": 0.39, + "step": 6910 + }, + { + "epoch": 5.023593466424682, + "grad_norm": 0.9479621052742004, + "learning_rate": 0.0002, + "loss": 0.3693, + "step": 6920 + }, + { + "epoch": 5.030852994555354, + "grad_norm": 0.8587583303451538, + "learning_rate": 0.0002, + "loss": 0.4012, + "step": 6930 + }, + { + "epoch": 5.038112522686025, + "grad_norm": 1.3314697742462158, + "learning_rate": 0.0002, + "loss": 0.3428, + "step": 6940 + }, + { + "epoch": 5.045372050816697, + "grad_norm": 1.195448875427246, + "learning_rate": 0.0002, + "loss": 0.3909, + "step": 6950 + }, + { + "epoch": 5.052631578947368, + "grad_norm": 1.2482256889343262, + "learning_rate": 0.0002, + "loss": 0.3322, + "step": 6960 + }, + { + "epoch": 5.05989110707804, + "grad_norm": 1.2011528015136719, + "learning_rate": 0.0002, + "loss": 0.3893, + "step": 6970 + }, + { + "epoch": 5.067150635208711, + "grad_norm": 1.3997188806533813, + "learning_rate": 0.0002, + "loss": 0.3265, + "step": 6980 + }, + { + "epoch": 5.074410163339383, + "grad_norm": 1.2147513628005981, + "learning_rate": 0.0002, + "loss": 0.3716, + "step": 6990 + }, + { + "epoch": 5.081669691470054, + "grad_norm": 1.6030137538909912, + "learning_rate": 0.0002, + "loss": 0.4053, + "step": 7000 + }, + { + "epoch": 5.088929219600726, + "grad_norm": 0.9466970562934875, + "learning_rate": 0.0002, + "loss": 0.3665, + "step": 7010 + }, + { + "epoch": 5.096188747731397, + "grad_norm": 1.4593411684036255, + "learning_rate": 0.0002, + "loss": 0.3451, + "step": 7020 + }, + { + "epoch": 5.103448275862069, + "grad_norm": 1.2196033000946045, + "learning_rate": 0.0002, + "loss": 0.3843, + "step": 7030 + }, + { + "epoch": 5.1107078039927405, + "grad_norm": 1.1341328620910645, + "learning_rate": 0.0002, + "loss": 0.3896, + "step": 7040 + }, + { + "epoch": 5.117967332123412, + "grad_norm": 1.2248976230621338, + "learning_rate": 0.0002, + "loss": 0.3627, + "step": 7050 + }, + { + "epoch": 5.125226860254084, + "grad_norm": 1.1620593070983887, + "learning_rate": 0.0002, + "loss": 0.3784, + "step": 7060 + }, + { + "epoch": 5.132486388384755, + "grad_norm": 0.9300723671913147, + "learning_rate": 0.0002, + "loss": 0.3678, + "step": 7070 + }, + { + "epoch": 5.139745916515427, + "grad_norm": 1.2265169620513916, + "learning_rate": 0.0002, + "loss": 0.3756, + "step": 7080 + }, + { + "epoch": 5.147005444646098, + "grad_norm": 1.4430373907089233, + "learning_rate": 0.0002, + "loss": 0.3595, + "step": 7090 + }, + { + "epoch": 5.15426497277677, + "grad_norm": 1.0821576118469238, + "learning_rate": 0.0002, + "loss": 0.3788, + "step": 7100 + }, + { + "epoch": 5.161524500907441, + "grad_norm": 1.2574739456176758, + "learning_rate": 0.0002, + "loss": 0.383, + "step": 7110 + }, + { + "epoch": 5.168784029038113, + "grad_norm": 1.1806069612503052, + "learning_rate": 0.0002, + "loss": 0.3692, + "step": 7120 + }, + { + "epoch": 5.176043557168784, + "grad_norm": 0.9900956153869629, + "learning_rate": 0.0002, + "loss": 0.3978, + "step": 7130 + }, + { + "epoch": 5.183303085299456, + "grad_norm": 1.2414425611495972, + "learning_rate": 0.0002, + "loss": 0.4358, + "step": 7140 + }, + { + "epoch": 5.190562613430127, + "grad_norm": 0.8220699429512024, + "learning_rate": 0.0002, + "loss": 0.3485, + "step": 7150 + }, + { + "epoch": 5.197822141560798, + "grad_norm": 1.29408860206604, + "learning_rate": 0.0002, + "loss": 0.3517, + "step": 7160 + }, + { + "epoch": 5.20508166969147, + "grad_norm": 0.8510639071464539, + "learning_rate": 0.0002, + "loss": 0.3405, + "step": 7170 + }, + { + "epoch": 5.212341197822141, + "grad_norm": 1.3448902368545532, + "learning_rate": 0.0002, + "loss": 0.4233, + "step": 7180 + }, + { + "epoch": 5.219600725952813, + "grad_norm": 1.054451584815979, + "learning_rate": 0.0002, + "loss": 0.3808, + "step": 7190 + }, + { + "epoch": 5.226860254083484, + "grad_norm": 1.3752713203430176, + "learning_rate": 0.0002, + "loss": 0.368, + "step": 7200 + }, + { + "epoch": 5.234119782214156, + "grad_norm": 1.4848095178604126, + "learning_rate": 0.0002, + "loss": 0.3844, + "step": 7210 + }, + { + "epoch": 5.241379310344827, + "grad_norm": 1.428842544555664, + "learning_rate": 0.0002, + "loss": 0.4187, + "step": 7220 + }, + { + "epoch": 5.248638838475499, + "grad_norm": 1.1703591346740723, + "learning_rate": 0.0002, + "loss": 0.3778, + "step": 7230 + }, + { + "epoch": 5.2558983666061705, + "grad_norm": 1.2180451154708862, + "learning_rate": 0.0002, + "loss": 0.417, + "step": 7240 + }, + { + "epoch": 5.2631578947368425, + "grad_norm": 1.094045877456665, + "learning_rate": 0.0002, + "loss": 0.3656, + "step": 7250 + }, + { + "epoch": 5.270417422867514, + "grad_norm": 0.9545766115188599, + "learning_rate": 0.0002, + "loss": 0.4331, + "step": 7260 + }, + { + "epoch": 5.277676950998185, + "grad_norm": 0.8356652855873108, + "learning_rate": 0.0002, + "loss": 0.3642, + "step": 7270 + }, + { + "epoch": 5.284936479128857, + "grad_norm": 1.148160457611084, + "learning_rate": 0.0002, + "loss": 0.3576, + "step": 7280 + }, + { + "epoch": 5.292196007259528, + "grad_norm": 1.2009977102279663, + "learning_rate": 0.0002, + "loss": 0.4178, + "step": 7290 + }, + { + "epoch": 5.2994555353902, + "grad_norm": 1.3283873796463013, + "learning_rate": 0.0002, + "loss": 0.3977, + "step": 7300 + }, + { + "epoch": 5.306715063520871, + "grad_norm": 0.9850481748580933, + "learning_rate": 0.0002, + "loss": 0.3853, + "step": 7310 + }, + { + "epoch": 5.313974591651543, + "grad_norm": 1.367550015449524, + "learning_rate": 0.0002, + "loss": 0.3645, + "step": 7320 + }, + { + "epoch": 5.321234119782214, + "grad_norm": 0.8602936863899231, + "learning_rate": 0.0002, + "loss": 0.3898, + "step": 7330 + }, + { + "epoch": 5.328493647912886, + "grad_norm": 1.1130679845809937, + "learning_rate": 0.0002, + "loss": 0.4173, + "step": 7340 + }, + { + "epoch": 5.335753176043557, + "grad_norm": 1.3002253770828247, + "learning_rate": 0.0002, + "loss": 0.3642, + "step": 7350 + }, + { + "epoch": 5.343012704174229, + "grad_norm": 1.6235289573669434, + "learning_rate": 0.0002, + "loss": 0.4138, + "step": 7360 + }, + { + "epoch": 5.3502722323049, + "grad_norm": 1.156379222869873, + "learning_rate": 0.0002, + "loss": 0.4779, + "step": 7370 + }, + { + "epoch": 5.357531760435572, + "grad_norm": 1.0569308996200562, + "learning_rate": 0.0002, + "loss": 0.3222, + "step": 7380 + }, + { + "epoch": 5.364791288566243, + "grad_norm": 1.6674021482467651, + "learning_rate": 0.0002, + "loss": 0.3573, + "step": 7390 + }, + { + "epoch": 5.372050816696914, + "grad_norm": 1.2962018251419067, + "learning_rate": 0.0002, + "loss": 0.4325, + "step": 7400 + }, + { + "epoch": 5.379310344827586, + "grad_norm": 1.1904195547103882, + "learning_rate": 0.0002, + "loss": 0.3809, + "step": 7410 + }, + { + "epoch": 5.386569872958257, + "grad_norm": 1.316245675086975, + "learning_rate": 0.0002, + "loss": 0.3728, + "step": 7420 + }, + { + "epoch": 5.393829401088929, + "grad_norm": 1.127570390701294, + "learning_rate": 0.0002, + "loss": 0.4096, + "step": 7430 + }, + { + "epoch": 5.4010889292196005, + "grad_norm": 1.3895777463912964, + "learning_rate": 0.0002, + "loss": 0.3933, + "step": 7440 + }, + { + "epoch": 5.4083484573502725, + "grad_norm": 1.626830816268921, + "learning_rate": 0.0002, + "loss": 0.4085, + "step": 7450 + }, + { + "epoch": 5.415607985480944, + "grad_norm": 1.3703926801681519, + "learning_rate": 0.0002, + "loss": 0.4186, + "step": 7460 + }, + { + "epoch": 5.422867513611616, + "grad_norm": 1.3854840993881226, + "learning_rate": 0.0002, + "loss": 0.3517, + "step": 7470 + }, + { + "epoch": 5.430127041742287, + "grad_norm": 1.107065200805664, + "learning_rate": 0.0002, + "loss": 0.3714, + "step": 7480 + }, + { + "epoch": 5.437386569872959, + "grad_norm": 0.7843456268310547, + "learning_rate": 0.0002, + "loss": 0.3855, + "step": 7490 + }, + { + "epoch": 5.44464609800363, + "grad_norm": 1.6692372560501099, + "learning_rate": 0.0002, + "loss": 0.4159, + "step": 7500 + }, + { + "epoch": 5.451905626134302, + "grad_norm": 1.2583858966827393, + "learning_rate": 0.0002, + "loss": 0.4185, + "step": 7510 + }, + { + "epoch": 5.459165154264973, + "grad_norm": 1.6827000379562378, + "learning_rate": 0.0002, + "loss": 0.4401, + "step": 7520 + }, + { + "epoch": 5.466424682395644, + "grad_norm": 1.6680560111999512, + "learning_rate": 0.0002, + "loss": 0.397, + "step": 7530 + }, + { + "epoch": 5.473684210526316, + "grad_norm": 1.3696072101593018, + "learning_rate": 0.0002, + "loss": 0.4193, + "step": 7540 + }, + { + "epoch": 5.480943738656987, + "grad_norm": 1.4523496627807617, + "learning_rate": 0.0002, + "loss": 0.4244, + "step": 7550 + }, + { + "epoch": 5.488203266787659, + "grad_norm": 1.3432692289352417, + "learning_rate": 0.0002, + "loss": 0.3609, + "step": 7560 + }, + { + "epoch": 5.49546279491833, + "grad_norm": 1.363818645477295, + "learning_rate": 0.0002, + "loss": 0.3675, + "step": 7570 + }, + { + "epoch": 5.502722323049002, + "grad_norm": 1.0176721811294556, + "learning_rate": 0.0002, + "loss": 0.3726, + "step": 7580 + }, + { + "epoch": 5.509981851179673, + "grad_norm": 1.1625547409057617, + "learning_rate": 0.0002, + "loss": 0.3751, + "step": 7590 + }, + { + "epoch": 5.517241379310345, + "grad_norm": 1.2480388879776, + "learning_rate": 0.0002, + "loss": 0.433, + "step": 7600 + }, + { + "epoch": 5.524500907441016, + "grad_norm": 1.341509222984314, + "learning_rate": 0.0002, + "loss": 0.4511, + "step": 7610 + }, + { + "epoch": 5.531760435571687, + "grad_norm": 1.7048436403274536, + "learning_rate": 0.0002, + "loss": 0.4642, + "step": 7620 + }, + { + "epoch": 5.539019963702359, + "grad_norm": 1.1435480117797852, + "learning_rate": 0.0002, + "loss": 0.4509, + "step": 7630 + }, + { + "epoch": 5.5462794918330305, + "grad_norm": 1.2381842136383057, + "learning_rate": 0.0002, + "loss": 0.4528, + "step": 7640 + }, + { + "epoch": 5.5535390199637025, + "grad_norm": 1.50786292552948, + "learning_rate": 0.0002, + "loss": 0.4496, + "step": 7650 + }, + { + "epoch": 5.560798548094374, + "grad_norm": 1.2263519763946533, + "learning_rate": 0.0002, + "loss": 0.4242, + "step": 7660 + }, + { + "epoch": 5.568058076225046, + "grad_norm": 1.2864696979522705, + "learning_rate": 0.0002, + "loss": 0.418, + "step": 7670 + }, + { + "epoch": 5.575317604355717, + "grad_norm": 1.4443191289901733, + "learning_rate": 0.0002, + "loss": 0.3832, + "step": 7680 + }, + { + "epoch": 5.582577132486389, + "grad_norm": 1.3360971212387085, + "learning_rate": 0.0002, + "loss": 0.3964, + "step": 7690 + }, + { + "epoch": 5.58983666061706, + "grad_norm": 1.391828179359436, + "learning_rate": 0.0002, + "loss": 0.4639, + "step": 7700 + }, + { + "epoch": 5.597096188747732, + "grad_norm": 1.3699384927749634, + "learning_rate": 0.0002, + "loss": 0.4722, + "step": 7710 + }, + { + "epoch": 5.604355716878403, + "grad_norm": 1.3778468370437622, + "learning_rate": 0.0002, + "loss": 0.4302, + "step": 7720 + }, + { + "epoch": 5.611615245009075, + "grad_norm": 1.1009501218795776, + "learning_rate": 0.0002, + "loss": 0.4179, + "step": 7730 + }, + { + "epoch": 5.618874773139746, + "grad_norm": 1.0410021543502808, + "learning_rate": 0.0002, + "loss": 0.4104, + "step": 7740 + }, + { + "epoch": 5.626134301270417, + "grad_norm": 1.1012226343154907, + "learning_rate": 0.0002, + "loss": 0.4489, + "step": 7750 + }, + { + "epoch": 5.633393829401089, + "grad_norm": 1.3246384859085083, + "learning_rate": 0.0002, + "loss": 0.4544, + "step": 7760 + }, + { + "epoch": 5.64065335753176, + "grad_norm": 1.4301716089248657, + "learning_rate": 0.0002, + "loss": 0.4381, + "step": 7770 + }, + { + "epoch": 5.647912885662432, + "grad_norm": 1.1368978023529053, + "learning_rate": 0.0002, + "loss": 0.4297, + "step": 7780 + }, + { + "epoch": 5.655172413793103, + "grad_norm": 1.3493064641952515, + "learning_rate": 0.0002, + "loss": 0.4063, + "step": 7790 + }, + { + "epoch": 5.662431941923775, + "grad_norm": 1.3328721523284912, + "learning_rate": 0.0002, + "loss": 0.4562, + "step": 7800 + }, + { + "epoch": 5.669691470054446, + "grad_norm": 1.3235671520233154, + "learning_rate": 0.0002, + "loss": 0.4075, + "step": 7810 + }, + { + "epoch": 5.676950998185118, + "grad_norm": 1.1961841583251953, + "learning_rate": 0.0002, + "loss": 0.4589, + "step": 7820 + }, + { + "epoch": 5.684210526315789, + "grad_norm": 1.4189636707305908, + "learning_rate": 0.0002, + "loss": 0.4503, + "step": 7830 + }, + { + "epoch": 5.691470054446461, + "grad_norm": 1.3551312685012817, + "learning_rate": 0.0002, + "loss": 0.4452, + "step": 7840 + }, + { + "epoch": 5.6987295825771325, + "grad_norm": 1.449987769126892, + "learning_rate": 0.0002, + "loss": 0.4268, + "step": 7850 + }, + { + "epoch": 5.7059891107078045, + "grad_norm": 1.1225156784057617, + "learning_rate": 0.0002, + "loss": 0.4141, + "step": 7860 + }, + { + "epoch": 5.713248638838476, + "grad_norm": 1.4734594821929932, + "learning_rate": 0.0002, + "loss": 0.41, + "step": 7870 + }, + { + "epoch": 5.720508166969147, + "grad_norm": 1.3793359994888306, + "learning_rate": 0.0002, + "loss": 0.4013, + "step": 7880 + }, + { + "epoch": 5.727767695099819, + "grad_norm": 1.2431834936141968, + "learning_rate": 0.0002, + "loss": 0.4065, + "step": 7890 + }, + { + "epoch": 5.73502722323049, + "grad_norm": 1.1158313751220703, + "learning_rate": 0.0002, + "loss": 0.4595, + "step": 7900 + }, + { + "epoch": 5.742286751361162, + "grad_norm": 1.212248682975769, + "learning_rate": 0.0002, + "loss": 0.4342, + "step": 7910 + }, + { + "epoch": 5.749546279491833, + "grad_norm": 1.5259995460510254, + "learning_rate": 0.0002, + "loss": 0.4611, + "step": 7920 + }, + { + "epoch": 5.756805807622505, + "grad_norm": 1.3909121751785278, + "learning_rate": 0.0002, + "loss": 0.4483, + "step": 7930 + }, + { + "epoch": 5.764065335753176, + "grad_norm": 1.2511249780654907, + "learning_rate": 0.0002, + "loss": 0.4325, + "step": 7940 + }, + { + "epoch": 5.771324863883848, + "grad_norm": 1.2511906623840332, + "learning_rate": 0.0002, + "loss": 0.4048, + "step": 7950 + }, + { + "epoch": 5.778584392014519, + "grad_norm": 1.1489921808242798, + "learning_rate": 0.0002, + "loss": 0.3715, + "step": 7960 + }, + { + "epoch": 5.78584392014519, + "grad_norm": 1.028943419456482, + "learning_rate": 0.0002, + "loss": 0.4196, + "step": 7970 + }, + { + "epoch": 5.793103448275862, + "grad_norm": 1.0820423364639282, + "learning_rate": 0.0002, + "loss": 0.4334, + "step": 7980 + }, + { + "epoch": 5.800362976406533, + "grad_norm": 1.296520471572876, + "learning_rate": 0.0002, + "loss": 0.3917, + "step": 7990 + }, + { + "epoch": 5.807622504537205, + "grad_norm": 1.3597749471664429, + "learning_rate": 0.0002, + "loss": 0.4509, + "step": 8000 + }, + { + "epoch": 5.814882032667876, + "grad_norm": 0.8741790652275085, + "learning_rate": 0.0002, + "loss": 0.4535, + "step": 8010 + }, + { + "epoch": 5.822141560798548, + "grad_norm": 1.1471822261810303, + "learning_rate": 0.0002, + "loss": 0.4239, + "step": 8020 + }, + { + "epoch": 5.829401088929219, + "grad_norm": 1.2997334003448486, + "learning_rate": 0.0002, + "loss": 0.5042, + "step": 8030 + }, + { + "epoch": 5.836660617059891, + "grad_norm": 1.1027175188064575, + "learning_rate": 0.0002, + "loss": 0.4758, + "step": 8040 + }, + { + "epoch": 5.8439201451905625, + "grad_norm": 1.2695307731628418, + "learning_rate": 0.0002, + "loss": 0.4192, + "step": 8050 + }, + { + "epoch": 5.8511796733212345, + "grad_norm": 1.5275461673736572, + "learning_rate": 0.0002, + "loss": 0.5173, + "step": 8060 + }, + { + "epoch": 5.8584392014519056, + "grad_norm": 1.3059501647949219, + "learning_rate": 0.0002, + "loss": 0.5012, + "step": 8070 + }, + { + "epoch": 5.8656987295825775, + "grad_norm": 1.57442045211792, + "learning_rate": 0.0002, + "loss": 0.4425, + "step": 8080 + }, + { + "epoch": 5.872958257713249, + "grad_norm": 1.119564414024353, + "learning_rate": 0.0002, + "loss": 0.4261, + "step": 8090 + }, + { + "epoch": 5.88021778584392, + "grad_norm": 1.6517373323440552, + "learning_rate": 0.0002, + "loss": 0.465, + "step": 8100 + }, + { + "epoch": 5.887477313974592, + "grad_norm": 1.4093554019927979, + "learning_rate": 0.0002, + "loss": 0.4406, + "step": 8110 + }, + { + "epoch": 5.894736842105263, + "grad_norm": 1.278843641281128, + "learning_rate": 0.0002, + "loss": 0.4433, + "step": 8120 + }, + { + "epoch": 5.901996370235935, + "grad_norm": 1.2042944431304932, + "learning_rate": 0.0002, + "loss": 0.4007, + "step": 8130 + }, + { + "epoch": 5.909255898366606, + "grad_norm": 1.1788326501846313, + "learning_rate": 0.0002, + "loss": 0.3972, + "step": 8140 + }, + { + "epoch": 5.916515426497278, + "grad_norm": 1.4364569187164307, + "learning_rate": 0.0002, + "loss": 0.4506, + "step": 8150 + }, + { + "epoch": 5.923774954627949, + "grad_norm": 1.1704283952713013, + "learning_rate": 0.0002, + "loss": 0.4651, + "step": 8160 + }, + { + "epoch": 5.931034482758621, + "grad_norm": 1.040814995765686, + "learning_rate": 0.0002, + "loss": 0.3972, + "step": 8170 + }, + { + "epoch": 5.938294010889292, + "grad_norm": 1.1367416381835938, + "learning_rate": 0.0002, + "loss": 0.4038, + "step": 8180 + }, + { + "epoch": 5.945553539019964, + "grad_norm": 1.3401511907577515, + "learning_rate": 0.0002, + "loss": 0.4387, + "step": 8190 + }, + { + "epoch": 5.952813067150635, + "grad_norm": 1.1154041290283203, + "learning_rate": 0.0002, + "loss": 0.4396, + "step": 8200 + }, + { + "epoch": 5.960072595281307, + "grad_norm": 1.426089882850647, + "learning_rate": 0.0002, + "loss": 0.4744, + "step": 8210 + }, + { + "epoch": 5.967332123411978, + "grad_norm": 1.3170222043991089, + "learning_rate": 0.0002, + "loss": 0.4105, + "step": 8220 + }, + { + "epoch": 5.974591651542649, + "grad_norm": 1.1960029602050781, + "learning_rate": 0.0002, + "loss": 0.4137, + "step": 8230 + }, + { + "epoch": 5.981851179673321, + "grad_norm": 1.0843931436538696, + "learning_rate": 0.0002, + "loss": 0.423, + "step": 8240 + }, + { + "epoch": 5.9891107078039925, + "grad_norm": 1.050421118736267, + "learning_rate": 0.0002, + "loss": 0.459, + "step": 8250 + }, + { + "epoch": 5.9963702359346644, + "grad_norm": 1.0183138847351074, + "learning_rate": 0.0002, + "loss": 0.3993, + "step": 8260 + }, + { + "epoch": 6.0, + "eval_loss": 1.4677470922470093, + "eval_runtime": 46.2504, + "eval_samples_per_second": 9.427, + "eval_steps_per_second": 1.189, + "step": 8265 + } + ], + "logging_steps": 10, + "max_steps": 11016, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.245213576167424e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..140bceabdec44fc473842cb9ace4a62a4dc9fe89 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-8265/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4abec5480aff960b5a7b03a48be1cdb1ae29ba8792469c451f5c2f269e4be93b +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/README.md b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/adapter_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/adapter_model.safetensors b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a29550672985a02772104c9364a326af8516392f --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbfa96848843eee83f689b4aa8c81d3536f5fda32736d2f1bc88c7aaf8d8d581 +size 143153376 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/optimizer.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9015ff4f0ebdd6af207e0f9bedb65376cb721252 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45bf5cb7fa854fc4154de94b62b2a2f6065395ab838e15396207bc7140a60ac0 +size 72886650 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/rng_state.pth b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3ebd19de3bde8f3039b2d46160e1963c74508e88 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bfe69a522f129b0b868fa3f22242a0f3a02106525c851adceef61abf24ab6ab +size 14244 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/scheduler.pt b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5a3b6fa20bce38a9ee756d031b747473d48d900 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fafd2ca8d1fb4578cad77d5f424303ecbe66d43ed15127356f9414a5f5e0997e +size 1064 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/trainer_state.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..34cc6d54c103f47037a381a3332231f4248cb873 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/trainer_state.json @@ -0,0 +1,6837 @@ +{ + "best_metric": 1.1086540222167969, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755", + "epoch": 6.999637023593467, + "eval_steps": 10, + "global_step": 9642, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.007259528130671506, + "grad_norm": 0.46010470390319824, + "learning_rate": 0.0002, + "loss": 2.4936, + "step": 10 + }, + { + "epoch": 0.014519056261343012, + "grad_norm": 0.5103179216384888, + "learning_rate": 0.0002, + "loss": 1.8442, + "step": 20 + }, + { + "epoch": 0.021778584392014518, + "grad_norm": 0.6282716989517212, + "learning_rate": 0.0002, + "loss": 1.559, + "step": 30 + }, + { + "epoch": 0.029038112522686024, + "grad_norm": 1.2480497360229492, + "learning_rate": 0.0002, + "loss": 1.3618, + "step": 40 + }, + { + "epoch": 0.036297640653357534, + "grad_norm": 0.4114855229854584, + "learning_rate": 0.0002, + "loss": 1.3874, + "step": 50 + }, + { + "epoch": 0.043557168784029036, + "grad_norm": 0.49482840299606323, + "learning_rate": 0.0002, + "loss": 1.2836, + "step": 60 + }, + { + "epoch": 0.050816696914700546, + "grad_norm": 0.4536272883415222, + "learning_rate": 0.0002, + "loss": 1.1728, + "step": 70 + }, + { + "epoch": 0.05807622504537205, + "grad_norm": 0.32328274846076965, + "learning_rate": 0.0002, + "loss": 1.1125, + "step": 80 + }, + { + "epoch": 0.06533575317604355, + "grad_norm": 0.40990468859672546, + "learning_rate": 0.0002, + "loss": 1.2447, + "step": 90 + }, + { + "epoch": 0.07259528130671507, + "grad_norm": 0.37273502349853516, + "learning_rate": 0.0002, + "loss": 1.2216, + "step": 100 + }, + { + "epoch": 0.07985480943738657, + "grad_norm": 0.3903871476650238, + "learning_rate": 0.0002, + "loss": 1.2259, + "step": 110 + }, + { + "epoch": 0.08711433756805807, + "grad_norm": 0.3272787928581238, + "learning_rate": 0.0002, + "loss": 1.3718, + "step": 120 + }, + { + "epoch": 0.09437386569872959, + "grad_norm": 0.3622824251651764, + "learning_rate": 0.0002, + "loss": 1.1832, + "step": 130 + }, + { + "epoch": 0.10163339382940109, + "grad_norm": 0.3503916561603546, + "learning_rate": 0.0002, + "loss": 1.2966, + "step": 140 + }, + { + "epoch": 0.1088929219600726, + "grad_norm": 0.32787832617759705, + "learning_rate": 0.0002, + "loss": 1.0913, + "step": 150 + }, + { + "epoch": 0.1161524500907441, + "grad_norm": 0.5822657942771912, + "learning_rate": 0.0002, + "loss": 1.2143, + "step": 160 + }, + { + "epoch": 0.12341197822141561, + "grad_norm": 0.28028249740600586, + "learning_rate": 0.0002, + "loss": 1.2091, + "step": 170 + }, + { + "epoch": 0.1306715063520871, + "grad_norm": 0.33602750301361084, + "learning_rate": 0.0002, + "loss": 1.1731, + "step": 180 + }, + { + "epoch": 0.13793103448275862, + "grad_norm": 0.29106274247169495, + "learning_rate": 0.0002, + "loss": 1.127, + "step": 190 + }, + { + "epoch": 0.14519056261343014, + "grad_norm": 0.38753363490104675, + "learning_rate": 0.0002, + "loss": 1.3261, + "step": 200 + }, + { + "epoch": 0.15245009074410162, + "grad_norm": 0.361009418964386, + "learning_rate": 0.0002, + "loss": 1.1536, + "step": 210 + }, + { + "epoch": 0.15970961887477314, + "grad_norm": 0.6743836402893066, + "learning_rate": 0.0002, + "loss": 1.1483, + "step": 220 + }, + { + "epoch": 0.16696914700544466, + "grad_norm": 0.3923613727092743, + "learning_rate": 0.0002, + "loss": 1.264, + "step": 230 + }, + { + "epoch": 0.17422867513611615, + "grad_norm": 0.2809699773788452, + "learning_rate": 0.0002, + "loss": 1.0437, + "step": 240 + }, + { + "epoch": 0.18148820326678766, + "grad_norm": 0.3631494641304016, + "learning_rate": 0.0002, + "loss": 1.1826, + "step": 250 + }, + { + "epoch": 0.18874773139745918, + "grad_norm": 0.24658171832561493, + "learning_rate": 0.0002, + "loss": 1.1584, + "step": 260 + }, + { + "epoch": 0.19600725952813067, + "grad_norm": 0.5780664682388306, + "learning_rate": 0.0002, + "loss": 1.0065, + "step": 270 + }, + { + "epoch": 0.20326678765880218, + "grad_norm": 0.3056720495223999, + "learning_rate": 0.0002, + "loss": 1.1251, + "step": 280 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.2857084572315216, + "learning_rate": 0.0002, + "loss": 1.1503, + "step": 290 + }, + { + "epoch": 0.2177858439201452, + "grad_norm": 0.6645345687866211, + "learning_rate": 0.0002, + "loss": 1.1215, + "step": 300 + }, + { + "epoch": 0.2250453720508167, + "grad_norm": 0.5966078639030457, + "learning_rate": 0.0002, + "loss": 1.127, + "step": 310 + }, + { + "epoch": 0.2323049001814882, + "grad_norm": 0.40937140583992004, + "learning_rate": 0.0002, + "loss": 1.1974, + "step": 320 + }, + { + "epoch": 0.2395644283121597, + "grad_norm": 0.5642806887626648, + "learning_rate": 0.0002, + "loss": 1.2385, + "step": 330 + }, + { + "epoch": 0.24682395644283123, + "grad_norm": 0.2750748097896576, + "learning_rate": 0.0002, + "loss": 1.1703, + "step": 340 + }, + { + "epoch": 0.2540834845735027, + "grad_norm": 0.34350234270095825, + "learning_rate": 0.0002, + "loss": 1.2598, + "step": 350 + }, + { + "epoch": 0.2613430127041742, + "grad_norm": 0.6767239570617676, + "learning_rate": 0.0002, + "loss": 1.1942, + "step": 360 + }, + { + "epoch": 0.26860254083484575, + "grad_norm": 0.31006959080696106, + "learning_rate": 0.0002, + "loss": 1.1436, + "step": 370 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 0.3825474679470062, + "learning_rate": 0.0002, + "loss": 1.2084, + "step": 380 + }, + { + "epoch": 0.2831215970961887, + "grad_norm": 0.30241551995277405, + "learning_rate": 0.0002, + "loss": 1.1523, + "step": 390 + }, + { + "epoch": 0.29038112522686027, + "grad_norm": 0.2962397336959839, + "learning_rate": 0.0002, + "loss": 1.1298, + "step": 400 + }, + { + "epoch": 0.29764065335753176, + "grad_norm": 0.2600369155406952, + "learning_rate": 0.0002, + "loss": 1.1299, + "step": 410 + }, + { + "epoch": 0.30490018148820325, + "grad_norm": 0.3675060272216797, + "learning_rate": 0.0002, + "loss": 1.1366, + "step": 420 + }, + { + "epoch": 0.3121597096188748, + "grad_norm": 0.3429498076438904, + "learning_rate": 0.0002, + "loss": 1.156, + "step": 430 + }, + { + "epoch": 0.3194192377495463, + "grad_norm": 0.34311825037002563, + "learning_rate": 0.0002, + "loss": 1.2741, + "step": 440 + }, + { + "epoch": 0.32667876588021777, + "grad_norm": 0.37872210144996643, + "learning_rate": 0.0002, + "loss": 1.3523, + "step": 450 + }, + { + "epoch": 0.3339382940108893, + "grad_norm": 0.33271121978759766, + "learning_rate": 0.0002, + "loss": 1.1365, + "step": 460 + }, + { + "epoch": 0.3411978221415608, + "grad_norm": 0.34605276584625244, + "learning_rate": 0.0002, + "loss": 1.1572, + "step": 470 + }, + { + "epoch": 0.3484573502722323, + "grad_norm": 0.41050562262535095, + "learning_rate": 0.0002, + "loss": 1.2755, + "step": 480 + }, + { + "epoch": 0.35571687840290384, + "grad_norm": 0.2066836953163147, + "learning_rate": 0.0002, + "loss": 1.075, + "step": 490 + }, + { + "epoch": 0.3629764065335753, + "grad_norm": 0.2859014868736267, + "learning_rate": 0.0002, + "loss": 1.2072, + "step": 500 + }, + { + "epoch": 0.3702359346642468, + "grad_norm": 0.28763777017593384, + "learning_rate": 0.0002, + "loss": 1.1435, + "step": 510 + }, + { + "epoch": 0.37749546279491836, + "grad_norm": 0.2730471193790436, + "learning_rate": 0.0002, + "loss": 1.1341, + "step": 520 + }, + { + "epoch": 0.38475499092558985, + "grad_norm": 0.3968936800956726, + "learning_rate": 0.0002, + "loss": 1.348, + "step": 530 + }, + { + "epoch": 0.39201451905626133, + "grad_norm": 0.3624701201915741, + "learning_rate": 0.0002, + "loss": 1.2363, + "step": 540 + }, + { + "epoch": 0.3992740471869328, + "grad_norm": 0.3303608298301697, + "learning_rate": 0.0002, + "loss": 1.1206, + "step": 550 + }, + { + "epoch": 0.40653357531760437, + "grad_norm": 0.33507466316223145, + "learning_rate": 0.0002, + "loss": 1.203, + "step": 560 + }, + { + "epoch": 0.41379310344827586, + "grad_norm": 0.3297670781612396, + "learning_rate": 0.0002, + "loss": 1.2294, + "step": 570 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.32334890961647034, + "learning_rate": 0.0002, + "loss": 1.2, + "step": 580 + }, + { + "epoch": 0.4283121597096189, + "grad_norm": 0.30281195044517517, + "learning_rate": 0.0002, + "loss": 1.112, + "step": 590 + }, + { + "epoch": 0.4355716878402904, + "grad_norm": 0.5900027751922607, + "learning_rate": 0.0002, + "loss": 1.1352, + "step": 600 + }, + { + "epoch": 0.44283121597096187, + "grad_norm": 0.28437477350234985, + "learning_rate": 0.0002, + "loss": 1.1575, + "step": 610 + }, + { + "epoch": 0.4500907441016334, + "grad_norm": 0.39601704478263855, + "learning_rate": 0.0002, + "loss": 1.2009, + "step": 620 + }, + { + "epoch": 0.4573502722323049, + "grad_norm": 0.41971510648727417, + "learning_rate": 0.0002, + "loss": 1.2302, + "step": 630 + }, + { + "epoch": 0.4646098003629764, + "grad_norm": 0.33814409375190735, + "learning_rate": 0.0002, + "loss": 1.236, + "step": 640 + }, + { + "epoch": 0.47186932849364793, + "grad_norm": 0.575718104839325, + "learning_rate": 0.0002, + "loss": 1.2471, + "step": 650 + }, + { + "epoch": 0.4791288566243194, + "grad_norm": 0.37927401065826416, + "learning_rate": 0.0002, + "loss": 1.1012, + "step": 660 + }, + { + "epoch": 0.4863883847549909, + "grad_norm": 0.3224332630634308, + "learning_rate": 0.0002, + "loss": 1.1552, + "step": 670 + }, + { + "epoch": 0.49364791288566245, + "grad_norm": 0.32683515548706055, + "learning_rate": 0.0002, + "loss": 1.1354, + "step": 680 + }, + { + "epoch": 0.5009074410163339, + "grad_norm": 0.4316163659095764, + "learning_rate": 0.0002, + "loss": 1.207, + "step": 690 + }, + { + "epoch": 0.5081669691470054, + "grad_norm": 0.342602401971817, + "learning_rate": 0.0002, + "loss": 1.2116, + "step": 700 + }, + { + "epoch": 0.515426497277677, + "grad_norm": 0.2794898748397827, + "learning_rate": 0.0002, + "loss": 1.0433, + "step": 710 + }, + { + "epoch": 0.5226860254083484, + "grad_norm": 0.3322339951992035, + "learning_rate": 0.0002, + "loss": 1.2395, + "step": 720 + }, + { + "epoch": 0.52994555353902, + "grad_norm": 0.3088509142398834, + "learning_rate": 0.0002, + "loss": 1.0762, + "step": 730 + }, + { + "epoch": 0.5372050816696915, + "grad_norm": 0.24444378912448883, + "learning_rate": 0.0002, + "loss": 1.1495, + "step": 740 + }, + { + "epoch": 0.5444646098003629, + "grad_norm": 0.3483171761035919, + "learning_rate": 0.0002, + "loss": 1.2176, + "step": 750 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 0.3425690531730652, + "learning_rate": 0.0002, + "loss": 1.2248, + "step": 760 + }, + { + "epoch": 0.558983666061706, + "grad_norm": 0.31841927766799927, + "learning_rate": 0.0002, + "loss": 1.1127, + "step": 770 + }, + { + "epoch": 0.5662431941923775, + "grad_norm": 0.39423868060112, + "learning_rate": 0.0002, + "loss": 1.1841, + "step": 780 + }, + { + "epoch": 0.573502722323049, + "grad_norm": 0.30328479409217834, + "learning_rate": 0.0002, + "loss": 1.0583, + "step": 790 + }, + { + "epoch": 0.5807622504537205, + "grad_norm": 0.24475938081741333, + "learning_rate": 0.0002, + "loss": 1.0718, + "step": 800 + }, + { + "epoch": 0.588021778584392, + "grad_norm": 0.37132805585861206, + "learning_rate": 0.0002, + "loss": 1.2006, + "step": 810 + }, + { + "epoch": 0.5952813067150635, + "grad_norm": 0.32195979356765747, + "learning_rate": 0.0002, + "loss": 1.1544, + "step": 820 + }, + { + "epoch": 0.6025408348457351, + "grad_norm": 0.2848738729953766, + "learning_rate": 0.0002, + "loss": 0.9937, + "step": 830 + }, + { + "epoch": 0.6098003629764065, + "grad_norm": 0.28015264868736267, + "learning_rate": 0.0002, + "loss": 1.0937, + "step": 840 + }, + { + "epoch": 0.617059891107078, + "grad_norm": 0.37796008586883545, + "learning_rate": 0.0002, + "loss": 1.1234, + "step": 850 + }, + { + "epoch": 0.6243194192377496, + "grad_norm": 0.39311841130256653, + "learning_rate": 0.0002, + "loss": 1.1455, + "step": 860 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.2761685252189636, + "learning_rate": 0.0002, + "loss": 1.1066, + "step": 870 + }, + { + "epoch": 0.6388384754990926, + "grad_norm": 0.3826720118522644, + "learning_rate": 0.0002, + "loss": 1.0859, + "step": 880 + }, + { + "epoch": 0.6460980036297641, + "grad_norm": 0.30076679587364197, + "learning_rate": 0.0002, + "loss": 1.1182, + "step": 890 + }, + { + "epoch": 0.6533575317604355, + "grad_norm": 0.21997687220573425, + "learning_rate": 0.0002, + "loss": 1.0927, + "step": 900 + }, + { + "epoch": 0.6606170598911071, + "grad_norm": 0.32593777775764465, + "learning_rate": 0.0002, + "loss": 1.1603, + "step": 910 + }, + { + "epoch": 0.6678765880217786, + "grad_norm": 0.30347898602485657, + "learning_rate": 0.0002, + "loss": 1.1578, + "step": 920 + }, + { + "epoch": 0.6751361161524501, + "grad_norm": 0.44173774123191833, + "learning_rate": 0.0002, + "loss": 1.1642, + "step": 930 + }, + { + "epoch": 0.6823956442831216, + "grad_norm": 0.2507467269897461, + "learning_rate": 0.0002, + "loss": 1.1832, + "step": 940 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 0.29463833570480347, + "learning_rate": 0.0002, + "loss": 1.3001, + "step": 950 + }, + { + "epoch": 0.6969147005444646, + "grad_norm": 0.9363154172897339, + "learning_rate": 0.0002, + "loss": 1.1651, + "step": 960 + }, + { + "epoch": 0.7041742286751361, + "grad_norm": 0.3236212134361267, + "learning_rate": 0.0002, + "loss": 1.158, + "step": 970 + }, + { + "epoch": 0.7114337568058077, + "grad_norm": 0.3123254179954529, + "learning_rate": 0.0002, + "loss": 1.1078, + "step": 980 + }, + { + "epoch": 0.7186932849364791, + "grad_norm": 0.3395805060863495, + "learning_rate": 0.0002, + "loss": 1.1578, + "step": 990 + }, + { + "epoch": 0.7259528130671506, + "grad_norm": 0.3240964412689209, + "learning_rate": 0.0002, + "loss": 1.0754, + "step": 1000 + }, + { + "epoch": 0.7332123411978222, + "grad_norm": 0.31902948021888733, + "learning_rate": 0.0002, + "loss": 1.0313, + "step": 1010 + }, + { + "epoch": 0.7404718693284936, + "grad_norm": 0.4848408102989197, + "learning_rate": 0.0002, + "loss": 1.1664, + "step": 1020 + }, + { + "epoch": 0.7477313974591652, + "grad_norm": 0.33006033301353455, + "learning_rate": 0.0002, + "loss": 1.0838, + "step": 1030 + }, + { + "epoch": 0.7549909255898367, + "grad_norm": 0.2928730547428131, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 1040 + }, + { + "epoch": 0.7622504537205081, + "grad_norm": 0.3529164791107178, + "learning_rate": 0.0002, + "loss": 1.1351, + "step": 1050 + }, + { + "epoch": 0.7695099818511797, + "grad_norm": 0.2736213803291321, + "learning_rate": 0.0002, + "loss": 1.1274, + "step": 1060 + }, + { + "epoch": 0.7767695099818511, + "grad_norm": 0.7200686931610107, + "learning_rate": 0.0002, + "loss": 1.1169, + "step": 1070 + }, + { + "epoch": 0.7840290381125227, + "grad_norm": 0.33396708965301514, + "learning_rate": 0.0002, + "loss": 1.1171, + "step": 1080 + }, + { + "epoch": 0.7912885662431942, + "grad_norm": 1.5760449171066284, + "learning_rate": 0.0002, + "loss": 1.2447, + "step": 1090 + }, + { + "epoch": 0.7985480943738656, + "grad_norm": 0.28138381242752075, + "learning_rate": 0.0002, + "loss": 1.1126, + "step": 1100 + }, + { + "epoch": 0.8058076225045372, + "grad_norm": 0.2597472369670868, + "learning_rate": 0.0002, + "loss": 1.2012, + "step": 1110 + }, + { + "epoch": 0.8130671506352087, + "grad_norm": 0.3305445611476898, + "learning_rate": 0.0002, + "loss": 1.1177, + "step": 1120 + }, + { + "epoch": 0.8203266787658802, + "grad_norm": 0.3934599459171295, + "learning_rate": 0.0002, + "loss": 1.0849, + "step": 1130 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 0.3472191393375397, + "learning_rate": 0.0002, + "loss": 1.0889, + "step": 1140 + }, + { + "epoch": 0.8348457350272233, + "grad_norm": 0.2857365906238556, + "learning_rate": 0.0002, + "loss": 1.265, + "step": 1150 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.3207702934741974, + "learning_rate": 0.0002, + "loss": 1.03, + "step": 1160 + }, + { + "epoch": 0.8493647912885662, + "grad_norm": 0.3176484704017639, + "learning_rate": 0.0002, + "loss": 1.1669, + "step": 1170 + }, + { + "epoch": 0.8566243194192378, + "grad_norm": 0.40685558319091797, + "learning_rate": 0.0002, + "loss": 1.1386, + "step": 1180 + }, + { + "epoch": 0.8638838475499092, + "grad_norm": 0.31125199794769287, + "learning_rate": 0.0002, + "loss": 1.1383, + "step": 1190 + }, + { + "epoch": 0.8711433756805808, + "grad_norm": 0.7361181378364563, + "learning_rate": 0.0002, + "loss": 1.1512, + "step": 1200 + }, + { + "epoch": 0.8784029038112523, + "grad_norm": 0.33699527382850647, + "learning_rate": 0.0002, + "loss": 0.9983, + "step": 1210 + }, + { + "epoch": 0.8856624319419237, + "grad_norm": 0.3315220773220062, + "learning_rate": 0.0002, + "loss": 1.11, + "step": 1220 + }, + { + "epoch": 0.8929219600725953, + "grad_norm": 0.6256054043769836, + "learning_rate": 0.0002, + "loss": 1.0266, + "step": 1230 + }, + { + "epoch": 0.9001814882032668, + "grad_norm": 0.3692137897014618, + "learning_rate": 0.0002, + "loss": 1.1738, + "step": 1240 + }, + { + "epoch": 0.9074410163339383, + "grad_norm": 0.3538484573364258, + "learning_rate": 0.0002, + "loss": 1.1112, + "step": 1250 + }, + { + "epoch": 0.9147005444646098, + "grad_norm": 0.27863040566444397, + "learning_rate": 0.0002, + "loss": 1.0878, + "step": 1260 + }, + { + "epoch": 0.9219600725952813, + "grad_norm": 0.3322528302669525, + "learning_rate": 0.0002, + "loss": 1.0826, + "step": 1270 + }, + { + "epoch": 0.9292196007259528, + "grad_norm": 0.7553173303604126, + "learning_rate": 0.0002, + "loss": 1.0158, + "step": 1280 + }, + { + "epoch": 0.9364791288566243, + "grad_norm": 0.4856191575527191, + "learning_rate": 0.0002, + "loss": 1.198, + "step": 1290 + }, + { + "epoch": 0.9437386569872959, + "grad_norm": 0.3668074905872345, + "learning_rate": 0.0002, + "loss": 1.1678, + "step": 1300 + }, + { + "epoch": 0.9509981851179673, + "grad_norm": 0.29851067066192627, + "learning_rate": 0.0002, + "loss": 1.1809, + "step": 1310 + }, + { + "epoch": 0.9582577132486388, + "grad_norm": 0.276664674282074, + "learning_rate": 0.0002, + "loss": 1.1299, + "step": 1320 + }, + { + "epoch": 0.9655172413793104, + "grad_norm": 0.2941018342971802, + "learning_rate": 0.0002, + "loss": 1.0597, + "step": 1330 + }, + { + "epoch": 0.9727767695099818, + "grad_norm": 0.3505859076976776, + "learning_rate": 0.0002, + "loss": 1.1731, + "step": 1340 + }, + { + "epoch": 0.9800362976406534, + "grad_norm": 0.3067687451839447, + "learning_rate": 0.0002, + "loss": 1.1809, + "step": 1350 + }, + { + "epoch": 0.9872958257713249, + "grad_norm": 0.27151066064834595, + "learning_rate": 0.0002, + "loss": 1.1511, + "step": 1360 + }, + { + "epoch": 0.9945553539019963, + "grad_norm": 0.36370083689689636, + "learning_rate": 0.0002, + "loss": 1.1919, + "step": 1370 + }, + { + "epoch": 0.9996370235934664, + "eval_loss": 1.1381088495254517, + "eval_runtime": 96.0848, + "eval_samples_per_second": 4.538, + "eval_steps_per_second": 0.572, + "step": 1377 + }, + { + "epoch": 1.0018148820326678, + "grad_norm": 0.27980583906173706, + "learning_rate": 0.0002, + "loss": 1.0082, + "step": 1380 + }, + { + "epoch": 1.0090744101633393, + "grad_norm": 0.26713913679122925, + "learning_rate": 0.0002, + "loss": 1.0565, + "step": 1390 + }, + { + "epoch": 1.0163339382940109, + "grad_norm": 0.3089541494846344, + "learning_rate": 0.0002, + "loss": 1.0241, + "step": 1400 + }, + { + "epoch": 1.0235934664246824, + "grad_norm": 0.4188242256641388, + "learning_rate": 0.0002, + "loss": 0.9649, + "step": 1410 + }, + { + "epoch": 1.030852994555354, + "grad_norm": 0.5246463418006897, + "learning_rate": 0.0002, + "loss": 0.9821, + "step": 1420 + }, + { + "epoch": 1.0381125226860255, + "grad_norm": 0.2728777825832367, + "learning_rate": 0.0002, + "loss": 1.0143, + "step": 1430 + }, + { + "epoch": 1.0453720508166968, + "grad_norm": 0.38167616724967957, + "learning_rate": 0.0002, + "loss": 1.075, + "step": 1440 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.4439380168914795, + "learning_rate": 0.0002, + "loss": 1.0229, + "step": 1450 + }, + { + "epoch": 1.05989110707804, + "grad_norm": 0.30954182147979736, + "learning_rate": 0.0002, + "loss": 1.0451, + "step": 1460 + }, + { + "epoch": 1.0671506352087115, + "grad_norm": 0.4022280275821686, + "learning_rate": 0.0002, + "loss": 1.0762, + "step": 1470 + }, + { + "epoch": 1.074410163339383, + "grad_norm": 0.7390811443328857, + "learning_rate": 0.0002, + "loss": 1.0754, + "step": 1480 + }, + { + "epoch": 1.0816696914700545, + "grad_norm": 0.3885486423969269, + "learning_rate": 0.0002, + "loss": 1.1378, + "step": 1490 + }, + { + "epoch": 1.0889292196007259, + "grad_norm": 0.5275560617446899, + "learning_rate": 0.0002, + "loss": 1.005, + "step": 1500 + }, + { + "epoch": 1.0961887477313974, + "grad_norm": 0.35112282633781433, + "learning_rate": 0.0002, + "loss": 1.0147, + "step": 1510 + }, + { + "epoch": 1.103448275862069, + "grad_norm": 0.33714351058006287, + "learning_rate": 0.0002, + "loss": 1.0331, + "step": 1520 + }, + { + "epoch": 1.1107078039927405, + "grad_norm": 0.31221693754196167, + "learning_rate": 0.0002, + "loss": 0.8723, + "step": 1530 + }, + { + "epoch": 1.117967332123412, + "grad_norm": 0.27549654245376587, + "learning_rate": 0.0002, + "loss": 1.0701, + "step": 1540 + }, + { + "epoch": 1.1252268602540836, + "grad_norm": 0.8465521335601807, + "learning_rate": 0.0002, + "loss": 1.0114, + "step": 1550 + }, + { + "epoch": 1.132486388384755, + "grad_norm": 0.36125949025154114, + "learning_rate": 0.0002, + "loss": 0.9643, + "step": 1560 + }, + { + "epoch": 1.1397459165154264, + "grad_norm": 0.37420371174812317, + "learning_rate": 0.0002, + "loss": 1.0744, + "step": 1570 + }, + { + "epoch": 1.147005444646098, + "grad_norm": 0.3294760584831238, + "learning_rate": 0.0002, + "loss": 1.165, + "step": 1580 + }, + { + "epoch": 1.1542649727767695, + "grad_norm": 0.3881238102912903, + "learning_rate": 0.0002, + "loss": 0.9207, + "step": 1590 + }, + { + "epoch": 1.161524500907441, + "grad_norm": 0.4766491651535034, + "learning_rate": 0.0002, + "loss": 1.0983, + "step": 1600 + }, + { + "epoch": 1.1687840290381124, + "grad_norm": 0.395530104637146, + "learning_rate": 0.0002, + "loss": 1.0092, + "step": 1610 + }, + { + "epoch": 1.176043557168784, + "grad_norm": 0.3297106623649597, + "learning_rate": 0.0002, + "loss": 0.9895, + "step": 1620 + }, + { + "epoch": 1.1833030852994555, + "grad_norm": 0.39528271555900574, + "learning_rate": 0.0002, + "loss": 1.1121, + "step": 1630 + }, + { + "epoch": 1.190562613430127, + "grad_norm": 0.3370221257209778, + "learning_rate": 0.0002, + "loss": 0.9202, + "step": 1640 + }, + { + "epoch": 1.1978221415607986, + "grad_norm": 0.31922030448913574, + "learning_rate": 0.0002, + "loss": 1.102, + "step": 1650 + }, + { + "epoch": 1.2050816696914701, + "grad_norm": 0.6142027378082275, + "learning_rate": 0.0002, + "loss": 1.0565, + "step": 1660 + }, + { + "epoch": 1.2123411978221417, + "grad_norm": 0.44769710302352905, + "learning_rate": 0.0002, + "loss": 0.9541, + "step": 1670 + }, + { + "epoch": 1.219600725952813, + "grad_norm": 0.41233646869659424, + "learning_rate": 0.0002, + "loss": 1.2501, + "step": 1680 + }, + { + "epoch": 1.2268602540834845, + "grad_norm": 0.2928866147994995, + "learning_rate": 0.0002, + "loss": 1.0147, + "step": 1690 + }, + { + "epoch": 1.234119782214156, + "grad_norm": 0.36913734674453735, + "learning_rate": 0.0002, + "loss": 0.9074, + "step": 1700 + }, + { + "epoch": 1.2413793103448276, + "grad_norm": 0.5281891226768494, + "learning_rate": 0.0002, + "loss": 1.0847, + "step": 1710 + }, + { + "epoch": 1.2486388384754992, + "grad_norm": 0.3374697268009186, + "learning_rate": 0.0002, + "loss": 0.9482, + "step": 1720 + }, + { + "epoch": 1.2558983666061705, + "grad_norm": 0.3802020847797394, + "learning_rate": 0.0002, + "loss": 0.9826, + "step": 1730 + }, + { + "epoch": 1.263157894736842, + "grad_norm": 0.38048651814460754, + "learning_rate": 0.0002, + "loss": 1.0191, + "step": 1740 + }, + { + "epoch": 1.2704174228675136, + "grad_norm": 0.6676169633865356, + "learning_rate": 0.0002, + "loss": 1.0142, + "step": 1750 + }, + { + "epoch": 1.2776769509981851, + "grad_norm": 0.4075961410999298, + "learning_rate": 0.0002, + "loss": 0.992, + "step": 1760 + }, + { + "epoch": 1.2849364791288567, + "grad_norm": 0.4374721348285675, + "learning_rate": 0.0002, + "loss": 1.0301, + "step": 1770 + }, + { + "epoch": 1.2921960072595282, + "grad_norm": 0.4638824164867401, + "learning_rate": 0.0002, + "loss": 1.0601, + "step": 1780 + }, + { + "epoch": 1.2994555353901998, + "grad_norm": 0.38631564378738403, + "learning_rate": 0.0002, + "loss": 0.9593, + "step": 1790 + }, + { + "epoch": 1.306715063520871, + "grad_norm": 0.35873809456825256, + "learning_rate": 0.0002, + "loss": 1.0042, + "step": 1800 + }, + { + "epoch": 1.3139745916515426, + "grad_norm": 0.33602237701416016, + "learning_rate": 0.0002, + "loss": 0.9835, + "step": 1810 + }, + { + "epoch": 1.3212341197822142, + "grad_norm": 0.46696463227272034, + "learning_rate": 0.0002, + "loss": 1.1032, + "step": 1820 + }, + { + "epoch": 1.3284936479128857, + "grad_norm": 0.368958979845047, + "learning_rate": 0.0002, + "loss": 1.0123, + "step": 1830 + }, + { + "epoch": 1.335753176043557, + "grad_norm": 0.3160957396030426, + "learning_rate": 0.0002, + "loss": 1.0456, + "step": 1840 + }, + { + "epoch": 1.3430127041742286, + "grad_norm": 0.4511511027812958, + "learning_rate": 0.0002, + "loss": 1.0264, + "step": 1850 + }, + { + "epoch": 1.3502722323049001, + "grad_norm": 0.6769845485687256, + "learning_rate": 0.0002, + "loss": 1.0179, + "step": 1860 + }, + { + "epoch": 1.3575317604355717, + "grad_norm": 0.3749309480190277, + "learning_rate": 0.0002, + "loss": 1.011, + "step": 1870 + }, + { + "epoch": 1.3647912885662432, + "grad_norm": 0.553798496723175, + "learning_rate": 0.0002, + "loss": 1.0223, + "step": 1880 + }, + { + "epoch": 1.3720508166969148, + "grad_norm": 0.3538985550403595, + "learning_rate": 0.0002, + "loss": 0.9634, + "step": 1890 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 0.5501534938812256, + "learning_rate": 0.0002, + "loss": 1.0275, + "step": 1900 + }, + { + "epoch": 1.3865698729582578, + "grad_norm": 0.4432051181793213, + "learning_rate": 0.0002, + "loss": 1.0461, + "step": 1910 + }, + { + "epoch": 1.3938294010889292, + "grad_norm": 0.41755786538124084, + "learning_rate": 0.0002, + "loss": 0.9082, + "step": 1920 + }, + { + "epoch": 1.4010889292196007, + "grad_norm": 0.5626114010810852, + "learning_rate": 0.0002, + "loss": 1.0858, + "step": 1930 + }, + { + "epoch": 1.4083484573502723, + "grad_norm": 0.44038185477256775, + "learning_rate": 0.0002, + "loss": 1.0687, + "step": 1940 + }, + { + "epoch": 1.4156079854809438, + "grad_norm": 0.3397001624107361, + "learning_rate": 0.0002, + "loss": 0.9454, + "step": 1950 + }, + { + "epoch": 1.4228675136116151, + "grad_norm": 0.4325368106365204, + "learning_rate": 0.0002, + "loss": 0.943, + "step": 1960 + }, + { + "epoch": 1.4301270417422867, + "grad_norm": 0.3900907039642334, + "learning_rate": 0.0002, + "loss": 1.0488, + "step": 1970 + }, + { + "epoch": 1.4373865698729582, + "grad_norm": 0.369612455368042, + "learning_rate": 0.0002, + "loss": 0.9699, + "step": 1980 + }, + { + "epoch": 1.4446460980036298, + "grad_norm": 0.4389338791370392, + "learning_rate": 0.0002, + "loss": 1.0609, + "step": 1990 + }, + { + "epoch": 1.4519056261343013, + "grad_norm": 1.694450855255127, + "learning_rate": 0.0002, + "loss": 1.042, + "step": 2000 + }, + { + "epoch": 1.4591651542649728, + "grad_norm": 0.516957700252533, + "learning_rate": 0.0002, + "loss": 0.997, + "step": 2010 + }, + { + "epoch": 1.4664246823956444, + "grad_norm": 0.45515501499176025, + "learning_rate": 0.0002, + "loss": 1.0979, + "step": 2020 + }, + { + "epoch": 1.4736842105263157, + "grad_norm": 0.4153139591217041, + "learning_rate": 0.0002, + "loss": 1.0188, + "step": 2030 + }, + { + "epoch": 1.4809437386569873, + "grad_norm": 0.44353052973747253, + "learning_rate": 0.0002, + "loss": 1.1521, + "step": 2040 + }, + { + "epoch": 1.4882032667876588, + "grad_norm": 0.570554256439209, + "learning_rate": 0.0002, + "loss": 0.9653, + "step": 2050 + }, + { + "epoch": 1.4954627949183303, + "grad_norm": 0.5742740035057068, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 2060 + }, + { + "epoch": 1.5027223230490017, + "grad_norm": 0.5890517830848694, + "learning_rate": 0.0002, + "loss": 1.0963, + "step": 2070 + }, + { + "epoch": 1.5099818511796732, + "grad_norm": 0.4162650406360626, + "learning_rate": 0.0002, + "loss": 1.0639, + "step": 2080 + }, + { + "epoch": 1.5172413793103448, + "grad_norm": 0.7334717512130737, + "learning_rate": 0.0002, + "loss": 1.071, + "step": 2090 + }, + { + "epoch": 1.5245009074410163, + "grad_norm": 0.2963249683380127, + "learning_rate": 0.0002, + "loss": 0.8957, + "step": 2100 + }, + { + "epoch": 1.5317604355716878, + "grad_norm": 0.30676454305648804, + "learning_rate": 0.0002, + "loss": 1.0446, + "step": 2110 + }, + { + "epoch": 1.5390199637023594, + "grad_norm": 0.35984641313552856, + "learning_rate": 0.0002, + "loss": 0.9967, + "step": 2120 + }, + { + "epoch": 1.546279491833031, + "grad_norm": 0.3384549617767334, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 2130 + }, + { + "epoch": 1.5535390199637025, + "grad_norm": 0.4725518226623535, + "learning_rate": 0.0002, + "loss": 0.9874, + "step": 2140 + }, + { + "epoch": 1.560798548094374, + "grad_norm": 0.3252685070037842, + "learning_rate": 0.0002, + "loss": 1.1166, + "step": 2150 + }, + { + "epoch": 1.5680580762250453, + "grad_norm": 0.45043081045150757, + "learning_rate": 0.0002, + "loss": 0.9127, + "step": 2160 + }, + { + "epoch": 1.5753176043557169, + "grad_norm": 0.374208003282547, + "learning_rate": 0.0002, + "loss": 1.0767, + "step": 2170 + }, + { + "epoch": 1.5825771324863884, + "grad_norm": 0.5118404030799866, + "learning_rate": 0.0002, + "loss": 1.0255, + "step": 2180 + }, + { + "epoch": 1.5898366606170597, + "grad_norm": 0.482170969247818, + "learning_rate": 0.0002, + "loss": 1.0062, + "step": 2190 + }, + { + "epoch": 1.5970961887477313, + "grad_norm": 0.5337533950805664, + "learning_rate": 0.0002, + "loss": 1.0872, + "step": 2200 + }, + { + "epoch": 1.6043557168784028, + "grad_norm": 0.5195064544677734, + "learning_rate": 0.0002, + "loss": 1.0405, + "step": 2210 + }, + { + "epoch": 1.6116152450090744, + "grad_norm": 0.30807098746299744, + "learning_rate": 0.0002, + "loss": 1.0454, + "step": 2220 + }, + { + "epoch": 1.618874773139746, + "grad_norm": 0.3962925672531128, + "learning_rate": 0.0002, + "loss": 1.0293, + "step": 2230 + }, + { + "epoch": 1.6261343012704175, + "grad_norm": 0.7636962532997131, + "learning_rate": 0.0002, + "loss": 1.0137, + "step": 2240 + }, + { + "epoch": 1.633393829401089, + "grad_norm": 0.32380592823028564, + "learning_rate": 0.0002, + "loss": 0.991, + "step": 2250 + }, + { + "epoch": 1.6406533575317606, + "grad_norm": 0.5767741799354553, + "learning_rate": 0.0002, + "loss": 0.9471, + "step": 2260 + }, + { + "epoch": 1.647912885662432, + "grad_norm": 0.39964812994003296, + "learning_rate": 0.0002, + "loss": 0.9524, + "step": 2270 + }, + { + "epoch": 1.6551724137931034, + "grad_norm": 0.622629702091217, + "learning_rate": 0.0002, + "loss": 0.9866, + "step": 2280 + }, + { + "epoch": 1.662431941923775, + "grad_norm": 0.40202152729034424, + "learning_rate": 0.0002, + "loss": 0.9609, + "step": 2290 + }, + { + "epoch": 1.6696914700544465, + "grad_norm": 0.4467349052429199, + "learning_rate": 0.0002, + "loss": 0.9859, + "step": 2300 + }, + { + "epoch": 1.6769509981851178, + "grad_norm": 0.5026949048042297, + "learning_rate": 0.0002, + "loss": 1.0312, + "step": 2310 + }, + { + "epoch": 1.6842105263157894, + "grad_norm": 0.43754541873931885, + "learning_rate": 0.0002, + "loss": 1.0776, + "step": 2320 + }, + { + "epoch": 1.691470054446461, + "grad_norm": 0.42869430780410767, + "learning_rate": 0.0002, + "loss": 0.9786, + "step": 2330 + }, + { + "epoch": 1.6987295825771325, + "grad_norm": 0.4192679524421692, + "learning_rate": 0.0002, + "loss": 1.0747, + "step": 2340 + }, + { + "epoch": 1.705989110707804, + "grad_norm": 0.3243155777454376, + "learning_rate": 0.0002, + "loss": 1.0054, + "step": 2350 + }, + { + "epoch": 1.7132486388384756, + "grad_norm": 1.0514075756072998, + "learning_rate": 0.0002, + "loss": 0.9165, + "step": 2360 + }, + { + "epoch": 1.720508166969147, + "grad_norm": 0.4719122648239136, + "learning_rate": 0.0002, + "loss": 1.1353, + "step": 2370 + }, + { + "epoch": 1.7277676950998186, + "grad_norm": 0.3846144676208496, + "learning_rate": 0.0002, + "loss": 0.9913, + "step": 2380 + }, + { + "epoch": 1.73502722323049, + "grad_norm": 0.7266581058502197, + "learning_rate": 0.0002, + "loss": 0.9935, + "step": 2390 + }, + { + "epoch": 1.7422867513611615, + "grad_norm": 0.6183241605758667, + "learning_rate": 0.0002, + "loss": 0.9509, + "step": 2400 + }, + { + "epoch": 1.749546279491833, + "grad_norm": 0.3658260405063629, + "learning_rate": 0.0002, + "loss": 0.9713, + "step": 2410 + }, + { + "epoch": 1.7568058076225044, + "grad_norm": 0.6036322712898254, + "learning_rate": 0.0002, + "loss": 1.0033, + "step": 2420 + }, + { + "epoch": 1.764065335753176, + "grad_norm": 0.7872936129570007, + "learning_rate": 0.0002, + "loss": 1.0526, + "step": 2430 + }, + { + "epoch": 1.7713248638838475, + "grad_norm": 0.35946124792099, + "learning_rate": 0.0002, + "loss": 0.99, + "step": 2440 + }, + { + "epoch": 1.778584392014519, + "grad_norm": 0.3740338981151581, + "learning_rate": 0.0002, + "loss": 0.8845, + "step": 2450 + }, + { + "epoch": 1.7858439201451906, + "grad_norm": 0.6150230169296265, + "learning_rate": 0.0002, + "loss": 0.9333, + "step": 2460 + }, + { + "epoch": 1.793103448275862, + "grad_norm": 0.4726075530052185, + "learning_rate": 0.0002, + "loss": 1.0373, + "step": 2470 + }, + { + "epoch": 1.8003629764065336, + "grad_norm": 0.31292253732681274, + "learning_rate": 0.0002, + "loss": 0.974, + "step": 2480 + }, + { + "epoch": 1.8076225045372052, + "grad_norm": 0.4463104009628296, + "learning_rate": 0.0002, + "loss": 1.0205, + "step": 2490 + }, + { + "epoch": 1.8148820326678767, + "grad_norm": 0.7848200798034668, + "learning_rate": 0.0002, + "loss": 1.0771, + "step": 2500 + }, + { + "epoch": 1.822141560798548, + "grad_norm": 0.5562082529067993, + "learning_rate": 0.0002, + "loss": 0.9508, + "step": 2510 + }, + { + "epoch": 1.8294010889292196, + "grad_norm": 0.39892756938934326, + "learning_rate": 0.0002, + "loss": 0.9343, + "step": 2520 + }, + { + "epoch": 1.8366606170598911, + "grad_norm": 1.2923320531845093, + "learning_rate": 0.0002, + "loss": 1.0489, + "step": 2530 + }, + { + "epoch": 1.8439201451905625, + "grad_norm": 0.6316490769386292, + "learning_rate": 0.0002, + "loss": 1.1203, + "step": 2540 + }, + { + "epoch": 1.851179673321234, + "grad_norm": 0.46100497245788574, + "learning_rate": 0.0002, + "loss": 0.9131, + "step": 2550 + }, + { + "epoch": 1.8584392014519056, + "grad_norm": 0.7902987003326416, + "learning_rate": 0.0002, + "loss": 1.0977, + "step": 2560 + }, + { + "epoch": 1.865698729582577, + "grad_norm": 0.4596365690231323, + "learning_rate": 0.0002, + "loss": 0.9702, + "step": 2570 + }, + { + "epoch": 1.8729582577132486, + "grad_norm": 0.6592172384262085, + "learning_rate": 0.0002, + "loss": 0.9322, + "step": 2580 + }, + { + "epoch": 1.8802177858439202, + "grad_norm": 0.5764662027359009, + "learning_rate": 0.0002, + "loss": 0.9549, + "step": 2590 + }, + { + "epoch": 1.8874773139745917, + "grad_norm": 0.8421637415885925, + "learning_rate": 0.0002, + "loss": 0.9485, + "step": 2600 + }, + { + "epoch": 1.8947368421052633, + "grad_norm": 0.5635305047035217, + "learning_rate": 0.0002, + "loss": 1.012, + "step": 2610 + }, + { + "epoch": 1.9019963702359348, + "grad_norm": 0.46231237053871155, + "learning_rate": 0.0002, + "loss": 0.8907, + "step": 2620 + }, + { + "epoch": 1.9092558983666061, + "grad_norm": 0.3944607079029083, + "learning_rate": 0.0002, + "loss": 0.9543, + "step": 2630 + }, + { + "epoch": 1.9165154264972777, + "grad_norm": 0.4753907322883606, + "learning_rate": 0.0002, + "loss": 0.9964, + "step": 2640 + }, + { + "epoch": 1.9237749546279492, + "grad_norm": 0.4151090979576111, + "learning_rate": 0.0002, + "loss": 1.0217, + "step": 2650 + }, + { + "epoch": 1.9310344827586206, + "grad_norm": 0.6793725490570068, + "learning_rate": 0.0002, + "loss": 1.0397, + "step": 2660 + }, + { + "epoch": 1.938294010889292, + "grad_norm": 0.339755117893219, + "learning_rate": 0.0002, + "loss": 1.0401, + "step": 2670 + }, + { + "epoch": 1.9455535390199636, + "grad_norm": 0.40789374709129333, + "learning_rate": 0.0002, + "loss": 1.082, + "step": 2680 + }, + { + "epoch": 1.9528130671506352, + "grad_norm": 0.3750005066394806, + "learning_rate": 0.0002, + "loss": 0.9973, + "step": 2690 + }, + { + "epoch": 1.9600725952813067, + "grad_norm": 0.39684441685676575, + "learning_rate": 0.0002, + "loss": 0.9837, + "step": 2700 + }, + { + "epoch": 1.9673321234119783, + "grad_norm": 0.378287672996521, + "learning_rate": 0.0002, + "loss": 1.1204, + "step": 2710 + }, + { + "epoch": 1.9745916515426498, + "grad_norm": 0.3668482005596161, + "learning_rate": 0.0002, + "loss": 0.9284, + "step": 2720 + }, + { + "epoch": 1.9818511796733214, + "grad_norm": 0.49997565150260925, + "learning_rate": 0.0002, + "loss": 1.0512, + "step": 2730 + }, + { + "epoch": 1.989110707803993, + "grad_norm": 0.36852124333381653, + "learning_rate": 0.0002, + "loss": 0.9311, + "step": 2740 + }, + { + "epoch": 1.9963702359346642, + "grad_norm": 0.5203380584716797, + "learning_rate": 0.0002, + "loss": 1.0898, + "step": 2750 + }, + { + "epoch": 2.0, + "eval_loss": 1.1086540222167969, + "eval_runtime": 95.6866, + "eval_samples_per_second": 4.557, + "eval_steps_per_second": 0.575, + "step": 2755 + }, + { + "epoch": 2.0036297640653356, + "grad_norm": 0.35921573638916016, + "learning_rate": 0.0002, + "loss": 0.9791, + "step": 2760 + }, + { + "epoch": 2.010889292196007, + "grad_norm": 1.013861894607544, + "learning_rate": 0.0002, + "loss": 0.8968, + "step": 2770 + }, + { + "epoch": 2.0181488203266786, + "grad_norm": 0.4425240159034729, + "learning_rate": 0.0002, + "loss": 0.7873, + "step": 2780 + }, + { + "epoch": 2.02540834845735, + "grad_norm": 0.60174161195755, + "learning_rate": 0.0002, + "loss": 0.8758, + "step": 2790 + }, + { + "epoch": 2.0326678765880217, + "grad_norm": 0.47582098841667175, + "learning_rate": 0.0002, + "loss": 0.7989, + "step": 2800 + }, + { + "epoch": 2.0399274047186933, + "grad_norm": 0.6012811660766602, + "learning_rate": 0.0002, + "loss": 0.8898, + "step": 2810 + }, + { + "epoch": 2.047186932849365, + "grad_norm": 0.4444408118724823, + "learning_rate": 0.0002, + "loss": 0.8109, + "step": 2820 + }, + { + "epoch": 2.0544464609800364, + "grad_norm": 0.4864003360271454, + "learning_rate": 0.0002, + "loss": 0.8621, + "step": 2830 + }, + { + "epoch": 2.061705989110708, + "grad_norm": 0.5104215741157532, + "learning_rate": 0.0002, + "loss": 0.8817, + "step": 2840 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 0.6218489408493042, + "learning_rate": 0.0002, + "loss": 0.7789, + "step": 2850 + }, + { + "epoch": 2.076225045372051, + "grad_norm": 0.705784261226654, + "learning_rate": 0.0002, + "loss": 0.8778, + "step": 2860 + }, + { + "epoch": 2.0834845735027225, + "grad_norm": 0.48091503977775574, + "learning_rate": 0.0002, + "loss": 0.7851, + "step": 2870 + }, + { + "epoch": 2.0907441016333936, + "grad_norm": 0.5062456727027893, + "learning_rate": 0.0002, + "loss": 0.8444, + "step": 2880 + }, + { + "epoch": 2.098003629764065, + "grad_norm": 0.4862022399902344, + "learning_rate": 0.0002, + "loss": 0.7898, + "step": 2890 + }, + { + "epoch": 2.1052631578947367, + "grad_norm": 0.48264044523239136, + "learning_rate": 0.0002, + "loss": 0.8593, + "step": 2900 + }, + { + "epoch": 2.1125226860254083, + "grad_norm": 0.43744346499443054, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 2910 + }, + { + "epoch": 2.11978221415608, + "grad_norm": 0.5480492115020752, + "learning_rate": 0.0002, + "loss": 0.9099, + "step": 2920 + }, + { + "epoch": 2.1270417422867514, + "grad_norm": 0.5068560838699341, + "learning_rate": 0.0002, + "loss": 0.7727, + "step": 2930 + }, + { + "epoch": 2.134301270417423, + "grad_norm": 0.4650852680206299, + "learning_rate": 0.0002, + "loss": 0.8403, + "step": 2940 + }, + { + "epoch": 2.1415607985480944, + "grad_norm": 0.4929981231689453, + "learning_rate": 0.0002, + "loss": 0.8467, + "step": 2950 + }, + { + "epoch": 2.148820326678766, + "grad_norm": 0.6537389755249023, + "learning_rate": 0.0002, + "loss": 0.8747, + "step": 2960 + }, + { + "epoch": 2.1560798548094375, + "grad_norm": 0.8032940626144409, + "learning_rate": 0.0002, + "loss": 0.7663, + "step": 2970 + }, + { + "epoch": 2.163339382940109, + "grad_norm": 0.7131643891334534, + "learning_rate": 0.0002, + "loss": 0.7604, + "step": 2980 + }, + { + "epoch": 2.1705989110707806, + "grad_norm": 0.6034275889396667, + "learning_rate": 0.0002, + "loss": 0.8424, + "step": 2990 + }, + { + "epoch": 2.1778584392014517, + "grad_norm": 0.6081095933914185, + "learning_rate": 0.0002, + "loss": 0.8838, + "step": 3000 + }, + { + "epoch": 2.1851179673321233, + "grad_norm": 0.5706912875175476, + "learning_rate": 0.0002, + "loss": 0.9013, + "step": 3010 + }, + { + "epoch": 2.192377495462795, + "grad_norm": 0.6742380261421204, + "learning_rate": 0.0002, + "loss": 0.8473, + "step": 3020 + }, + { + "epoch": 2.1996370235934664, + "grad_norm": 0.4847496449947357, + "learning_rate": 0.0002, + "loss": 0.8372, + "step": 3030 + }, + { + "epoch": 2.206896551724138, + "grad_norm": 0.5529342889785767, + "learning_rate": 0.0002, + "loss": 0.8487, + "step": 3040 + }, + { + "epoch": 2.2141560798548094, + "grad_norm": 0.6108783483505249, + "learning_rate": 0.0002, + "loss": 0.9057, + "step": 3050 + }, + { + "epoch": 2.221415607985481, + "grad_norm": 0.8841571807861328, + "learning_rate": 0.0002, + "loss": 0.9659, + "step": 3060 + }, + { + "epoch": 2.2286751361161525, + "grad_norm": 0.4227530360221863, + "learning_rate": 0.0002, + "loss": 0.8538, + "step": 3070 + }, + { + "epoch": 2.235934664246824, + "grad_norm": 0.461935818195343, + "learning_rate": 0.0002, + "loss": 0.7925, + "step": 3080 + }, + { + "epoch": 2.2431941923774956, + "grad_norm": 0.5407412648200989, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 3090 + }, + { + "epoch": 2.250453720508167, + "grad_norm": 0.5057998895645142, + "learning_rate": 0.0002, + "loss": 0.8392, + "step": 3100 + }, + { + "epoch": 2.2577132486388383, + "grad_norm": 0.530057966709137, + "learning_rate": 0.0002, + "loss": 0.7898, + "step": 3110 + }, + { + "epoch": 2.26497277676951, + "grad_norm": 0.5066842436790466, + "learning_rate": 0.0002, + "loss": 0.8507, + "step": 3120 + }, + { + "epoch": 2.2722323049001814, + "grad_norm": 0.5069178342819214, + "learning_rate": 0.0002, + "loss": 0.7903, + "step": 3130 + }, + { + "epoch": 2.279491833030853, + "grad_norm": 0.6095499396324158, + "learning_rate": 0.0002, + "loss": 0.8458, + "step": 3140 + }, + { + "epoch": 2.2867513611615244, + "grad_norm": 0.49524766206741333, + "learning_rate": 0.0002, + "loss": 0.8171, + "step": 3150 + }, + { + "epoch": 2.294010889292196, + "grad_norm": 0.5334409475326538, + "learning_rate": 0.0002, + "loss": 0.7568, + "step": 3160 + }, + { + "epoch": 2.3012704174228675, + "grad_norm": 1.681748867034912, + "learning_rate": 0.0002, + "loss": 0.7533, + "step": 3170 + }, + { + "epoch": 2.308529945553539, + "grad_norm": 0.7225565314292908, + "learning_rate": 0.0002, + "loss": 0.8155, + "step": 3180 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 0.5379388928413391, + "learning_rate": 0.0002, + "loss": 0.8034, + "step": 3190 + }, + { + "epoch": 2.323049001814882, + "grad_norm": 0.45770326256752014, + "learning_rate": 0.0002, + "loss": 0.8833, + "step": 3200 + }, + { + "epoch": 2.3303085299455537, + "grad_norm": 0.6984533071517944, + "learning_rate": 0.0002, + "loss": 0.8419, + "step": 3210 + }, + { + "epoch": 2.337568058076225, + "grad_norm": 0.6725744605064392, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 3220 + }, + { + "epoch": 2.344827586206897, + "grad_norm": 1.1247471570968628, + "learning_rate": 0.0002, + "loss": 0.8657, + "step": 3230 + }, + { + "epoch": 2.352087114337568, + "grad_norm": 1.0240263938903809, + "learning_rate": 0.0002, + "loss": 0.7782, + "step": 3240 + }, + { + "epoch": 2.3593466424682394, + "grad_norm": 0.5608096122741699, + "learning_rate": 0.0002, + "loss": 0.826, + "step": 3250 + }, + { + "epoch": 2.366606170598911, + "grad_norm": 0.8294990062713623, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 3260 + }, + { + "epoch": 2.3738656987295825, + "grad_norm": 0.6734224557876587, + "learning_rate": 0.0002, + "loss": 0.9845, + "step": 3270 + }, + { + "epoch": 2.381125226860254, + "grad_norm": 0.6862800717353821, + "learning_rate": 0.0002, + "loss": 0.7921, + "step": 3280 + }, + { + "epoch": 2.3883847549909256, + "grad_norm": 0.5442930459976196, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 3290 + }, + { + "epoch": 2.395644283121597, + "grad_norm": 0.5745394229888916, + "learning_rate": 0.0002, + "loss": 0.8543, + "step": 3300 + }, + { + "epoch": 2.4029038112522687, + "grad_norm": 0.6257799863815308, + "learning_rate": 0.0002, + "loss": 0.833, + "step": 3310 + }, + { + "epoch": 2.4101633393829403, + "grad_norm": 0.5608420968055725, + "learning_rate": 0.0002, + "loss": 0.8191, + "step": 3320 + }, + { + "epoch": 2.417422867513612, + "grad_norm": 0.5512017011642456, + "learning_rate": 0.0002, + "loss": 0.8745, + "step": 3330 + }, + { + "epoch": 2.4246823956442833, + "grad_norm": 0.7452999353408813, + "learning_rate": 0.0002, + "loss": 0.8642, + "step": 3340 + }, + { + "epoch": 2.4319419237749544, + "grad_norm": 0.4604301452636719, + "learning_rate": 0.0002, + "loss": 0.8417, + "step": 3350 + }, + { + "epoch": 2.439201451905626, + "grad_norm": 0.8225823640823364, + "learning_rate": 0.0002, + "loss": 0.9148, + "step": 3360 + }, + { + "epoch": 2.4464609800362975, + "grad_norm": 0.8604981303215027, + "learning_rate": 0.0002, + "loss": 0.8144, + "step": 3370 + }, + { + "epoch": 2.453720508166969, + "grad_norm": 0.6620925664901733, + "learning_rate": 0.0002, + "loss": 0.9276, + "step": 3380 + }, + { + "epoch": 2.4609800362976406, + "grad_norm": 0.4750158488750458, + "learning_rate": 0.0002, + "loss": 0.8381, + "step": 3390 + }, + { + "epoch": 2.468239564428312, + "grad_norm": 0.6061418056488037, + "learning_rate": 0.0002, + "loss": 0.8249, + "step": 3400 + }, + { + "epoch": 2.4754990925589837, + "grad_norm": 0.5977247357368469, + "learning_rate": 0.0002, + "loss": 0.8944, + "step": 3410 + }, + { + "epoch": 2.4827586206896552, + "grad_norm": 0.5004227757453918, + "learning_rate": 0.0002, + "loss": 0.8366, + "step": 3420 + }, + { + "epoch": 2.490018148820327, + "grad_norm": 0.46428972482681274, + "learning_rate": 0.0002, + "loss": 0.9133, + "step": 3430 + }, + { + "epoch": 2.4972776769509983, + "grad_norm": 0.7658395171165466, + "learning_rate": 0.0002, + "loss": 0.8534, + "step": 3440 + }, + { + "epoch": 2.50453720508167, + "grad_norm": 0.621819794178009, + "learning_rate": 0.0002, + "loss": 0.9066, + "step": 3450 + }, + { + "epoch": 2.511796733212341, + "grad_norm": 0.4556088149547577, + "learning_rate": 0.0002, + "loss": 0.8481, + "step": 3460 + }, + { + "epoch": 2.519056261343013, + "grad_norm": 0.6124140024185181, + "learning_rate": 0.0002, + "loss": 0.8522, + "step": 3470 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 0.6256231665611267, + "learning_rate": 0.0002, + "loss": 0.8158, + "step": 3480 + }, + { + "epoch": 2.5335753176043556, + "grad_norm": 0.5464141964912415, + "learning_rate": 0.0002, + "loss": 0.9097, + "step": 3490 + }, + { + "epoch": 2.540834845735027, + "grad_norm": 0.51471346616745, + "learning_rate": 0.0002, + "loss": 0.8708, + "step": 3500 + }, + { + "epoch": 2.5480943738656987, + "grad_norm": 0.5326165556907654, + "learning_rate": 0.0002, + "loss": 0.9042, + "step": 3510 + }, + { + "epoch": 2.5553539019963702, + "grad_norm": 0.4750378429889679, + "learning_rate": 0.0002, + "loss": 0.8749, + "step": 3520 + }, + { + "epoch": 2.562613430127042, + "grad_norm": 0.5292279124259949, + "learning_rate": 0.0002, + "loss": 0.8248, + "step": 3530 + }, + { + "epoch": 2.5698729582577133, + "grad_norm": 0.6145227551460266, + "learning_rate": 0.0002, + "loss": 0.8808, + "step": 3540 + }, + { + "epoch": 2.577132486388385, + "grad_norm": 0.8275189399719238, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 3550 + }, + { + "epoch": 2.5843920145190564, + "grad_norm": 0.5037438273429871, + "learning_rate": 0.0002, + "loss": 0.875, + "step": 3560 + }, + { + "epoch": 2.5916515426497275, + "grad_norm": 0.5838707685470581, + "learning_rate": 0.0002, + "loss": 0.8324, + "step": 3570 + }, + { + "epoch": 2.5989110707803995, + "grad_norm": 0.5398710370063782, + "learning_rate": 0.0002, + "loss": 0.8716, + "step": 3580 + }, + { + "epoch": 2.6061705989110706, + "grad_norm": 0.6115376949310303, + "learning_rate": 0.0002, + "loss": 0.7843, + "step": 3590 + }, + { + "epoch": 2.613430127041742, + "grad_norm": 1.118809461593628, + "learning_rate": 0.0002, + "loss": 0.8142, + "step": 3600 + }, + { + "epoch": 2.6206896551724137, + "grad_norm": 0.7811771631240845, + "learning_rate": 0.0002, + "loss": 0.8282, + "step": 3610 + }, + { + "epoch": 2.6279491833030852, + "grad_norm": 0.5753175020217896, + "learning_rate": 0.0002, + "loss": 0.7997, + "step": 3620 + }, + { + "epoch": 2.635208711433757, + "grad_norm": 0.550829291343689, + "learning_rate": 0.0002, + "loss": 0.7705, + "step": 3630 + }, + { + "epoch": 2.6424682395644283, + "grad_norm": 0.5360019207000732, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 3640 + }, + { + "epoch": 2.6497277676951, + "grad_norm": 0.63050377368927, + "learning_rate": 0.0002, + "loss": 0.7648, + "step": 3650 + }, + { + "epoch": 2.6569872958257714, + "grad_norm": 0.5833110213279724, + "learning_rate": 0.0002, + "loss": 0.9156, + "step": 3660 + }, + { + "epoch": 2.664246823956443, + "grad_norm": 0.5543047189712524, + "learning_rate": 0.0002, + "loss": 0.8033, + "step": 3670 + }, + { + "epoch": 2.671506352087114, + "grad_norm": 0.6842212080955505, + "learning_rate": 0.0002, + "loss": 0.8867, + "step": 3680 + }, + { + "epoch": 2.678765880217786, + "grad_norm": 0.6859333515167236, + "learning_rate": 0.0002, + "loss": 0.8569, + "step": 3690 + }, + { + "epoch": 2.686025408348457, + "grad_norm": 0.7038410902023315, + "learning_rate": 0.0002, + "loss": 0.8081, + "step": 3700 + }, + { + "epoch": 2.6932849364791287, + "grad_norm": 0.447233647108078, + "learning_rate": 0.0002, + "loss": 0.8686, + "step": 3710 + }, + { + "epoch": 2.7005444646098002, + "grad_norm": 0.6453872919082642, + "learning_rate": 0.0002, + "loss": 0.8951, + "step": 3720 + }, + { + "epoch": 2.707803992740472, + "grad_norm": 0.8025672435760498, + "learning_rate": 0.0002, + "loss": 0.8292, + "step": 3730 + }, + { + "epoch": 2.7150635208711433, + "grad_norm": 0.5997087955474854, + "learning_rate": 0.0002, + "loss": 0.8488, + "step": 3740 + }, + { + "epoch": 2.722323049001815, + "grad_norm": 0.6901142001152039, + "learning_rate": 0.0002, + "loss": 0.8308, + "step": 3750 + }, + { + "epoch": 2.7295825771324864, + "grad_norm": 1.036145567893982, + "learning_rate": 0.0002, + "loss": 0.8517, + "step": 3760 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 0.7207072377204895, + "learning_rate": 0.0002, + "loss": 0.8455, + "step": 3770 + }, + { + "epoch": 2.7441016333938295, + "grad_norm": 1.0452989339828491, + "learning_rate": 0.0002, + "loss": 0.8009, + "step": 3780 + }, + { + "epoch": 2.751361161524501, + "grad_norm": 0.5615278482437134, + "learning_rate": 0.0002, + "loss": 0.8868, + "step": 3790 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 0.46439215540885925, + "learning_rate": 0.0002, + "loss": 0.9047, + "step": 3800 + }, + { + "epoch": 2.7658802177858437, + "grad_norm": 0.7134917974472046, + "learning_rate": 0.0002, + "loss": 0.7847, + "step": 3810 + }, + { + "epoch": 2.7731397459165157, + "grad_norm": 0.5139115452766418, + "learning_rate": 0.0002, + "loss": 0.8315, + "step": 3820 + }, + { + "epoch": 2.780399274047187, + "grad_norm": 0.8595781326293945, + "learning_rate": 0.0002, + "loss": 0.8995, + "step": 3830 + }, + { + "epoch": 2.7876588021778583, + "grad_norm": 0.544614851474762, + "learning_rate": 0.0002, + "loss": 0.9079, + "step": 3840 + }, + { + "epoch": 2.79491833030853, + "grad_norm": 0.6073850393295288, + "learning_rate": 0.0002, + "loss": 0.8322, + "step": 3850 + }, + { + "epoch": 2.8021778584392014, + "grad_norm": 0.8224069476127625, + "learning_rate": 0.0002, + "loss": 0.8277, + "step": 3860 + }, + { + "epoch": 2.809437386569873, + "grad_norm": 0.5347970128059387, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 3870 + }, + { + "epoch": 2.8166969147005445, + "grad_norm": 0.6799601912498474, + "learning_rate": 0.0002, + "loss": 0.8219, + "step": 3880 + }, + { + "epoch": 2.823956442831216, + "grad_norm": 0.5219197869300842, + "learning_rate": 0.0002, + "loss": 0.8796, + "step": 3890 + }, + { + "epoch": 2.8312159709618876, + "grad_norm": 0.5710130333900452, + "learning_rate": 0.0002, + "loss": 0.8649, + "step": 3900 + }, + { + "epoch": 2.838475499092559, + "grad_norm": 0.5857213139533997, + "learning_rate": 0.0002, + "loss": 0.8741, + "step": 3910 + }, + { + "epoch": 2.8457350272232302, + "grad_norm": 0.5206644535064697, + "learning_rate": 0.0002, + "loss": 0.8154, + "step": 3920 + }, + { + "epoch": 2.8529945553539022, + "grad_norm": 1.2902015447616577, + "learning_rate": 0.0002, + "loss": 0.9328, + "step": 3930 + }, + { + "epoch": 2.8602540834845733, + "grad_norm": 0.5252797603607178, + "learning_rate": 0.0002, + "loss": 0.8192, + "step": 3940 + }, + { + "epoch": 2.867513611615245, + "grad_norm": 0.5925108790397644, + "learning_rate": 0.0002, + "loss": 0.9047, + "step": 3950 + }, + { + "epoch": 2.8747731397459164, + "grad_norm": 0.9719610810279846, + "learning_rate": 0.0002, + "loss": 0.8611, + "step": 3960 + }, + { + "epoch": 2.882032667876588, + "grad_norm": 0.6834747195243835, + "learning_rate": 0.0002, + "loss": 0.8531, + "step": 3970 + }, + { + "epoch": 2.8892921960072595, + "grad_norm": 0.6952353119850159, + "learning_rate": 0.0002, + "loss": 0.8124, + "step": 3980 + }, + { + "epoch": 2.896551724137931, + "grad_norm": 0.49889910221099854, + "learning_rate": 0.0002, + "loss": 0.8365, + "step": 3990 + }, + { + "epoch": 2.9038112522686026, + "grad_norm": 0.5007026791572571, + "learning_rate": 0.0002, + "loss": 0.8197, + "step": 4000 + }, + { + "epoch": 2.911070780399274, + "grad_norm": 0.5474239587783813, + "learning_rate": 0.0002, + "loss": 0.7752, + "step": 4010 + }, + { + "epoch": 2.9183303085299457, + "grad_norm": 0.6617428660392761, + "learning_rate": 0.0002, + "loss": 0.8579, + "step": 4020 + }, + { + "epoch": 2.925589836660617, + "grad_norm": 0.6097776293754578, + "learning_rate": 0.0002, + "loss": 0.8583, + "step": 4030 + }, + { + "epoch": 2.9328493647912888, + "grad_norm": 0.5985828638076782, + "learning_rate": 0.0002, + "loss": 0.8296, + "step": 4040 + }, + { + "epoch": 2.94010889292196, + "grad_norm": 0.769488513469696, + "learning_rate": 0.0002, + "loss": 0.9134, + "step": 4050 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 0.5167055130004883, + "learning_rate": 0.0002, + "loss": 0.8148, + "step": 4060 + }, + { + "epoch": 2.954627949183303, + "grad_norm": 0.6265496015548706, + "learning_rate": 0.0002, + "loss": 0.8665, + "step": 4070 + }, + { + "epoch": 2.9618874773139745, + "grad_norm": 1.2644082307815552, + "learning_rate": 0.0002, + "loss": 0.9218, + "step": 4080 + }, + { + "epoch": 2.969147005444646, + "grad_norm": 0.6007561087608337, + "learning_rate": 0.0002, + "loss": 0.8026, + "step": 4090 + }, + { + "epoch": 2.9764065335753176, + "grad_norm": 0.47984927892684937, + "learning_rate": 0.0002, + "loss": 0.8262, + "step": 4100 + }, + { + "epoch": 2.983666061705989, + "grad_norm": 1.128198504447937, + "learning_rate": 0.0002, + "loss": 0.7897, + "step": 4110 + }, + { + "epoch": 2.9909255898366607, + "grad_norm": 0.526292085647583, + "learning_rate": 0.0002, + "loss": 0.7883, + "step": 4120 + }, + { + "epoch": 2.9981851179673322, + "grad_norm": 0.5462674498558044, + "learning_rate": 0.0002, + "loss": 0.7801, + "step": 4130 + }, + { + "epoch": 2.9996370235934666, + "eval_loss": 1.1093357801437378, + "eval_runtime": 46.2498, + "eval_samples_per_second": 9.427, + "eval_steps_per_second": 1.189, + "step": 4132 + }, + { + "epoch": 3.0054446460980038, + "grad_norm": 0.48288026452064514, + "learning_rate": 0.0002, + "loss": 0.721, + "step": 4140 + }, + { + "epoch": 3.0127041742286753, + "grad_norm": 1.0181782245635986, + "learning_rate": 0.0002, + "loss": 0.6769, + "step": 4150 + }, + { + "epoch": 3.019963702359347, + "grad_norm": 0.7718019485473633, + "learning_rate": 0.0002, + "loss": 0.7185, + "step": 4160 + }, + { + "epoch": 3.027223230490018, + "grad_norm": 0.7492219805717468, + "learning_rate": 0.0002, + "loss": 0.6552, + "step": 4170 + }, + { + "epoch": 3.0344827586206895, + "grad_norm": 0.9363632798194885, + "learning_rate": 0.0002, + "loss": 0.6678, + "step": 4180 + }, + { + "epoch": 3.041742286751361, + "grad_norm": 0.6888533234596252, + "learning_rate": 0.0002, + "loss": 0.7187, + "step": 4190 + }, + { + "epoch": 3.0490018148820326, + "grad_norm": 0.7072834968566895, + "learning_rate": 0.0002, + "loss": 0.6469, + "step": 4200 + }, + { + "epoch": 3.056261343012704, + "grad_norm": 0.7182047963142395, + "learning_rate": 0.0002, + "loss": 0.6387, + "step": 4210 + }, + { + "epoch": 3.0635208711433757, + "grad_norm": 0.7194355130195618, + "learning_rate": 0.0002, + "loss": 0.6385, + "step": 4220 + }, + { + "epoch": 3.0707803992740472, + "grad_norm": 0.9454023838043213, + "learning_rate": 0.0002, + "loss": 0.5812, + "step": 4230 + }, + { + "epoch": 3.0780399274047188, + "grad_norm": 0.838657557964325, + "learning_rate": 0.0002, + "loss": 0.6036, + "step": 4240 + }, + { + "epoch": 3.0852994555353903, + "grad_norm": 0.740113377571106, + "learning_rate": 0.0002, + "loss": 0.646, + "step": 4250 + }, + { + "epoch": 3.092558983666062, + "grad_norm": 0.6616561412811279, + "learning_rate": 0.0002, + "loss": 0.604, + "step": 4260 + }, + { + "epoch": 3.0998185117967334, + "grad_norm": 0.8846506476402283, + "learning_rate": 0.0002, + "loss": 0.6462, + "step": 4270 + }, + { + "epoch": 3.107078039927405, + "grad_norm": 0.6322125792503357, + "learning_rate": 0.0002, + "loss": 0.6037, + "step": 4280 + }, + { + "epoch": 3.114337568058076, + "grad_norm": 0.7461467385292053, + "learning_rate": 0.0002, + "loss": 0.5953, + "step": 4290 + }, + { + "epoch": 3.1215970961887476, + "grad_norm": 0.8251287341117859, + "learning_rate": 0.0002, + "loss": 0.6099, + "step": 4300 + }, + { + "epoch": 3.128856624319419, + "grad_norm": 0.8767673373222351, + "learning_rate": 0.0002, + "loss": 0.6284, + "step": 4310 + }, + { + "epoch": 3.1361161524500907, + "grad_norm": 0.7758759260177612, + "learning_rate": 0.0002, + "loss": 0.7535, + "step": 4320 + }, + { + "epoch": 3.143375680580762, + "grad_norm": 1.1056879758834839, + "learning_rate": 0.0002, + "loss": 0.6624, + "step": 4330 + }, + { + "epoch": 3.1506352087114338, + "grad_norm": 0.8259835243225098, + "learning_rate": 0.0002, + "loss": 0.691, + "step": 4340 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 0.6607027053833008, + "learning_rate": 0.0002, + "loss": 0.6635, + "step": 4350 + }, + { + "epoch": 3.165154264972777, + "grad_norm": 0.7983301281929016, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 4360 + }, + { + "epoch": 3.1724137931034484, + "grad_norm": 0.6725239157676697, + "learning_rate": 0.0002, + "loss": 0.6496, + "step": 4370 + }, + { + "epoch": 3.17967332123412, + "grad_norm": 0.9052095413208008, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 4380 + }, + { + "epoch": 3.1869328493647915, + "grad_norm": 0.8131307363510132, + "learning_rate": 0.0002, + "loss": 0.6877, + "step": 4390 + }, + { + "epoch": 3.1941923774954626, + "grad_norm": 0.6435626149177551, + "learning_rate": 0.0002, + "loss": 0.6384, + "step": 4400 + }, + { + "epoch": 3.201451905626134, + "grad_norm": 0.84367436170578, + "learning_rate": 0.0002, + "loss": 0.5819, + "step": 4410 + }, + { + "epoch": 3.2087114337568057, + "grad_norm": 1.5018867254257202, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 4420 + }, + { + "epoch": 3.215970961887477, + "grad_norm": 0.7019091844558716, + "learning_rate": 0.0002, + "loss": 0.6838, + "step": 4430 + }, + { + "epoch": 3.2232304900181488, + "grad_norm": 0.9164197444915771, + "learning_rate": 0.0002, + "loss": 0.6153, + "step": 4440 + }, + { + "epoch": 3.2304900181488203, + "grad_norm": 0.7890861630439758, + "learning_rate": 0.0002, + "loss": 0.6618, + "step": 4450 + }, + { + "epoch": 3.237749546279492, + "grad_norm": 0.6517660617828369, + "learning_rate": 0.0002, + "loss": 0.6401, + "step": 4460 + }, + { + "epoch": 3.2450090744101634, + "grad_norm": 1.10188889503479, + "learning_rate": 0.0002, + "loss": 0.6699, + "step": 4470 + }, + { + "epoch": 3.252268602540835, + "grad_norm": 0.8158330917358398, + "learning_rate": 0.0002, + "loss": 0.6356, + "step": 4480 + }, + { + "epoch": 3.2595281306715065, + "grad_norm": 0.7663109302520752, + "learning_rate": 0.0002, + "loss": 0.7757, + "step": 4490 + }, + { + "epoch": 3.266787658802178, + "grad_norm": 0.8473444581031799, + "learning_rate": 0.0002, + "loss": 0.6539, + "step": 4500 + }, + { + "epoch": 3.274047186932849, + "grad_norm": 0.9724768996238708, + "learning_rate": 0.0002, + "loss": 0.6511, + "step": 4510 + }, + { + "epoch": 3.281306715063521, + "grad_norm": 0.8516759276390076, + "learning_rate": 0.0002, + "loss": 0.5464, + "step": 4520 + }, + { + "epoch": 3.288566243194192, + "grad_norm": 0.7543437480926514, + "learning_rate": 0.0002, + "loss": 0.6534, + "step": 4530 + }, + { + "epoch": 3.2958257713248638, + "grad_norm": 1.0472029447555542, + "learning_rate": 0.0002, + "loss": 0.6095, + "step": 4540 + }, + { + "epoch": 3.3030852994555353, + "grad_norm": 0.6240826845169067, + "learning_rate": 0.0002, + "loss": 0.6216, + "step": 4550 + }, + { + "epoch": 3.310344827586207, + "grad_norm": 0.9957774877548218, + "learning_rate": 0.0002, + "loss": 0.6223, + "step": 4560 + }, + { + "epoch": 3.3176043557168784, + "grad_norm": 0.6448912620544434, + "learning_rate": 0.0002, + "loss": 0.618, + "step": 4570 + }, + { + "epoch": 3.32486388384755, + "grad_norm": 0.7519692778587341, + "learning_rate": 0.0002, + "loss": 0.6188, + "step": 4580 + }, + { + "epoch": 3.3321234119782215, + "grad_norm": 0.7367453575134277, + "learning_rate": 0.0002, + "loss": 0.6672, + "step": 4590 + }, + { + "epoch": 3.339382940108893, + "grad_norm": 0.8064960837364197, + "learning_rate": 0.0002, + "loss": 0.6517, + "step": 4600 + }, + { + "epoch": 3.3466424682395646, + "grad_norm": 0.7664631009101868, + "learning_rate": 0.0002, + "loss": 0.6062, + "step": 4610 + }, + { + "epoch": 3.353901996370236, + "grad_norm": 0.7803396582603455, + "learning_rate": 0.0002, + "loss": 0.6834, + "step": 4620 + }, + { + "epoch": 3.3611615245009077, + "grad_norm": 0.9141599535942078, + "learning_rate": 0.0002, + "loss": 0.6961, + "step": 4630 + }, + { + "epoch": 3.3684210526315788, + "grad_norm": 0.9719856381416321, + "learning_rate": 0.0002, + "loss": 0.6889, + "step": 4640 + }, + { + "epoch": 3.3756805807622503, + "grad_norm": 0.9223218560218811, + "learning_rate": 0.0002, + "loss": 0.6914, + "step": 4650 + }, + { + "epoch": 3.382940108892922, + "grad_norm": 0.7289277911186218, + "learning_rate": 0.0002, + "loss": 0.5981, + "step": 4660 + }, + { + "epoch": 3.3901996370235934, + "grad_norm": 1.039724349975586, + "learning_rate": 0.0002, + "loss": 0.595, + "step": 4670 + }, + { + "epoch": 3.397459165154265, + "grad_norm": 1.397438883781433, + "learning_rate": 0.0002, + "loss": 0.8121, + "step": 4680 + }, + { + "epoch": 3.4047186932849365, + "grad_norm": 1.0069999694824219, + "learning_rate": 0.0002, + "loss": 0.6334, + "step": 4690 + }, + { + "epoch": 3.411978221415608, + "grad_norm": 0.816291332244873, + "learning_rate": 0.0002, + "loss": 0.6598, + "step": 4700 + }, + { + "epoch": 3.4192377495462796, + "grad_norm": 1.2831530570983887, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 4710 + }, + { + "epoch": 3.426497277676951, + "grad_norm": 0.9573889970779419, + "learning_rate": 0.0002, + "loss": 0.6625, + "step": 4720 + }, + { + "epoch": 3.4337568058076227, + "grad_norm": 0.7685632705688477, + "learning_rate": 0.0002, + "loss": 0.7279, + "step": 4730 + }, + { + "epoch": 3.441016333938294, + "grad_norm": 0.7019195556640625, + "learning_rate": 0.0002, + "loss": 0.6104, + "step": 4740 + }, + { + "epoch": 3.4482758620689653, + "grad_norm": 0.7244833707809448, + "learning_rate": 0.0002, + "loss": 0.7606, + "step": 4750 + }, + { + "epoch": 3.455535390199637, + "grad_norm": 1.3468551635742188, + "learning_rate": 0.0002, + "loss": 0.6951, + "step": 4760 + }, + { + "epoch": 3.4627949183303084, + "grad_norm": 0.822846531867981, + "learning_rate": 0.0002, + "loss": 0.6945, + "step": 4770 + }, + { + "epoch": 3.47005444646098, + "grad_norm": 0.7311608195304871, + "learning_rate": 0.0002, + "loss": 0.6431, + "step": 4780 + }, + { + "epoch": 3.4773139745916515, + "grad_norm": 0.9466770887374878, + "learning_rate": 0.0002, + "loss": 0.7019, + "step": 4790 + }, + { + "epoch": 3.484573502722323, + "grad_norm": 1.1527155637741089, + "learning_rate": 0.0002, + "loss": 0.7767, + "step": 4800 + }, + { + "epoch": 3.4918330308529946, + "grad_norm": 1.1288906335830688, + "learning_rate": 0.0002, + "loss": 0.6882, + "step": 4810 + }, + { + "epoch": 3.499092558983666, + "grad_norm": 0.9096164107322693, + "learning_rate": 0.0002, + "loss": 0.6564, + "step": 4820 + }, + { + "epoch": 3.5063520871143377, + "grad_norm": 0.7988565564155579, + "learning_rate": 0.0002, + "loss": 0.6127, + "step": 4830 + }, + { + "epoch": 3.513611615245009, + "grad_norm": 0.7183415293693542, + "learning_rate": 0.0002, + "loss": 0.7004, + "step": 4840 + }, + { + "epoch": 3.5208711433756807, + "grad_norm": 0.6614915132522583, + "learning_rate": 0.0002, + "loss": 0.74, + "step": 4850 + }, + { + "epoch": 3.528130671506352, + "grad_norm": 0.8609521985054016, + "learning_rate": 0.0002, + "loss": 0.7271, + "step": 4860 + }, + { + "epoch": 3.535390199637024, + "grad_norm": 0.86552894115448, + "learning_rate": 0.0002, + "loss": 0.6664, + "step": 4870 + }, + { + "epoch": 3.542649727767695, + "grad_norm": 0.6926496028900146, + "learning_rate": 0.0002, + "loss": 0.6432, + "step": 4880 + }, + { + "epoch": 3.5499092558983665, + "grad_norm": 0.8157467246055603, + "learning_rate": 0.0002, + "loss": 0.7117, + "step": 4890 + }, + { + "epoch": 3.557168784029038, + "grad_norm": 0.9085357189178467, + "learning_rate": 0.0002, + "loss": 0.6201, + "step": 4900 + }, + { + "epoch": 3.5644283121597096, + "grad_norm": 0.6322644948959351, + "learning_rate": 0.0002, + "loss": 0.6521, + "step": 4910 + }, + { + "epoch": 3.571687840290381, + "grad_norm": 1.263205885887146, + "learning_rate": 0.0002, + "loss": 0.6607, + "step": 4920 + }, + { + "epoch": 3.5789473684210527, + "grad_norm": 0.8901070356369019, + "learning_rate": 0.0002, + "loss": 0.6657, + "step": 4930 + }, + { + "epoch": 3.586206896551724, + "grad_norm": 0.7983952164649963, + "learning_rate": 0.0002, + "loss": 0.6434, + "step": 4940 + }, + { + "epoch": 3.5934664246823957, + "grad_norm": 0.9887813925743103, + "learning_rate": 0.0002, + "loss": 0.6861, + "step": 4950 + }, + { + "epoch": 3.6007259528130673, + "grad_norm": 0.7895187735557556, + "learning_rate": 0.0002, + "loss": 0.6502, + "step": 4960 + }, + { + "epoch": 3.6079854809437384, + "grad_norm": 0.9685819745063782, + "learning_rate": 0.0002, + "loss": 0.7111, + "step": 4970 + }, + { + "epoch": 3.6152450090744104, + "grad_norm": 0.6576591730117798, + "learning_rate": 0.0002, + "loss": 0.6915, + "step": 4980 + }, + { + "epoch": 3.6225045372050815, + "grad_norm": 0.856985330581665, + "learning_rate": 0.0002, + "loss": 0.6195, + "step": 4990 + }, + { + "epoch": 3.629764065335753, + "grad_norm": 0.7230252623558044, + "learning_rate": 0.0002, + "loss": 0.6318, + "step": 5000 + }, + { + "epoch": 3.6370235934664246, + "grad_norm": 0.8260893821716309, + "learning_rate": 0.0002, + "loss": 0.742, + "step": 5010 + }, + { + "epoch": 3.644283121597096, + "grad_norm": 0.7635950446128845, + "learning_rate": 0.0002, + "loss": 0.7223, + "step": 5020 + }, + { + "epoch": 3.6515426497277677, + "grad_norm": 0.7060768604278564, + "learning_rate": 0.0002, + "loss": 0.6837, + "step": 5030 + }, + { + "epoch": 3.658802177858439, + "grad_norm": 0.8020303249359131, + "learning_rate": 0.0002, + "loss": 0.6921, + "step": 5040 + }, + { + "epoch": 3.6660617059891107, + "grad_norm": 0.8530341386795044, + "learning_rate": 0.0002, + "loss": 0.6446, + "step": 5050 + }, + { + "epoch": 3.6733212341197823, + "grad_norm": 0.6667101979255676, + "learning_rate": 0.0002, + "loss": 0.7222, + "step": 5060 + }, + { + "epoch": 3.680580762250454, + "grad_norm": 0.7385406494140625, + "learning_rate": 0.0002, + "loss": 0.7081, + "step": 5070 + }, + { + "epoch": 3.6878402903811254, + "grad_norm": 0.7753380537033081, + "learning_rate": 0.0002, + "loss": 0.7035, + "step": 5080 + }, + { + "epoch": 3.695099818511797, + "grad_norm": 0.7516207098960876, + "learning_rate": 0.0002, + "loss": 0.6491, + "step": 5090 + }, + { + "epoch": 3.702359346642468, + "grad_norm": 0.8171586394309998, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 5100 + }, + { + "epoch": 3.70961887477314, + "grad_norm": 1.0796279907226562, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 5110 + }, + { + "epoch": 3.716878402903811, + "grad_norm": 0.6957688927650452, + "learning_rate": 0.0002, + "loss": 0.5948, + "step": 5120 + }, + { + "epoch": 3.7241379310344827, + "grad_norm": 0.8550161719322205, + "learning_rate": 0.0002, + "loss": 0.7515, + "step": 5130 + }, + { + "epoch": 3.731397459165154, + "grad_norm": 0.9396728277206421, + "learning_rate": 0.0002, + "loss": 0.7286, + "step": 5140 + }, + { + "epoch": 3.7386569872958257, + "grad_norm": 1.4264805316925049, + "learning_rate": 0.0002, + "loss": 0.7594, + "step": 5150 + }, + { + "epoch": 3.7459165154264973, + "grad_norm": 0.8725108504295349, + "learning_rate": 0.0002, + "loss": 0.6575, + "step": 5160 + }, + { + "epoch": 3.753176043557169, + "grad_norm": 1.0346195697784424, + "learning_rate": 0.0002, + "loss": 0.6778, + "step": 5170 + }, + { + "epoch": 3.7604355716878404, + "grad_norm": 0.5395554304122925, + "learning_rate": 0.0002, + "loss": 0.6371, + "step": 5180 + }, + { + "epoch": 3.767695099818512, + "grad_norm": 1.3153616189956665, + "learning_rate": 0.0002, + "loss": 0.7308, + "step": 5190 + }, + { + "epoch": 3.7749546279491835, + "grad_norm": 0.9879828691482544, + "learning_rate": 0.0002, + "loss": 0.78, + "step": 5200 + }, + { + "epoch": 3.7822141560798546, + "grad_norm": 0.8876672983169556, + "learning_rate": 0.0002, + "loss": 0.7068, + "step": 5210 + }, + { + "epoch": 3.7894736842105265, + "grad_norm": 0.8363267779350281, + "learning_rate": 0.0002, + "loss": 0.6283, + "step": 5220 + }, + { + "epoch": 3.7967332123411976, + "grad_norm": 0.637294590473175, + "learning_rate": 0.0002, + "loss": 0.6255, + "step": 5230 + }, + { + "epoch": 3.803992740471869, + "grad_norm": 1.1408970355987549, + "learning_rate": 0.0002, + "loss": 0.6685, + "step": 5240 + }, + { + "epoch": 3.8112522686025407, + "grad_norm": 1.0128360986709595, + "learning_rate": 0.0002, + "loss": 0.6761, + "step": 5250 + }, + { + "epoch": 3.8185117967332123, + "grad_norm": 0.8061144351959229, + "learning_rate": 0.0002, + "loss": 0.6764, + "step": 5260 + }, + { + "epoch": 3.825771324863884, + "grad_norm": 0.9626626968383789, + "learning_rate": 0.0002, + "loss": 0.7254, + "step": 5270 + }, + { + "epoch": 3.8330308529945554, + "grad_norm": 0.9013627171516418, + "learning_rate": 0.0002, + "loss": 0.7367, + "step": 5280 + }, + { + "epoch": 3.840290381125227, + "grad_norm": 0.8411344289779663, + "learning_rate": 0.0002, + "loss": 0.6806, + "step": 5290 + }, + { + "epoch": 3.8475499092558985, + "grad_norm": 0.7426059246063232, + "learning_rate": 0.0002, + "loss": 0.6818, + "step": 5300 + }, + { + "epoch": 3.85480943738657, + "grad_norm": 1.003413438796997, + "learning_rate": 0.0002, + "loss": 0.6748, + "step": 5310 + }, + { + "epoch": 3.862068965517241, + "grad_norm": 0.7527840733528137, + "learning_rate": 0.0002, + "loss": 0.8554, + "step": 5320 + }, + { + "epoch": 3.869328493647913, + "grad_norm": 0.738610565662384, + "learning_rate": 0.0002, + "loss": 0.7521, + "step": 5330 + }, + { + "epoch": 3.876588021778584, + "grad_norm": 0.7277999520301819, + "learning_rate": 0.0002, + "loss": 0.7266, + "step": 5340 + }, + { + "epoch": 3.8838475499092557, + "grad_norm": 0.5951359272003174, + "learning_rate": 0.0002, + "loss": 0.7503, + "step": 5350 + }, + { + "epoch": 3.8911070780399273, + "grad_norm": 1.043884038925171, + "learning_rate": 0.0002, + "loss": 0.7447, + "step": 5360 + }, + { + "epoch": 3.898366606170599, + "grad_norm": 0.8436498045921326, + "learning_rate": 0.0002, + "loss": 0.6862, + "step": 5370 + }, + { + "epoch": 3.9056261343012704, + "grad_norm": 0.5603365302085876, + "learning_rate": 0.0002, + "loss": 0.665, + "step": 5380 + }, + { + "epoch": 3.912885662431942, + "grad_norm": 1.0128886699676514, + "learning_rate": 0.0002, + "loss": 0.7098, + "step": 5390 + }, + { + "epoch": 3.9201451905626135, + "grad_norm": 0.7970930337905884, + "learning_rate": 0.0002, + "loss": 0.6707, + "step": 5400 + }, + { + "epoch": 3.927404718693285, + "grad_norm": 0.7699369192123413, + "learning_rate": 0.0002, + "loss": 0.637, + "step": 5410 + }, + { + "epoch": 3.9346642468239565, + "grad_norm": 0.800561249256134, + "learning_rate": 0.0002, + "loss": 0.6742, + "step": 5420 + }, + { + "epoch": 3.941923774954628, + "grad_norm": 0.8020331859588623, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 5430 + }, + { + "epoch": 3.9491833030852996, + "grad_norm": 0.7461140155792236, + "learning_rate": 0.0002, + "loss": 0.7294, + "step": 5440 + }, + { + "epoch": 3.9564428312159707, + "grad_norm": 0.8346918821334839, + "learning_rate": 0.0002, + "loss": 0.7013, + "step": 5450 + }, + { + "epoch": 3.9637023593466427, + "grad_norm": 0.9723302125930786, + "learning_rate": 0.0002, + "loss": 0.6289, + "step": 5460 + }, + { + "epoch": 3.970961887477314, + "grad_norm": 0.6809740662574768, + "learning_rate": 0.0002, + "loss": 0.8029, + "step": 5470 + }, + { + "epoch": 3.9782214156079854, + "grad_norm": 0.7353498339653015, + "learning_rate": 0.0002, + "loss": 0.6896, + "step": 5480 + }, + { + "epoch": 3.985480943738657, + "grad_norm": 0.748009443283081, + "learning_rate": 0.0002, + "loss": 0.6722, + "step": 5490 + }, + { + "epoch": 3.9927404718693285, + "grad_norm": 1.3656195402145386, + "learning_rate": 0.0002, + "loss": 0.6866, + "step": 5500 + }, + { + "epoch": 4.0, + "grad_norm": 0.8402108550071716, + "learning_rate": 0.0002, + "loss": 0.7368, + "step": 5510 + }, + { + "epoch": 4.0, + "eval_loss": 1.17229425907135, + "eval_runtime": 46.2554, + "eval_samples_per_second": 9.426, + "eval_steps_per_second": 1.189, + "step": 5510 + }, + { + "epoch": 4.007259528130671, + "grad_norm": 0.8601235747337341, + "learning_rate": 0.0002, + "loss": 0.4637, + "step": 5520 + }, + { + "epoch": 4.014519056261343, + "grad_norm": 1.2635200023651123, + "learning_rate": 0.0002, + "loss": 0.4717, + "step": 5530 + }, + { + "epoch": 4.021778584392014, + "grad_norm": 1.0257477760314941, + "learning_rate": 0.0002, + "loss": 0.503, + "step": 5540 + }, + { + "epoch": 4.029038112522686, + "grad_norm": 0.9436745047569275, + "learning_rate": 0.0002, + "loss": 0.4547, + "step": 5550 + }, + { + "epoch": 4.036297640653357, + "grad_norm": 0.9443606734275818, + "learning_rate": 0.0002, + "loss": 0.459, + "step": 5560 + }, + { + "epoch": 4.043557168784029, + "grad_norm": 1.3965742588043213, + "learning_rate": 0.0002, + "loss": 0.5386, + "step": 5570 + }, + { + "epoch": 4.0508166969147, + "grad_norm": 0.8973520398139954, + "learning_rate": 0.0002, + "loss": 0.4248, + "step": 5580 + }, + { + "epoch": 4.058076225045372, + "grad_norm": 0.9998409748077393, + "learning_rate": 0.0002, + "loss": 0.4111, + "step": 5590 + }, + { + "epoch": 4.0653357531760435, + "grad_norm": 1.1213387250900269, + "learning_rate": 0.0002, + "loss": 0.4828, + "step": 5600 + }, + { + "epoch": 4.072595281306715, + "grad_norm": 0.7064558863639832, + "learning_rate": 0.0002, + "loss": 0.439, + "step": 5610 + }, + { + "epoch": 4.0798548094373865, + "grad_norm": 1.2390803098678589, + "learning_rate": 0.0002, + "loss": 0.4607, + "step": 5620 + }, + { + "epoch": 4.087114337568058, + "grad_norm": 1.123469591140747, + "learning_rate": 0.0002, + "loss": 0.5014, + "step": 5630 + }, + { + "epoch": 4.09437386569873, + "grad_norm": 1.229573369026184, + "learning_rate": 0.0002, + "loss": 0.513, + "step": 5640 + }, + { + "epoch": 4.101633393829401, + "grad_norm": 1.7182831764221191, + "learning_rate": 0.0002, + "loss": 0.5258, + "step": 5650 + }, + { + "epoch": 4.108892921960073, + "grad_norm": 0.894903302192688, + "learning_rate": 0.0002, + "loss": 0.5371, + "step": 5660 + }, + { + "epoch": 4.116152450090744, + "grad_norm": 0.8754552006721497, + "learning_rate": 0.0002, + "loss": 0.4813, + "step": 5670 + }, + { + "epoch": 4.123411978221416, + "grad_norm": 1.2401553392410278, + "learning_rate": 0.0002, + "loss": 0.491, + "step": 5680 + }, + { + "epoch": 4.130671506352087, + "grad_norm": 0.8631148934364319, + "learning_rate": 0.0002, + "loss": 0.4549, + "step": 5690 + }, + { + "epoch": 4.137931034482759, + "grad_norm": 1.1798022985458374, + "learning_rate": 0.0002, + "loss": 0.487, + "step": 5700 + }, + { + "epoch": 4.14519056261343, + "grad_norm": 0.8344549536705017, + "learning_rate": 0.0002, + "loss": 0.4522, + "step": 5710 + }, + { + "epoch": 4.152450090744102, + "grad_norm": 1.2342697381973267, + "learning_rate": 0.0002, + "loss": 0.4559, + "step": 5720 + }, + { + "epoch": 4.159709618874773, + "grad_norm": 1.1601094007492065, + "learning_rate": 0.0002, + "loss": 0.53, + "step": 5730 + }, + { + "epoch": 4.166969147005445, + "grad_norm": 1.2925703525543213, + "learning_rate": 0.0002, + "loss": 0.4755, + "step": 5740 + }, + { + "epoch": 4.174228675136116, + "grad_norm": 1.0870997905731201, + "learning_rate": 0.0002, + "loss": 0.4973, + "step": 5750 + }, + { + "epoch": 4.181488203266787, + "grad_norm": 0.9077792763710022, + "learning_rate": 0.0002, + "loss": 0.5184, + "step": 5760 + }, + { + "epoch": 4.188747731397459, + "grad_norm": 1.009273886680603, + "learning_rate": 0.0002, + "loss": 0.4606, + "step": 5770 + }, + { + "epoch": 4.19600725952813, + "grad_norm": 1.2465530633926392, + "learning_rate": 0.0002, + "loss": 0.5383, + "step": 5780 + }, + { + "epoch": 4.203266787658802, + "grad_norm": 1.2261253595352173, + "learning_rate": 0.0002, + "loss": 0.4938, + "step": 5790 + }, + { + "epoch": 4.2105263157894735, + "grad_norm": 1.1498041152954102, + "learning_rate": 0.0002, + "loss": 0.5255, + "step": 5800 + }, + { + "epoch": 4.217785843920145, + "grad_norm": 1.1966725587844849, + "learning_rate": 0.0002, + "loss": 0.5269, + "step": 5810 + }, + { + "epoch": 4.2250453720508165, + "grad_norm": 1.2651296854019165, + "learning_rate": 0.0002, + "loss": 0.5626, + "step": 5820 + }, + { + "epoch": 4.2323049001814885, + "grad_norm": 1.0388574600219727, + "learning_rate": 0.0002, + "loss": 0.5213, + "step": 5830 + }, + { + "epoch": 4.23956442831216, + "grad_norm": 1.3042771816253662, + "learning_rate": 0.0002, + "loss": 0.4965, + "step": 5840 + }, + { + "epoch": 4.246823956442832, + "grad_norm": 1.1127727031707764, + "learning_rate": 0.0002, + "loss": 0.5116, + "step": 5850 + }, + { + "epoch": 4.254083484573503, + "grad_norm": 0.9653958082199097, + "learning_rate": 0.0002, + "loss": 0.5197, + "step": 5860 + }, + { + "epoch": 4.261343012704174, + "grad_norm": 1.0500504970550537, + "learning_rate": 0.0002, + "loss": 0.4414, + "step": 5870 + }, + { + "epoch": 4.268602540834846, + "grad_norm": 1.1476165056228638, + "learning_rate": 0.0002, + "loss": 0.544, + "step": 5880 + }, + { + "epoch": 4.275862068965517, + "grad_norm": 0.9424414038658142, + "learning_rate": 0.0002, + "loss": 0.4667, + "step": 5890 + }, + { + "epoch": 4.283121597096189, + "grad_norm": 1.3309166431427002, + "learning_rate": 0.0002, + "loss": 0.5039, + "step": 5900 + }, + { + "epoch": 4.29038112522686, + "grad_norm": 1.3025873899459839, + "learning_rate": 0.0002, + "loss": 0.5472, + "step": 5910 + }, + { + "epoch": 4.297640653357532, + "grad_norm": 1.1442325115203857, + "learning_rate": 0.0002, + "loss": 0.4644, + "step": 5920 + }, + { + "epoch": 4.304900181488203, + "grad_norm": 0.9820859432220459, + "learning_rate": 0.0002, + "loss": 0.5066, + "step": 5930 + }, + { + "epoch": 4.312159709618875, + "grad_norm": 0.9615740180015564, + "learning_rate": 0.0002, + "loss": 0.5224, + "step": 5940 + }, + { + "epoch": 4.319419237749546, + "grad_norm": 1.1627109050750732, + "learning_rate": 0.0002, + "loss": 0.5665, + "step": 5950 + }, + { + "epoch": 4.326678765880218, + "grad_norm": 0.9381322860717773, + "learning_rate": 0.0002, + "loss": 0.4852, + "step": 5960 + }, + { + "epoch": 4.333938294010889, + "grad_norm": 0.8154335618019104, + "learning_rate": 0.0002, + "loss": 0.4532, + "step": 5970 + }, + { + "epoch": 4.341197822141561, + "grad_norm": 0.877671480178833, + "learning_rate": 0.0002, + "loss": 0.5237, + "step": 5980 + }, + { + "epoch": 4.348457350272232, + "grad_norm": 1.1742031574249268, + "learning_rate": 0.0002, + "loss": 0.6113, + "step": 5990 + }, + { + "epoch": 4.3557168784029034, + "grad_norm": 1.0352917909622192, + "learning_rate": 0.0002, + "loss": 0.5704, + "step": 6000 + }, + { + "epoch": 4.362976406533575, + "grad_norm": 0.9963878989219666, + "learning_rate": 0.0002, + "loss": 0.4996, + "step": 6010 + }, + { + "epoch": 4.3702359346642465, + "grad_norm": 1.1892237663269043, + "learning_rate": 0.0002, + "loss": 0.4813, + "step": 6020 + }, + { + "epoch": 4.3774954627949185, + "grad_norm": 1.2516111135482788, + "learning_rate": 0.0002, + "loss": 0.5224, + "step": 6030 + }, + { + "epoch": 4.38475499092559, + "grad_norm": 1.2111951112747192, + "learning_rate": 0.0002, + "loss": 0.5952, + "step": 6040 + }, + { + "epoch": 4.392014519056262, + "grad_norm": 1.0820083618164062, + "learning_rate": 0.0002, + "loss": 0.4275, + "step": 6050 + }, + { + "epoch": 4.399274047186933, + "grad_norm": 1.033915638923645, + "learning_rate": 0.0002, + "loss": 0.5117, + "step": 6060 + }, + { + "epoch": 4.406533575317605, + "grad_norm": 1.0635870695114136, + "learning_rate": 0.0002, + "loss": 0.5431, + "step": 6070 + }, + { + "epoch": 4.413793103448276, + "grad_norm": 1.0520414113998413, + "learning_rate": 0.0002, + "loss": 0.5341, + "step": 6080 + }, + { + "epoch": 4.421052631578947, + "grad_norm": 1.0821926593780518, + "learning_rate": 0.0002, + "loss": 0.512, + "step": 6090 + }, + { + "epoch": 4.428312159709619, + "grad_norm": 1.0533246994018555, + "learning_rate": 0.0002, + "loss": 0.5065, + "step": 6100 + }, + { + "epoch": 4.43557168784029, + "grad_norm": 0.9231932759284973, + "learning_rate": 0.0002, + "loss": 0.4577, + "step": 6110 + }, + { + "epoch": 4.442831215970962, + "grad_norm": 0.9910260438919067, + "learning_rate": 0.0002, + "loss": 0.583, + "step": 6120 + }, + { + "epoch": 4.450090744101633, + "grad_norm": 1.061949372291565, + "learning_rate": 0.0002, + "loss": 0.4717, + "step": 6130 + }, + { + "epoch": 4.457350272232305, + "grad_norm": 1.2927039861679077, + "learning_rate": 0.0002, + "loss": 0.5893, + "step": 6140 + }, + { + "epoch": 4.464609800362976, + "grad_norm": 1.3966081142425537, + "learning_rate": 0.0002, + "loss": 0.4684, + "step": 6150 + }, + { + "epoch": 4.471869328493648, + "grad_norm": 1.3835992813110352, + "learning_rate": 0.0002, + "loss": 0.5507, + "step": 6160 + }, + { + "epoch": 4.479128856624319, + "grad_norm": 1.0892692804336548, + "learning_rate": 0.0002, + "loss": 0.5911, + "step": 6170 + }, + { + "epoch": 4.486388384754991, + "grad_norm": 1.0318800210952759, + "learning_rate": 0.0002, + "loss": 0.478, + "step": 6180 + }, + { + "epoch": 4.493647912885662, + "grad_norm": 0.8174677491188049, + "learning_rate": 0.0002, + "loss": 0.5198, + "step": 6190 + }, + { + "epoch": 4.500907441016334, + "grad_norm": 1.4157509803771973, + "learning_rate": 0.0002, + "loss": 0.5387, + "step": 6200 + }, + { + "epoch": 4.508166969147005, + "grad_norm": 1.5244114398956299, + "learning_rate": 0.0002, + "loss": 0.5868, + "step": 6210 + }, + { + "epoch": 4.5154264972776765, + "grad_norm": 0.8164850473403931, + "learning_rate": 0.0002, + "loss": 0.4642, + "step": 6220 + }, + { + "epoch": 4.5226860254083485, + "grad_norm": 1.2904746532440186, + "learning_rate": 0.0002, + "loss": 0.522, + "step": 6230 + }, + { + "epoch": 4.52994555353902, + "grad_norm": 0.7987732887268066, + "learning_rate": 0.0002, + "loss": 0.5103, + "step": 6240 + }, + { + "epoch": 4.537205081669692, + "grad_norm": 0.831040620803833, + "learning_rate": 0.0002, + "loss": 0.4615, + "step": 6250 + }, + { + "epoch": 4.544464609800363, + "grad_norm": 0.9545485973358154, + "learning_rate": 0.0002, + "loss": 0.5065, + "step": 6260 + }, + { + "epoch": 4.551724137931035, + "grad_norm": 0.9291793704032898, + "learning_rate": 0.0002, + "loss": 0.5515, + "step": 6270 + }, + { + "epoch": 4.558983666061706, + "grad_norm": 0.8977208733558655, + "learning_rate": 0.0002, + "loss": 0.4535, + "step": 6280 + }, + { + "epoch": 4.566243194192378, + "grad_norm": 1.1768537759780884, + "learning_rate": 0.0002, + "loss": 0.544, + "step": 6290 + }, + { + "epoch": 4.573502722323049, + "grad_norm": 1.0688952207565308, + "learning_rate": 0.0002, + "loss": 0.5925, + "step": 6300 + }, + { + "epoch": 4.580762250453721, + "grad_norm": 0.8800966739654541, + "learning_rate": 0.0002, + "loss": 0.5207, + "step": 6310 + }, + { + "epoch": 4.588021778584392, + "grad_norm": 1.0911834239959717, + "learning_rate": 0.0002, + "loss": 0.6106, + "step": 6320 + }, + { + "epoch": 4.595281306715064, + "grad_norm": 1.1420872211456299, + "learning_rate": 0.0002, + "loss": 0.5109, + "step": 6330 + }, + { + "epoch": 4.602540834845735, + "grad_norm": 1.0215224027633667, + "learning_rate": 0.0002, + "loss": 0.5147, + "step": 6340 + }, + { + "epoch": 4.609800362976406, + "grad_norm": 0.9685489535331726, + "learning_rate": 0.0002, + "loss": 0.592, + "step": 6350 + }, + { + "epoch": 4.617059891107078, + "grad_norm": 1.12773597240448, + "learning_rate": 0.0002, + "loss": 0.5775, + "step": 6360 + }, + { + "epoch": 4.624319419237749, + "grad_norm": 1.0663973093032837, + "learning_rate": 0.0002, + "loss": 0.5966, + "step": 6370 + }, + { + "epoch": 4.631578947368421, + "grad_norm": 1.1707262992858887, + "learning_rate": 0.0002, + "loss": 0.512, + "step": 6380 + }, + { + "epoch": 4.638838475499092, + "grad_norm": 1.0672980546951294, + "learning_rate": 0.0002, + "loss": 0.5497, + "step": 6390 + }, + { + "epoch": 4.646098003629764, + "grad_norm": 1.1464333534240723, + "learning_rate": 0.0002, + "loss": 0.5699, + "step": 6400 + }, + { + "epoch": 4.653357531760435, + "grad_norm": 1.070230484008789, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 6410 + }, + { + "epoch": 4.660617059891107, + "grad_norm": 0.9673764109611511, + "learning_rate": 0.0002, + "loss": 0.5013, + "step": 6420 + }, + { + "epoch": 4.6678765880217785, + "grad_norm": 1.0189043283462524, + "learning_rate": 0.0002, + "loss": 0.5901, + "step": 6430 + }, + { + "epoch": 4.67513611615245, + "grad_norm": 1.185896396636963, + "learning_rate": 0.0002, + "loss": 0.5193, + "step": 6440 + }, + { + "epoch": 4.682395644283122, + "grad_norm": 1.0682812929153442, + "learning_rate": 0.0002, + "loss": 0.5318, + "step": 6450 + }, + { + "epoch": 4.689655172413794, + "grad_norm": 1.3586071729660034, + "learning_rate": 0.0002, + "loss": 0.5773, + "step": 6460 + }, + { + "epoch": 4.696914700544465, + "grad_norm": 0.6561792492866516, + "learning_rate": 0.0002, + "loss": 0.5482, + "step": 6470 + }, + { + "epoch": 4.704174228675136, + "grad_norm": 1.1394113302230835, + "learning_rate": 0.0002, + "loss": 0.5711, + "step": 6480 + }, + { + "epoch": 4.711433756805808, + "grad_norm": 0.9683151245117188, + "learning_rate": 0.0002, + "loss": 0.5325, + "step": 6490 + }, + { + "epoch": 4.718693284936479, + "grad_norm": 1.0247553586959839, + "learning_rate": 0.0002, + "loss": 0.5139, + "step": 6500 + }, + { + "epoch": 4.725952813067151, + "grad_norm": 0.8046169281005859, + "learning_rate": 0.0002, + "loss": 0.5794, + "step": 6510 + }, + { + "epoch": 4.733212341197822, + "grad_norm": 1.0710240602493286, + "learning_rate": 0.0002, + "loss": 0.5471, + "step": 6520 + }, + { + "epoch": 4.740471869328494, + "grad_norm": 0.9438924193382263, + "learning_rate": 0.0002, + "loss": 0.4805, + "step": 6530 + }, + { + "epoch": 4.747731397459165, + "grad_norm": 0.869162380695343, + "learning_rate": 0.0002, + "loss": 0.5404, + "step": 6540 + }, + { + "epoch": 4.754990925589837, + "grad_norm": 0.9776787161827087, + "learning_rate": 0.0002, + "loss": 0.6379, + "step": 6550 + }, + { + "epoch": 4.762250453720508, + "grad_norm": 1.1990505456924438, + "learning_rate": 0.0002, + "loss": 0.5288, + "step": 6560 + }, + { + "epoch": 4.769509981851179, + "grad_norm": 1.0582209825515747, + "learning_rate": 0.0002, + "loss": 0.5539, + "step": 6570 + }, + { + "epoch": 4.776769509981851, + "grad_norm": 0.9966367483139038, + "learning_rate": 0.0002, + "loss": 0.489, + "step": 6580 + }, + { + "epoch": 4.784029038112522, + "grad_norm": 0.9130612015724182, + "learning_rate": 0.0002, + "loss": 0.5514, + "step": 6590 + }, + { + "epoch": 4.791288566243194, + "grad_norm": 1.0950500965118408, + "learning_rate": 0.0002, + "loss": 0.5864, + "step": 6600 + }, + { + "epoch": 4.798548094373865, + "grad_norm": 1.108681321144104, + "learning_rate": 0.0002, + "loss": 0.5266, + "step": 6610 + }, + { + "epoch": 4.805807622504537, + "grad_norm": 1.1873763799667358, + "learning_rate": 0.0002, + "loss": 0.5875, + "step": 6620 + }, + { + "epoch": 4.8130671506352085, + "grad_norm": 1.305367112159729, + "learning_rate": 0.0002, + "loss": 0.5736, + "step": 6630 + }, + { + "epoch": 4.8203266787658805, + "grad_norm": 1.2801482677459717, + "learning_rate": 0.0002, + "loss": 0.5636, + "step": 6640 + }, + { + "epoch": 4.827586206896552, + "grad_norm": 1.26764976978302, + "learning_rate": 0.0002, + "loss": 0.582, + "step": 6650 + }, + { + "epoch": 4.834845735027224, + "grad_norm": 1.0018208026885986, + "learning_rate": 0.0002, + "loss": 0.5259, + "step": 6660 + }, + { + "epoch": 4.842105263157895, + "grad_norm": 1.2326326370239258, + "learning_rate": 0.0002, + "loss": 0.548, + "step": 6670 + }, + { + "epoch": 4.849364791288567, + "grad_norm": 0.9707282781600952, + "learning_rate": 0.0002, + "loss": 0.5933, + "step": 6680 + }, + { + "epoch": 4.856624319419238, + "grad_norm": 1.2772048711776733, + "learning_rate": 0.0002, + "loss": 0.5612, + "step": 6690 + }, + { + "epoch": 4.863883847549909, + "grad_norm": 2.6652262210845947, + "learning_rate": 0.0002, + "loss": 0.5346, + "step": 6700 + }, + { + "epoch": 4.871143375680581, + "grad_norm": 1.215828537940979, + "learning_rate": 0.0002, + "loss": 0.5428, + "step": 6710 + }, + { + "epoch": 4.878402903811252, + "grad_norm": 1.3704510927200317, + "learning_rate": 0.0002, + "loss": 0.6571, + "step": 6720 + }, + { + "epoch": 4.885662431941924, + "grad_norm": 0.7781757116317749, + "learning_rate": 0.0002, + "loss": 0.4963, + "step": 6730 + }, + { + "epoch": 4.892921960072595, + "grad_norm": 1.1883646249771118, + "learning_rate": 0.0002, + "loss": 0.5989, + "step": 6740 + }, + { + "epoch": 4.900181488203267, + "grad_norm": 0.9216066002845764, + "learning_rate": 0.0002, + "loss": 0.6067, + "step": 6750 + }, + { + "epoch": 4.907441016333938, + "grad_norm": 1.0558464527130127, + "learning_rate": 0.0002, + "loss": 0.5085, + "step": 6760 + }, + { + "epoch": 4.91470054446461, + "grad_norm": 1.032656192779541, + "learning_rate": 0.0002, + "loss": 0.5216, + "step": 6770 + }, + { + "epoch": 4.921960072595281, + "grad_norm": 1.1261441707611084, + "learning_rate": 0.0002, + "loss": 0.5426, + "step": 6780 + }, + { + "epoch": 4.929219600725952, + "grad_norm": 1.2178640365600586, + "learning_rate": 0.0002, + "loss": 0.5295, + "step": 6790 + }, + { + "epoch": 4.936479128856624, + "grad_norm": 1.5369361639022827, + "learning_rate": 0.0002, + "loss": 0.5476, + "step": 6800 + }, + { + "epoch": 4.943738656987296, + "grad_norm": 1.1188377141952515, + "learning_rate": 0.0002, + "loss": 0.5358, + "step": 6810 + }, + { + "epoch": 4.950998185117967, + "grad_norm": 1.2506113052368164, + "learning_rate": 0.0002, + "loss": 0.5483, + "step": 6820 + }, + { + "epoch": 4.9582577132486385, + "grad_norm": 0.8776047825813293, + "learning_rate": 0.0002, + "loss": 0.567, + "step": 6830 + }, + { + "epoch": 4.9655172413793105, + "grad_norm": 0.9700555205345154, + "learning_rate": 0.0002, + "loss": 0.5764, + "step": 6840 + }, + { + "epoch": 4.972776769509982, + "grad_norm": 1.2713534832000732, + "learning_rate": 0.0002, + "loss": 0.5396, + "step": 6850 + }, + { + "epoch": 4.980036297640654, + "grad_norm": 0.9855955243110657, + "learning_rate": 0.0002, + "loss": 0.5451, + "step": 6860 + }, + { + "epoch": 4.987295825771325, + "grad_norm": 0.8734853863716125, + "learning_rate": 0.0002, + "loss": 0.5884, + "step": 6870 + }, + { + "epoch": 4.994555353901997, + "grad_norm": 0.8065403699874878, + "learning_rate": 0.0002, + "loss": 0.5189, + "step": 6880 + }, + { + "epoch": 4.999637023593467, + "eval_loss": 1.3302682638168335, + "eval_runtime": 46.2496, + "eval_samples_per_second": 9.427, + "eval_steps_per_second": 1.189, + "step": 6887 + }, + { + "epoch": 5.001814882032668, + "grad_norm": 0.5163813829421997, + "learning_rate": 0.0002, + "loss": 0.4889, + "step": 6890 + }, + { + "epoch": 5.00907441016334, + "grad_norm": 1.1496137380599976, + "learning_rate": 0.0002, + "loss": 0.3545, + "step": 6900 + }, + { + "epoch": 5.016333938294011, + "grad_norm": 1.0133885145187378, + "learning_rate": 0.0002, + "loss": 0.39, + "step": 6910 + }, + { + "epoch": 5.023593466424682, + "grad_norm": 0.9479621052742004, + "learning_rate": 0.0002, + "loss": 0.3693, + "step": 6920 + }, + { + "epoch": 5.030852994555354, + "grad_norm": 0.8587583303451538, + "learning_rate": 0.0002, + "loss": 0.4012, + "step": 6930 + }, + { + "epoch": 5.038112522686025, + "grad_norm": 1.3314697742462158, + "learning_rate": 0.0002, + "loss": 0.3428, + "step": 6940 + }, + { + "epoch": 5.045372050816697, + "grad_norm": 1.195448875427246, + "learning_rate": 0.0002, + "loss": 0.3909, + "step": 6950 + }, + { + "epoch": 5.052631578947368, + "grad_norm": 1.2482256889343262, + "learning_rate": 0.0002, + "loss": 0.3322, + "step": 6960 + }, + { + "epoch": 5.05989110707804, + "grad_norm": 1.2011528015136719, + "learning_rate": 0.0002, + "loss": 0.3893, + "step": 6970 + }, + { + "epoch": 5.067150635208711, + "grad_norm": 1.3997188806533813, + "learning_rate": 0.0002, + "loss": 0.3265, + "step": 6980 + }, + { + "epoch": 5.074410163339383, + "grad_norm": 1.2147513628005981, + "learning_rate": 0.0002, + "loss": 0.3716, + "step": 6990 + }, + { + "epoch": 5.081669691470054, + "grad_norm": 1.6030137538909912, + "learning_rate": 0.0002, + "loss": 0.4053, + "step": 7000 + }, + { + "epoch": 5.088929219600726, + "grad_norm": 0.9466970562934875, + "learning_rate": 0.0002, + "loss": 0.3665, + "step": 7010 + }, + { + "epoch": 5.096188747731397, + "grad_norm": 1.4593411684036255, + "learning_rate": 0.0002, + "loss": 0.3451, + "step": 7020 + }, + { + "epoch": 5.103448275862069, + "grad_norm": 1.2196033000946045, + "learning_rate": 0.0002, + "loss": 0.3843, + "step": 7030 + }, + { + "epoch": 5.1107078039927405, + "grad_norm": 1.1341328620910645, + "learning_rate": 0.0002, + "loss": 0.3896, + "step": 7040 + }, + { + "epoch": 5.117967332123412, + "grad_norm": 1.2248976230621338, + "learning_rate": 0.0002, + "loss": 0.3627, + "step": 7050 + }, + { + "epoch": 5.125226860254084, + "grad_norm": 1.1620593070983887, + "learning_rate": 0.0002, + "loss": 0.3784, + "step": 7060 + }, + { + "epoch": 5.132486388384755, + "grad_norm": 0.9300723671913147, + "learning_rate": 0.0002, + "loss": 0.3678, + "step": 7070 + }, + { + "epoch": 5.139745916515427, + "grad_norm": 1.2265169620513916, + "learning_rate": 0.0002, + "loss": 0.3756, + "step": 7080 + }, + { + "epoch": 5.147005444646098, + "grad_norm": 1.4430373907089233, + "learning_rate": 0.0002, + "loss": 0.3595, + "step": 7090 + }, + { + "epoch": 5.15426497277677, + "grad_norm": 1.0821576118469238, + "learning_rate": 0.0002, + "loss": 0.3788, + "step": 7100 + }, + { + "epoch": 5.161524500907441, + "grad_norm": 1.2574739456176758, + "learning_rate": 0.0002, + "loss": 0.383, + "step": 7110 + }, + { + "epoch": 5.168784029038113, + "grad_norm": 1.1806069612503052, + "learning_rate": 0.0002, + "loss": 0.3692, + "step": 7120 + }, + { + "epoch": 5.176043557168784, + "grad_norm": 0.9900956153869629, + "learning_rate": 0.0002, + "loss": 0.3978, + "step": 7130 + }, + { + "epoch": 5.183303085299456, + "grad_norm": 1.2414425611495972, + "learning_rate": 0.0002, + "loss": 0.4358, + "step": 7140 + }, + { + "epoch": 5.190562613430127, + "grad_norm": 0.8220699429512024, + "learning_rate": 0.0002, + "loss": 0.3485, + "step": 7150 + }, + { + "epoch": 5.197822141560798, + "grad_norm": 1.29408860206604, + "learning_rate": 0.0002, + "loss": 0.3517, + "step": 7160 + }, + { + "epoch": 5.20508166969147, + "grad_norm": 0.8510639071464539, + "learning_rate": 0.0002, + "loss": 0.3405, + "step": 7170 + }, + { + "epoch": 5.212341197822141, + "grad_norm": 1.3448902368545532, + "learning_rate": 0.0002, + "loss": 0.4233, + "step": 7180 + }, + { + "epoch": 5.219600725952813, + "grad_norm": 1.054451584815979, + "learning_rate": 0.0002, + "loss": 0.3808, + "step": 7190 + }, + { + "epoch": 5.226860254083484, + "grad_norm": 1.3752713203430176, + "learning_rate": 0.0002, + "loss": 0.368, + "step": 7200 + }, + { + "epoch": 5.234119782214156, + "grad_norm": 1.4848095178604126, + "learning_rate": 0.0002, + "loss": 0.3844, + "step": 7210 + }, + { + "epoch": 5.241379310344827, + "grad_norm": 1.428842544555664, + "learning_rate": 0.0002, + "loss": 0.4187, + "step": 7220 + }, + { + "epoch": 5.248638838475499, + "grad_norm": 1.1703591346740723, + "learning_rate": 0.0002, + "loss": 0.3778, + "step": 7230 + }, + { + "epoch": 5.2558983666061705, + "grad_norm": 1.2180451154708862, + "learning_rate": 0.0002, + "loss": 0.417, + "step": 7240 + }, + { + "epoch": 5.2631578947368425, + "grad_norm": 1.094045877456665, + "learning_rate": 0.0002, + "loss": 0.3656, + "step": 7250 + }, + { + "epoch": 5.270417422867514, + "grad_norm": 0.9545766115188599, + "learning_rate": 0.0002, + "loss": 0.4331, + "step": 7260 + }, + { + "epoch": 5.277676950998185, + "grad_norm": 0.8356652855873108, + "learning_rate": 0.0002, + "loss": 0.3642, + "step": 7270 + }, + { + "epoch": 5.284936479128857, + "grad_norm": 1.148160457611084, + "learning_rate": 0.0002, + "loss": 0.3576, + "step": 7280 + }, + { + "epoch": 5.292196007259528, + "grad_norm": 1.2009977102279663, + "learning_rate": 0.0002, + "loss": 0.4178, + "step": 7290 + }, + { + "epoch": 5.2994555353902, + "grad_norm": 1.3283873796463013, + "learning_rate": 0.0002, + "loss": 0.3977, + "step": 7300 + }, + { + "epoch": 5.306715063520871, + "grad_norm": 0.9850481748580933, + "learning_rate": 0.0002, + "loss": 0.3853, + "step": 7310 + }, + { + "epoch": 5.313974591651543, + "grad_norm": 1.367550015449524, + "learning_rate": 0.0002, + "loss": 0.3645, + "step": 7320 + }, + { + "epoch": 5.321234119782214, + "grad_norm": 0.8602936863899231, + "learning_rate": 0.0002, + "loss": 0.3898, + "step": 7330 + }, + { + "epoch": 5.328493647912886, + "grad_norm": 1.1130679845809937, + "learning_rate": 0.0002, + "loss": 0.4173, + "step": 7340 + }, + { + "epoch": 5.335753176043557, + "grad_norm": 1.3002253770828247, + "learning_rate": 0.0002, + "loss": 0.3642, + "step": 7350 + }, + { + "epoch": 5.343012704174229, + "grad_norm": 1.6235289573669434, + "learning_rate": 0.0002, + "loss": 0.4138, + "step": 7360 + }, + { + "epoch": 5.3502722323049, + "grad_norm": 1.156379222869873, + "learning_rate": 0.0002, + "loss": 0.4779, + "step": 7370 + }, + { + "epoch": 5.357531760435572, + "grad_norm": 1.0569308996200562, + "learning_rate": 0.0002, + "loss": 0.3222, + "step": 7380 + }, + { + "epoch": 5.364791288566243, + "grad_norm": 1.6674021482467651, + "learning_rate": 0.0002, + "loss": 0.3573, + "step": 7390 + }, + { + "epoch": 5.372050816696914, + "grad_norm": 1.2962018251419067, + "learning_rate": 0.0002, + "loss": 0.4325, + "step": 7400 + }, + { + "epoch": 5.379310344827586, + "grad_norm": 1.1904195547103882, + "learning_rate": 0.0002, + "loss": 0.3809, + "step": 7410 + }, + { + "epoch": 5.386569872958257, + "grad_norm": 1.316245675086975, + "learning_rate": 0.0002, + "loss": 0.3728, + "step": 7420 + }, + { + "epoch": 5.393829401088929, + "grad_norm": 1.127570390701294, + "learning_rate": 0.0002, + "loss": 0.4096, + "step": 7430 + }, + { + "epoch": 5.4010889292196005, + "grad_norm": 1.3895777463912964, + "learning_rate": 0.0002, + "loss": 0.3933, + "step": 7440 + }, + { + "epoch": 5.4083484573502725, + "grad_norm": 1.626830816268921, + "learning_rate": 0.0002, + "loss": 0.4085, + "step": 7450 + }, + { + "epoch": 5.415607985480944, + "grad_norm": 1.3703926801681519, + "learning_rate": 0.0002, + "loss": 0.4186, + "step": 7460 + }, + { + "epoch": 5.422867513611616, + "grad_norm": 1.3854840993881226, + "learning_rate": 0.0002, + "loss": 0.3517, + "step": 7470 + }, + { + "epoch": 5.430127041742287, + "grad_norm": 1.107065200805664, + "learning_rate": 0.0002, + "loss": 0.3714, + "step": 7480 + }, + { + "epoch": 5.437386569872959, + "grad_norm": 0.7843456268310547, + "learning_rate": 0.0002, + "loss": 0.3855, + "step": 7490 + }, + { + "epoch": 5.44464609800363, + "grad_norm": 1.6692372560501099, + "learning_rate": 0.0002, + "loss": 0.4159, + "step": 7500 + }, + { + "epoch": 5.451905626134302, + "grad_norm": 1.2583858966827393, + "learning_rate": 0.0002, + "loss": 0.4185, + "step": 7510 + }, + { + "epoch": 5.459165154264973, + "grad_norm": 1.6827000379562378, + "learning_rate": 0.0002, + "loss": 0.4401, + "step": 7520 + }, + { + "epoch": 5.466424682395644, + "grad_norm": 1.6680560111999512, + "learning_rate": 0.0002, + "loss": 0.397, + "step": 7530 + }, + { + "epoch": 5.473684210526316, + "grad_norm": 1.3696072101593018, + "learning_rate": 0.0002, + "loss": 0.4193, + "step": 7540 + }, + { + "epoch": 5.480943738656987, + "grad_norm": 1.4523496627807617, + "learning_rate": 0.0002, + "loss": 0.4244, + "step": 7550 + }, + { + "epoch": 5.488203266787659, + "grad_norm": 1.3432692289352417, + "learning_rate": 0.0002, + "loss": 0.3609, + "step": 7560 + }, + { + "epoch": 5.49546279491833, + "grad_norm": 1.363818645477295, + "learning_rate": 0.0002, + "loss": 0.3675, + "step": 7570 + }, + { + "epoch": 5.502722323049002, + "grad_norm": 1.0176721811294556, + "learning_rate": 0.0002, + "loss": 0.3726, + "step": 7580 + }, + { + "epoch": 5.509981851179673, + "grad_norm": 1.1625547409057617, + "learning_rate": 0.0002, + "loss": 0.3751, + "step": 7590 + }, + { + "epoch": 5.517241379310345, + "grad_norm": 1.2480388879776, + "learning_rate": 0.0002, + "loss": 0.433, + "step": 7600 + }, + { + "epoch": 5.524500907441016, + "grad_norm": 1.341509222984314, + "learning_rate": 0.0002, + "loss": 0.4511, + "step": 7610 + }, + { + "epoch": 5.531760435571687, + "grad_norm": 1.7048436403274536, + "learning_rate": 0.0002, + "loss": 0.4642, + "step": 7620 + }, + { + "epoch": 5.539019963702359, + "grad_norm": 1.1435480117797852, + "learning_rate": 0.0002, + "loss": 0.4509, + "step": 7630 + }, + { + "epoch": 5.5462794918330305, + "grad_norm": 1.2381842136383057, + "learning_rate": 0.0002, + "loss": 0.4528, + "step": 7640 + }, + { + "epoch": 5.5535390199637025, + "grad_norm": 1.50786292552948, + "learning_rate": 0.0002, + "loss": 0.4496, + "step": 7650 + }, + { + "epoch": 5.560798548094374, + "grad_norm": 1.2263519763946533, + "learning_rate": 0.0002, + "loss": 0.4242, + "step": 7660 + }, + { + "epoch": 5.568058076225046, + "grad_norm": 1.2864696979522705, + "learning_rate": 0.0002, + "loss": 0.418, + "step": 7670 + }, + { + "epoch": 5.575317604355717, + "grad_norm": 1.4443191289901733, + "learning_rate": 0.0002, + "loss": 0.3832, + "step": 7680 + }, + { + "epoch": 5.582577132486389, + "grad_norm": 1.3360971212387085, + "learning_rate": 0.0002, + "loss": 0.3964, + "step": 7690 + }, + { + "epoch": 5.58983666061706, + "grad_norm": 1.391828179359436, + "learning_rate": 0.0002, + "loss": 0.4639, + "step": 7700 + }, + { + "epoch": 5.597096188747732, + "grad_norm": 1.3699384927749634, + "learning_rate": 0.0002, + "loss": 0.4722, + "step": 7710 + }, + { + "epoch": 5.604355716878403, + "grad_norm": 1.3778468370437622, + "learning_rate": 0.0002, + "loss": 0.4302, + "step": 7720 + }, + { + "epoch": 5.611615245009075, + "grad_norm": 1.1009501218795776, + "learning_rate": 0.0002, + "loss": 0.4179, + "step": 7730 + }, + { + "epoch": 5.618874773139746, + "grad_norm": 1.0410021543502808, + "learning_rate": 0.0002, + "loss": 0.4104, + "step": 7740 + }, + { + "epoch": 5.626134301270417, + "grad_norm": 1.1012226343154907, + "learning_rate": 0.0002, + "loss": 0.4489, + "step": 7750 + }, + { + "epoch": 5.633393829401089, + "grad_norm": 1.3246384859085083, + "learning_rate": 0.0002, + "loss": 0.4544, + "step": 7760 + }, + { + "epoch": 5.64065335753176, + "grad_norm": 1.4301716089248657, + "learning_rate": 0.0002, + "loss": 0.4381, + "step": 7770 + }, + { + "epoch": 5.647912885662432, + "grad_norm": 1.1368978023529053, + "learning_rate": 0.0002, + "loss": 0.4297, + "step": 7780 + }, + { + "epoch": 5.655172413793103, + "grad_norm": 1.3493064641952515, + "learning_rate": 0.0002, + "loss": 0.4063, + "step": 7790 + }, + { + "epoch": 5.662431941923775, + "grad_norm": 1.3328721523284912, + "learning_rate": 0.0002, + "loss": 0.4562, + "step": 7800 + }, + { + "epoch": 5.669691470054446, + "grad_norm": 1.3235671520233154, + "learning_rate": 0.0002, + "loss": 0.4075, + "step": 7810 + }, + { + "epoch": 5.676950998185118, + "grad_norm": 1.1961841583251953, + "learning_rate": 0.0002, + "loss": 0.4589, + "step": 7820 + }, + { + "epoch": 5.684210526315789, + "grad_norm": 1.4189636707305908, + "learning_rate": 0.0002, + "loss": 0.4503, + "step": 7830 + }, + { + "epoch": 5.691470054446461, + "grad_norm": 1.3551312685012817, + "learning_rate": 0.0002, + "loss": 0.4452, + "step": 7840 + }, + { + "epoch": 5.6987295825771325, + "grad_norm": 1.449987769126892, + "learning_rate": 0.0002, + "loss": 0.4268, + "step": 7850 + }, + { + "epoch": 5.7059891107078045, + "grad_norm": 1.1225156784057617, + "learning_rate": 0.0002, + "loss": 0.4141, + "step": 7860 + }, + { + "epoch": 5.713248638838476, + "grad_norm": 1.4734594821929932, + "learning_rate": 0.0002, + "loss": 0.41, + "step": 7870 + }, + { + "epoch": 5.720508166969147, + "grad_norm": 1.3793359994888306, + "learning_rate": 0.0002, + "loss": 0.4013, + "step": 7880 + }, + { + "epoch": 5.727767695099819, + "grad_norm": 1.2431834936141968, + "learning_rate": 0.0002, + "loss": 0.4065, + "step": 7890 + }, + { + "epoch": 5.73502722323049, + "grad_norm": 1.1158313751220703, + "learning_rate": 0.0002, + "loss": 0.4595, + "step": 7900 + }, + { + "epoch": 5.742286751361162, + "grad_norm": 1.212248682975769, + "learning_rate": 0.0002, + "loss": 0.4342, + "step": 7910 + }, + { + "epoch": 5.749546279491833, + "grad_norm": 1.5259995460510254, + "learning_rate": 0.0002, + "loss": 0.4611, + "step": 7920 + }, + { + "epoch": 5.756805807622505, + "grad_norm": 1.3909121751785278, + "learning_rate": 0.0002, + "loss": 0.4483, + "step": 7930 + }, + { + "epoch": 5.764065335753176, + "grad_norm": 1.2511249780654907, + "learning_rate": 0.0002, + "loss": 0.4325, + "step": 7940 + }, + { + "epoch": 5.771324863883848, + "grad_norm": 1.2511906623840332, + "learning_rate": 0.0002, + "loss": 0.4048, + "step": 7950 + }, + { + "epoch": 5.778584392014519, + "grad_norm": 1.1489921808242798, + "learning_rate": 0.0002, + "loss": 0.3715, + "step": 7960 + }, + { + "epoch": 5.78584392014519, + "grad_norm": 1.028943419456482, + "learning_rate": 0.0002, + "loss": 0.4196, + "step": 7970 + }, + { + "epoch": 5.793103448275862, + "grad_norm": 1.0820423364639282, + "learning_rate": 0.0002, + "loss": 0.4334, + "step": 7980 + }, + { + "epoch": 5.800362976406533, + "grad_norm": 1.296520471572876, + "learning_rate": 0.0002, + "loss": 0.3917, + "step": 7990 + }, + { + "epoch": 5.807622504537205, + "grad_norm": 1.3597749471664429, + "learning_rate": 0.0002, + "loss": 0.4509, + "step": 8000 + }, + { + "epoch": 5.814882032667876, + "grad_norm": 0.8741790652275085, + "learning_rate": 0.0002, + "loss": 0.4535, + "step": 8010 + }, + { + "epoch": 5.822141560798548, + "grad_norm": 1.1471822261810303, + "learning_rate": 0.0002, + "loss": 0.4239, + "step": 8020 + }, + { + "epoch": 5.829401088929219, + "grad_norm": 1.2997334003448486, + "learning_rate": 0.0002, + "loss": 0.5042, + "step": 8030 + }, + { + "epoch": 5.836660617059891, + "grad_norm": 1.1027175188064575, + "learning_rate": 0.0002, + "loss": 0.4758, + "step": 8040 + }, + { + "epoch": 5.8439201451905625, + "grad_norm": 1.2695307731628418, + "learning_rate": 0.0002, + "loss": 0.4192, + "step": 8050 + }, + { + "epoch": 5.8511796733212345, + "grad_norm": 1.5275461673736572, + "learning_rate": 0.0002, + "loss": 0.5173, + "step": 8060 + }, + { + "epoch": 5.8584392014519056, + "grad_norm": 1.3059501647949219, + "learning_rate": 0.0002, + "loss": 0.5012, + "step": 8070 + }, + { + "epoch": 5.8656987295825775, + "grad_norm": 1.57442045211792, + "learning_rate": 0.0002, + "loss": 0.4425, + "step": 8080 + }, + { + "epoch": 5.872958257713249, + "grad_norm": 1.119564414024353, + "learning_rate": 0.0002, + "loss": 0.4261, + "step": 8090 + }, + { + "epoch": 5.88021778584392, + "grad_norm": 1.6517373323440552, + "learning_rate": 0.0002, + "loss": 0.465, + "step": 8100 + }, + { + "epoch": 5.887477313974592, + "grad_norm": 1.4093554019927979, + "learning_rate": 0.0002, + "loss": 0.4406, + "step": 8110 + }, + { + "epoch": 5.894736842105263, + "grad_norm": 1.278843641281128, + "learning_rate": 0.0002, + "loss": 0.4433, + "step": 8120 + }, + { + "epoch": 5.901996370235935, + "grad_norm": 1.2042944431304932, + "learning_rate": 0.0002, + "loss": 0.4007, + "step": 8130 + }, + { + "epoch": 5.909255898366606, + "grad_norm": 1.1788326501846313, + "learning_rate": 0.0002, + "loss": 0.3972, + "step": 8140 + }, + { + "epoch": 5.916515426497278, + "grad_norm": 1.4364569187164307, + "learning_rate": 0.0002, + "loss": 0.4506, + "step": 8150 + }, + { + "epoch": 5.923774954627949, + "grad_norm": 1.1704283952713013, + "learning_rate": 0.0002, + "loss": 0.4651, + "step": 8160 + }, + { + "epoch": 5.931034482758621, + "grad_norm": 1.040814995765686, + "learning_rate": 0.0002, + "loss": 0.3972, + "step": 8170 + }, + { + "epoch": 5.938294010889292, + "grad_norm": 1.1367416381835938, + "learning_rate": 0.0002, + "loss": 0.4038, + "step": 8180 + }, + { + "epoch": 5.945553539019964, + "grad_norm": 1.3401511907577515, + "learning_rate": 0.0002, + "loss": 0.4387, + "step": 8190 + }, + { + "epoch": 5.952813067150635, + "grad_norm": 1.1154041290283203, + "learning_rate": 0.0002, + "loss": 0.4396, + "step": 8200 + }, + { + "epoch": 5.960072595281307, + "grad_norm": 1.426089882850647, + "learning_rate": 0.0002, + "loss": 0.4744, + "step": 8210 + }, + { + "epoch": 5.967332123411978, + "grad_norm": 1.3170222043991089, + "learning_rate": 0.0002, + "loss": 0.4105, + "step": 8220 + }, + { + "epoch": 5.974591651542649, + "grad_norm": 1.1960029602050781, + "learning_rate": 0.0002, + "loss": 0.4137, + "step": 8230 + }, + { + "epoch": 5.981851179673321, + "grad_norm": 1.0843931436538696, + "learning_rate": 0.0002, + "loss": 0.423, + "step": 8240 + }, + { + "epoch": 5.9891107078039925, + "grad_norm": 1.050421118736267, + "learning_rate": 0.0002, + "loss": 0.459, + "step": 8250 + }, + { + "epoch": 5.9963702359346644, + "grad_norm": 1.0183138847351074, + "learning_rate": 0.0002, + "loss": 0.3993, + "step": 8260 + }, + { + "epoch": 6.0, + "eval_loss": 1.4677470922470093, + "eval_runtime": 46.2504, + "eval_samples_per_second": 9.427, + "eval_steps_per_second": 1.189, + "step": 8265 + }, + { + "epoch": 6.0036297640653356, + "grad_norm": 1.1702998876571655, + "learning_rate": 0.0002, + "loss": 0.3947, + "step": 8270 + }, + { + "epoch": 6.0108892921960075, + "grad_norm": 1.5389727354049683, + "learning_rate": 0.0002, + "loss": 0.2854, + "step": 8280 + }, + { + "epoch": 6.018148820326679, + "grad_norm": 1.502568244934082, + "learning_rate": 0.0002, + "loss": 0.2603, + "step": 8290 + }, + { + "epoch": 6.025408348457351, + "grad_norm": 1.3846043348312378, + "learning_rate": 0.0002, + "loss": 0.3329, + "step": 8300 + }, + { + "epoch": 6.032667876588022, + "grad_norm": 1.173553228378296, + "learning_rate": 0.0002, + "loss": 0.2651, + "step": 8310 + }, + { + "epoch": 6.039927404718694, + "grad_norm": 1.5325932502746582, + "learning_rate": 0.0002, + "loss": 0.3142, + "step": 8320 + }, + { + "epoch": 6.047186932849365, + "grad_norm": 1.303783655166626, + "learning_rate": 0.0002, + "loss": 0.2511, + "step": 8330 + }, + { + "epoch": 6.054446460980036, + "grad_norm": 0.9408994913101196, + "learning_rate": 0.0002, + "loss": 0.2352, + "step": 8340 + }, + { + "epoch": 6.061705989110708, + "grad_norm": 1.5430388450622559, + "learning_rate": 0.0002, + "loss": 0.2548, + "step": 8350 + }, + { + "epoch": 6.068965517241379, + "grad_norm": 0.8765342235565186, + "learning_rate": 0.0002, + "loss": 0.2682, + "step": 8360 + }, + { + "epoch": 6.076225045372051, + "grad_norm": 1.2363157272338867, + "learning_rate": 0.0002, + "loss": 0.2614, + "step": 8370 + }, + { + "epoch": 6.083484573502722, + "grad_norm": 1.21284818649292, + "learning_rate": 0.0002, + "loss": 0.294, + "step": 8380 + }, + { + "epoch": 6.090744101633394, + "grad_norm": 1.3261712789535522, + "learning_rate": 0.0002, + "loss": 0.2498, + "step": 8390 + }, + { + "epoch": 6.098003629764065, + "grad_norm": 1.077317714691162, + "learning_rate": 0.0002, + "loss": 0.2649, + "step": 8400 + }, + { + "epoch": 6.105263157894737, + "grad_norm": 0.9873808026313782, + "learning_rate": 0.0002, + "loss": 0.269, + "step": 8410 + }, + { + "epoch": 6.112522686025408, + "grad_norm": 1.032258152961731, + "learning_rate": 0.0002, + "loss": 0.2736, + "step": 8420 + }, + { + "epoch": 6.11978221415608, + "grad_norm": 1.1014811992645264, + "learning_rate": 0.0002, + "loss": 0.2854, + "step": 8430 + }, + { + "epoch": 6.127041742286751, + "grad_norm": 1.4264203310012817, + "learning_rate": 0.0002, + "loss": 0.2924, + "step": 8440 + }, + { + "epoch": 6.1343012704174225, + "grad_norm": 1.4086531400680542, + "learning_rate": 0.0002, + "loss": 0.3388, + "step": 8450 + }, + { + "epoch": 6.1415607985480944, + "grad_norm": 1.3842453956604004, + "learning_rate": 0.0002, + "loss": 0.2786, + "step": 8460 + }, + { + "epoch": 6.1488203266787655, + "grad_norm": 1.4356757402420044, + "learning_rate": 0.0002, + "loss": 0.3201, + "step": 8470 + }, + { + "epoch": 6.1560798548094375, + "grad_norm": 1.193315029144287, + "learning_rate": 0.0002, + "loss": 0.2908, + "step": 8480 + }, + { + "epoch": 6.163339382940109, + "grad_norm": 1.0623924732208252, + "learning_rate": 0.0002, + "loss": 0.342, + "step": 8490 + }, + { + "epoch": 6.170598911070781, + "grad_norm": 1.5484434366226196, + "learning_rate": 0.0002, + "loss": 0.3257, + "step": 8500 + }, + { + "epoch": 6.177858439201452, + "grad_norm": 1.3520029783248901, + "learning_rate": 0.0002, + "loss": 0.2861, + "step": 8510 + }, + { + "epoch": 6.185117967332124, + "grad_norm": 1.2773103713989258, + "learning_rate": 0.0002, + "loss": 0.3242, + "step": 8520 + }, + { + "epoch": 6.192377495462795, + "grad_norm": 1.4675105810165405, + "learning_rate": 0.0002, + "loss": 0.3108, + "step": 8530 + }, + { + "epoch": 6.199637023593467, + "grad_norm": 1.2118732929229736, + "learning_rate": 0.0002, + "loss": 0.3044, + "step": 8540 + }, + { + "epoch": 6.206896551724138, + "grad_norm": 1.264024257659912, + "learning_rate": 0.0002, + "loss": 0.2726, + "step": 8550 + }, + { + "epoch": 6.21415607985481, + "grad_norm": 1.406931757926941, + "learning_rate": 0.0002, + "loss": 0.306, + "step": 8560 + }, + { + "epoch": 6.221415607985481, + "grad_norm": 1.385459542274475, + "learning_rate": 0.0002, + "loss": 0.2904, + "step": 8570 + }, + { + "epoch": 6.228675136116152, + "grad_norm": 1.9336168766021729, + "learning_rate": 0.0002, + "loss": 0.3413, + "step": 8580 + }, + { + "epoch": 6.235934664246824, + "grad_norm": 0.9880136847496033, + "learning_rate": 0.0002, + "loss": 0.2769, + "step": 8590 + }, + { + "epoch": 6.243194192377495, + "grad_norm": 1.3870339393615723, + "learning_rate": 0.0002, + "loss": 0.3035, + "step": 8600 + }, + { + "epoch": 6.250453720508167, + "grad_norm": 1.2303647994995117, + "learning_rate": 0.0002, + "loss": 0.286, + "step": 8610 + }, + { + "epoch": 6.257713248638838, + "grad_norm": 1.5406211614608765, + "learning_rate": 0.0002, + "loss": 0.3113, + "step": 8620 + }, + { + "epoch": 6.26497277676951, + "grad_norm": 1.2436790466308594, + "learning_rate": 0.0002, + "loss": 0.292, + "step": 8630 + }, + { + "epoch": 6.272232304900181, + "grad_norm": 0.8844212293624878, + "learning_rate": 0.0002, + "loss": 0.3102, + "step": 8640 + }, + { + "epoch": 6.279491833030853, + "grad_norm": 1.2846336364746094, + "learning_rate": 0.0002, + "loss": 0.3373, + "step": 8650 + }, + { + "epoch": 6.286751361161524, + "grad_norm": 1.593814730644226, + "learning_rate": 0.0002, + "loss": 0.3535, + "step": 8660 + }, + { + "epoch": 6.2940108892921955, + "grad_norm": 1.2277469635009766, + "learning_rate": 0.0002, + "loss": 0.3413, + "step": 8670 + }, + { + "epoch": 6.3012704174228675, + "grad_norm": 1.2574384212493896, + "learning_rate": 0.0002, + "loss": 0.2958, + "step": 8680 + }, + { + "epoch": 6.308529945553539, + "grad_norm": 1.335150957107544, + "learning_rate": 0.0002, + "loss": 0.3251, + "step": 8690 + }, + { + "epoch": 6.315789473684211, + "grad_norm": 1.3140437602996826, + "learning_rate": 0.0002, + "loss": 0.3009, + "step": 8700 + }, + { + "epoch": 6.323049001814882, + "grad_norm": 1.1689209938049316, + "learning_rate": 0.0002, + "loss": 0.2783, + "step": 8710 + }, + { + "epoch": 6.330308529945554, + "grad_norm": 1.6448503732681274, + "learning_rate": 0.0002, + "loss": 0.3476, + "step": 8720 + }, + { + "epoch": 6.337568058076225, + "grad_norm": 0.9944145679473877, + "learning_rate": 0.0002, + "loss": 0.2934, + "step": 8730 + }, + { + "epoch": 6.344827586206897, + "grad_norm": 1.1775634288787842, + "learning_rate": 0.0002, + "loss": 0.3315, + "step": 8740 + }, + { + "epoch": 6.352087114337568, + "grad_norm": 1.8438639640808105, + "learning_rate": 0.0002, + "loss": 0.3514, + "step": 8750 + }, + { + "epoch": 6.35934664246824, + "grad_norm": 1.062495470046997, + "learning_rate": 0.0002, + "loss": 0.303, + "step": 8760 + }, + { + "epoch": 6.366606170598911, + "grad_norm": 1.3224315643310547, + "learning_rate": 0.0002, + "loss": 0.2737, + "step": 8770 + }, + { + "epoch": 6.373865698729583, + "grad_norm": 1.399844765663147, + "learning_rate": 0.0002, + "loss": 0.3445, + "step": 8780 + }, + { + "epoch": 6.381125226860254, + "grad_norm": 1.0409915447235107, + "learning_rate": 0.0002, + "loss": 0.3277, + "step": 8790 + }, + { + "epoch": 6.388384754990925, + "grad_norm": 1.5657726526260376, + "learning_rate": 0.0002, + "loss": 0.3218, + "step": 8800 + }, + { + "epoch": 6.395644283121597, + "grad_norm": 1.4098644256591797, + "learning_rate": 0.0002, + "loss": 0.3031, + "step": 8810 + }, + { + "epoch": 6.402903811252268, + "grad_norm": 1.5154732465744019, + "learning_rate": 0.0002, + "loss": 0.3133, + "step": 8820 + }, + { + "epoch": 6.41016333938294, + "grad_norm": 1.1139698028564453, + "learning_rate": 0.0002, + "loss": 0.3111, + "step": 8830 + }, + { + "epoch": 6.417422867513611, + "grad_norm": 1.4149729013442993, + "learning_rate": 0.0002, + "loss": 0.3553, + "step": 8840 + }, + { + "epoch": 6.424682395644283, + "grad_norm": 1.2632299661636353, + "learning_rate": 0.0002, + "loss": 0.287, + "step": 8850 + }, + { + "epoch": 6.431941923774954, + "grad_norm": 1.6636109352111816, + "learning_rate": 0.0002, + "loss": 0.3198, + "step": 8860 + }, + { + "epoch": 6.439201451905626, + "grad_norm": 1.4149386882781982, + "learning_rate": 0.0002, + "loss": 0.3749, + "step": 8870 + }, + { + "epoch": 6.4464609800362975, + "grad_norm": 1.1396206617355347, + "learning_rate": 0.0002, + "loss": 0.3504, + "step": 8880 + }, + { + "epoch": 6.4537205081669695, + "grad_norm": 1.2188775539398193, + "learning_rate": 0.0002, + "loss": 0.3328, + "step": 8890 + }, + { + "epoch": 6.460980036297641, + "grad_norm": 0.9740369319915771, + "learning_rate": 0.0002, + "loss": 0.3427, + "step": 8900 + }, + { + "epoch": 6.468239564428313, + "grad_norm": 1.228569746017456, + "learning_rate": 0.0002, + "loss": 0.3223, + "step": 8910 + }, + { + "epoch": 6.475499092558984, + "grad_norm": 1.5019789934158325, + "learning_rate": 0.0002, + "loss": 0.3151, + "step": 8920 + }, + { + "epoch": 6.482758620689655, + "grad_norm": 1.3320101499557495, + "learning_rate": 0.0002, + "loss": 0.2916, + "step": 8930 + }, + { + "epoch": 6.490018148820327, + "grad_norm": 1.5551502704620361, + "learning_rate": 0.0002, + "loss": 0.298, + "step": 8940 + }, + { + "epoch": 6.497277676950998, + "grad_norm": 1.470131754875183, + "learning_rate": 0.0002, + "loss": 0.3238, + "step": 8950 + }, + { + "epoch": 6.50453720508167, + "grad_norm": 1.1803025007247925, + "learning_rate": 0.0002, + "loss": 0.2808, + "step": 8960 + }, + { + "epoch": 6.511796733212341, + "grad_norm": 1.3505640029907227, + "learning_rate": 0.0002, + "loss": 0.3025, + "step": 8970 + }, + { + "epoch": 6.519056261343013, + "grad_norm": 1.13093900680542, + "learning_rate": 0.0002, + "loss": 0.3124, + "step": 8980 + }, + { + "epoch": 6.526315789473684, + "grad_norm": 1.347386121749878, + "learning_rate": 0.0002, + "loss": 0.3454, + "step": 8990 + }, + { + "epoch": 6.533575317604356, + "grad_norm": 1.7879165410995483, + "learning_rate": 0.0002, + "loss": 0.3532, + "step": 9000 + }, + { + "epoch": 6.540834845735027, + "grad_norm": 1.2168169021606445, + "learning_rate": 0.0002, + "loss": 0.3382, + "step": 9010 + }, + { + "epoch": 6.548094373865698, + "grad_norm": 1.1758877038955688, + "learning_rate": 0.0002, + "loss": 0.3413, + "step": 9020 + }, + { + "epoch": 6.55535390199637, + "grad_norm": 1.7366445064544678, + "learning_rate": 0.0002, + "loss": 0.2806, + "step": 9030 + }, + { + "epoch": 6.562613430127042, + "grad_norm": 1.5919222831726074, + "learning_rate": 0.0002, + "loss": 0.3437, + "step": 9040 + }, + { + "epoch": 6.569872958257713, + "grad_norm": 1.336863398551941, + "learning_rate": 0.0002, + "loss": 0.3261, + "step": 9050 + }, + { + "epoch": 6.577132486388384, + "grad_norm": 1.1769421100616455, + "learning_rate": 0.0002, + "loss": 0.3103, + "step": 9060 + }, + { + "epoch": 6.584392014519056, + "grad_norm": 1.0048751831054688, + "learning_rate": 0.0002, + "loss": 0.3295, + "step": 9070 + }, + { + "epoch": 6.5916515426497275, + "grad_norm": 1.5268515348434448, + "learning_rate": 0.0002, + "loss": 0.3156, + "step": 9080 + }, + { + "epoch": 6.5989110707803995, + "grad_norm": 1.434610366821289, + "learning_rate": 0.0002, + "loss": 0.3752, + "step": 9090 + }, + { + "epoch": 6.606170598911071, + "grad_norm": 1.1151410341262817, + "learning_rate": 0.0002, + "loss": 0.3375, + "step": 9100 + }, + { + "epoch": 6.613430127041743, + "grad_norm": 1.6690642833709717, + "learning_rate": 0.0002, + "loss": 0.363, + "step": 9110 + }, + { + "epoch": 6.620689655172414, + "grad_norm": 1.4495552778244019, + "learning_rate": 0.0002, + "loss": 0.3703, + "step": 9120 + }, + { + "epoch": 6.627949183303086, + "grad_norm": 1.377621054649353, + "learning_rate": 0.0002, + "loss": 0.3648, + "step": 9130 + }, + { + "epoch": 6.635208711433757, + "grad_norm": 1.5459434986114502, + "learning_rate": 0.0002, + "loss": 0.3766, + "step": 9140 + }, + { + "epoch": 6.642468239564428, + "grad_norm": 1.0920850038528442, + "learning_rate": 0.0002, + "loss": 0.3196, + "step": 9150 + }, + { + "epoch": 6.6497277676951, + "grad_norm": 1.6708381175994873, + "learning_rate": 0.0002, + "loss": 0.3505, + "step": 9160 + }, + { + "epoch": 6.656987295825771, + "grad_norm": 1.747514009475708, + "learning_rate": 0.0002, + "loss": 0.368, + "step": 9170 + }, + { + "epoch": 6.664246823956443, + "grad_norm": 1.133466362953186, + "learning_rate": 0.0002, + "loss": 0.3099, + "step": 9180 + }, + { + "epoch": 6.671506352087114, + "grad_norm": 1.394358515739441, + "learning_rate": 0.0002, + "loss": 0.3175, + "step": 9190 + }, + { + "epoch": 6.678765880217786, + "grad_norm": 0.9258374571800232, + "learning_rate": 0.0002, + "loss": 0.2981, + "step": 9200 + }, + { + "epoch": 6.686025408348457, + "grad_norm": 1.3750739097595215, + "learning_rate": 0.0002, + "loss": 0.3723, + "step": 9210 + }, + { + "epoch": 6.693284936479129, + "grad_norm": 0.8604967594146729, + "learning_rate": 0.0002, + "loss": 0.3441, + "step": 9220 + }, + { + "epoch": 6.7005444646098, + "grad_norm": 1.6074559688568115, + "learning_rate": 0.0002, + "loss": 0.3775, + "step": 9230 + }, + { + "epoch": 6.707803992740472, + "grad_norm": 0.9576877355575562, + "learning_rate": 0.0002, + "loss": 0.3139, + "step": 9240 + }, + { + "epoch": 6.715063520871143, + "grad_norm": 1.7193048000335693, + "learning_rate": 0.0002, + "loss": 0.3633, + "step": 9250 + }, + { + "epoch": 6.722323049001815, + "grad_norm": 1.3131844997406006, + "learning_rate": 0.0002, + "loss": 0.3139, + "step": 9260 + }, + { + "epoch": 6.729582577132486, + "grad_norm": 1.2978184223175049, + "learning_rate": 0.0002, + "loss": 0.3121, + "step": 9270 + }, + { + "epoch": 6.7368421052631575, + "grad_norm": 1.4792617559432983, + "learning_rate": 0.0002, + "loss": 0.3534, + "step": 9280 + }, + { + "epoch": 6.7441016333938295, + "grad_norm": 1.1265567541122437, + "learning_rate": 0.0002, + "loss": 0.3429, + "step": 9290 + }, + { + "epoch": 6.751361161524501, + "grad_norm": 1.8553377389907837, + "learning_rate": 0.0002, + "loss": 0.3526, + "step": 9300 + }, + { + "epoch": 6.758620689655173, + "grad_norm": 1.3602519035339355, + "learning_rate": 0.0002, + "loss": 0.3666, + "step": 9310 + }, + { + "epoch": 6.765880217785844, + "grad_norm": 1.2874794006347656, + "learning_rate": 0.0002, + "loss": 0.2922, + "step": 9320 + }, + { + "epoch": 6.773139745916516, + "grad_norm": 1.4834712743759155, + "learning_rate": 0.0002, + "loss": 0.3816, + "step": 9330 + }, + { + "epoch": 6.780399274047187, + "grad_norm": 2.0824034214019775, + "learning_rate": 0.0002, + "loss": 0.3557, + "step": 9340 + }, + { + "epoch": 6.787658802177859, + "grad_norm": 1.2267698049545288, + "learning_rate": 0.0002, + "loss": 0.3174, + "step": 9350 + }, + { + "epoch": 6.79491833030853, + "grad_norm": 1.4485498666763306, + "learning_rate": 0.0002, + "loss": 0.3665, + "step": 9360 + }, + { + "epoch": 6.802177858439201, + "grad_norm": 1.3199396133422852, + "learning_rate": 0.0002, + "loss": 0.3676, + "step": 9370 + }, + { + "epoch": 6.809437386569873, + "grad_norm": 1.2552456855773926, + "learning_rate": 0.0002, + "loss": 0.298, + "step": 9380 + }, + { + "epoch": 6.816696914700545, + "grad_norm": 1.3895127773284912, + "learning_rate": 0.0002, + "loss": 0.3152, + "step": 9390 + }, + { + "epoch": 6.823956442831216, + "grad_norm": 1.7637823820114136, + "learning_rate": 0.0002, + "loss": 0.3375, + "step": 9400 + }, + { + "epoch": 6.831215970961887, + "grad_norm": 1.6004475355148315, + "learning_rate": 0.0002, + "loss": 0.3234, + "step": 9410 + }, + { + "epoch": 6.838475499092559, + "grad_norm": 1.4133695363998413, + "learning_rate": 0.0002, + "loss": 0.3364, + "step": 9420 + }, + { + "epoch": 6.84573502722323, + "grad_norm": 1.1583502292633057, + "learning_rate": 0.0002, + "loss": 0.3656, + "step": 9430 + }, + { + "epoch": 6.852994555353902, + "grad_norm": 1.3769075870513916, + "learning_rate": 0.0002, + "loss": 0.3499, + "step": 9440 + }, + { + "epoch": 6.860254083484573, + "grad_norm": 1.1831218004226685, + "learning_rate": 0.0002, + "loss": 0.3333, + "step": 9450 + }, + { + "epoch": 6.867513611615245, + "grad_norm": 1.6092621088027954, + "learning_rate": 0.0002, + "loss": 0.3501, + "step": 9460 + }, + { + "epoch": 6.874773139745916, + "grad_norm": 1.3850210905075073, + "learning_rate": 0.0002, + "loss": 0.3933, + "step": 9470 + }, + { + "epoch": 6.882032667876588, + "grad_norm": 1.4119619131088257, + "learning_rate": 0.0002, + "loss": 0.3868, + "step": 9480 + }, + { + "epoch": 6.8892921960072595, + "grad_norm": 1.3494242429733276, + "learning_rate": 0.0002, + "loss": 0.3939, + "step": 9490 + }, + { + "epoch": 6.896551724137931, + "grad_norm": 1.3130041360855103, + "learning_rate": 0.0002, + "loss": 0.3217, + "step": 9500 + }, + { + "epoch": 6.903811252268603, + "grad_norm": 1.169256329536438, + "learning_rate": 0.0002, + "loss": 0.3738, + "step": 9510 + }, + { + "epoch": 6.911070780399274, + "grad_norm": 1.7475035190582275, + "learning_rate": 0.0002, + "loss": 0.408, + "step": 9520 + }, + { + "epoch": 6.918330308529946, + "grad_norm": 1.440434217453003, + "learning_rate": 0.0002, + "loss": 0.3407, + "step": 9530 + }, + { + "epoch": 6.925589836660617, + "grad_norm": 1.6768704652786255, + "learning_rate": 0.0002, + "loss": 0.3707, + "step": 9540 + }, + { + "epoch": 6.932849364791289, + "grad_norm": 1.3720577955245972, + "learning_rate": 0.0002, + "loss": 0.3283, + "step": 9550 + }, + { + "epoch": 6.94010889292196, + "grad_norm": 1.8140523433685303, + "learning_rate": 0.0002, + "loss": 0.3257, + "step": 9560 + }, + { + "epoch": 6.947368421052632, + "grad_norm": 1.1828241348266602, + "learning_rate": 0.0002, + "loss": 0.3308, + "step": 9570 + }, + { + "epoch": 6.954627949183303, + "grad_norm": 1.2755135297775269, + "learning_rate": 0.0002, + "loss": 0.3536, + "step": 9580 + }, + { + "epoch": 6.961887477313975, + "grad_norm": 1.622009038925171, + "learning_rate": 0.0002, + "loss": 0.3711, + "step": 9590 + }, + { + "epoch": 6.969147005444646, + "grad_norm": 1.1543664932250977, + "learning_rate": 0.0002, + "loss": 0.3529, + "step": 9600 + }, + { + "epoch": 6.976406533575318, + "grad_norm": 1.6755319833755493, + "learning_rate": 0.0002, + "loss": 0.416, + "step": 9610 + }, + { + "epoch": 6.983666061705989, + "grad_norm": 1.3726437091827393, + "learning_rate": 0.0002, + "loss": 0.339, + "step": 9620 + }, + { + "epoch": 6.99092558983666, + "grad_norm": 1.1605958938598633, + "learning_rate": 0.0002, + "loss": 0.3709, + "step": 9630 + }, + { + "epoch": 6.998185117967332, + "grad_norm": 1.5371781587600708, + "learning_rate": 0.0002, + "loss": 0.371, + "step": 9640 + }, + { + "epoch": 6.999637023593467, + "eval_loss": 1.6280181407928467, + "eval_runtime": 46.1964, + "eval_samples_per_second": 9.438, + "eval_steps_per_second": 1.191, + "step": 9642 + } + ], + "logging_steps": 10, + "max_steps": 11016, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.952749172195328e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..140bceabdec44fc473842cb9ace4a62a4dc9fe89 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-9642/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4abec5480aff960b5a7b03a48be1cdb1ae29ba8792469c451f5c2f269e4be93b +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/special_tokens_map.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/tokenizer.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/tokenizer.model b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/tokenizer_config.json b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/training_args.bin b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..140bceabdec44fc473842cb9ace4a62a4dc9fe89 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4abec5480aff960b5a7b03a48be1cdb1ae29ba8792469c451f5c2f269e4be93b +size 5560 diff --git a/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/training_log.jsonl b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..13bc1bb55cfbc6e4f997e42f0cd1a650ac7068f8 --- /dev/null +++ b/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/training_log.jsonl @@ -0,0 +1,8 @@ +{"epoch": 0.9996370235934664, "step": 1377, "epoch_duration": 4561.8241751194, "total_accumulated_duration": 4561.8241751194, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4936, "grad_norm": 0.46010470390319824, "learning_rate": 0.0002, "epoch": 0.007259528130671506, "step": 10}, {"loss": 1.8442, "grad_norm": 0.5103179216384888, "learning_rate": 0.0002, "epoch": 0.014519056261343012, "step": 20}, {"loss": 1.559, "grad_norm": 0.6282716989517212, "learning_rate": 0.0002, "epoch": 0.021778584392014518, "step": 30}, {"loss": 1.3618, "grad_norm": 1.2480497360229492, "learning_rate": 0.0002, "epoch": 0.029038112522686024, "step": 40}, {"loss": 1.3874, "grad_norm": 0.4114855229854584, "learning_rate": 0.0002, "epoch": 0.036297640653357534, "step": 50}, {"loss": 1.2836, "grad_norm": 0.49482840299606323, "learning_rate": 0.0002, "epoch": 0.043557168784029036, "step": 60}, {"loss": 1.1728, "grad_norm": 0.4536272883415222, "learning_rate": 0.0002, "epoch": 0.050816696914700546, "step": 70}, {"loss": 1.1125, "grad_norm": 0.32328274846076965, "learning_rate": 0.0002, "epoch": 0.05807622504537205, "step": 80}, {"loss": 1.2447, "grad_norm": 0.40990468859672546, "learning_rate": 0.0002, "epoch": 0.06533575317604355, "step": 90}, {"loss": 1.2216, "grad_norm": 0.37273502349853516, "learning_rate": 0.0002, "epoch": 0.07259528130671507, "step": 100}, {"loss": 1.2259, "grad_norm": 0.3903871476650238, "learning_rate": 0.0002, "epoch": 0.07985480943738657, "step": 110}, {"loss": 1.3718, "grad_norm": 0.3272787928581238, "learning_rate": 0.0002, "epoch": 0.08711433756805807, "step": 120}, {"loss": 1.1832, "grad_norm": 0.3622824251651764, "learning_rate": 0.0002, "epoch": 0.09437386569872959, "step": 130}, {"loss": 1.2966, "grad_norm": 0.3503916561603546, "learning_rate": 0.0002, "epoch": 0.10163339382940109, "step": 140}, {"loss": 1.0913, "grad_norm": 0.32787832617759705, "learning_rate": 0.0002, "epoch": 0.1088929219600726, "step": 150}, {"loss": 1.2143, "grad_norm": 0.5822657942771912, "learning_rate": 0.0002, "epoch": 0.1161524500907441, "step": 160}, {"loss": 1.2091, "grad_norm": 0.28028249740600586, "learning_rate": 0.0002, "epoch": 0.12341197822141561, "step": 170}, {"loss": 1.1731, "grad_norm": 0.33602750301361084, "learning_rate": 0.0002, "epoch": 0.1306715063520871, "step": 180}, {"loss": 1.127, "grad_norm": 0.29106274247169495, "learning_rate": 0.0002, "epoch": 0.13793103448275862, "step": 190}, {"loss": 1.3261, "grad_norm": 0.38753363490104675, "learning_rate": 0.0002, "epoch": 0.14519056261343014, "step": 200}, {"loss": 1.1536, "grad_norm": 0.361009418964386, "learning_rate": 0.0002, "epoch": 0.15245009074410162, "step": 210}, {"loss": 1.1483, "grad_norm": 0.6743836402893066, "learning_rate": 0.0002, "epoch": 0.15970961887477314, "step": 220}, {"loss": 1.264, "grad_norm": 0.3923613727092743, "learning_rate": 0.0002, "epoch": 0.16696914700544466, "step": 230}, {"loss": 1.0437, "grad_norm": 0.2809699773788452, "learning_rate": 0.0002, "epoch": 0.17422867513611615, "step": 240}, {"loss": 1.1826, "grad_norm": 0.3631494641304016, "learning_rate": 0.0002, "epoch": 0.18148820326678766, "step": 250}, {"loss": 1.1584, "grad_norm": 0.24658171832561493, "learning_rate": 0.0002, "epoch": 0.18874773139745918, "step": 260}, {"loss": 1.0065, "grad_norm": 0.5780664682388306, "learning_rate": 0.0002, "epoch": 0.19600725952813067, "step": 270}, {"loss": 1.1251, "grad_norm": 0.3056720495223999, "learning_rate": 0.0002, "epoch": 0.20326678765880218, "step": 280}, {"loss": 1.1503, "grad_norm": 0.2857084572315216, "learning_rate": 0.0002, "epoch": 0.21052631578947367, "step": 290}, {"loss": 1.1215, "grad_norm": 0.6645345687866211, "learning_rate": 0.0002, "epoch": 0.2177858439201452, "step": 300}, {"loss": 1.127, "grad_norm": 0.5966078639030457, "learning_rate": 0.0002, "epoch": 0.2250453720508167, "step": 310}, {"loss": 1.1974, "grad_norm": 0.40937140583992004, "learning_rate": 0.0002, "epoch": 0.2323049001814882, "step": 320}, {"loss": 1.2385, "grad_norm": 0.5642806887626648, "learning_rate": 0.0002, "epoch": 0.2395644283121597, "step": 330}, {"loss": 1.1703, "grad_norm": 0.2750748097896576, "learning_rate": 0.0002, "epoch": 0.24682395644283123, "step": 340}, {"loss": 1.2598, "grad_norm": 0.34350234270095825, "learning_rate": 0.0002, "epoch": 0.2540834845735027, "step": 350}, {"loss": 1.1942, "grad_norm": 0.6767239570617676, "learning_rate": 0.0002, "epoch": 0.2613430127041742, "step": 360}, {"loss": 1.1436, "grad_norm": 0.31006959080696106, "learning_rate": 0.0002, "epoch": 0.26860254083484575, "step": 370}, {"loss": 1.2084, "grad_norm": 0.3825474679470062, "learning_rate": 0.0002, "epoch": 0.27586206896551724, "step": 380}, {"loss": 1.1523, "grad_norm": 0.30241551995277405, "learning_rate": 0.0002, "epoch": 0.2831215970961887, "step": 390}, {"loss": 1.1298, "grad_norm": 0.2962397336959839, "learning_rate": 0.0002, "epoch": 0.29038112522686027, "step": 400}, {"loss": 1.1299, "grad_norm": 0.2600369155406952, "learning_rate": 0.0002, "epoch": 0.29764065335753176, "step": 410}, {"loss": 1.1366, "grad_norm": 0.3675060272216797, "learning_rate": 0.0002, "epoch": 0.30490018148820325, "step": 420}, {"loss": 1.156, "grad_norm": 0.3429498076438904, "learning_rate": 0.0002, "epoch": 0.3121597096188748, "step": 430}, {"loss": 1.2741, "grad_norm": 0.34311825037002563, "learning_rate": 0.0002, "epoch": 0.3194192377495463, "step": 440}, {"loss": 1.3523, "grad_norm": 0.37872210144996643, "learning_rate": 0.0002, "epoch": 0.32667876588021777, "step": 450}, {"loss": 1.1365, "grad_norm": 0.33271121978759766, "learning_rate": 0.0002, "epoch": 0.3339382940108893, "step": 460}, {"loss": 1.1572, "grad_norm": 0.34605276584625244, "learning_rate": 0.0002, "epoch": 0.3411978221415608, "step": 470}, {"loss": 1.2755, "grad_norm": 0.41050562262535095, "learning_rate": 0.0002, "epoch": 0.3484573502722323, "step": 480}, {"loss": 1.075, "grad_norm": 0.2066836953163147, "learning_rate": 0.0002, "epoch": 0.35571687840290384, "step": 490}, {"loss": 1.2072, "grad_norm": 0.2859014868736267, "learning_rate": 0.0002, "epoch": 0.3629764065335753, "step": 500}, {"loss": 1.1435, "grad_norm": 0.28763777017593384, "learning_rate": 0.0002, "epoch": 0.3702359346642468, "step": 510}, {"loss": 1.1341, "grad_norm": 0.2730471193790436, "learning_rate": 0.0002, "epoch": 0.37749546279491836, "step": 520}, {"loss": 1.348, "grad_norm": 0.3968936800956726, "learning_rate": 0.0002, "epoch": 0.38475499092558985, "step": 530}, {"loss": 1.2363, "grad_norm": 0.3624701201915741, "learning_rate": 0.0002, "epoch": 0.39201451905626133, "step": 540}, {"loss": 1.1206, "grad_norm": 0.3303608298301697, "learning_rate": 0.0002, "epoch": 0.3992740471869328, "step": 550}, {"loss": 1.203, "grad_norm": 0.33507466316223145, "learning_rate": 0.0002, "epoch": 0.40653357531760437, "step": 560}, {"loss": 1.2294, "grad_norm": 0.3297670781612396, "learning_rate": 0.0002, "epoch": 0.41379310344827586, "step": 570}, {"loss": 1.2, "grad_norm": 0.32334890961647034, "learning_rate": 0.0002, "epoch": 0.42105263157894735, "step": 580}, {"loss": 1.112, "grad_norm": 0.30281195044517517, "learning_rate": 0.0002, "epoch": 0.4283121597096189, "step": 590}, {"loss": 1.1352, "grad_norm": 0.5900027751922607, "learning_rate": 0.0002, "epoch": 0.4355716878402904, "step": 600}, {"loss": 1.1575, "grad_norm": 0.28437477350234985, "learning_rate": 0.0002, "epoch": 0.44283121597096187, "step": 610}, {"loss": 1.2009, "grad_norm": 0.39601704478263855, "learning_rate": 0.0002, "epoch": 0.4500907441016334, "step": 620}, {"loss": 1.2302, "grad_norm": 0.41971510648727417, "learning_rate": 0.0002, "epoch": 0.4573502722323049, "step": 630}, {"loss": 1.236, "grad_norm": 0.33814409375190735, "learning_rate": 0.0002, "epoch": 0.4646098003629764, "step": 640}, {"loss": 1.2471, "grad_norm": 0.575718104839325, "learning_rate": 0.0002, "epoch": 0.47186932849364793, "step": 650}, {"loss": 1.1012, "grad_norm": 0.37927401065826416, "learning_rate": 0.0002, "epoch": 0.4791288566243194, "step": 660}, {"loss": 1.1552, "grad_norm": 0.3224332630634308, "learning_rate": 0.0002, "epoch": 0.4863883847549909, "step": 670}, {"loss": 1.1354, "grad_norm": 0.32683515548706055, "learning_rate": 0.0002, "epoch": 0.49364791288566245, "step": 680}, {"loss": 1.207, "grad_norm": 0.4316163659095764, "learning_rate": 0.0002, "epoch": 0.5009074410163339, "step": 690}, {"loss": 1.2116, "grad_norm": 0.342602401971817, "learning_rate": 0.0002, "epoch": 0.5081669691470054, "step": 700}, {"loss": 1.0433, "grad_norm": 0.2794898748397827, "learning_rate": 0.0002, "epoch": 0.515426497277677, "step": 710}, {"loss": 1.2395, "grad_norm": 0.3322339951992035, "learning_rate": 0.0002, "epoch": 0.5226860254083484, "step": 720}, {"loss": 1.0762, "grad_norm": 0.3088509142398834, "learning_rate": 0.0002, "epoch": 0.52994555353902, "step": 730}, {"loss": 1.1495, "grad_norm": 0.24444378912448883, "learning_rate": 0.0002, "epoch": 0.5372050816696915, "step": 740}, {"loss": 1.2176, "grad_norm": 0.3483171761035919, "learning_rate": 0.0002, "epoch": 0.5444646098003629, "step": 750}, {"loss": 1.2248, "grad_norm": 0.3425690531730652, "learning_rate": 0.0002, "epoch": 0.5517241379310345, "step": 760}, {"loss": 1.1127, "grad_norm": 0.31841927766799927, "learning_rate": 0.0002, "epoch": 0.558983666061706, "step": 770}, {"loss": 1.1841, "grad_norm": 0.39423868060112, "learning_rate": 0.0002, "epoch": 0.5662431941923775, "step": 780}, {"loss": 1.0583, "grad_norm": 0.30328479409217834, "learning_rate": 0.0002, "epoch": 0.573502722323049, "step": 790}, {"loss": 1.0718, "grad_norm": 0.24475938081741333, "learning_rate": 0.0002, "epoch": 0.5807622504537205, "step": 800}, {"loss": 1.2006, "grad_norm": 0.37132805585861206, "learning_rate": 0.0002, "epoch": 0.588021778584392, "step": 810}, {"loss": 1.1544, "grad_norm": 0.32195979356765747, "learning_rate": 0.0002, "epoch": 0.5952813067150635, "step": 820}, {"loss": 0.9937, "grad_norm": 0.2848738729953766, "learning_rate": 0.0002, "epoch": 0.6025408348457351, "step": 830}, {"loss": 1.0937, "grad_norm": 0.28015264868736267, "learning_rate": 0.0002, "epoch": 0.6098003629764065, "step": 840}, {"loss": 1.1234, "grad_norm": 0.37796008586883545, "learning_rate": 0.0002, "epoch": 0.617059891107078, "step": 850}, {"loss": 1.1455, "grad_norm": 0.39311841130256653, "learning_rate": 0.0002, "epoch": 0.6243194192377496, "step": 860}, {"loss": 1.1066, "grad_norm": 0.2761685252189636, "learning_rate": 0.0002, "epoch": 0.631578947368421, "step": 870}, {"loss": 1.0859, "grad_norm": 0.3826720118522644, "learning_rate": 0.0002, "epoch": 0.6388384754990926, "step": 880}, {"loss": 1.1182, "grad_norm": 0.30076679587364197, "learning_rate": 0.0002, "epoch": 0.6460980036297641, "step": 890}, {"loss": 1.0927, "grad_norm": 0.21997687220573425, "learning_rate": 0.0002, "epoch": 0.6533575317604355, "step": 900}, {"loss": 1.1603, "grad_norm": 0.32593777775764465, "learning_rate": 0.0002, "epoch": 0.6606170598911071, "step": 910}, {"loss": 1.1578, "grad_norm": 0.30347898602485657, "learning_rate": 0.0002, "epoch": 0.6678765880217786, "step": 920}, {"loss": 1.1642, "grad_norm": 0.44173774123191833, "learning_rate": 0.0002, "epoch": 0.6751361161524501, "step": 930}, {"loss": 1.1832, "grad_norm": 0.2507467269897461, "learning_rate": 0.0002, "epoch": 0.6823956442831216, "step": 940}, {"loss": 1.3001, "grad_norm": 0.29463833570480347, "learning_rate": 0.0002, "epoch": 0.6896551724137931, "step": 950}, {"loss": 1.1651, "grad_norm": 0.9363154172897339, "learning_rate": 0.0002, "epoch": 0.6969147005444646, "step": 960}, {"loss": 1.158, "grad_norm": 0.3236212134361267, "learning_rate": 0.0002, "epoch": 0.7041742286751361, "step": 970}, {"loss": 1.1078, "grad_norm": 0.3123254179954529, "learning_rate": 0.0002, "epoch": 0.7114337568058077, "step": 980}, {"loss": 1.1578, "grad_norm": 0.3395805060863495, "learning_rate": 0.0002, "epoch": 0.7186932849364791, "step": 990}, {"loss": 1.0754, "grad_norm": 0.3240964412689209, "learning_rate": 0.0002, "epoch": 0.7259528130671506, "step": 1000}, {"loss": 1.0313, "grad_norm": 0.31902948021888733, "learning_rate": 0.0002, "epoch": 0.7332123411978222, "step": 1010}, {"loss": 1.1664, "grad_norm": 0.4848408102989197, "learning_rate": 0.0002, "epoch": 0.7404718693284936, "step": 1020}, {"loss": 1.0838, "grad_norm": 0.33006033301353455, "learning_rate": 0.0002, "epoch": 0.7477313974591652, "step": 1030}, {"loss": 1.0979, "grad_norm": 0.2928730547428131, "learning_rate": 0.0002, "epoch": 0.7549909255898367, "step": 1040}, {"loss": 1.1351, "grad_norm": 0.3529164791107178, "learning_rate": 0.0002, "epoch": 0.7622504537205081, "step": 1050}, {"loss": 1.1274, "grad_norm": 0.2736213803291321, "learning_rate": 0.0002, "epoch": 0.7695099818511797, "step": 1060}, {"loss": 1.1169, "grad_norm": 0.7200686931610107, "learning_rate": 0.0002, "epoch": 0.7767695099818511, "step": 1070}, {"loss": 1.1171, "grad_norm": 0.33396708965301514, "learning_rate": 0.0002, "epoch": 0.7840290381125227, "step": 1080}, {"loss": 1.2447, "grad_norm": 1.5760449171066284, "learning_rate": 0.0002, "epoch": 0.7912885662431942, "step": 1090}, {"loss": 1.1126, "grad_norm": 0.28138381242752075, "learning_rate": 0.0002, "epoch": 0.7985480943738656, "step": 1100}, {"loss": 1.2012, "grad_norm": 0.2597472369670868, "learning_rate": 0.0002, "epoch": 0.8058076225045372, "step": 1110}, {"loss": 1.1177, "grad_norm": 0.3305445611476898, "learning_rate": 0.0002, "epoch": 0.8130671506352087, "step": 1120}, {"loss": 1.0849, "grad_norm": 0.3934599459171295, "learning_rate": 0.0002, "epoch": 0.8203266787658802, "step": 1130}, {"loss": 1.0889, "grad_norm": 0.3472191393375397, "learning_rate": 0.0002, "epoch": 0.8275862068965517, "step": 1140}, {"loss": 1.265, "grad_norm": 0.2857365906238556, "learning_rate": 0.0002, "epoch": 0.8348457350272233, "step": 1150}, {"loss": 1.03, "grad_norm": 0.3207702934741974, "learning_rate": 0.0002, "epoch": 0.8421052631578947, "step": 1160}, {"loss": 1.1669, "grad_norm": 0.3176484704017639, "learning_rate": 0.0002, "epoch": 0.8493647912885662, "step": 1170}, {"loss": 1.1386, "grad_norm": 0.40685558319091797, "learning_rate": 0.0002, "epoch": 0.8566243194192378, "step": 1180}, {"loss": 1.1383, "grad_norm": 0.31125199794769287, "learning_rate": 0.0002, "epoch": 0.8638838475499092, "step": 1190}, {"loss": 1.1512, "grad_norm": 0.7361181378364563, "learning_rate": 0.0002, "epoch": 0.8711433756805808, "step": 1200}, {"loss": 0.9983, "grad_norm": 0.33699527382850647, "learning_rate": 0.0002, "epoch": 0.8784029038112523, "step": 1210}, {"loss": 1.11, "grad_norm": 0.3315220773220062, "learning_rate": 0.0002, "epoch": 0.8856624319419237, "step": 1220}, {"loss": 1.0266, "grad_norm": 0.6256054043769836, "learning_rate": 0.0002, "epoch": 0.8929219600725953, "step": 1230}, {"loss": 1.1738, "grad_norm": 0.3692137897014618, "learning_rate": 0.0002, "epoch": 0.9001814882032668, "step": 1240}, {"loss": 1.1112, "grad_norm": 0.3538484573364258, "learning_rate": 0.0002, "epoch": 0.9074410163339383, "step": 1250}, {"loss": 1.0878, "grad_norm": 0.27863040566444397, "learning_rate": 0.0002, "epoch": 0.9147005444646098, "step": 1260}, {"loss": 1.0826, "grad_norm": 0.3322528302669525, "learning_rate": 0.0002, "epoch": 0.9219600725952813, "step": 1270}, {"loss": 1.0158, "grad_norm": 0.7553173303604126, "learning_rate": 0.0002, "epoch": 0.9292196007259528, "step": 1280}, {"loss": 1.198, "grad_norm": 0.4856191575527191, "learning_rate": 0.0002, "epoch": 0.9364791288566243, "step": 1290}, {"loss": 1.1678, "grad_norm": 0.3668074905872345, "learning_rate": 0.0002, "epoch": 0.9437386569872959, "step": 1300}, {"loss": 1.1809, "grad_norm": 0.29851067066192627, "learning_rate": 0.0002, "epoch": 0.9509981851179673, "step": 1310}, {"loss": 1.1299, "grad_norm": 0.276664674282074, "learning_rate": 0.0002, "epoch": 0.9582577132486388, "step": 1320}, {"loss": 1.0597, "grad_norm": 0.2941018342971802, "learning_rate": 0.0002, "epoch": 0.9655172413793104, "step": 1330}, {"loss": 1.1731, "grad_norm": 0.3505859076976776, "learning_rate": 0.0002, "epoch": 0.9727767695099818, "step": 1340}, {"loss": 1.1809, "grad_norm": 0.3067687451839447, "learning_rate": 0.0002, "epoch": 0.9800362976406534, "step": 1350}, {"loss": 1.1511, "grad_norm": 0.27151066064834595, "learning_rate": 0.0002, "epoch": 0.9872958257713249, "step": 1360}, {"loss": 1.1919, "grad_norm": 0.36370083689689636, "learning_rate": 0.0002, "epoch": 0.9945553539019963, "step": 1370}]} +{"epoch": 2.0, "step": 2755, "epoch_duration": 4702.509291410446, "total_accumulated_duration": 9264.333466529846, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 24486.0}, "peak_memory_reserved": {"GPU_0": 24752.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-1377", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4936, "grad_norm": 0.46010470390319824, "learning_rate": 0.0002, "epoch": 0.007259528130671506, "step": 10}, {"loss": 1.8442, "grad_norm": 0.5103179216384888, "learning_rate": 0.0002, "epoch": 0.014519056261343012, "step": 20}, {"loss": 1.559, "grad_norm": 0.6282716989517212, "learning_rate": 0.0002, "epoch": 0.021778584392014518, "step": 30}, {"loss": 1.3618, "grad_norm": 1.2480497360229492, "learning_rate": 0.0002, "epoch": 0.029038112522686024, "step": 40}, {"loss": 1.3874, "grad_norm": 0.4114855229854584, "learning_rate": 0.0002, "epoch": 0.036297640653357534, "step": 50}, {"loss": 1.2836, "grad_norm": 0.49482840299606323, "learning_rate": 0.0002, "epoch": 0.043557168784029036, "step": 60}, {"loss": 1.1728, "grad_norm": 0.4536272883415222, "learning_rate": 0.0002, "epoch": 0.050816696914700546, "step": 70}, {"loss": 1.1125, "grad_norm": 0.32328274846076965, "learning_rate": 0.0002, "epoch": 0.05807622504537205, "step": 80}, {"loss": 1.2447, "grad_norm": 0.40990468859672546, "learning_rate": 0.0002, "epoch": 0.06533575317604355, "step": 90}, {"loss": 1.2216, "grad_norm": 0.37273502349853516, "learning_rate": 0.0002, "epoch": 0.07259528130671507, "step": 100}, {"loss": 1.2259, "grad_norm": 0.3903871476650238, "learning_rate": 0.0002, "epoch": 0.07985480943738657, "step": 110}, {"loss": 1.3718, "grad_norm": 0.3272787928581238, "learning_rate": 0.0002, "epoch": 0.08711433756805807, "step": 120}, {"loss": 1.1832, "grad_norm": 0.3622824251651764, "learning_rate": 0.0002, "epoch": 0.09437386569872959, "step": 130}, {"loss": 1.2966, "grad_norm": 0.3503916561603546, "learning_rate": 0.0002, "epoch": 0.10163339382940109, "step": 140}, {"loss": 1.0913, "grad_norm": 0.32787832617759705, "learning_rate": 0.0002, "epoch": 0.1088929219600726, "step": 150}, {"loss": 1.2143, "grad_norm": 0.5822657942771912, "learning_rate": 0.0002, "epoch": 0.1161524500907441, "step": 160}, {"loss": 1.2091, "grad_norm": 0.28028249740600586, "learning_rate": 0.0002, "epoch": 0.12341197822141561, "step": 170}, {"loss": 1.1731, "grad_norm": 0.33602750301361084, "learning_rate": 0.0002, "epoch": 0.1306715063520871, "step": 180}, {"loss": 1.127, "grad_norm": 0.29106274247169495, "learning_rate": 0.0002, "epoch": 0.13793103448275862, "step": 190}, {"loss": 1.3261, "grad_norm": 0.38753363490104675, "learning_rate": 0.0002, "epoch": 0.14519056261343014, "step": 200}, {"loss": 1.1536, "grad_norm": 0.361009418964386, "learning_rate": 0.0002, "epoch": 0.15245009074410162, "step": 210}, {"loss": 1.1483, "grad_norm": 0.6743836402893066, "learning_rate": 0.0002, "epoch": 0.15970961887477314, "step": 220}, {"loss": 1.264, "grad_norm": 0.3923613727092743, "learning_rate": 0.0002, "epoch": 0.16696914700544466, "step": 230}, {"loss": 1.0437, "grad_norm": 0.2809699773788452, "learning_rate": 0.0002, "epoch": 0.17422867513611615, "step": 240}, {"loss": 1.1826, "grad_norm": 0.3631494641304016, "learning_rate": 0.0002, "epoch": 0.18148820326678766, "step": 250}, {"loss": 1.1584, "grad_norm": 0.24658171832561493, "learning_rate": 0.0002, "epoch": 0.18874773139745918, "step": 260}, {"loss": 1.0065, "grad_norm": 0.5780664682388306, "learning_rate": 0.0002, "epoch": 0.19600725952813067, "step": 270}, {"loss": 1.1251, "grad_norm": 0.3056720495223999, "learning_rate": 0.0002, "epoch": 0.20326678765880218, "step": 280}, {"loss": 1.1503, "grad_norm": 0.2857084572315216, "learning_rate": 0.0002, "epoch": 0.21052631578947367, "step": 290}, {"loss": 1.1215, "grad_norm": 0.6645345687866211, "learning_rate": 0.0002, "epoch": 0.2177858439201452, "step": 300}, {"loss": 1.127, "grad_norm": 0.5966078639030457, "learning_rate": 0.0002, "epoch": 0.2250453720508167, "step": 310}, {"loss": 1.1974, "grad_norm": 0.40937140583992004, "learning_rate": 0.0002, "epoch": 0.2323049001814882, "step": 320}, {"loss": 1.2385, "grad_norm": 0.5642806887626648, "learning_rate": 0.0002, "epoch": 0.2395644283121597, "step": 330}, {"loss": 1.1703, "grad_norm": 0.2750748097896576, "learning_rate": 0.0002, "epoch": 0.24682395644283123, "step": 340}, {"loss": 1.2598, "grad_norm": 0.34350234270095825, "learning_rate": 0.0002, "epoch": 0.2540834845735027, "step": 350}, {"loss": 1.1942, "grad_norm": 0.6767239570617676, "learning_rate": 0.0002, "epoch": 0.2613430127041742, "step": 360}, {"loss": 1.1436, "grad_norm": 0.31006959080696106, "learning_rate": 0.0002, "epoch": 0.26860254083484575, "step": 370}, {"loss": 1.2084, "grad_norm": 0.3825474679470062, "learning_rate": 0.0002, "epoch": 0.27586206896551724, "step": 380}, {"loss": 1.1523, "grad_norm": 0.30241551995277405, "learning_rate": 0.0002, "epoch": 0.2831215970961887, "step": 390}, {"loss": 1.1298, "grad_norm": 0.2962397336959839, "learning_rate": 0.0002, "epoch": 0.29038112522686027, "step": 400}, {"loss": 1.1299, "grad_norm": 0.2600369155406952, "learning_rate": 0.0002, "epoch": 0.29764065335753176, "step": 410}, {"loss": 1.1366, "grad_norm": 0.3675060272216797, "learning_rate": 0.0002, "epoch": 0.30490018148820325, "step": 420}, {"loss": 1.156, "grad_norm": 0.3429498076438904, "learning_rate": 0.0002, "epoch": 0.3121597096188748, "step": 430}, {"loss": 1.2741, "grad_norm": 0.34311825037002563, "learning_rate": 0.0002, "epoch": 0.3194192377495463, "step": 440}, {"loss": 1.3523, "grad_norm": 0.37872210144996643, "learning_rate": 0.0002, "epoch": 0.32667876588021777, "step": 450}, {"loss": 1.1365, "grad_norm": 0.33271121978759766, "learning_rate": 0.0002, "epoch": 0.3339382940108893, "step": 460}, {"loss": 1.1572, "grad_norm": 0.34605276584625244, "learning_rate": 0.0002, "epoch": 0.3411978221415608, "step": 470}, {"loss": 1.2755, "grad_norm": 0.41050562262535095, "learning_rate": 0.0002, "epoch": 0.3484573502722323, "step": 480}, {"loss": 1.075, "grad_norm": 0.2066836953163147, "learning_rate": 0.0002, "epoch": 0.35571687840290384, "step": 490}, {"loss": 1.2072, "grad_norm": 0.2859014868736267, "learning_rate": 0.0002, "epoch": 0.3629764065335753, "step": 500}, {"loss": 1.1435, "grad_norm": 0.28763777017593384, "learning_rate": 0.0002, "epoch": 0.3702359346642468, "step": 510}, {"loss": 1.1341, "grad_norm": 0.2730471193790436, "learning_rate": 0.0002, "epoch": 0.37749546279491836, "step": 520}, {"loss": 1.348, "grad_norm": 0.3968936800956726, "learning_rate": 0.0002, "epoch": 0.38475499092558985, "step": 530}, {"loss": 1.2363, "grad_norm": 0.3624701201915741, "learning_rate": 0.0002, "epoch": 0.39201451905626133, "step": 540}, {"loss": 1.1206, "grad_norm": 0.3303608298301697, "learning_rate": 0.0002, "epoch": 0.3992740471869328, "step": 550}, {"loss": 1.203, "grad_norm": 0.33507466316223145, "learning_rate": 0.0002, "epoch": 0.40653357531760437, "step": 560}, {"loss": 1.2294, "grad_norm": 0.3297670781612396, "learning_rate": 0.0002, "epoch": 0.41379310344827586, "step": 570}, {"loss": 1.2, "grad_norm": 0.32334890961647034, "learning_rate": 0.0002, "epoch": 0.42105263157894735, "step": 580}, {"loss": 1.112, "grad_norm": 0.30281195044517517, "learning_rate": 0.0002, "epoch": 0.4283121597096189, "step": 590}, {"loss": 1.1352, "grad_norm": 0.5900027751922607, "learning_rate": 0.0002, "epoch": 0.4355716878402904, "step": 600}, {"loss": 1.1575, "grad_norm": 0.28437477350234985, "learning_rate": 0.0002, "epoch": 0.44283121597096187, "step": 610}, {"loss": 1.2009, "grad_norm": 0.39601704478263855, "learning_rate": 0.0002, "epoch": 0.4500907441016334, "step": 620}, {"loss": 1.2302, "grad_norm": 0.41971510648727417, "learning_rate": 0.0002, "epoch": 0.4573502722323049, "step": 630}, {"loss": 1.236, "grad_norm": 0.33814409375190735, "learning_rate": 0.0002, "epoch": 0.4646098003629764, "step": 640}, {"loss": 1.2471, "grad_norm": 0.575718104839325, "learning_rate": 0.0002, "epoch": 0.47186932849364793, "step": 650}, {"loss": 1.1012, "grad_norm": 0.37927401065826416, "learning_rate": 0.0002, "epoch": 0.4791288566243194, "step": 660}, {"loss": 1.1552, "grad_norm": 0.3224332630634308, "learning_rate": 0.0002, "epoch": 0.4863883847549909, "step": 670}, {"loss": 1.1354, "grad_norm": 0.32683515548706055, "learning_rate": 0.0002, "epoch": 0.49364791288566245, "step": 680}, {"loss": 1.207, "grad_norm": 0.4316163659095764, "learning_rate": 0.0002, "epoch": 0.5009074410163339, "step": 690}, {"loss": 1.2116, "grad_norm": 0.342602401971817, "learning_rate": 0.0002, "epoch": 0.5081669691470054, "step": 700}, {"loss": 1.0433, "grad_norm": 0.2794898748397827, "learning_rate": 0.0002, "epoch": 0.515426497277677, "step": 710}, {"loss": 1.2395, "grad_norm": 0.3322339951992035, "learning_rate": 0.0002, "epoch": 0.5226860254083484, "step": 720}, {"loss": 1.0762, "grad_norm": 0.3088509142398834, "learning_rate": 0.0002, "epoch": 0.52994555353902, "step": 730}, {"loss": 1.1495, "grad_norm": 0.24444378912448883, "learning_rate": 0.0002, "epoch": 0.5372050816696915, "step": 740}, {"loss": 1.2176, "grad_norm": 0.3483171761035919, "learning_rate": 0.0002, "epoch": 0.5444646098003629, "step": 750}, {"loss": 1.2248, "grad_norm": 0.3425690531730652, "learning_rate": 0.0002, "epoch": 0.5517241379310345, "step": 760}, {"loss": 1.1127, "grad_norm": 0.31841927766799927, "learning_rate": 0.0002, "epoch": 0.558983666061706, "step": 770}, {"loss": 1.1841, "grad_norm": 0.39423868060112, "learning_rate": 0.0002, "epoch": 0.5662431941923775, "step": 780}, {"loss": 1.0583, "grad_norm": 0.30328479409217834, "learning_rate": 0.0002, "epoch": 0.573502722323049, "step": 790}, {"loss": 1.0718, "grad_norm": 0.24475938081741333, "learning_rate": 0.0002, "epoch": 0.5807622504537205, "step": 800}, {"loss": 1.2006, "grad_norm": 0.37132805585861206, "learning_rate": 0.0002, "epoch": 0.588021778584392, "step": 810}, {"loss": 1.1544, "grad_norm": 0.32195979356765747, "learning_rate": 0.0002, "epoch": 0.5952813067150635, "step": 820}, {"loss": 0.9937, "grad_norm": 0.2848738729953766, "learning_rate": 0.0002, "epoch": 0.6025408348457351, "step": 830}, {"loss": 1.0937, "grad_norm": 0.28015264868736267, "learning_rate": 0.0002, "epoch": 0.6098003629764065, "step": 840}, {"loss": 1.1234, "grad_norm": 0.37796008586883545, "learning_rate": 0.0002, "epoch": 0.617059891107078, "step": 850}, {"loss": 1.1455, "grad_norm": 0.39311841130256653, "learning_rate": 0.0002, "epoch": 0.6243194192377496, "step": 860}, {"loss": 1.1066, "grad_norm": 0.2761685252189636, "learning_rate": 0.0002, "epoch": 0.631578947368421, "step": 870}, {"loss": 1.0859, "grad_norm": 0.3826720118522644, "learning_rate": 0.0002, "epoch": 0.6388384754990926, "step": 880}, {"loss": 1.1182, "grad_norm": 0.30076679587364197, "learning_rate": 0.0002, "epoch": 0.6460980036297641, "step": 890}, {"loss": 1.0927, "grad_norm": 0.21997687220573425, "learning_rate": 0.0002, "epoch": 0.6533575317604355, "step": 900}, {"loss": 1.1603, "grad_norm": 0.32593777775764465, "learning_rate": 0.0002, "epoch": 0.6606170598911071, "step": 910}, {"loss": 1.1578, "grad_norm": 0.30347898602485657, "learning_rate": 0.0002, "epoch": 0.6678765880217786, "step": 920}, {"loss": 1.1642, "grad_norm": 0.44173774123191833, "learning_rate": 0.0002, "epoch": 0.6751361161524501, "step": 930}, {"loss": 1.1832, "grad_norm": 0.2507467269897461, "learning_rate": 0.0002, "epoch": 0.6823956442831216, "step": 940}, {"loss": 1.3001, "grad_norm": 0.29463833570480347, "learning_rate": 0.0002, "epoch": 0.6896551724137931, "step": 950}, {"loss": 1.1651, "grad_norm": 0.9363154172897339, "learning_rate": 0.0002, "epoch": 0.6969147005444646, "step": 960}, {"loss": 1.158, "grad_norm": 0.3236212134361267, "learning_rate": 0.0002, "epoch": 0.7041742286751361, "step": 970}, {"loss": 1.1078, "grad_norm": 0.3123254179954529, "learning_rate": 0.0002, "epoch": 0.7114337568058077, "step": 980}, {"loss": 1.1578, "grad_norm": 0.3395805060863495, "learning_rate": 0.0002, "epoch": 0.7186932849364791, "step": 990}, {"loss": 1.0754, "grad_norm": 0.3240964412689209, "learning_rate": 0.0002, "epoch": 0.7259528130671506, "step": 1000}, {"loss": 1.0313, "grad_norm": 0.31902948021888733, "learning_rate": 0.0002, "epoch": 0.7332123411978222, "step": 1010}, {"loss": 1.1664, "grad_norm": 0.4848408102989197, "learning_rate": 0.0002, "epoch": 0.7404718693284936, "step": 1020}, {"loss": 1.0838, "grad_norm": 0.33006033301353455, "learning_rate": 0.0002, "epoch": 0.7477313974591652, "step": 1030}, {"loss": 1.0979, "grad_norm": 0.2928730547428131, "learning_rate": 0.0002, "epoch": 0.7549909255898367, "step": 1040}, {"loss": 1.1351, "grad_norm": 0.3529164791107178, "learning_rate": 0.0002, "epoch": 0.7622504537205081, "step": 1050}, {"loss": 1.1274, "grad_norm": 0.2736213803291321, "learning_rate": 0.0002, "epoch": 0.7695099818511797, "step": 1060}, {"loss": 1.1169, "grad_norm": 0.7200686931610107, "learning_rate": 0.0002, "epoch": 0.7767695099818511, "step": 1070}, {"loss": 1.1171, "grad_norm": 0.33396708965301514, "learning_rate": 0.0002, "epoch": 0.7840290381125227, "step": 1080}, {"loss": 1.2447, "grad_norm": 1.5760449171066284, "learning_rate": 0.0002, "epoch": 0.7912885662431942, "step": 1090}, {"loss": 1.1126, "grad_norm": 0.28138381242752075, "learning_rate": 0.0002, "epoch": 0.7985480943738656, "step": 1100}, {"loss": 1.2012, "grad_norm": 0.2597472369670868, "learning_rate": 0.0002, "epoch": 0.8058076225045372, "step": 1110}, {"loss": 1.1177, "grad_norm": 0.3305445611476898, "learning_rate": 0.0002, "epoch": 0.8130671506352087, "step": 1120}, {"loss": 1.0849, "grad_norm": 0.3934599459171295, "learning_rate": 0.0002, "epoch": 0.8203266787658802, "step": 1130}, {"loss": 1.0889, "grad_norm": 0.3472191393375397, "learning_rate": 0.0002, "epoch": 0.8275862068965517, "step": 1140}, {"loss": 1.265, "grad_norm": 0.2857365906238556, "learning_rate": 0.0002, "epoch": 0.8348457350272233, "step": 1150}, {"loss": 1.03, "grad_norm": 0.3207702934741974, "learning_rate": 0.0002, "epoch": 0.8421052631578947, "step": 1160}, {"loss": 1.1669, "grad_norm": 0.3176484704017639, "learning_rate": 0.0002, "epoch": 0.8493647912885662, "step": 1170}, {"loss": 1.1386, "grad_norm": 0.40685558319091797, "learning_rate": 0.0002, "epoch": 0.8566243194192378, "step": 1180}, {"loss": 1.1383, "grad_norm": 0.31125199794769287, "learning_rate": 0.0002, "epoch": 0.8638838475499092, "step": 1190}, {"loss": 1.1512, "grad_norm": 0.7361181378364563, "learning_rate": 0.0002, "epoch": 0.8711433756805808, "step": 1200}, {"loss": 0.9983, "grad_norm": 0.33699527382850647, "learning_rate": 0.0002, "epoch": 0.8784029038112523, "step": 1210}, {"loss": 1.11, "grad_norm": 0.3315220773220062, "learning_rate": 0.0002, "epoch": 0.8856624319419237, "step": 1220}, {"loss": 1.0266, "grad_norm": 0.6256054043769836, "learning_rate": 0.0002, "epoch": 0.8929219600725953, "step": 1230}, {"loss": 1.1738, "grad_norm": 0.3692137897014618, "learning_rate": 0.0002, "epoch": 0.9001814882032668, "step": 1240}, {"loss": 1.1112, "grad_norm": 0.3538484573364258, "learning_rate": 0.0002, "epoch": 0.9074410163339383, "step": 1250}, {"loss": 1.0878, "grad_norm": 0.27863040566444397, "learning_rate": 0.0002, "epoch": 0.9147005444646098, "step": 1260}, {"loss": 1.0826, "grad_norm": 0.3322528302669525, "learning_rate": 0.0002, "epoch": 0.9219600725952813, "step": 1270}, {"loss": 1.0158, "grad_norm": 0.7553173303604126, "learning_rate": 0.0002, "epoch": 0.9292196007259528, "step": 1280}, {"loss": 1.198, "grad_norm": 0.4856191575527191, "learning_rate": 0.0002, "epoch": 0.9364791288566243, "step": 1290}, {"loss": 1.1678, "grad_norm": 0.3668074905872345, "learning_rate": 0.0002, "epoch": 0.9437386569872959, "step": 1300}, {"loss": 1.1809, "grad_norm": 0.29851067066192627, "learning_rate": 0.0002, "epoch": 0.9509981851179673, "step": 1310}, {"loss": 1.1299, "grad_norm": 0.276664674282074, "learning_rate": 0.0002, "epoch": 0.9582577132486388, "step": 1320}, {"loss": 1.0597, "grad_norm": 0.2941018342971802, "learning_rate": 0.0002, "epoch": 0.9655172413793104, "step": 1330}, {"loss": 1.1731, "grad_norm": 0.3505859076976776, "learning_rate": 0.0002, "epoch": 0.9727767695099818, "step": 1340}, {"loss": 1.1809, "grad_norm": 0.3067687451839447, "learning_rate": 0.0002, "epoch": 0.9800362976406534, "step": 1350}, {"loss": 1.1511, "grad_norm": 0.27151066064834595, "learning_rate": 0.0002, "epoch": 0.9872958257713249, "step": 1360}, {"loss": 1.1919, "grad_norm": 0.36370083689689636, "learning_rate": 0.0002, "epoch": 0.9945553539019963, "step": 1370}, {"eval_loss": 1.1381088495254517, "eval_runtime": 96.0848, "eval_samples_per_second": 4.538, "eval_steps_per_second": 0.572, "epoch": 0.9996370235934664, "step": 1377}, {"loss": 1.0082, "grad_norm": 0.27980583906173706, "learning_rate": 0.0002, "epoch": 1.0018148820326678, "step": 1380}, {"loss": 1.0565, "grad_norm": 0.26713913679122925, "learning_rate": 0.0002, "epoch": 1.0090744101633393, "step": 1390}, {"loss": 1.0241, "grad_norm": 0.3089541494846344, "learning_rate": 0.0002, "epoch": 1.0163339382940109, "step": 1400}, {"loss": 0.9649, "grad_norm": 0.4188242256641388, "learning_rate": 0.0002, "epoch": 1.0235934664246824, "step": 1410}, {"loss": 0.9821, "grad_norm": 0.5246463418006897, "learning_rate": 0.0002, "epoch": 1.030852994555354, "step": 1420}, {"loss": 1.0143, "grad_norm": 0.2728777825832367, "learning_rate": 0.0002, "epoch": 1.0381125226860255, "step": 1430}, {"loss": 1.075, "grad_norm": 0.38167616724967957, "learning_rate": 0.0002, "epoch": 1.0453720508166968, "step": 1440}, {"loss": 1.0229, "grad_norm": 0.4439380168914795, "learning_rate": 0.0002, "epoch": 1.0526315789473684, "step": 1450}, {"loss": 1.0451, "grad_norm": 0.30954182147979736, "learning_rate": 0.0002, "epoch": 1.05989110707804, "step": 1460}, {"loss": 1.0762, "grad_norm": 0.4022280275821686, "learning_rate": 0.0002, "epoch": 1.0671506352087115, "step": 1470}, {"loss": 1.0754, "grad_norm": 0.7390811443328857, "learning_rate": 0.0002, "epoch": 1.074410163339383, "step": 1480}, {"loss": 1.1378, "grad_norm": 0.3885486423969269, "learning_rate": 0.0002, "epoch": 1.0816696914700545, "step": 1490}, {"loss": 1.005, "grad_norm": 0.5275560617446899, "learning_rate": 0.0002, "epoch": 1.0889292196007259, "step": 1500}, {"loss": 1.0147, "grad_norm": 0.35112282633781433, "learning_rate": 0.0002, "epoch": 1.0961887477313974, "step": 1510}, {"loss": 1.0331, "grad_norm": 0.33714351058006287, "learning_rate": 0.0002, "epoch": 1.103448275862069, "step": 1520}, {"loss": 0.8723, "grad_norm": 0.31221693754196167, "learning_rate": 0.0002, "epoch": 1.1107078039927405, "step": 1530}, {"loss": 1.0701, "grad_norm": 0.27549654245376587, "learning_rate": 0.0002, "epoch": 1.117967332123412, "step": 1540}, {"loss": 1.0114, "grad_norm": 0.8465521335601807, "learning_rate": 0.0002, "epoch": 1.1252268602540836, "step": 1550}, {"loss": 0.9643, "grad_norm": 0.36125949025154114, "learning_rate": 0.0002, "epoch": 1.132486388384755, "step": 1560}, {"loss": 1.0744, "grad_norm": 0.37420371174812317, "learning_rate": 0.0002, "epoch": 1.1397459165154264, "step": 1570}, {"loss": 1.165, "grad_norm": 0.3294760584831238, "learning_rate": 0.0002, "epoch": 1.147005444646098, "step": 1580}, {"loss": 0.9207, "grad_norm": 0.3881238102912903, "learning_rate": 0.0002, "epoch": 1.1542649727767695, "step": 1590}, {"loss": 1.0983, "grad_norm": 0.4766491651535034, "learning_rate": 0.0002, "epoch": 1.161524500907441, "step": 1600}, {"loss": 1.0092, "grad_norm": 0.395530104637146, "learning_rate": 0.0002, "epoch": 1.1687840290381124, "step": 1610}, {"loss": 0.9895, "grad_norm": 0.3297106623649597, "learning_rate": 0.0002, "epoch": 1.176043557168784, "step": 1620}, {"loss": 1.1121, "grad_norm": 0.39528271555900574, "learning_rate": 0.0002, "epoch": 1.1833030852994555, "step": 1630}, {"loss": 0.9202, "grad_norm": 0.3370221257209778, "learning_rate": 0.0002, "epoch": 1.190562613430127, "step": 1640}, {"loss": 1.102, "grad_norm": 0.31922030448913574, "learning_rate": 0.0002, "epoch": 1.1978221415607986, "step": 1650}, {"loss": 1.0565, "grad_norm": 0.6142027378082275, "learning_rate": 0.0002, "epoch": 1.2050816696914701, "step": 1660}, {"loss": 0.9541, "grad_norm": 0.44769710302352905, "learning_rate": 0.0002, "epoch": 1.2123411978221417, "step": 1670}, {"loss": 1.2501, "grad_norm": 0.41233646869659424, "learning_rate": 0.0002, "epoch": 1.219600725952813, "step": 1680}, {"loss": 1.0147, "grad_norm": 0.2928866147994995, "learning_rate": 0.0002, "epoch": 1.2268602540834845, "step": 1690}, {"loss": 0.9074, "grad_norm": 0.36913734674453735, "learning_rate": 0.0002, "epoch": 1.234119782214156, "step": 1700}, {"loss": 1.0847, "grad_norm": 0.5281891226768494, "learning_rate": 0.0002, "epoch": 1.2413793103448276, "step": 1710}, {"loss": 0.9482, "grad_norm": 0.3374697268009186, "learning_rate": 0.0002, "epoch": 1.2486388384754992, "step": 1720}, {"loss": 0.9826, "grad_norm": 0.3802020847797394, "learning_rate": 0.0002, "epoch": 1.2558983666061705, "step": 1730}, {"loss": 1.0191, "grad_norm": 0.38048651814460754, "learning_rate": 0.0002, "epoch": 1.263157894736842, "step": 1740}, {"loss": 1.0142, "grad_norm": 0.6676169633865356, "learning_rate": 0.0002, "epoch": 1.2704174228675136, "step": 1750}, {"loss": 0.992, "grad_norm": 0.4075961410999298, "learning_rate": 0.0002, "epoch": 1.2776769509981851, "step": 1760}, {"loss": 1.0301, "grad_norm": 0.4374721348285675, "learning_rate": 0.0002, "epoch": 1.2849364791288567, "step": 1770}, {"loss": 1.0601, "grad_norm": 0.4638824164867401, "learning_rate": 0.0002, "epoch": 1.2921960072595282, "step": 1780}, {"loss": 0.9593, "grad_norm": 0.38631564378738403, "learning_rate": 0.0002, "epoch": 1.2994555353901998, "step": 1790}, {"loss": 1.0042, "grad_norm": 0.35873809456825256, "learning_rate": 0.0002, "epoch": 1.306715063520871, "step": 1800}, {"loss": 0.9835, "grad_norm": 0.33602237701416016, "learning_rate": 0.0002, "epoch": 1.3139745916515426, "step": 1810}, {"loss": 1.1032, "grad_norm": 0.46696463227272034, "learning_rate": 0.0002, "epoch": 1.3212341197822142, "step": 1820}, {"loss": 1.0123, "grad_norm": 0.368958979845047, "learning_rate": 0.0002, "epoch": 1.3284936479128857, "step": 1830}, {"loss": 1.0456, "grad_norm": 0.3160957396030426, "learning_rate": 0.0002, "epoch": 1.335753176043557, "step": 1840}, {"loss": 1.0264, "grad_norm": 0.4511511027812958, "learning_rate": 0.0002, "epoch": 1.3430127041742286, "step": 1850}, {"loss": 1.0179, "grad_norm": 0.6769845485687256, "learning_rate": 0.0002, "epoch": 1.3502722323049001, "step": 1860}, {"loss": 1.011, "grad_norm": 0.3749309480190277, "learning_rate": 0.0002, "epoch": 1.3575317604355717, "step": 1870}, {"loss": 1.0223, "grad_norm": 0.553798496723175, "learning_rate": 0.0002, "epoch": 1.3647912885662432, "step": 1880}, {"loss": 0.9634, "grad_norm": 0.3538985550403595, "learning_rate": 0.0002, "epoch": 1.3720508166969148, "step": 1890}, {"loss": 1.0275, "grad_norm": 0.5501534938812256, "learning_rate": 0.0002, "epoch": 1.3793103448275863, "step": 1900}, {"loss": 1.0461, "grad_norm": 0.4432051181793213, "learning_rate": 0.0002, "epoch": 1.3865698729582578, "step": 1910}, {"loss": 0.9082, "grad_norm": 0.41755786538124084, "learning_rate": 0.0002, "epoch": 1.3938294010889292, "step": 1920}, {"loss": 1.0858, "grad_norm": 0.5626114010810852, "learning_rate": 0.0002, "epoch": 1.4010889292196007, "step": 1930}, {"loss": 1.0687, "grad_norm": 0.44038185477256775, "learning_rate": 0.0002, "epoch": 1.4083484573502723, "step": 1940}, {"loss": 0.9454, "grad_norm": 0.3397001624107361, "learning_rate": 0.0002, "epoch": 1.4156079854809438, "step": 1950}, {"loss": 0.943, "grad_norm": 0.4325368106365204, "learning_rate": 0.0002, "epoch": 1.4228675136116151, "step": 1960}, {"loss": 1.0488, "grad_norm": 0.3900907039642334, "learning_rate": 0.0002, "epoch": 1.4301270417422867, "step": 1970}, {"loss": 0.9699, "grad_norm": 0.369612455368042, "learning_rate": 0.0002, "epoch": 1.4373865698729582, "step": 1980}, {"loss": 1.0609, "grad_norm": 0.4389338791370392, "learning_rate": 0.0002, "epoch": 1.4446460980036298, "step": 1990}, {"loss": 1.042, "grad_norm": 1.694450855255127, "learning_rate": 0.0002, "epoch": 1.4519056261343013, "step": 2000}, {"loss": 0.997, "grad_norm": 0.516957700252533, "learning_rate": 0.0002, "epoch": 1.4591651542649728, "step": 2010}, {"loss": 1.0979, "grad_norm": 0.45515501499176025, "learning_rate": 0.0002, "epoch": 1.4664246823956444, "step": 2020}, {"loss": 1.0188, "grad_norm": 0.4153139591217041, "learning_rate": 0.0002, "epoch": 1.4736842105263157, "step": 2030}, {"loss": 1.1521, "grad_norm": 0.44353052973747253, "learning_rate": 0.0002, "epoch": 1.4809437386569873, "step": 2040}, {"loss": 0.9653, "grad_norm": 0.570554256439209, "learning_rate": 0.0002, "epoch": 1.4882032667876588, "step": 2050}, {"loss": 1.0352, "grad_norm": 0.5742740035057068, "learning_rate": 0.0002, "epoch": 1.4954627949183303, "step": 2060}, {"loss": 1.0963, "grad_norm": 0.5890517830848694, "learning_rate": 0.0002, "epoch": 1.5027223230490017, "step": 2070}, {"loss": 1.0639, "grad_norm": 0.4162650406360626, "learning_rate": 0.0002, "epoch": 1.5099818511796732, "step": 2080}, {"loss": 1.071, "grad_norm": 0.7334717512130737, "learning_rate": 0.0002, "epoch": 1.5172413793103448, "step": 2090}, {"loss": 0.8957, "grad_norm": 0.2963249683380127, "learning_rate": 0.0002, "epoch": 1.5245009074410163, "step": 2100}, {"loss": 1.0446, "grad_norm": 0.30676454305648804, "learning_rate": 0.0002, "epoch": 1.5317604355716878, "step": 2110}, {"loss": 0.9967, "grad_norm": 0.35984641313552856, "learning_rate": 0.0002, "epoch": 1.5390199637023594, "step": 2120}, {"loss": 1.0352, "grad_norm": 0.3384549617767334, "learning_rate": 0.0002, "epoch": 1.546279491833031, "step": 2130}, {"loss": 0.9874, "grad_norm": 0.4725518226623535, "learning_rate": 0.0002, "epoch": 1.5535390199637025, "step": 2140}, {"loss": 1.1166, "grad_norm": 0.3252685070037842, "learning_rate": 0.0002, "epoch": 1.560798548094374, "step": 2150}, {"loss": 0.9127, "grad_norm": 0.45043081045150757, "learning_rate": 0.0002, "epoch": 1.5680580762250453, "step": 2160}, {"loss": 1.0767, "grad_norm": 0.374208003282547, "learning_rate": 0.0002, "epoch": 1.5753176043557169, "step": 2170}, {"loss": 1.0255, "grad_norm": 0.5118404030799866, "learning_rate": 0.0002, "epoch": 1.5825771324863884, "step": 2180}, {"loss": 1.0062, "grad_norm": 0.482170969247818, "learning_rate": 0.0002, "epoch": 1.5898366606170597, "step": 2190}, {"loss": 1.0872, "grad_norm": 0.5337533950805664, "learning_rate": 0.0002, "epoch": 1.5970961887477313, "step": 2200}, {"loss": 1.0405, "grad_norm": 0.5195064544677734, "learning_rate": 0.0002, "epoch": 1.6043557168784028, "step": 2210}, {"loss": 1.0454, "grad_norm": 0.30807098746299744, "learning_rate": 0.0002, "epoch": 1.6116152450090744, "step": 2220}, {"loss": 1.0293, "grad_norm": 0.3962925672531128, "learning_rate": 0.0002, "epoch": 1.618874773139746, "step": 2230}, {"loss": 1.0137, "grad_norm": 0.7636962532997131, "learning_rate": 0.0002, "epoch": 1.6261343012704175, "step": 2240}, {"loss": 0.991, "grad_norm": 0.32380592823028564, "learning_rate": 0.0002, "epoch": 1.633393829401089, "step": 2250}, {"loss": 0.9471, "grad_norm": 0.5767741799354553, "learning_rate": 0.0002, "epoch": 1.6406533575317606, "step": 2260}, {"loss": 0.9524, "grad_norm": 0.39964812994003296, "learning_rate": 0.0002, "epoch": 1.647912885662432, "step": 2270}, {"loss": 0.9866, "grad_norm": 0.622629702091217, "learning_rate": 0.0002, "epoch": 1.6551724137931034, "step": 2280}, {"loss": 0.9609, "grad_norm": 0.40202152729034424, "learning_rate": 0.0002, "epoch": 1.662431941923775, "step": 2290}, {"loss": 0.9859, "grad_norm": 0.4467349052429199, "learning_rate": 0.0002, "epoch": 1.6696914700544465, "step": 2300}, {"loss": 1.0312, "grad_norm": 0.5026949048042297, "learning_rate": 0.0002, "epoch": 1.6769509981851178, "step": 2310}, {"loss": 1.0776, "grad_norm": 0.43754541873931885, "learning_rate": 0.0002, "epoch": 1.6842105263157894, "step": 2320}, {"loss": 0.9786, "grad_norm": 0.42869430780410767, "learning_rate": 0.0002, "epoch": 1.691470054446461, "step": 2330}, {"loss": 1.0747, "grad_norm": 0.4192679524421692, "learning_rate": 0.0002, "epoch": 1.6987295825771325, "step": 2340}, {"loss": 1.0054, "grad_norm": 0.3243155777454376, "learning_rate": 0.0002, "epoch": 1.705989110707804, "step": 2350}, {"loss": 0.9165, "grad_norm": 1.0514075756072998, "learning_rate": 0.0002, "epoch": 1.7132486388384756, "step": 2360}, {"loss": 1.1353, "grad_norm": 0.4719122648239136, "learning_rate": 0.0002, "epoch": 1.720508166969147, "step": 2370}, {"loss": 0.9913, "grad_norm": 0.3846144676208496, "learning_rate": 0.0002, "epoch": 1.7277676950998186, "step": 2380}, {"loss": 0.9935, "grad_norm": 0.7266581058502197, "learning_rate": 0.0002, "epoch": 1.73502722323049, "step": 2390}, {"loss": 0.9509, "grad_norm": 0.6183241605758667, "learning_rate": 0.0002, "epoch": 1.7422867513611615, "step": 2400}, {"loss": 0.9713, "grad_norm": 0.3658260405063629, "learning_rate": 0.0002, "epoch": 1.749546279491833, "step": 2410}, {"loss": 1.0033, "grad_norm": 0.6036322712898254, "learning_rate": 0.0002, "epoch": 1.7568058076225044, "step": 2420}, {"loss": 1.0526, "grad_norm": 0.7872936129570007, "learning_rate": 0.0002, "epoch": 1.764065335753176, "step": 2430}, {"loss": 0.99, "grad_norm": 0.35946124792099, "learning_rate": 0.0002, "epoch": 1.7713248638838475, "step": 2440}, {"loss": 0.8845, "grad_norm": 0.3740338981151581, "learning_rate": 0.0002, "epoch": 1.778584392014519, "step": 2450}, {"loss": 0.9333, "grad_norm": 0.6150230169296265, "learning_rate": 0.0002, "epoch": 1.7858439201451906, "step": 2460}, {"loss": 1.0373, "grad_norm": 0.4726075530052185, "learning_rate": 0.0002, "epoch": 1.793103448275862, "step": 2470}, {"loss": 0.974, "grad_norm": 0.31292253732681274, "learning_rate": 0.0002, "epoch": 1.8003629764065336, "step": 2480}, {"loss": 1.0205, "grad_norm": 0.4463104009628296, "learning_rate": 0.0002, "epoch": 1.8076225045372052, "step": 2490}, {"loss": 1.0771, "grad_norm": 0.7848200798034668, "learning_rate": 0.0002, "epoch": 1.8148820326678767, "step": 2500}, {"loss": 0.9508, "grad_norm": 0.5562082529067993, "learning_rate": 0.0002, "epoch": 1.822141560798548, "step": 2510}, {"loss": 0.9343, "grad_norm": 0.39892756938934326, "learning_rate": 0.0002, "epoch": 1.8294010889292196, "step": 2520}, {"loss": 1.0489, "grad_norm": 1.2923320531845093, "learning_rate": 0.0002, "epoch": 1.8366606170598911, "step": 2530}, {"loss": 1.1203, "grad_norm": 0.6316490769386292, "learning_rate": 0.0002, "epoch": 1.8439201451905625, "step": 2540}, {"loss": 0.9131, "grad_norm": 0.46100497245788574, "learning_rate": 0.0002, "epoch": 1.851179673321234, "step": 2550}, {"loss": 1.0977, "grad_norm": 0.7902987003326416, "learning_rate": 0.0002, "epoch": 1.8584392014519056, "step": 2560}, {"loss": 0.9702, "grad_norm": 0.4596365690231323, "learning_rate": 0.0002, "epoch": 1.865698729582577, "step": 2570}, {"loss": 0.9322, "grad_norm": 0.6592172384262085, "learning_rate": 0.0002, "epoch": 1.8729582577132486, "step": 2580}, {"loss": 0.9549, "grad_norm": 0.5764662027359009, "learning_rate": 0.0002, "epoch": 1.8802177858439202, "step": 2590}, {"loss": 0.9485, "grad_norm": 0.8421637415885925, "learning_rate": 0.0002, "epoch": 1.8874773139745917, "step": 2600}, {"loss": 1.012, "grad_norm": 0.5635305047035217, "learning_rate": 0.0002, "epoch": 1.8947368421052633, "step": 2610}, {"loss": 0.8907, "grad_norm": 0.46231237053871155, "learning_rate": 0.0002, "epoch": 1.9019963702359348, "step": 2620}, {"loss": 0.9543, "grad_norm": 0.3944607079029083, "learning_rate": 0.0002, "epoch": 1.9092558983666061, "step": 2630}, {"loss": 0.9964, "grad_norm": 0.4753907322883606, "learning_rate": 0.0002, "epoch": 1.9165154264972777, "step": 2640}, {"loss": 1.0217, "grad_norm": 0.4151090979576111, "learning_rate": 0.0002, "epoch": 1.9237749546279492, "step": 2650}, {"loss": 1.0397, "grad_norm": 0.6793725490570068, "learning_rate": 0.0002, "epoch": 1.9310344827586206, "step": 2660}, {"loss": 1.0401, "grad_norm": 0.339755117893219, "learning_rate": 0.0002, "epoch": 1.938294010889292, "step": 2670}, {"loss": 1.082, "grad_norm": 0.40789374709129333, "learning_rate": 0.0002, "epoch": 1.9455535390199636, "step": 2680}, {"loss": 0.9973, "grad_norm": 0.3750005066394806, "learning_rate": 0.0002, "epoch": 1.9528130671506352, "step": 2690}, {"loss": 0.9837, "grad_norm": 0.39684441685676575, "learning_rate": 0.0002, "epoch": 1.9600725952813067, "step": 2700}, {"loss": 1.1204, "grad_norm": 0.378287672996521, "learning_rate": 0.0002, "epoch": 1.9673321234119783, "step": 2710}, {"loss": 0.9284, "grad_norm": 0.3668482005596161, "learning_rate": 0.0002, "epoch": 1.9745916515426498, "step": 2720}, {"loss": 1.0512, "grad_norm": 0.49997565150260925, "learning_rate": 0.0002, "epoch": 1.9818511796733214, "step": 2730}, {"loss": 0.9311, "grad_norm": 0.36852124333381653, "learning_rate": 0.0002, "epoch": 1.989110707803993, "step": 2740}, {"loss": 1.0898, "grad_norm": 0.5203380584716797, "learning_rate": 0.0002, "epoch": 1.9963702359346642, "step": 2750}]} +{"epoch": 2.9996370235934666, "step": 4132, "epoch_duration": 2965.006893157959, "total_accumulated_duration": 12229.340359687805, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 24486.0}, "peak_memory_reserved": {"GPU_0": 24752.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4936, "grad_norm": 0.46010470390319824, "learning_rate": 0.0002, "epoch": 0.007259528130671506, "step": 10}, {"loss": 1.8442, "grad_norm": 0.5103179216384888, "learning_rate": 0.0002, "epoch": 0.014519056261343012, "step": 20}, {"loss": 1.559, "grad_norm": 0.6282716989517212, "learning_rate": 0.0002, "epoch": 0.021778584392014518, "step": 30}, {"loss": 1.3618, "grad_norm": 1.2480497360229492, "learning_rate": 0.0002, "epoch": 0.029038112522686024, "step": 40}, {"loss": 1.3874, "grad_norm": 0.4114855229854584, "learning_rate": 0.0002, "epoch": 0.036297640653357534, "step": 50}, {"loss": 1.2836, "grad_norm": 0.49482840299606323, "learning_rate": 0.0002, "epoch": 0.043557168784029036, "step": 60}, {"loss": 1.1728, "grad_norm": 0.4536272883415222, "learning_rate": 0.0002, "epoch": 0.050816696914700546, "step": 70}, {"loss": 1.1125, "grad_norm": 0.32328274846076965, "learning_rate": 0.0002, "epoch": 0.05807622504537205, "step": 80}, {"loss": 1.2447, "grad_norm": 0.40990468859672546, "learning_rate": 0.0002, "epoch": 0.06533575317604355, "step": 90}, {"loss": 1.2216, "grad_norm": 0.37273502349853516, "learning_rate": 0.0002, "epoch": 0.07259528130671507, "step": 100}, {"loss": 1.2259, "grad_norm": 0.3903871476650238, "learning_rate": 0.0002, "epoch": 0.07985480943738657, "step": 110}, {"loss": 1.3718, "grad_norm": 0.3272787928581238, "learning_rate": 0.0002, "epoch": 0.08711433756805807, "step": 120}, {"loss": 1.1832, "grad_norm": 0.3622824251651764, "learning_rate": 0.0002, "epoch": 0.09437386569872959, "step": 130}, {"loss": 1.2966, "grad_norm": 0.3503916561603546, "learning_rate": 0.0002, "epoch": 0.10163339382940109, "step": 140}, {"loss": 1.0913, "grad_norm": 0.32787832617759705, "learning_rate": 0.0002, "epoch": 0.1088929219600726, "step": 150}, {"loss": 1.2143, "grad_norm": 0.5822657942771912, "learning_rate": 0.0002, "epoch": 0.1161524500907441, "step": 160}, {"loss": 1.2091, "grad_norm": 0.28028249740600586, "learning_rate": 0.0002, "epoch": 0.12341197822141561, "step": 170}, {"loss": 1.1731, "grad_norm": 0.33602750301361084, "learning_rate": 0.0002, "epoch": 0.1306715063520871, "step": 180}, {"loss": 1.127, "grad_norm": 0.29106274247169495, "learning_rate": 0.0002, "epoch": 0.13793103448275862, "step": 190}, {"loss": 1.3261, "grad_norm": 0.38753363490104675, "learning_rate": 0.0002, "epoch": 0.14519056261343014, "step": 200}, {"loss": 1.1536, "grad_norm": 0.361009418964386, "learning_rate": 0.0002, "epoch": 0.15245009074410162, "step": 210}, {"loss": 1.1483, "grad_norm": 0.6743836402893066, "learning_rate": 0.0002, "epoch": 0.15970961887477314, "step": 220}, {"loss": 1.264, "grad_norm": 0.3923613727092743, "learning_rate": 0.0002, "epoch": 0.16696914700544466, "step": 230}, {"loss": 1.0437, "grad_norm": 0.2809699773788452, "learning_rate": 0.0002, "epoch": 0.17422867513611615, "step": 240}, {"loss": 1.1826, "grad_norm": 0.3631494641304016, "learning_rate": 0.0002, "epoch": 0.18148820326678766, "step": 250}, {"loss": 1.1584, "grad_norm": 0.24658171832561493, "learning_rate": 0.0002, "epoch": 0.18874773139745918, "step": 260}, {"loss": 1.0065, "grad_norm": 0.5780664682388306, "learning_rate": 0.0002, "epoch": 0.19600725952813067, "step": 270}, {"loss": 1.1251, "grad_norm": 0.3056720495223999, "learning_rate": 0.0002, "epoch": 0.20326678765880218, "step": 280}, {"loss": 1.1503, "grad_norm": 0.2857084572315216, "learning_rate": 0.0002, "epoch": 0.21052631578947367, "step": 290}, {"loss": 1.1215, "grad_norm": 0.6645345687866211, "learning_rate": 0.0002, "epoch": 0.2177858439201452, "step": 300}, {"loss": 1.127, "grad_norm": 0.5966078639030457, "learning_rate": 0.0002, "epoch": 0.2250453720508167, "step": 310}, {"loss": 1.1974, "grad_norm": 0.40937140583992004, "learning_rate": 0.0002, "epoch": 0.2323049001814882, "step": 320}, {"loss": 1.2385, "grad_norm": 0.5642806887626648, "learning_rate": 0.0002, "epoch": 0.2395644283121597, "step": 330}, {"loss": 1.1703, "grad_norm": 0.2750748097896576, "learning_rate": 0.0002, "epoch": 0.24682395644283123, "step": 340}, {"loss": 1.2598, "grad_norm": 0.34350234270095825, "learning_rate": 0.0002, "epoch": 0.2540834845735027, "step": 350}, {"loss": 1.1942, "grad_norm": 0.6767239570617676, "learning_rate": 0.0002, "epoch": 0.2613430127041742, "step": 360}, {"loss": 1.1436, "grad_norm": 0.31006959080696106, "learning_rate": 0.0002, "epoch": 0.26860254083484575, "step": 370}, {"loss": 1.2084, "grad_norm": 0.3825474679470062, "learning_rate": 0.0002, "epoch": 0.27586206896551724, "step": 380}, {"loss": 1.1523, "grad_norm": 0.30241551995277405, "learning_rate": 0.0002, "epoch": 0.2831215970961887, "step": 390}, {"loss": 1.1298, "grad_norm": 0.2962397336959839, "learning_rate": 0.0002, "epoch": 0.29038112522686027, "step": 400}, {"loss": 1.1299, "grad_norm": 0.2600369155406952, "learning_rate": 0.0002, "epoch": 0.29764065335753176, "step": 410}, {"loss": 1.1366, "grad_norm": 0.3675060272216797, "learning_rate": 0.0002, "epoch": 0.30490018148820325, "step": 420}, {"loss": 1.156, "grad_norm": 0.3429498076438904, "learning_rate": 0.0002, "epoch": 0.3121597096188748, "step": 430}, {"loss": 1.2741, "grad_norm": 0.34311825037002563, "learning_rate": 0.0002, "epoch": 0.3194192377495463, "step": 440}, {"loss": 1.3523, "grad_norm": 0.37872210144996643, "learning_rate": 0.0002, "epoch": 0.32667876588021777, "step": 450}, {"loss": 1.1365, "grad_norm": 0.33271121978759766, "learning_rate": 0.0002, "epoch": 0.3339382940108893, "step": 460}, {"loss": 1.1572, "grad_norm": 0.34605276584625244, "learning_rate": 0.0002, "epoch": 0.3411978221415608, "step": 470}, {"loss": 1.2755, "grad_norm": 0.41050562262535095, "learning_rate": 0.0002, "epoch": 0.3484573502722323, "step": 480}, {"loss": 1.075, "grad_norm": 0.2066836953163147, "learning_rate": 0.0002, "epoch": 0.35571687840290384, "step": 490}, {"loss": 1.2072, "grad_norm": 0.2859014868736267, "learning_rate": 0.0002, "epoch": 0.3629764065335753, "step": 500}, {"loss": 1.1435, "grad_norm": 0.28763777017593384, "learning_rate": 0.0002, "epoch": 0.3702359346642468, "step": 510}, {"loss": 1.1341, "grad_norm": 0.2730471193790436, "learning_rate": 0.0002, "epoch": 0.37749546279491836, "step": 520}, {"loss": 1.348, "grad_norm": 0.3968936800956726, "learning_rate": 0.0002, "epoch": 0.38475499092558985, "step": 530}, {"loss": 1.2363, "grad_norm": 0.3624701201915741, "learning_rate": 0.0002, "epoch": 0.39201451905626133, "step": 540}, {"loss": 1.1206, "grad_norm": 0.3303608298301697, "learning_rate": 0.0002, "epoch": 0.3992740471869328, "step": 550}, {"loss": 1.203, "grad_norm": 0.33507466316223145, "learning_rate": 0.0002, "epoch": 0.40653357531760437, "step": 560}, {"loss": 1.2294, "grad_norm": 0.3297670781612396, "learning_rate": 0.0002, "epoch": 0.41379310344827586, "step": 570}, {"loss": 1.2, "grad_norm": 0.32334890961647034, "learning_rate": 0.0002, "epoch": 0.42105263157894735, "step": 580}, {"loss": 1.112, "grad_norm": 0.30281195044517517, "learning_rate": 0.0002, "epoch": 0.4283121597096189, "step": 590}, {"loss": 1.1352, "grad_norm": 0.5900027751922607, "learning_rate": 0.0002, "epoch": 0.4355716878402904, "step": 600}, {"loss": 1.1575, "grad_norm": 0.28437477350234985, "learning_rate": 0.0002, "epoch": 0.44283121597096187, "step": 610}, {"loss": 1.2009, "grad_norm": 0.39601704478263855, "learning_rate": 0.0002, "epoch": 0.4500907441016334, "step": 620}, {"loss": 1.2302, "grad_norm": 0.41971510648727417, "learning_rate": 0.0002, "epoch": 0.4573502722323049, "step": 630}, {"loss": 1.236, "grad_norm": 0.33814409375190735, "learning_rate": 0.0002, "epoch": 0.4646098003629764, "step": 640}, {"loss": 1.2471, "grad_norm": 0.575718104839325, "learning_rate": 0.0002, "epoch": 0.47186932849364793, "step": 650}, {"loss": 1.1012, "grad_norm": 0.37927401065826416, "learning_rate": 0.0002, "epoch": 0.4791288566243194, "step": 660}, {"loss": 1.1552, "grad_norm": 0.3224332630634308, "learning_rate": 0.0002, "epoch": 0.4863883847549909, "step": 670}, {"loss": 1.1354, "grad_norm": 0.32683515548706055, "learning_rate": 0.0002, "epoch": 0.49364791288566245, "step": 680}, {"loss": 1.207, "grad_norm": 0.4316163659095764, "learning_rate": 0.0002, "epoch": 0.5009074410163339, "step": 690}, {"loss": 1.2116, "grad_norm": 0.342602401971817, "learning_rate": 0.0002, "epoch": 0.5081669691470054, "step": 700}, {"loss": 1.0433, "grad_norm": 0.2794898748397827, "learning_rate": 0.0002, "epoch": 0.515426497277677, "step": 710}, {"loss": 1.2395, "grad_norm": 0.3322339951992035, "learning_rate": 0.0002, "epoch": 0.5226860254083484, "step": 720}, {"loss": 1.0762, "grad_norm": 0.3088509142398834, "learning_rate": 0.0002, "epoch": 0.52994555353902, "step": 730}, {"loss": 1.1495, "grad_norm": 0.24444378912448883, "learning_rate": 0.0002, "epoch": 0.5372050816696915, "step": 740}, {"loss": 1.2176, "grad_norm": 0.3483171761035919, "learning_rate": 0.0002, "epoch": 0.5444646098003629, "step": 750}, {"loss": 1.2248, "grad_norm": 0.3425690531730652, "learning_rate": 0.0002, "epoch": 0.5517241379310345, "step": 760}, {"loss": 1.1127, "grad_norm": 0.31841927766799927, "learning_rate": 0.0002, "epoch": 0.558983666061706, "step": 770}, {"loss": 1.1841, "grad_norm": 0.39423868060112, "learning_rate": 0.0002, "epoch": 0.5662431941923775, "step": 780}, {"loss": 1.0583, "grad_norm": 0.30328479409217834, "learning_rate": 0.0002, "epoch": 0.573502722323049, "step": 790}, {"loss": 1.0718, "grad_norm": 0.24475938081741333, "learning_rate": 0.0002, "epoch": 0.5807622504537205, "step": 800}, {"loss": 1.2006, "grad_norm": 0.37132805585861206, "learning_rate": 0.0002, "epoch": 0.588021778584392, "step": 810}, {"loss": 1.1544, "grad_norm": 0.32195979356765747, "learning_rate": 0.0002, "epoch": 0.5952813067150635, "step": 820}, {"loss": 0.9937, "grad_norm": 0.2848738729953766, "learning_rate": 0.0002, "epoch": 0.6025408348457351, "step": 830}, {"loss": 1.0937, "grad_norm": 0.28015264868736267, "learning_rate": 0.0002, "epoch": 0.6098003629764065, "step": 840}, {"loss": 1.1234, "grad_norm": 0.37796008586883545, "learning_rate": 0.0002, "epoch": 0.617059891107078, "step": 850}, {"loss": 1.1455, "grad_norm": 0.39311841130256653, "learning_rate": 0.0002, "epoch": 0.6243194192377496, "step": 860}, {"loss": 1.1066, "grad_norm": 0.2761685252189636, "learning_rate": 0.0002, "epoch": 0.631578947368421, "step": 870}, {"loss": 1.0859, "grad_norm": 0.3826720118522644, "learning_rate": 0.0002, "epoch": 0.6388384754990926, "step": 880}, {"loss": 1.1182, "grad_norm": 0.30076679587364197, "learning_rate": 0.0002, "epoch": 0.6460980036297641, "step": 890}, {"loss": 1.0927, "grad_norm": 0.21997687220573425, "learning_rate": 0.0002, "epoch": 0.6533575317604355, "step": 900}, {"loss": 1.1603, "grad_norm": 0.32593777775764465, "learning_rate": 0.0002, "epoch": 0.6606170598911071, "step": 910}, {"loss": 1.1578, "grad_norm": 0.30347898602485657, "learning_rate": 0.0002, "epoch": 0.6678765880217786, "step": 920}, {"loss": 1.1642, "grad_norm": 0.44173774123191833, "learning_rate": 0.0002, "epoch": 0.6751361161524501, "step": 930}, {"loss": 1.1832, "grad_norm": 0.2507467269897461, "learning_rate": 0.0002, "epoch": 0.6823956442831216, "step": 940}, {"loss": 1.3001, "grad_norm": 0.29463833570480347, "learning_rate": 0.0002, "epoch": 0.6896551724137931, "step": 950}, {"loss": 1.1651, "grad_norm": 0.9363154172897339, "learning_rate": 0.0002, "epoch": 0.6969147005444646, "step": 960}, {"loss": 1.158, "grad_norm": 0.3236212134361267, "learning_rate": 0.0002, "epoch": 0.7041742286751361, "step": 970}, {"loss": 1.1078, "grad_norm": 0.3123254179954529, "learning_rate": 0.0002, "epoch": 0.7114337568058077, "step": 980}, {"loss": 1.1578, "grad_norm": 0.3395805060863495, "learning_rate": 0.0002, "epoch": 0.7186932849364791, "step": 990}, {"loss": 1.0754, "grad_norm": 0.3240964412689209, "learning_rate": 0.0002, "epoch": 0.7259528130671506, "step": 1000}, {"loss": 1.0313, "grad_norm": 0.31902948021888733, "learning_rate": 0.0002, "epoch": 0.7332123411978222, "step": 1010}, {"loss": 1.1664, "grad_norm": 0.4848408102989197, "learning_rate": 0.0002, "epoch": 0.7404718693284936, "step": 1020}, {"loss": 1.0838, "grad_norm": 0.33006033301353455, "learning_rate": 0.0002, "epoch": 0.7477313974591652, "step": 1030}, {"loss": 1.0979, "grad_norm": 0.2928730547428131, "learning_rate": 0.0002, "epoch": 0.7549909255898367, "step": 1040}, {"loss": 1.1351, "grad_norm": 0.3529164791107178, "learning_rate": 0.0002, "epoch": 0.7622504537205081, "step": 1050}, {"loss": 1.1274, "grad_norm": 0.2736213803291321, "learning_rate": 0.0002, "epoch": 0.7695099818511797, "step": 1060}, {"loss": 1.1169, "grad_norm": 0.7200686931610107, "learning_rate": 0.0002, "epoch": 0.7767695099818511, "step": 1070}, {"loss": 1.1171, "grad_norm": 0.33396708965301514, "learning_rate": 0.0002, "epoch": 0.7840290381125227, "step": 1080}, {"loss": 1.2447, "grad_norm": 1.5760449171066284, "learning_rate": 0.0002, "epoch": 0.7912885662431942, "step": 1090}, {"loss": 1.1126, "grad_norm": 0.28138381242752075, "learning_rate": 0.0002, "epoch": 0.7985480943738656, "step": 1100}, {"loss": 1.2012, "grad_norm": 0.2597472369670868, "learning_rate": 0.0002, "epoch": 0.8058076225045372, "step": 1110}, {"loss": 1.1177, "grad_norm": 0.3305445611476898, "learning_rate": 0.0002, "epoch": 0.8130671506352087, "step": 1120}, {"loss": 1.0849, "grad_norm": 0.3934599459171295, "learning_rate": 0.0002, "epoch": 0.8203266787658802, "step": 1130}, {"loss": 1.0889, "grad_norm": 0.3472191393375397, "learning_rate": 0.0002, "epoch": 0.8275862068965517, "step": 1140}, {"loss": 1.265, "grad_norm": 0.2857365906238556, "learning_rate": 0.0002, "epoch": 0.8348457350272233, "step": 1150}, {"loss": 1.03, "grad_norm": 0.3207702934741974, "learning_rate": 0.0002, "epoch": 0.8421052631578947, "step": 1160}, {"loss": 1.1669, "grad_norm": 0.3176484704017639, "learning_rate": 0.0002, "epoch": 0.8493647912885662, "step": 1170}, {"loss": 1.1386, "grad_norm": 0.40685558319091797, "learning_rate": 0.0002, "epoch": 0.8566243194192378, "step": 1180}, {"loss": 1.1383, "grad_norm": 0.31125199794769287, "learning_rate": 0.0002, "epoch": 0.8638838475499092, "step": 1190}, {"loss": 1.1512, "grad_norm": 0.7361181378364563, "learning_rate": 0.0002, "epoch": 0.8711433756805808, "step": 1200}, {"loss": 0.9983, "grad_norm": 0.33699527382850647, "learning_rate": 0.0002, "epoch": 0.8784029038112523, "step": 1210}, {"loss": 1.11, "grad_norm": 0.3315220773220062, "learning_rate": 0.0002, "epoch": 0.8856624319419237, "step": 1220}, {"loss": 1.0266, "grad_norm": 0.6256054043769836, "learning_rate": 0.0002, "epoch": 0.8929219600725953, "step": 1230}, {"loss": 1.1738, "grad_norm": 0.3692137897014618, "learning_rate": 0.0002, "epoch": 0.9001814882032668, "step": 1240}, {"loss": 1.1112, "grad_norm": 0.3538484573364258, "learning_rate": 0.0002, "epoch": 0.9074410163339383, "step": 1250}, {"loss": 1.0878, "grad_norm": 0.27863040566444397, "learning_rate": 0.0002, "epoch": 0.9147005444646098, "step": 1260}, {"loss": 1.0826, "grad_norm": 0.3322528302669525, "learning_rate": 0.0002, "epoch": 0.9219600725952813, "step": 1270}, {"loss": 1.0158, "grad_norm": 0.7553173303604126, "learning_rate": 0.0002, "epoch": 0.9292196007259528, "step": 1280}, {"loss": 1.198, "grad_norm": 0.4856191575527191, "learning_rate": 0.0002, "epoch": 0.9364791288566243, "step": 1290}, {"loss": 1.1678, "grad_norm": 0.3668074905872345, "learning_rate": 0.0002, "epoch": 0.9437386569872959, "step": 1300}, {"loss": 1.1809, "grad_norm": 0.29851067066192627, "learning_rate": 0.0002, "epoch": 0.9509981851179673, "step": 1310}, {"loss": 1.1299, "grad_norm": 0.276664674282074, "learning_rate": 0.0002, "epoch": 0.9582577132486388, "step": 1320}, {"loss": 1.0597, "grad_norm": 0.2941018342971802, "learning_rate": 0.0002, "epoch": 0.9655172413793104, "step": 1330}, {"loss": 1.1731, "grad_norm": 0.3505859076976776, "learning_rate": 0.0002, "epoch": 0.9727767695099818, "step": 1340}, {"loss": 1.1809, "grad_norm": 0.3067687451839447, "learning_rate": 0.0002, "epoch": 0.9800362976406534, "step": 1350}, {"loss": 1.1511, "grad_norm": 0.27151066064834595, "learning_rate": 0.0002, "epoch": 0.9872958257713249, "step": 1360}, {"loss": 1.1919, "grad_norm": 0.36370083689689636, "learning_rate": 0.0002, "epoch": 0.9945553539019963, "step": 1370}, {"eval_loss": 1.1381088495254517, "eval_runtime": 96.0848, "eval_samples_per_second": 4.538, "eval_steps_per_second": 0.572, "epoch": 0.9996370235934664, "step": 1377}, {"loss": 1.0082, "grad_norm": 0.27980583906173706, "learning_rate": 0.0002, "epoch": 1.0018148820326678, "step": 1380}, {"loss": 1.0565, "grad_norm": 0.26713913679122925, "learning_rate": 0.0002, "epoch": 1.0090744101633393, "step": 1390}, {"loss": 1.0241, "grad_norm": 0.3089541494846344, "learning_rate": 0.0002, "epoch": 1.0163339382940109, "step": 1400}, {"loss": 0.9649, "grad_norm": 0.4188242256641388, "learning_rate": 0.0002, "epoch": 1.0235934664246824, "step": 1410}, {"loss": 0.9821, "grad_norm": 0.5246463418006897, "learning_rate": 0.0002, "epoch": 1.030852994555354, "step": 1420}, {"loss": 1.0143, "grad_norm": 0.2728777825832367, "learning_rate": 0.0002, "epoch": 1.0381125226860255, "step": 1430}, {"loss": 1.075, "grad_norm": 0.38167616724967957, "learning_rate": 0.0002, "epoch": 1.0453720508166968, "step": 1440}, {"loss": 1.0229, "grad_norm": 0.4439380168914795, "learning_rate": 0.0002, "epoch": 1.0526315789473684, "step": 1450}, {"loss": 1.0451, "grad_norm": 0.30954182147979736, "learning_rate": 0.0002, "epoch": 1.05989110707804, "step": 1460}, {"loss": 1.0762, "grad_norm": 0.4022280275821686, "learning_rate": 0.0002, "epoch": 1.0671506352087115, "step": 1470}, {"loss": 1.0754, "grad_norm": 0.7390811443328857, "learning_rate": 0.0002, "epoch": 1.074410163339383, "step": 1480}, {"loss": 1.1378, "grad_norm": 0.3885486423969269, "learning_rate": 0.0002, "epoch": 1.0816696914700545, "step": 1490}, {"loss": 1.005, "grad_norm": 0.5275560617446899, "learning_rate": 0.0002, "epoch": 1.0889292196007259, "step": 1500}, {"loss": 1.0147, "grad_norm": 0.35112282633781433, "learning_rate": 0.0002, "epoch": 1.0961887477313974, "step": 1510}, {"loss": 1.0331, "grad_norm": 0.33714351058006287, "learning_rate": 0.0002, "epoch": 1.103448275862069, "step": 1520}, {"loss": 0.8723, "grad_norm": 0.31221693754196167, "learning_rate": 0.0002, "epoch": 1.1107078039927405, "step": 1530}, {"loss": 1.0701, "grad_norm": 0.27549654245376587, "learning_rate": 0.0002, "epoch": 1.117967332123412, "step": 1540}, {"loss": 1.0114, "grad_norm": 0.8465521335601807, "learning_rate": 0.0002, "epoch": 1.1252268602540836, "step": 1550}, {"loss": 0.9643, "grad_norm": 0.36125949025154114, "learning_rate": 0.0002, "epoch": 1.132486388384755, "step": 1560}, {"loss": 1.0744, "grad_norm": 0.37420371174812317, "learning_rate": 0.0002, "epoch": 1.1397459165154264, "step": 1570}, {"loss": 1.165, "grad_norm": 0.3294760584831238, "learning_rate": 0.0002, "epoch": 1.147005444646098, "step": 1580}, {"loss": 0.9207, "grad_norm": 0.3881238102912903, "learning_rate": 0.0002, "epoch": 1.1542649727767695, "step": 1590}, {"loss": 1.0983, "grad_norm": 0.4766491651535034, "learning_rate": 0.0002, "epoch": 1.161524500907441, "step": 1600}, {"loss": 1.0092, "grad_norm": 0.395530104637146, "learning_rate": 0.0002, "epoch": 1.1687840290381124, "step": 1610}, {"loss": 0.9895, "grad_norm": 0.3297106623649597, "learning_rate": 0.0002, "epoch": 1.176043557168784, "step": 1620}, {"loss": 1.1121, "grad_norm": 0.39528271555900574, "learning_rate": 0.0002, "epoch": 1.1833030852994555, "step": 1630}, {"loss": 0.9202, "grad_norm": 0.3370221257209778, "learning_rate": 0.0002, "epoch": 1.190562613430127, "step": 1640}, {"loss": 1.102, "grad_norm": 0.31922030448913574, "learning_rate": 0.0002, "epoch": 1.1978221415607986, "step": 1650}, {"loss": 1.0565, "grad_norm": 0.6142027378082275, "learning_rate": 0.0002, "epoch": 1.2050816696914701, "step": 1660}, {"loss": 0.9541, "grad_norm": 0.44769710302352905, "learning_rate": 0.0002, "epoch": 1.2123411978221417, "step": 1670}, {"loss": 1.2501, "grad_norm": 0.41233646869659424, "learning_rate": 0.0002, "epoch": 1.219600725952813, "step": 1680}, {"loss": 1.0147, "grad_norm": 0.2928866147994995, "learning_rate": 0.0002, "epoch": 1.2268602540834845, "step": 1690}, {"loss": 0.9074, "grad_norm": 0.36913734674453735, "learning_rate": 0.0002, "epoch": 1.234119782214156, "step": 1700}, {"loss": 1.0847, "grad_norm": 0.5281891226768494, "learning_rate": 0.0002, "epoch": 1.2413793103448276, "step": 1710}, {"loss": 0.9482, "grad_norm": 0.3374697268009186, "learning_rate": 0.0002, "epoch": 1.2486388384754992, "step": 1720}, {"loss": 0.9826, "grad_norm": 0.3802020847797394, "learning_rate": 0.0002, "epoch": 1.2558983666061705, "step": 1730}, {"loss": 1.0191, "grad_norm": 0.38048651814460754, "learning_rate": 0.0002, "epoch": 1.263157894736842, "step": 1740}, {"loss": 1.0142, "grad_norm": 0.6676169633865356, "learning_rate": 0.0002, "epoch": 1.2704174228675136, "step": 1750}, {"loss": 0.992, "grad_norm": 0.4075961410999298, "learning_rate": 0.0002, "epoch": 1.2776769509981851, "step": 1760}, {"loss": 1.0301, "grad_norm": 0.4374721348285675, "learning_rate": 0.0002, "epoch": 1.2849364791288567, "step": 1770}, {"loss": 1.0601, "grad_norm": 0.4638824164867401, "learning_rate": 0.0002, "epoch": 1.2921960072595282, "step": 1780}, {"loss": 0.9593, "grad_norm": 0.38631564378738403, "learning_rate": 0.0002, "epoch": 1.2994555353901998, "step": 1790}, {"loss": 1.0042, "grad_norm": 0.35873809456825256, "learning_rate": 0.0002, "epoch": 1.306715063520871, "step": 1800}, {"loss": 0.9835, "grad_norm": 0.33602237701416016, "learning_rate": 0.0002, "epoch": 1.3139745916515426, "step": 1810}, {"loss": 1.1032, "grad_norm": 0.46696463227272034, "learning_rate": 0.0002, "epoch": 1.3212341197822142, "step": 1820}, {"loss": 1.0123, "grad_norm": 0.368958979845047, "learning_rate": 0.0002, "epoch": 1.3284936479128857, "step": 1830}, {"loss": 1.0456, "grad_norm": 0.3160957396030426, "learning_rate": 0.0002, "epoch": 1.335753176043557, "step": 1840}, {"loss": 1.0264, "grad_norm": 0.4511511027812958, "learning_rate": 0.0002, "epoch": 1.3430127041742286, "step": 1850}, {"loss": 1.0179, "grad_norm": 0.6769845485687256, "learning_rate": 0.0002, "epoch": 1.3502722323049001, "step": 1860}, {"loss": 1.011, "grad_norm": 0.3749309480190277, "learning_rate": 0.0002, "epoch": 1.3575317604355717, "step": 1870}, {"loss": 1.0223, "grad_norm": 0.553798496723175, "learning_rate": 0.0002, "epoch": 1.3647912885662432, "step": 1880}, {"loss": 0.9634, "grad_norm": 0.3538985550403595, "learning_rate": 0.0002, "epoch": 1.3720508166969148, "step": 1890}, {"loss": 1.0275, "grad_norm": 0.5501534938812256, "learning_rate": 0.0002, "epoch": 1.3793103448275863, "step": 1900}, {"loss": 1.0461, "grad_norm": 0.4432051181793213, "learning_rate": 0.0002, "epoch": 1.3865698729582578, "step": 1910}, {"loss": 0.9082, "grad_norm": 0.41755786538124084, "learning_rate": 0.0002, "epoch": 1.3938294010889292, "step": 1920}, {"loss": 1.0858, "grad_norm": 0.5626114010810852, "learning_rate": 0.0002, "epoch": 1.4010889292196007, "step": 1930}, {"loss": 1.0687, "grad_norm": 0.44038185477256775, "learning_rate": 0.0002, "epoch": 1.4083484573502723, "step": 1940}, {"loss": 0.9454, "grad_norm": 0.3397001624107361, "learning_rate": 0.0002, "epoch": 1.4156079854809438, "step": 1950}, {"loss": 0.943, "grad_norm": 0.4325368106365204, "learning_rate": 0.0002, "epoch": 1.4228675136116151, "step": 1960}, {"loss": 1.0488, "grad_norm": 0.3900907039642334, "learning_rate": 0.0002, "epoch": 1.4301270417422867, "step": 1970}, {"loss": 0.9699, "grad_norm": 0.369612455368042, "learning_rate": 0.0002, "epoch": 1.4373865698729582, "step": 1980}, {"loss": 1.0609, "grad_norm": 0.4389338791370392, "learning_rate": 0.0002, "epoch": 1.4446460980036298, "step": 1990}, {"loss": 1.042, "grad_norm": 1.694450855255127, "learning_rate": 0.0002, "epoch": 1.4519056261343013, "step": 2000}, {"loss": 0.997, "grad_norm": 0.516957700252533, "learning_rate": 0.0002, "epoch": 1.4591651542649728, "step": 2010}, {"loss": 1.0979, "grad_norm": 0.45515501499176025, "learning_rate": 0.0002, "epoch": 1.4664246823956444, "step": 2020}, {"loss": 1.0188, "grad_norm": 0.4153139591217041, "learning_rate": 0.0002, "epoch": 1.4736842105263157, "step": 2030}, {"loss": 1.1521, "grad_norm": 0.44353052973747253, "learning_rate": 0.0002, "epoch": 1.4809437386569873, "step": 2040}, {"loss": 0.9653, "grad_norm": 0.570554256439209, "learning_rate": 0.0002, "epoch": 1.4882032667876588, "step": 2050}, {"loss": 1.0352, "grad_norm": 0.5742740035057068, "learning_rate": 0.0002, "epoch": 1.4954627949183303, "step": 2060}, {"loss": 1.0963, "grad_norm": 0.5890517830848694, "learning_rate": 0.0002, "epoch": 1.5027223230490017, "step": 2070}, {"loss": 1.0639, "grad_norm": 0.4162650406360626, "learning_rate": 0.0002, "epoch": 1.5099818511796732, "step": 2080}, {"loss": 1.071, "grad_norm": 0.7334717512130737, "learning_rate": 0.0002, "epoch": 1.5172413793103448, "step": 2090}, {"loss": 0.8957, "grad_norm": 0.2963249683380127, "learning_rate": 0.0002, "epoch": 1.5245009074410163, "step": 2100}, {"loss": 1.0446, "grad_norm": 0.30676454305648804, "learning_rate": 0.0002, "epoch": 1.5317604355716878, "step": 2110}, {"loss": 0.9967, "grad_norm": 0.35984641313552856, "learning_rate": 0.0002, "epoch": 1.5390199637023594, "step": 2120}, {"loss": 1.0352, "grad_norm": 0.3384549617767334, "learning_rate": 0.0002, "epoch": 1.546279491833031, "step": 2130}, {"loss": 0.9874, "grad_norm": 0.4725518226623535, "learning_rate": 0.0002, "epoch": 1.5535390199637025, "step": 2140}, {"loss": 1.1166, "grad_norm": 0.3252685070037842, "learning_rate": 0.0002, "epoch": 1.560798548094374, "step": 2150}, {"loss": 0.9127, "grad_norm": 0.45043081045150757, "learning_rate": 0.0002, "epoch": 1.5680580762250453, "step": 2160}, {"loss": 1.0767, "grad_norm": 0.374208003282547, "learning_rate": 0.0002, "epoch": 1.5753176043557169, "step": 2170}, {"loss": 1.0255, "grad_norm": 0.5118404030799866, "learning_rate": 0.0002, "epoch": 1.5825771324863884, "step": 2180}, {"loss": 1.0062, "grad_norm": 0.482170969247818, "learning_rate": 0.0002, "epoch": 1.5898366606170597, "step": 2190}, {"loss": 1.0872, "grad_norm": 0.5337533950805664, "learning_rate": 0.0002, "epoch": 1.5970961887477313, "step": 2200}, {"loss": 1.0405, "grad_norm": 0.5195064544677734, "learning_rate": 0.0002, "epoch": 1.6043557168784028, "step": 2210}, {"loss": 1.0454, "grad_norm": 0.30807098746299744, "learning_rate": 0.0002, "epoch": 1.6116152450090744, "step": 2220}, {"loss": 1.0293, "grad_norm": 0.3962925672531128, "learning_rate": 0.0002, "epoch": 1.618874773139746, "step": 2230}, {"loss": 1.0137, "grad_norm": 0.7636962532997131, "learning_rate": 0.0002, "epoch": 1.6261343012704175, "step": 2240}, {"loss": 0.991, "grad_norm": 0.32380592823028564, "learning_rate": 0.0002, "epoch": 1.633393829401089, "step": 2250}, {"loss": 0.9471, "grad_norm": 0.5767741799354553, "learning_rate": 0.0002, "epoch": 1.6406533575317606, "step": 2260}, {"loss": 0.9524, "grad_norm": 0.39964812994003296, "learning_rate": 0.0002, "epoch": 1.647912885662432, "step": 2270}, {"loss": 0.9866, "grad_norm": 0.622629702091217, "learning_rate": 0.0002, "epoch": 1.6551724137931034, "step": 2280}, {"loss": 0.9609, "grad_norm": 0.40202152729034424, "learning_rate": 0.0002, "epoch": 1.662431941923775, "step": 2290}, {"loss": 0.9859, "grad_norm": 0.4467349052429199, "learning_rate": 0.0002, "epoch": 1.6696914700544465, "step": 2300}, {"loss": 1.0312, "grad_norm": 0.5026949048042297, "learning_rate": 0.0002, "epoch": 1.6769509981851178, "step": 2310}, {"loss": 1.0776, "grad_norm": 0.43754541873931885, "learning_rate": 0.0002, "epoch": 1.6842105263157894, "step": 2320}, {"loss": 0.9786, "grad_norm": 0.42869430780410767, "learning_rate": 0.0002, "epoch": 1.691470054446461, "step": 2330}, {"loss": 1.0747, "grad_norm": 0.4192679524421692, "learning_rate": 0.0002, "epoch": 1.6987295825771325, "step": 2340}, {"loss": 1.0054, "grad_norm": 0.3243155777454376, "learning_rate": 0.0002, "epoch": 1.705989110707804, "step": 2350}, {"loss": 0.9165, "grad_norm": 1.0514075756072998, "learning_rate": 0.0002, "epoch": 1.7132486388384756, "step": 2360}, {"loss": 1.1353, "grad_norm": 0.4719122648239136, "learning_rate": 0.0002, "epoch": 1.720508166969147, "step": 2370}, {"loss": 0.9913, "grad_norm": 0.3846144676208496, "learning_rate": 0.0002, "epoch": 1.7277676950998186, "step": 2380}, {"loss": 0.9935, "grad_norm": 0.7266581058502197, "learning_rate": 0.0002, "epoch": 1.73502722323049, "step": 2390}, {"loss": 0.9509, "grad_norm": 0.6183241605758667, "learning_rate": 0.0002, "epoch": 1.7422867513611615, "step": 2400}, {"loss": 0.9713, "grad_norm": 0.3658260405063629, "learning_rate": 0.0002, "epoch": 1.749546279491833, "step": 2410}, {"loss": 1.0033, "grad_norm": 0.6036322712898254, "learning_rate": 0.0002, "epoch": 1.7568058076225044, "step": 2420}, {"loss": 1.0526, "grad_norm": 0.7872936129570007, "learning_rate": 0.0002, "epoch": 1.764065335753176, "step": 2430}, {"loss": 0.99, "grad_norm": 0.35946124792099, "learning_rate": 0.0002, "epoch": 1.7713248638838475, "step": 2440}, {"loss": 0.8845, "grad_norm": 0.3740338981151581, "learning_rate": 0.0002, "epoch": 1.778584392014519, "step": 2450}, {"loss": 0.9333, "grad_norm": 0.6150230169296265, "learning_rate": 0.0002, "epoch": 1.7858439201451906, "step": 2460}, {"loss": 1.0373, "grad_norm": 0.4726075530052185, "learning_rate": 0.0002, "epoch": 1.793103448275862, "step": 2470}, {"loss": 0.974, "grad_norm": 0.31292253732681274, "learning_rate": 0.0002, "epoch": 1.8003629764065336, "step": 2480}, {"loss": 1.0205, "grad_norm": 0.4463104009628296, "learning_rate": 0.0002, "epoch": 1.8076225045372052, "step": 2490}, {"loss": 1.0771, "grad_norm": 0.7848200798034668, "learning_rate": 0.0002, "epoch": 1.8148820326678767, "step": 2500}, {"loss": 0.9508, "grad_norm": 0.5562082529067993, "learning_rate": 0.0002, "epoch": 1.822141560798548, "step": 2510}, {"loss": 0.9343, "grad_norm": 0.39892756938934326, "learning_rate": 0.0002, "epoch": 1.8294010889292196, "step": 2520}, {"loss": 1.0489, "grad_norm": 1.2923320531845093, "learning_rate": 0.0002, "epoch": 1.8366606170598911, "step": 2530}, {"loss": 1.1203, "grad_norm": 0.6316490769386292, "learning_rate": 0.0002, "epoch": 1.8439201451905625, "step": 2540}, {"loss": 0.9131, "grad_norm": 0.46100497245788574, "learning_rate": 0.0002, "epoch": 1.851179673321234, "step": 2550}, {"loss": 1.0977, "grad_norm": 0.7902987003326416, "learning_rate": 0.0002, "epoch": 1.8584392014519056, "step": 2560}, {"loss": 0.9702, "grad_norm": 0.4596365690231323, "learning_rate": 0.0002, "epoch": 1.865698729582577, "step": 2570}, {"loss": 0.9322, "grad_norm": 0.6592172384262085, "learning_rate": 0.0002, "epoch": 1.8729582577132486, "step": 2580}, {"loss": 0.9549, "grad_norm": 0.5764662027359009, "learning_rate": 0.0002, "epoch": 1.8802177858439202, "step": 2590}, {"loss": 0.9485, "grad_norm": 0.8421637415885925, "learning_rate": 0.0002, "epoch": 1.8874773139745917, "step": 2600}, {"loss": 1.012, "grad_norm": 0.5635305047035217, "learning_rate": 0.0002, "epoch": 1.8947368421052633, "step": 2610}, {"loss": 0.8907, "grad_norm": 0.46231237053871155, "learning_rate": 0.0002, "epoch": 1.9019963702359348, "step": 2620}, {"loss": 0.9543, "grad_norm": 0.3944607079029083, "learning_rate": 0.0002, "epoch": 1.9092558983666061, "step": 2630}, {"loss": 0.9964, "grad_norm": 0.4753907322883606, "learning_rate": 0.0002, "epoch": 1.9165154264972777, "step": 2640}, {"loss": 1.0217, "grad_norm": 0.4151090979576111, "learning_rate": 0.0002, "epoch": 1.9237749546279492, "step": 2650}, {"loss": 1.0397, "grad_norm": 0.6793725490570068, "learning_rate": 0.0002, "epoch": 1.9310344827586206, "step": 2660}, {"loss": 1.0401, "grad_norm": 0.339755117893219, "learning_rate": 0.0002, "epoch": 1.938294010889292, "step": 2670}, {"loss": 1.082, "grad_norm": 0.40789374709129333, "learning_rate": 0.0002, "epoch": 1.9455535390199636, "step": 2680}, {"loss": 0.9973, "grad_norm": 0.3750005066394806, "learning_rate": 0.0002, "epoch": 1.9528130671506352, "step": 2690}, {"loss": 0.9837, "grad_norm": 0.39684441685676575, "learning_rate": 0.0002, "epoch": 1.9600725952813067, "step": 2700}, {"loss": 1.1204, "grad_norm": 0.378287672996521, "learning_rate": 0.0002, "epoch": 1.9673321234119783, "step": 2710}, {"loss": 0.9284, "grad_norm": 0.3668482005596161, "learning_rate": 0.0002, "epoch": 1.9745916515426498, "step": 2720}, {"loss": 1.0512, "grad_norm": 0.49997565150260925, "learning_rate": 0.0002, "epoch": 1.9818511796733214, "step": 2730}, {"loss": 0.9311, "grad_norm": 0.36852124333381653, "learning_rate": 0.0002, "epoch": 1.989110707803993, "step": 2740}, {"loss": 1.0898, "grad_norm": 0.5203380584716797, "learning_rate": 0.0002, "epoch": 1.9963702359346642, "step": 2750}, {"eval_loss": 1.1086540222167969, "eval_runtime": 95.6866, "eval_samples_per_second": 4.557, "eval_steps_per_second": 0.575, "epoch": 2.0, "step": 2755}, {"loss": 0.9791, "grad_norm": 0.35921573638916016, "learning_rate": 0.0002, "epoch": 2.0036297640653356, "step": 2760}, {"loss": 0.8968, "grad_norm": 1.013861894607544, "learning_rate": 0.0002, "epoch": 2.010889292196007, "step": 2770}, {"loss": 0.7873, "grad_norm": 0.4425240159034729, "learning_rate": 0.0002, "epoch": 2.0181488203266786, "step": 2780}, {"loss": 0.8758, "grad_norm": 0.60174161195755, "learning_rate": 0.0002, "epoch": 2.02540834845735, "step": 2790}, {"loss": 0.7989, "grad_norm": 0.47582098841667175, "learning_rate": 0.0002, "epoch": 2.0326678765880217, "step": 2800}, {"loss": 0.8898, "grad_norm": 0.6012811660766602, "learning_rate": 0.0002, "epoch": 2.0399274047186933, "step": 2810}, {"loss": 0.8109, "grad_norm": 0.4444408118724823, "learning_rate": 0.0002, "epoch": 2.047186932849365, "step": 2820}, {"loss": 0.8621, "grad_norm": 0.4864003360271454, "learning_rate": 0.0002, "epoch": 2.0544464609800364, "step": 2830}, {"loss": 0.8817, "grad_norm": 0.5104215741157532, "learning_rate": 0.0002, "epoch": 2.061705989110708, "step": 2840}, {"loss": 0.7789, "grad_norm": 0.6218489408493042, "learning_rate": 0.0002, "epoch": 2.0689655172413794, "step": 2850}, {"loss": 0.8778, "grad_norm": 0.705784261226654, "learning_rate": 0.0002, "epoch": 2.076225045372051, "step": 2860}, {"loss": 0.7851, "grad_norm": 0.48091503977775574, "learning_rate": 0.0002, "epoch": 2.0834845735027225, "step": 2870}, {"loss": 0.8444, "grad_norm": 0.5062456727027893, "learning_rate": 0.0002, "epoch": 2.0907441016333936, "step": 2880}, {"loss": 0.7898, "grad_norm": 0.4862022399902344, "learning_rate": 0.0002, "epoch": 2.098003629764065, "step": 2890}, {"loss": 0.8593, "grad_norm": 0.48264044523239136, "learning_rate": 0.0002, "epoch": 2.1052631578947367, "step": 2900}, {"loss": 0.8325, "grad_norm": 0.43744346499443054, "learning_rate": 0.0002, "epoch": 2.1125226860254083, "step": 2910}, {"loss": 0.9099, "grad_norm": 0.5480492115020752, "learning_rate": 0.0002, "epoch": 2.11978221415608, "step": 2920}, {"loss": 0.7727, "grad_norm": 0.5068560838699341, "learning_rate": 0.0002, "epoch": 2.1270417422867514, "step": 2930}, {"loss": 0.8403, "grad_norm": 0.4650852680206299, "learning_rate": 0.0002, "epoch": 2.134301270417423, "step": 2940}, {"loss": 0.8467, "grad_norm": 0.4929981231689453, "learning_rate": 0.0002, "epoch": 2.1415607985480944, "step": 2950}, {"loss": 0.8747, "grad_norm": 0.6537389755249023, "learning_rate": 0.0002, "epoch": 2.148820326678766, "step": 2960}, {"loss": 0.7663, "grad_norm": 0.8032940626144409, "learning_rate": 0.0002, "epoch": 2.1560798548094375, "step": 2970}, {"loss": 0.7604, "grad_norm": 0.7131643891334534, "learning_rate": 0.0002, "epoch": 2.163339382940109, "step": 2980}, {"loss": 0.8424, "grad_norm": 0.6034275889396667, "learning_rate": 0.0002, "epoch": 2.1705989110707806, "step": 2990}, {"loss": 0.8838, "grad_norm": 0.6081095933914185, "learning_rate": 0.0002, "epoch": 2.1778584392014517, "step": 3000}, {"loss": 0.9013, "grad_norm": 0.5706912875175476, "learning_rate": 0.0002, "epoch": 2.1851179673321233, "step": 3010}, {"loss": 0.8473, "grad_norm": 0.6742380261421204, "learning_rate": 0.0002, "epoch": 2.192377495462795, "step": 3020}, {"loss": 0.8372, "grad_norm": 0.4847496449947357, "learning_rate": 0.0002, "epoch": 2.1996370235934664, "step": 3030}, {"loss": 0.8487, "grad_norm": 0.5529342889785767, "learning_rate": 0.0002, "epoch": 2.206896551724138, "step": 3040}, {"loss": 0.9057, "grad_norm": 0.6108783483505249, "learning_rate": 0.0002, "epoch": 2.2141560798548094, "step": 3050}, {"loss": 0.9659, "grad_norm": 0.8841571807861328, "learning_rate": 0.0002, "epoch": 2.221415607985481, "step": 3060}, {"loss": 0.8538, "grad_norm": 0.4227530360221863, "learning_rate": 0.0002, "epoch": 2.2286751361161525, "step": 3070}, {"loss": 0.7925, "grad_norm": 0.461935818195343, "learning_rate": 0.0002, "epoch": 2.235934664246824, "step": 3080}, {"loss": 0.8542, "grad_norm": 0.5407412648200989, "learning_rate": 0.0002, "epoch": 2.2431941923774956, "step": 3090}, {"loss": 0.8392, "grad_norm": 0.5057998895645142, "learning_rate": 0.0002, "epoch": 2.250453720508167, "step": 3100}, {"loss": 0.7898, "grad_norm": 0.530057966709137, "learning_rate": 0.0002, "epoch": 2.2577132486388383, "step": 3110}, {"loss": 0.8507, "grad_norm": 0.5066842436790466, "learning_rate": 0.0002, "epoch": 2.26497277676951, "step": 3120}, {"loss": 0.7903, "grad_norm": 0.5069178342819214, "learning_rate": 0.0002, "epoch": 2.2722323049001814, "step": 3130}, {"loss": 0.8458, "grad_norm": 0.6095499396324158, "learning_rate": 0.0002, "epoch": 2.279491833030853, "step": 3140}, {"loss": 0.8171, "grad_norm": 0.49524766206741333, "learning_rate": 0.0002, "epoch": 2.2867513611615244, "step": 3150}, {"loss": 0.7568, "grad_norm": 0.5334409475326538, "learning_rate": 0.0002, "epoch": 2.294010889292196, "step": 3160}, {"loss": 0.7533, "grad_norm": 1.681748867034912, "learning_rate": 0.0002, "epoch": 2.3012704174228675, "step": 3170}, {"loss": 0.8155, "grad_norm": 0.7225565314292908, "learning_rate": 0.0002, "epoch": 2.308529945553539, "step": 3180}, {"loss": 0.8034, "grad_norm": 0.5379388928413391, "learning_rate": 0.0002, "epoch": 2.3157894736842106, "step": 3190}, {"loss": 0.8833, "grad_norm": 0.45770326256752014, "learning_rate": 0.0002, "epoch": 2.323049001814882, "step": 3200}, {"loss": 0.8419, "grad_norm": 0.6984533071517944, "learning_rate": 0.0002, "epoch": 2.3303085299455537, "step": 3210}, {"loss": 0.7414, "grad_norm": 0.6725744605064392, "learning_rate": 0.0002, "epoch": 2.337568058076225, "step": 3220}, {"loss": 0.8657, "grad_norm": 1.1247471570968628, "learning_rate": 0.0002, "epoch": 2.344827586206897, "step": 3230}, {"loss": 0.7782, "grad_norm": 1.0240263938903809, "learning_rate": 0.0002, "epoch": 2.352087114337568, "step": 3240}, {"loss": 0.826, "grad_norm": 0.5608096122741699, "learning_rate": 0.0002, "epoch": 2.3593466424682394, "step": 3250}, {"loss": 0.8191, "grad_norm": 0.8294990062713623, "learning_rate": 0.0002, "epoch": 2.366606170598911, "step": 3260}, {"loss": 0.9845, "grad_norm": 0.6734224557876587, "learning_rate": 0.0002, "epoch": 2.3738656987295825, "step": 3270}, {"loss": 0.7921, "grad_norm": 0.6862800717353821, "learning_rate": 0.0002, "epoch": 2.381125226860254, "step": 3280}, {"loss": 0.7752, "grad_norm": 0.5442930459976196, "learning_rate": 0.0002, "epoch": 2.3883847549909256, "step": 3290}, {"loss": 0.8543, "grad_norm": 0.5745394229888916, "learning_rate": 0.0002, "epoch": 2.395644283121597, "step": 3300}, {"loss": 0.833, "grad_norm": 0.6257799863815308, "learning_rate": 0.0002, "epoch": 2.4029038112522687, "step": 3310}, {"loss": 0.8191, "grad_norm": 0.5608420968055725, "learning_rate": 0.0002, "epoch": 2.4101633393829403, "step": 3320}, {"loss": 0.8745, "grad_norm": 0.5512017011642456, "learning_rate": 0.0002, "epoch": 2.417422867513612, "step": 3330}, {"loss": 0.8642, "grad_norm": 0.7452999353408813, "learning_rate": 0.0002, "epoch": 2.4246823956442833, "step": 3340}, {"loss": 0.8417, "grad_norm": 0.4604301452636719, "learning_rate": 0.0002, "epoch": 2.4319419237749544, "step": 3350}, {"loss": 0.9148, "grad_norm": 0.8225823640823364, "learning_rate": 0.0002, "epoch": 2.439201451905626, "step": 3360}, {"loss": 0.8144, "grad_norm": 0.8604981303215027, "learning_rate": 0.0002, "epoch": 2.4464609800362975, "step": 3370}, {"loss": 0.9276, "grad_norm": 0.6620925664901733, "learning_rate": 0.0002, "epoch": 2.453720508166969, "step": 3380}, {"loss": 0.8381, "grad_norm": 0.4750158488750458, "learning_rate": 0.0002, "epoch": 2.4609800362976406, "step": 3390}, {"loss": 0.8249, "grad_norm": 0.6061418056488037, "learning_rate": 0.0002, "epoch": 2.468239564428312, "step": 3400}, {"loss": 0.8944, "grad_norm": 0.5977247357368469, "learning_rate": 0.0002, "epoch": 2.4754990925589837, "step": 3410}, {"loss": 0.8366, "grad_norm": 0.5004227757453918, "learning_rate": 0.0002, "epoch": 2.4827586206896552, "step": 3420}, {"loss": 0.9133, "grad_norm": 0.46428972482681274, "learning_rate": 0.0002, "epoch": 2.490018148820327, "step": 3430}, {"loss": 0.8534, "grad_norm": 0.7658395171165466, "learning_rate": 0.0002, "epoch": 2.4972776769509983, "step": 3440}, {"loss": 0.9066, "grad_norm": 0.621819794178009, "learning_rate": 0.0002, "epoch": 2.50453720508167, "step": 3450}, {"loss": 0.8481, "grad_norm": 0.4556088149547577, "learning_rate": 0.0002, "epoch": 2.511796733212341, "step": 3460}, {"loss": 0.8522, "grad_norm": 0.6124140024185181, "learning_rate": 0.0002, "epoch": 2.519056261343013, "step": 3470}, {"loss": 0.8158, "grad_norm": 0.6256231665611267, "learning_rate": 0.0002, "epoch": 2.526315789473684, "step": 3480}, {"loss": 0.9097, "grad_norm": 0.5464141964912415, "learning_rate": 0.0002, "epoch": 2.5335753176043556, "step": 3490}, {"loss": 0.8708, "grad_norm": 0.51471346616745, "learning_rate": 0.0002, "epoch": 2.540834845735027, "step": 3500}, {"loss": 0.9042, "grad_norm": 0.5326165556907654, "learning_rate": 0.0002, "epoch": 2.5480943738656987, "step": 3510}, {"loss": 0.8749, "grad_norm": 0.4750378429889679, "learning_rate": 0.0002, "epoch": 2.5553539019963702, "step": 3520}, {"loss": 0.8248, "grad_norm": 0.5292279124259949, "learning_rate": 0.0002, "epoch": 2.562613430127042, "step": 3530}, {"loss": 0.8808, "grad_norm": 0.6145227551460266, "learning_rate": 0.0002, "epoch": 2.5698729582577133, "step": 3540}, {"loss": 0.846, "grad_norm": 0.8275189399719238, "learning_rate": 0.0002, "epoch": 2.577132486388385, "step": 3550}, {"loss": 0.875, "grad_norm": 0.5037438273429871, "learning_rate": 0.0002, "epoch": 2.5843920145190564, "step": 3560}, {"loss": 0.8324, "grad_norm": 0.5838707685470581, "learning_rate": 0.0002, "epoch": 2.5916515426497275, "step": 3570}, {"loss": 0.8716, "grad_norm": 0.5398710370063782, "learning_rate": 0.0002, "epoch": 2.5989110707803995, "step": 3580}, {"loss": 0.7843, "grad_norm": 0.6115376949310303, "learning_rate": 0.0002, "epoch": 2.6061705989110706, "step": 3590}, {"loss": 0.8142, "grad_norm": 1.118809461593628, "learning_rate": 0.0002, "epoch": 2.613430127041742, "step": 3600}, {"loss": 0.8282, "grad_norm": 0.7811771631240845, "learning_rate": 0.0002, "epoch": 2.6206896551724137, "step": 3610}, {"loss": 0.7997, "grad_norm": 0.5753175020217896, "learning_rate": 0.0002, "epoch": 2.6279491833030852, "step": 3620}, {"loss": 0.7705, "grad_norm": 0.550829291343689, "learning_rate": 0.0002, "epoch": 2.635208711433757, "step": 3630}, {"loss": 0.8564, "grad_norm": 0.5360019207000732, "learning_rate": 0.0002, "epoch": 2.6424682395644283, "step": 3640}, {"loss": 0.7648, "grad_norm": 0.63050377368927, "learning_rate": 0.0002, "epoch": 2.6497277676951, "step": 3650}, {"loss": 0.9156, "grad_norm": 0.5833110213279724, "learning_rate": 0.0002, "epoch": 2.6569872958257714, "step": 3660}, {"loss": 0.8033, "grad_norm": 0.5543047189712524, "learning_rate": 0.0002, "epoch": 2.664246823956443, "step": 3670}, {"loss": 0.8867, "grad_norm": 0.6842212080955505, "learning_rate": 0.0002, "epoch": 2.671506352087114, "step": 3680}, {"loss": 0.8569, "grad_norm": 0.6859333515167236, "learning_rate": 0.0002, "epoch": 2.678765880217786, "step": 3690}, {"loss": 0.8081, "grad_norm": 0.7038410902023315, "learning_rate": 0.0002, "epoch": 2.686025408348457, "step": 3700}, {"loss": 0.8686, "grad_norm": 0.447233647108078, "learning_rate": 0.0002, "epoch": 2.6932849364791287, "step": 3710}, {"loss": 0.8951, "grad_norm": 0.6453872919082642, "learning_rate": 0.0002, "epoch": 2.7005444646098002, "step": 3720}, {"loss": 0.8292, "grad_norm": 0.8025672435760498, "learning_rate": 0.0002, "epoch": 2.707803992740472, "step": 3730}, {"loss": 0.8488, "grad_norm": 0.5997087955474854, "learning_rate": 0.0002, "epoch": 2.7150635208711433, "step": 3740}, {"loss": 0.8308, "grad_norm": 0.6901142001152039, "learning_rate": 0.0002, "epoch": 2.722323049001815, "step": 3750}, {"loss": 0.8517, "grad_norm": 1.036145567893982, "learning_rate": 0.0002, "epoch": 2.7295825771324864, "step": 3760}, {"loss": 0.8455, "grad_norm": 0.7207072377204895, "learning_rate": 0.0002, "epoch": 2.736842105263158, "step": 3770}, {"loss": 0.8009, "grad_norm": 1.0452989339828491, "learning_rate": 0.0002, "epoch": 2.7441016333938295, "step": 3780}, {"loss": 0.8868, "grad_norm": 0.5615278482437134, "learning_rate": 0.0002, "epoch": 2.751361161524501, "step": 3790}, {"loss": 0.9047, "grad_norm": 0.46439215540885925, "learning_rate": 0.0002, "epoch": 2.7586206896551726, "step": 3800}, {"loss": 0.7847, "grad_norm": 0.7134917974472046, "learning_rate": 0.0002, "epoch": 2.7658802177858437, "step": 3810}, {"loss": 0.8315, "grad_norm": 0.5139115452766418, "learning_rate": 0.0002, "epoch": 2.7731397459165157, "step": 3820}, {"loss": 0.8995, "grad_norm": 0.8595781326293945, "learning_rate": 0.0002, "epoch": 2.780399274047187, "step": 3830}, {"loss": 0.9079, "grad_norm": 0.544614851474762, "learning_rate": 0.0002, "epoch": 2.7876588021778583, "step": 3840}, {"loss": 0.8322, "grad_norm": 0.6073850393295288, "learning_rate": 0.0002, "epoch": 2.79491833030853, "step": 3850}, {"loss": 0.8277, "grad_norm": 0.8224069476127625, "learning_rate": 0.0002, "epoch": 2.8021778584392014, "step": 3860}, {"loss": 0.815, "grad_norm": 0.5347970128059387, "learning_rate": 0.0002, "epoch": 2.809437386569873, "step": 3870}, {"loss": 0.8219, "grad_norm": 0.6799601912498474, "learning_rate": 0.0002, "epoch": 2.8166969147005445, "step": 3880}, {"loss": 0.8796, "grad_norm": 0.5219197869300842, "learning_rate": 0.0002, "epoch": 2.823956442831216, "step": 3890}, {"loss": 0.8649, "grad_norm": 0.5710130333900452, "learning_rate": 0.0002, "epoch": 2.8312159709618876, "step": 3900}, {"loss": 0.8741, "grad_norm": 0.5857213139533997, "learning_rate": 0.0002, "epoch": 2.838475499092559, "step": 3910}, {"loss": 0.8154, "grad_norm": 0.5206644535064697, "learning_rate": 0.0002, "epoch": 2.8457350272232302, "step": 3920}, {"loss": 0.9328, "grad_norm": 1.2902015447616577, "learning_rate": 0.0002, "epoch": 2.8529945553539022, "step": 3930}, {"loss": 0.8192, "grad_norm": 0.5252797603607178, "learning_rate": 0.0002, "epoch": 2.8602540834845733, "step": 3940}, {"loss": 0.9047, "grad_norm": 0.5925108790397644, "learning_rate": 0.0002, "epoch": 2.867513611615245, "step": 3950}, {"loss": 0.8611, "grad_norm": 0.9719610810279846, "learning_rate": 0.0002, "epoch": 2.8747731397459164, "step": 3960}, {"loss": 0.8531, "grad_norm": 0.6834747195243835, "learning_rate": 0.0002, "epoch": 2.882032667876588, "step": 3970}, {"loss": 0.8124, "grad_norm": 0.6952353119850159, "learning_rate": 0.0002, "epoch": 2.8892921960072595, "step": 3980}, {"loss": 0.8365, "grad_norm": 0.49889910221099854, "learning_rate": 0.0002, "epoch": 2.896551724137931, "step": 3990}, {"loss": 0.8197, "grad_norm": 0.5007026791572571, "learning_rate": 0.0002, "epoch": 2.9038112522686026, "step": 4000}, {"loss": 0.7752, "grad_norm": 0.5474239587783813, "learning_rate": 0.0002, "epoch": 2.911070780399274, "step": 4010}, {"loss": 0.8579, "grad_norm": 0.6617428660392761, "learning_rate": 0.0002, "epoch": 2.9183303085299457, "step": 4020}, {"loss": 0.8583, "grad_norm": 0.6097776293754578, "learning_rate": 0.0002, "epoch": 2.925589836660617, "step": 4030}, {"loss": 0.8296, "grad_norm": 0.5985828638076782, "learning_rate": 0.0002, "epoch": 2.9328493647912888, "step": 4040}, {"loss": 0.9134, "grad_norm": 0.769488513469696, "learning_rate": 0.0002, "epoch": 2.94010889292196, "step": 4050}, {"loss": 0.8148, "grad_norm": 0.5167055130004883, "learning_rate": 0.0002, "epoch": 2.9473684210526314, "step": 4060}, {"loss": 0.8665, "grad_norm": 0.6265496015548706, "learning_rate": 0.0002, "epoch": 2.954627949183303, "step": 4070}, {"loss": 0.9218, "grad_norm": 1.2644082307815552, "learning_rate": 0.0002, "epoch": 2.9618874773139745, "step": 4080}, {"loss": 0.8026, "grad_norm": 0.6007561087608337, "learning_rate": 0.0002, "epoch": 2.969147005444646, "step": 4090}, {"loss": 0.8262, "grad_norm": 0.47984927892684937, "learning_rate": 0.0002, "epoch": 2.9764065335753176, "step": 4100}, {"loss": 0.7897, "grad_norm": 1.128198504447937, "learning_rate": 0.0002, "epoch": 2.983666061705989, "step": 4110}, {"loss": 0.7883, "grad_norm": 0.526292085647583, "learning_rate": 0.0002, "epoch": 2.9909255898366607, "step": 4120}, {"loss": 0.7801, "grad_norm": 0.5462674498558044, "learning_rate": 0.0002, "epoch": 2.9981851179673322, "step": 4130}]} +{"epoch": 4.0, "step": 5510, "epoch_duration": 2056.9718704223633, "total_accumulated_duration": 14286.312230110168, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 24486.0}, "peak_memory_reserved": {"GPU_0": 24752.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4936, "grad_norm": 0.46010470390319824, "learning_rate": 0.0002, "epoch": 0.007259528130671506, "step": 10}, {"loss": 1.8442, "grad_norm": 0.5103179216384888, "learning_rate": 0.0002, "epoch": 0.014519056261343012, "step": 20}, {"loss": 1.559, "grad_norm": 0.6282716989517212, "learning_rate": 0.0002, "epoch": 0.021778584392014518, "step": 30}, {"loss": 1.3618, "grad_norm": 1.2480497360229492, "learning_rate": 0.0002, "epoch": 0.029038112522686024, "step": 40}, {"loss": 1.3874, "grad_norm": 0.4114855229854584, "learning_rate": 0.0002, "epoch": 0.036297640653357534, "step": 50}, {"loss": 1.2836, "grad_norm": 0.49482840299606323, "learning_rate": 0.0002, "epoch": 0.043557168784029036, "step": 60}, {"loss": 1.1728, "grad_norm": 0.4536272883415222, "learning_rate": 0.0002, "epoch": 0.050816696914700546, "step": 70}, {"loss": 1.1125, "grad_norm": 0.32328274846076965, "learning_rate": 0.0002, "epoch": 0.05807622504537205, "step": 80}, {"loss": 1.2447, "grad_norm": 0.40990468859672546, "learning_rate": 0.0002, "epoch": 0.06533575317604355, "step": 90}, {"loss": 1.2216, "grad_norm": 0.37273502349853516, "learning_rate": 0.0002, "epoch": 0.07259528130671507, "step": 100}, {"loss": 1.2259, "grad_norm": 0.3903871476650238, "learning_rate": 0.0002, "epoch": 0.07985480943738657, "step": 110}, {"loss": 1.3718, "grad_norm": 0.3272787928581238, "learning_rate": 0.0002, "epoch": 0.08711433756805807, "step": 120}, {"loss": 1.1832, "grad_norm": 0.3622824251651764, "learning_rate": 0.0002, "epoch": 0.09437386569872959, "step": 130}, {"loss": 1.2966, "grad_norm": 0.3503916561603546, "learning_rate": 0.0002, "epoch": 0.10163339382940109, "step": 140}, {"loss": 1.0913, "grad_norm": 0.32787832617759705, "learning_rate": 0.0002, "epoch": 0.1088929219600726, "step": 150}, {"loss": 1.2143, "grad_norm": 0.5822657942771912, "learning_rate": 0.0002, "epoch": 0.1161524500907441, "step": 160}, {"loss": 1.2091, "grad_norm": 0.28028249740600586, "learning_rate": 0.0002, "epoch": 0.12341197822141561, "step": 170}, {"loss": 1.1731, "grad_norm": 0.33602750301361084, "learning_rate": 0.0002, "epoch": 0.1306715063520871, "step": 180}, {"loss": 1.127, "grad_norm": 0.29106274247169495, "learning_rate": 0.0002, "epoch": 0.13793103448275862, "step": 190}, {"loss": 1.3261, "grad_norm": 0.38753363490104675, "learning_rate": 0.0002, "epoch": 0.14519056261343014, "step": 200}, {"loss": 1.1536, "grad_norm": 0.361009418964386, "learning_rate": 0.0002, "epoch": 0.15245009074410162, "step": 210}, {"loss": 1.1483, "grad_norm": 0.6743836402893066, "learning_rate": 0.0002, "epoch": 0.15970961887477314, "step": 220}, {"loss": 1.264, "grad_norm": 0.3923613727092743, "learning_rate": 0.0002, "epoch": 0.16696914700544466, "step": 230}, {"loss": 1.0437, "grad_norm": 0.2809699773788452, "learning_rate": 0.0002, "epoch": 0.17422867513611615, "step": 240}, {"loss": 1.1826, "grad_norm": 0.3631494641304016, "learning_rate": 0.0002, "epoch": 0.18148820326678766, "step": 250}, {"loss": 1.1584, "grad_norm": 0.24658171832561493, "learning_rate": 0.0002, "epoch": 0.18874773139745918, "step": 260}, {"loss": 1.0065, "grad_norm": 0.5780664682388306, "learning_rate": 0.0002, "epoch": 0.19600725952813067, "step": 270}, {"loss": 1.1251, "grad_norm": 0.3056720495223999, "learning_rate": 0.0002, "epoch": 0.20326678765880218, "step": 280}, {"loss": 1.1503, "grad_norm": 0.2857084572315216, "learning_rate": 0.0002, "epoch": 0.21052631578947367, "step": 290}, {"loss": 1.1215, "grad_norm": 0.6645345687866211, "learning_rate": 0.0002, "epoch": 0.2177858439201452, "step": 300}, {"loss": 1.127, "grad_norm": 0.5966078639030457, "learning_rate": 0.0002, "epoch": 0.2250453720508167, "step": 310}, {"loss": 1.1974, "grad_norm": 0.40937140583992004, "learning_rate": 0.0002, "epoch": 0.2323049001814882, "step": 320}, {"loss": 1.2385, "grad_norm": 0.5642806887626648, "learning_rate": 0.0002, "epoch": 0.2395644283121597, "step": 330}, {"loss": 1.1703, "grad_norm": 0.2750748097896576, "learning_rate": 0.0002, "epoch": 0.24682395644283123, "step": 340}, {"loss": 1.2598, "grad_norm": 0.34350234270095825, "learning_rate": 0.0002, "epoch": 0.2540834845735027, "step": 350}, {"loss": 1.1942, "grad_norm": 0.6767239570617676, "learning_rate": 0.0002, "epoch": 0.2613430127041742, "step": 360}, {"loss": 1.1436, "grad_norm": 0.31006959080696106, "learning_rate": 0.0002, "epoch": 0.26860254083484575, "step": 370}, {"loss": 1.2084, "grad_norm": 0.3825474679470062, "learning_rate": 0.0002, "epoch": 0.27586206896551724, "step": 380}, {"loss": 1.1523, "grad_norm": 0.30241551995277405, "learning_rate": 0.0002, "epoch": 0.2831215970961887, "step": 390}, {"loss": 1.1298, "grad_norm": 0.2962397336959839, "learning_rate": 0.0002, "epoch": 0.29038112522686027, "step": 400}, {"loss": 1.1299, "grad_norm": 0.2600369155406952, "learning_rate": 0.0002, "epoch": 0.29764065335753176, "step": 410}, {"loss": 1.1366, "grad_norm": 0.3675060272216797, "learning_rate": 0.0002, "epoch": 0.30490018148820325, "step": 420}, {"loss": 1.156, "grad_norm": 0.3429498076438904, "learning_rate": 0.0002, "epoch": 0.3121597096188748, "step": 430}, {"loss": 1.2741, "grad_norm": 0.34311825037002563, "learning_rate": 0.0002, "epoch": 0.3194192377495463, "step": 440}, {"loss": 1.3523, "grad_norm": 0.37872210144996643, "learning_rate": 0.0002, "epoch": 0.32667876588021777, "step": 450}, {"loss": 1.1365, "grad_norm": 0.33271121978759766, "learning_rate": 0.0002, "epoch": 0.3339382940108893, "step": 460}, {"loss": 1.1572, "grad_norm": 0.34605276584625244, "learning_rate": 0.0002, "epoch": 0.3411978221415608, "step": 470}, {"loss": 1.2755, "grad_norm": 0.41050562262535095, "learning_rate": 0.0002, "epoch": 0.3484573502722323, "step": 480}, {"loss": 1.075, "grad_norm": 0.2066836953163147, "learning_rate": 0.0002, "epoch": 0.35571687840290384, "step": 490}, {"loss": 1.2072, "grad_norm": 0.2859014868736267, "learning_rate": 0.0002, "epoch": 0.3629764065335753, "step": 500}, {"loss": 1.1435, "grad_norm": 0.28763777017593384, "learning_rate": 0.0002, "epoch": 0.3702359346642468, "step": 510}, {"loss": 1.1341, "grad_norm": 0.2730471193790436, "learning_rate": 0.0002, "epoch": 0.37749546279491836, "step": 520}, {"loss": 1.348, "grad_norm": 0.3968936800956726, "learning_rate": 0.0002, "epoch": 0.38475499092558985, "step": 530}, {"loss": 1.2363, "grad_norm": 0.3624701201915741, "learning_rate": 0.0002, "epoch": 0.39201451905626133, "step": 540}, {"loss": 1.1206, "grad_norm": 0.3303608298301697, "learning_rate": 0.0002, "epoch": 0.3992740471869328, "step": 550}, {"loss": 1.203, "grad_norm": 0.33507466316223145, "learning_rate": 0.0002, "epoch": 0.40653357531760437, "step": 560}, {"loss": 1.2294, "grad_norm": 0.3297670781612396, "learning_rate": 0.0002, "epoch": 0.41379310344827586, "step": 570}, {"loss": 1.2, "grad_norm": 0.32334890961647034, "learning_rate": 0.0002, "epoch": 0.42105263157894735, "step": 580}, {"loss": 1.112, "grad_norm": 0.30281195044517517, "learning_rate": 0.0002, "epoch": 0.4283121597096189, "step": 590}, {"loss": 1.1352, "grad_norm": 0.5900027751922607, "learning_rate": 0.0002, "epoch": 0.4355716878402904, "step": 600}, {"loss": 1.1575, "grad_norm": 0.28437477350234985, "learning_rate": 0.0002, "epoch": 0.44283121597096187, "step": 610}, {"loss": 1.2009, "grad_norm": 0.39601704478263855, "learning_rate": 0.0002, "epoch": 0.4500907441016334, "step": 620}, {"loss": 1.2302, "grad_norm": 0.41971510648727417, "learning_rate": 0.0002, "epoch": 0.4573502722323049, "step": 630}, {"loss": 1.236, "grad_norm": 0.33814409375190735, "learning_rate": 0.0002, "epoch": 0.4646098003629764, "step": 640}, {"loss": 1.2471, "grad_norm": 0.575718104839325, "learning_rate": 0.0002, "epoch": 0.47186932849364793, "step": 650}, {"loss": 1.1012, "grad_norm": 0.37927401065826416, "learning_rate": 0.0002, "epoch": 0.4791288566243194, "step": 660}, {"loss": 1.1552, "grad_norm": 0.3224332630634308, "learning_rate": 0.0002, "epoch": 0.4863883847549909, "step": 670}, {"loss": 1.1354, "grad_norm": 0.32683515548706055, "learning_rate": 0.0002, "epoch": 0.49364791288566245, "step": 680}, {"loss": 1.207, "grad_norm": 0.4316163659095764, "learning_rate": 0.0002, "epoch": 0.5009074410163339, "step": 690}, {"loss": 1.2116, "grad_norm": 0.342602401971817, "learning_rate": 0.0002, "epoch": 0.5081669691470054, "step": 700}, {"loss": 1.0433, "grad_norm": 0.2794898748397827, "learning_rate": 0.0002, "epoch": 0.515426497277677, "step": 710}, {"loss": 1.2395, "grad_norm": 0.3322339951992035, "learning_rate": 0.0002, "epoch": 0.5226860254083484, "step": 720}, {"loss": 1.0762, "grad_norm": 0.3088509142398834, "learning_rate": 0.0002, "epoch": 0.52994555353902, "step": 730}, {"loss": 1.1495, "grad_norm": 0.24444378912448883, "learning_rate": 0.0002, "epoch": 0.5372050816696915, "step": 740}, {"loss": 1.2176, "grad_norm": 0.3483171761035919, "learning_rate": 0.0002, "epoch": 0.5444646098003629, "step": 750}, {"loss": 1.2248, "grad_norm": 0.3425690531730652, "learning_rate": 0.0002, "epoch": 0.5517241379310345, "step": 760}, {"loss": 1.1127, "grad_norm": 0.31841927766799927, "learning_rate": 0.0002, "epoch": 0.558983666061706, "step": 770}, {"loss": 1.1841, "grad_norm": 0.39423868060112, "learning_rate": 0.0002, "epoch": 0.5662431941923775, "step": 780}, {"loss": 1.0583, "grad_norm": 0.30328479409217834, "learning_rate": 0.0002, "epoch": 0.573502722323049, "step": 790}, {"loss": 1.0718, "grad_norm": 0.24475938081741333, "learning_rate": 0.0002, "epoch": 0.5807622504537205, "step": 800}, {"loss": 1.2006, "grad_norm": 0.37132805585861206, "learning_rate": 0.0002, "epoch": 0.588021778584392, "step": 810}, {"loss": 1.1544, "grad_norm": 0.32195979356765747, "learning_rate": 0.0002, "epoch": 0.5952813067150635, "step": 820}, {"loss": 0.9937, "grad_norm": 0.2848738729953766, "learning_rate": 0.0002, "epoch": 0.6025408348457351, "step": 830}, {"loss": 1.0937, "grad_norm": 0.28015264868736267, "learning_rate": 0.0002, "epoch": 0.6098003629764065, "step": 840}, {"loss": 1.1234, "grad_norm": 0.37796008586883545, "learning_rate": 0.0002, "epoch": 0.617059891107078, "step": 850}, {"loss": 1.1455, "grad_norm": 0.39311841130256653, "learning_rate": 0.0002, "epoch": 0.6243194192377496, "step": 860}, {"loss": 1.1066, "grad_norm": 0.2761685252189636, "learning_rate": 0.0002, "epoch": 0.631578947368421, "step": 870}, {"loss": 1.0859, "grad_norm": 0.3826720118522644, "learning_rate": 0.0002, "epoch": 0.6388384754990926, "step": 880}, {"loss": 1.1182, "grad_norm": 0.30076679587364197, "learning_rate": 0.0002, "epoch": 0.6460980036297641, "step": 890}, {"loss": 1.0927, "grad_norm": 0.21997687220573425, "learning_rate": 0.0002, "epoch": 0.6533575317604355, "step": 900}, {"loss": 1.1603, "grad_norm": 0.32593777775764465, "learning_rate": 0.0002, "epoch": 0.6606170598911071, "step": 910}, {"loss": 1.1578, "grad_norm": 0.30347898602485657, "learning_rate": 0.0002, "epoch": 0.6678765880217786, "step": 920}, {"loss": 1.1642, "grad_norm": 0.44173774123191833, "learning_rate": 0.0002, "epoch": 0.6751361161524501, "step": 930}, {"loss": 1.1832, "grad_norm": 0.2507467269897461, "learning_rate": 0.0002, "epoch": 0.6823956442831216, "step": 940}, {"loss": 1.3001, "grad_norm": 0.29463833570480347, "learning_rate": 0.0002, "epoch": 0.6896551724137931, "step": 950}, {"loss": 1.1651, "grad_norm": 0.9363154172897339, "learning_rate": 0.0002, "epoch": 0.6969147005444646, "step": 960}, {"loss": 1.158, "grad_norm": 0.3236212134361267, "learning_rate": 0.0002, "epoch": 0.7041742286751361, "step": 970}, {"loss": 1.1078, "grad_norm": 0.3123254179954529, "learning_rate": 0.0002, "epoch": 0.7114337568058077, "step": 980}, {"loss": 1.1578, "grad_norm": 0.3395805060863495, "learning_rate": 0.0002, "epoch": 0.7186932849364791, "step": 990}, {"loss": 1.0754, "grad_norm": 0.3240964412689209, "learning_rate": 0.0002, "epoch": 0.7259528130671506, "step": 1000}, {"loss": 1.0313, "grad_norm": 0.31902948021888733, "learning_rate": 0.0002, "epoch": 0.7332123411978222, "step": 1010}, {"loss": 1.1664, "grad_norm": 0.4848408102989197, "learning_rate": 0.0002, "epoch": 0.7404718693284936, "step": 1020}, {"loss": 1.0838, "grad_norm": 0.33006033301353455, "learning_rate": 0.0002, "epoch": 0.7477313974591652, "step": 1030}, {"loss": 1.0979, "grad_norm": 0.2928730547428131, "learning_rate": 0.0002, "epoch": 0.7549909255898367, "step": 1040}, {"loss": 1.1351, "grad_norm": 0.3529164791107178, "learning_rate": 0.0002, "epoch": 0.7622504537205081, "step": 1050}, {"loss": 1.1274, "grad_norm": 0.2736213803291321, "learning_rate": 0.0002, "epoch": 0.7695099818511797, "step": 1060}, {"loss": 1.1169, "grad_norm": 0.7200686931610107, "learning_rate": 0.0002, "epoch": 0.7767695099818511, "step": 1070}, {"loss": 1.1171, "grad_norm": 0.33396708965301514, "learning_rate": 0.0002, "epoch": 0.7840290381125227, "step": 1080}, {"loss": 1.2447, "grad_norm": 1.5760449171066284, "learning_rate": 0.0002, "epoch": 0.7912885662431942, "step": 1090}, {"loss": 1.1126, "grad_norm": 0.28138381242752075, "learning_rate": 0.0002, "epoch": 0.7985480943738656, "step": 1100}, {"loss": 1.2012, "grad_norm": 0.2597472369670868, "learning_rate": 0.0002, "epoch": 0.8058076225045372, "step": 1110}, {"loss": 1.1177, "grad_norm": 0.3305445611476898, "learning_rate": 0.0002, "epoch": 0.8130671506352087, "step": 1120}, {"loss": 1.0849, "grad_norm": 0.3934599459171295, "learning_rate": 0.0002, "epoch": 0.8203266787658802, "step": 1130}, {"loss": 1.0889, "grad_norm": 0.3472191393375397, "learning_rate": 0.0002, "epoch": 0.8275862068965517, "step": 1140}, {"loss": 1.265, "grad_norm": 0.2857365906238556, "learning_rate": 0.0002, "epoch": 0.8348457350272233, "step": 1150}, {"loss": 1.03, "grad_norm": 0.3207702934741974, "learning_rate": 0.0002, "epoch": 0.8421052631578947, "step": 1160}, {"loss": 1.1669, "grad_norm": 0.3176484704017639, "learning_rate": 0.0002, "epoch": 0.8493647912885662, "step": 1170}, {"loss": 1.1386, "grad_norm": 0.40685558319091797, "learning_rate": 0.0002, "epoch": 0.8566243194192378, "step": 1180}, {"loss": 1.1383, "grad_norm": 0.31125199794769287, "learning_rate": 0.0002, "epoch": 0.8638838475499092, "step": 1190}, {"loss": 1.1512, "grad_norm": 0.7361181378364563, "learning_rate": 0.0002, "epoch": 0.8711433756805808, "step": 1200}, {"loss": 0.9983, "grad_norm": 0.33699527382850647, "learning_rate": 0.0002, "epoch": 0.8784029038112523, "step": 1210}, {"loss": 1.11, "grad_norm": 0.3315220773220062, "learning_rate": 0.0002, "epoch": 0.8856624319419237, "step": 1220}, {"loss": 1.0266, "grad_norm": 0.6256054043769836, "learning_rate": 0.0002, "epoch": 0.8929219600725953, "step": 1230}, {"loss": 1.1738, "grad_norm": 0.3692137897014618, "learning_rate": 0.0002, "epoch": 0.9001814882032668, "step": 1240}, {"loss": 1.1112, "grad_norm": 0.3538484573364258, "learning_rate": 0.0002, "epoch": 0.9074410163339383, "step": 1250}, {"loss": 1.0878, "grad_norm": 0.27863040566444397, "learning_rate": 0.0002, "epoch": 0.9147005444646098, "step": 1260}, {"loss": 1.0826, "grad_norm": 0.3322528302669525, "learning_rate": 0.0002, "epoch": 0.9219600725952813, "step": 1270}, {"loss": 1.0158, "grad_norm": 0.7553173303604126, "learning_rate": 0.0002, "epoch": 0.9292196007259528, "step": 1280}, {"loss": 1.198, "grad_norm": 0.4856191575527191, "learning_rate": 0.0002, "epoch": 0.9364791288566243, "step": 1290}, {"loss": 1.1678, "grad_norm": 0.3668074905872345, "learning_rate": 0.0002, "epoch": 0.9437386569872959, "step": 1300}, {"loss": 1.1809, "grad_norm": 0.29851067066192627, "learning_rate": 0.0002, "epoch": 0.9509981851179673, "step": 1310}, {"loss": 1.1299, "grad_norm": 0.276664674282074, "learning_rate": 0.0002, "epoch": 0.9582577132486388, "step": 1320}, {"loss": 1.0597, "grad_norm": 0.2941018342971802, "learning_rate": 0.0002, "epoch": 0.9655172413793104, "step": 1330}, {"loss": 1.1731, "grad_norm": 0.3505859076976776, "learning_rate": 0.0002, "epoch": 0.9727767695099818, "step": 1340}, {"loss": 1.1809, "grad_norm": 0.3067687451839447, "learning_rate": 0.0002, "epoch": 0.9800362976406534, "step": 1350}, {"loss": 1.1511, "grad_norm": 0.27151066064834595, "learning_rate": 0.0002, "epoch": 0.9872958257713249, "step": 1360}, {"loss": 1.1919, "grad_norm": 0.36370083689689636, "learning_rate": 0.0002, "epoch": 0.9945553539019963, "step": 1370}, {"eval_loss": 1.1381088495254517, "eval_runtime": 96.0848, "eval_samples_per_second": 4.538, "eval_steps_per_second": 0.572, "epoch": 0.9996370235934664, "step": 1377}, {"loss": 1.0082, "grad_norm": 0.27980583906173706, "learning_rate": 0.0002, "epoch": 1.0018148820326678, "step": 1380}, {"loss": 1.0565, "grad_norm": 0.26713913679122925, "learning_rate": 0.0002, "epoch": 1.0090744101633393, "step": 1390}, {"loss": 1.0241, "grad_norm": 0.3089541494846344, "learning_rate": 0.0002, "epoch": 1.0163339382940109, "step": 1400}, {"loss": 0.9649, "grad_norm": 0.4188242256641388, "learning_rate": 0.0002, "epoch": 1.0235934664246824, "step": 1410}, {"loss": 0.9821, "grad_norm": 0.5246463418006897, "learning_rate": 0.0002, "epoch": 1.030852994555354, "step": 1420}, {"loss": 1.0143, "grad_norm": 0.2728777825832367, "learning_rate": 0.0002, "epoch": 1.0381125226860255, "step": 1430}, {"loss": 1.075, "grad_norm": 0.38167616724967957, "learning_rate": 0.0002, "epoch": 1.0453720508166968, "step": 1440}, {"loss": 1.0229, "grad_norm": 0.4439380168914795, "learning_rate": 0.0002, "epoch": 1.0526315789473684, "step": 1450}, {"loss": 1.0451, "grad_norm": 0.30954182147979736, "learning_rate": 0.0002, "epoch": 1.05989110707804, "step": 1460}, {"loss": 1.0762, "grad_norm": 0.4022280275821686, "learning_rate": 0.0002, "epoch": 1.0671506352087115, "step": 1470}, {"loss": 1.0754, "grad_norm": 0.7390811443328857, "learning_rate": 0.0002, "epoch": 1.074410163339383, "step": 1480}, {"loss": 1.1378, "grad_norm": 0.3885486423969269, "learning_rate": 0.0002, "epoch": 1.0816696914700545, "step": 1490}, {"loss": 1.005, "grad_norm": 0.5275560617446899, "learning_rate": 0.0002, "epoch": 1.0889292196007259, "step": 1500}, {"loss": 1.0147, "grad_norm": 0.35112282633781433, "learning_rate": 0.0002, "epoch": 1.0961887477313974, "step": 1510}, {"loss": 1.0331, "grad_norm": 0.33714351058006287, "learning_rate": 0.0002, "epoch": 1.103448275862069, "step": 1520}, {"loss": 0.8723, "grad_norm": 0.31221693754196167, "learning_rate": 0.0002, "epoch": 1.1107078039927405, "step": 1530}, {"loss": 1.0701, "grad_norm": 0.27549654245376587, "learning_rate": 0.0002, "epoch": 1.117967332123412, "step": 1540}, {"loss": 1.0114, "grad_norm": 0.8465521335601807, "learning_rate": 0.0002, "epoch": 1.1252268602540836, "step": 1550}, {"loss": 0.9643, "grad_norm": 0.36125949025154114, "learning_rate": 0.0002, "epoch": 1.132486388384755, "step": 1560}, {"loss": 1.0744, "grad_norm": 0.37420371174812317, "learning_rate": 0.0002, "epoch": 1.1397459165154264, "step": 1570}, {"loss": 1.165, "grad_norm": 0.3294760584831238, "learning_rate": 0.0002, "epoch": 1.147005444646098, "step": 1580}, {"loss": 0.9207, "grad_norm": 0.3881238102912903, "learning_rate": 0.0002, "epoch": 1.1542649727767695, "step": 1590}, {"loss": 1.0983, "grad_norm": 0.4766491651535034, "learning_rate": 0.0002, "epoch": 1.161524500907441, "step": 1600}, {"loss": 1.0092, "grad_norm": 0.395530104637146, "learning_rate": 0.0002, "epoch": 1.1687840290381124, "step": 1610}, {"loss": 0.9895, "grad_norm": 0.3297106623649597, "learning_rate": 0.0002, "epoch": 1.176043557168784, "step": 1620}, {"loss": 1.1121, "grad_norm": 0.39528271555900574, "learning_rate": 0.0002, "epoch": 1.1833030852994555, "step": 1630}, {"loss": 0.9202, "grad_norm": 0.3370221257209778, "learning_rate": 0.0002, "epoch": 1.190562613430127, "step": 1640}, {"loss": 1.102, "grad_norm": 0.31922030448913574, "learning_rate": 0.0002, "epoch": 1.1978221415607986, "step": 1650}, {"loss": 1.0565, "grad_norm": 0.6142027378082275, "learning_rate": 0.0002, "epoch": 1.2050816696914701, "step": 1660}, {"loss": 0.9541, "grad_norm": 0.44769710302352905, "learning_rate": 0.0002, "epoch": 1.2123411978221417, "step": 1670}, {"loss": 1.2501, "grad_norm": 0.41233646869659424, "learning_rate": 0.0002, "epoch": 1.219600725952813, "step": 1680}, {"loss": 1.0147, "grad_norm": 0.2928866147994995, "learning_rate": 0.0002, "epoch": 1.2268602540834845, "step": 1690}, {"loss": 0.9074, "grad_norm": 0.36913734674453735, "learning_rate": 0.0002, "epoch": 1.234119782214156, "step": 1700}, {"loss": 1.0847, "grad_norm": 0.5281891226768494, "learning_rate": 0.0002, "epoch": 1.2413793103448276, "step": 1710}, {"loss": 0.9482, "grad_norm": 0.3374697268009186, "learning_rate": 0.0002, "epoch": 1.2486388384754992, "step": 1720}, {"loss": 0.9826, "grad_norm": 0.3802020847797394, "learning_rate": 0.0002, "epoch": 1.2558983666061705, "step": 1730}, {"loss": 1.0191, "grad_norm": 0.38048651814460754, "learning_rate": 0.0002, "epoch": 1.263157894736842, "step": 1740}, {"loss": 1.0142, "grad_norm": 0.6676169633865356, "learning_rate": 0.0002, "epoch": 1.2704174228675136, "step": 1750}, {"loss": 0.992, "grad_norm": 0.4075961410999298, "learning_rate": 0.0002, "epoch": 1.2776769509981851, "step": 1760}, {"loss": 1.0301, "grad_norm": 0.4374721348285675, "learning_rate": 0.0002, "epoch": 1.2849364791288567, "step": 1770}, {"loss": 1.0601, "grad_norm": 0.4638824164867401, "learning_rate": 0.0002, "epoch": 1.2921960072595282, "step": 1780}, {"loss": 0.9593, "grad_norm": 0.38631564378738403, "learning_rate": 0.0002, "epoch": 1.2994555353901998, "step": 1790}, {"loss": 1.0042, "grad_norm": 0.35873809456825256, "learning_rate": 0.0002, "epoch": 1.306715063520871, "step": 1800}, {"loss": 0.9835, "grad_norm": 0.33602237701416016, "learning_rate": 0.0002, "epoch": 1.3139745916515426, "step": 1810}, {"loss": 1.1032, "grad_norm": 0.46696463227272034, "learning_rate": 0.0002, "epoch": 1.3212341197822142, "step": 1820}, {"loss": 1.0123, "grad_norm": 0.368958979845047, "learning_rate": 0.0002, "epoch": 1.3284936479128857, "step": 1830}, {"loss": 1.0456, "grad_norm": 0.3160957396030426, "learning_rate": 0.0002, "epoch": 1.335753176043557, "step": 1840}, {"loss": 1.0264, "grad_norm": 0.4511511027812958, "learning_rate": 0.0002, "epoch": 1.3430127041742286, "step": 1850}, {"loss": 1.0179, "grad_norm": 0.6769845485687256, "learning_rate": 0.0002, "epoch": 1.3502722323049001, "step": 1860}, {"loss": 1.011, "grad_norm": 0.3749309480190277, "learning_rate": 0.0002, "epoch": 1.3575317604355717, "step": 1870}, {"loss": 1.0223, "grad_norm": 0.553798496723175, "learning_rate": 0.0002, "epoch": 1.3647912885662432, "step": 1880}, {"loss": 0.9634, "grad_norm": 0.3538985550403595, "learning_rate": 0.0002, "epoch": 1.3720508166969148, "step": 1890}, {"loss": 1.0275, "grad_norm": 0.5501534938812256, "learning_rate": 0.0002, "epoch": 1.3793103448275863, "step": 1900}, {"loss": 1.0461, "grad_norm": 0.4432051181793213, "learning_rate": 0.0002, "epoch": 1.3865698729582578, "step": 1910}, {"loss": 0.9082, "grad_norm": 0.41755786538124084, "learning_rate": 0.0002, "epoch": 1.3938294010889292, "step": 1920}, {"loss": 1.0858, "grad_norm": 0.5626114010810852, "learning_rate": 0.0002, "epoch": 1.4010889292196007, "step": 1930}, {"loss": 1.0687, "grad_norm": 0.44038185477256775, "learning_rate": 0.0002, "epoch": 1.4083484573502723, "step": 1940}, {"loss": 0.9454, "grad_norm": 0.3397001624107361, "learning_rate": 0.0002, "epoch": 1.4156079854809438, "step": 1950}, {"loss": 0.943, "grad_norm": 0.4325368106365204, "learning_rate": 0.0002, "epoch": 1.4228675136116151, "step": 1960}, {"loss": 1.0488, "grad_norm": 0.3900907039642334, "learning_rate": 0.0002, "epoch": 1.4301270417422867, "step": 1970}, {"loss": 0.9699, "grad_norm": 0.369612455368042, "learning_rate": 0.0002, "epoch": 1.4373865698729582, "step": 1980}, {"loss": 1.0609, "grad_norm": 0.4389338791370392, "learning_rate": 0.0002, "epoch": 1.4446460980036298, "step": 1990}, {"loss": 1.042, "grad_norm": 1.694450855255127, "learning_rate": 0.0002, "epoch": 1.4519056261343013, "step": 2000}, {"loss": 0.997, "grad_norm": 0.516957700252533, "learning_rate": 0.0002, "epoch": 1.4591651542649728, "step": 2010}, {"loss": 1.0979, "grad_norm": 0.45515501499176025, "learning_rate": 0.0002, "epoch": 1.4664246823956444, "step": 2020}, {"loss": 1.0188, "grad_norm": 0.4153139591217041, "learning_rate": 0.0002, "epoch": 1.4736842105263157, "step": 2030}, {"loss": 1.1521, "grad_norm": 0.44353052973747253, "learning_rate": 0.0002, "epoch": 1.4809437386569873, "step": 2040}, {"loss": 0.9653, "grad_norm": 0.570554256439209, "learning_rate": 0.0002, "epoch": 1.4882032667876588, "step": 2050}, {"loss": 1.0352, "grad_norm": 0.5742740035057068, "learning_rate": 0.0002, "epoch": 1.4954627949183303, "step": 2060}, {"loss": 1.0963, "grad_norm": 0.5890517830848694, "learning_rate": 0.0002, "epoch": 1.5027223230490017, "step": 2070}, {"loss": 1.0639, "grad_norm": 0.4162650406360626, "learning_rate": 0.0002, "epoch": 1.5099818511796732, "step": 2080}, {"loss": 1.071, "grad_norm": 0.7334717512130737, "learning_rate": 0.0002, "epoch": 1.5172413793103448, "step": 2090}, {"loss": 0.8957, "grad_norm": 0.2963249683380127, "learning_rate": 0.0002, "epoch": 1.5245009074410163, "step": 2100}, {"loss": 1.0446, "grad_norm": 0.30676454305648804, "learning_rate": 0.0002, "epoch": 1.5317604355716878, "step": 2110}, {"loss": 0.9967, "grad_norm": 0.35984641313552856, "learning_rate": 0.0002, "epoch": 1.5390199637023594, "step": 2120}, {"loss": 1.0352, "grad_norm": 0.3384549617767334, "learning_rate": 0.0002, "epoch": 1.546279491833031, "step": 2130}, {"loss": 0.9874, "grad_norm": 0.4725518226623535, "learning_rate": 0.0002, "epoch": 1.5535390199637025, "step": 2140}, {"loss": 1.1166, "grad_norm": 0.3252685070037842, "learning_rate": 0.0002, "epoch": 1.560798548094374, "step": 2150}, {"loss": 0.9127, "grad_norm": 0.45043081045150757, "learning_rate": 0.0002, "epoch": 1.5680580762250453, "step": 2160}, {"loss": 1.0767, "grad_norm": 0.374208003282547, "learning_rate": 0.0002, "epoch": 1.5753176043557169, "step": 2170}, {"loss": 1.0255, "grad_norm": 0.5118404030799866, "learning_rate": 0.0002, "epoch": 1.5825771324863884, "step": 2180}, {"loss": 1.0062, "grad_norm": 0.482170969247818, "learning_rate": 0.0002, "epoch": 1.5898366606170597, "step": 2190}, {"loss": 1.0872, "grad_norm": 0.5337533950805664, "learning_rate": 0.0002, "epoch": 1.5970961887477313, "step": 2200}, {"loss": 1.0405, "grad_norm": 0.5195064544677734, "learning_rate": 0.0002, "epoch": 1.6043557168784028, "step": 2210}, {"loss": 1.0454, "grad_norm": 0.30807098746299744, "learning_rate": 0.0002, "epoch": 1.6116152450090744, "step": 2220}, {"loss": 1.0293, "grad_norm": 0.3962925672531128, "learning_rate": 0.0002, "epoch": 1.618874773139746, "step": 2230}, {"loss": 1.0137, "grad_norm": 0.7636962532997131, "learning_rate": 0.0002, "epoch": 1.6261343012704175, "step": 2240}, {"loss": 0.991, "grad_norm": 0.32380592823028564, "learning_rate": 0.0002, "epoch": 1.633393829401089, "step": 2250}, {"loss": 0.9471, "grad_norm": 0.5767741799354553, "learning_rate": 0.0002, "epoch": 1.6406533575317606, "step": 2260}, {"loss": 0.9524, "grad_norm": 0.39964812994003296, "learning_rate": 0.0002, "epoch": 1.647912885662432, "step": 2270}, {"loss": 0.9866, "grad_norm": 0.622629702091217, "learning_rate": 0.0002, "epoch": 1.6551724137931034, "step": 2280}, {"loss": 0.9609, "grad_norm": 0.40202152729034424, "learning_rate": 0.0002, "epoch": 1.662431941923775, "step": 2290}, {"loss": 0.9859, "grad_norm": 0.4467349052429199, "learning_rate": 0.0002, "epoch": 1.6696914700544465, "step": 2300}, {"loss": 1.0312, "grad_norm": 0.5026949048042297, "learning_rate": 0.0002, "epoch": 1.6769509981851178, "step": 2310}, {"loss": 1.0776, "grad_norm": 0.43754541873931885, "learning_rate": 0.0002, "epoch": 1.6842105263157894, "step": 2320}, {"loss": 0.9786, "grad_norm": 0.42869430780410767, "learning_rate": 0.0002, "epoch": 1.691470054446461, "step": 2330}, {"loss": 1.0747, "grad_norm": 0.4192679524421692, "learning_rate": 0.0002, "epoch": 1.6987295825771325, "step": 2340}, {"loss": 1.0054, "grad_norm": 0.3243155777454376, "learning_rate": 0.0002, "epoch": 1.705989110707804, "step": 2350}, {"loss": 0.9165, "grad_norm": 1.0514075756072998, "learning_rate": 0.0002, "epoch": 1.7132486388384756, "step": 2360}, {"loss": 1.1353, "grad_norm": 0.4719122648239136, "learning_rate": 0.0002, "epoch": 1.720508166969147, "step": 2370}, {"loss": 0.9913, "grad_norm": 0.3846144676208496, "learning_rate": 0.0002, "epoch": 1.7277676950998186, "step": 2380}, {"loss": 0.9935, "grad_norm": 0.7266581058502197, "learning_rate": 0.0002, "epoch": 1.73502722323049, "step": 2390}, {"loss": 0.9509, "grad_norm": 0.6183241605758667, "learning_rate": 0.0002, "epoch": 1.7422867513611615, "step": 2400}, {"loss": 0.9713, "grad_norm": 0.3658260405063629, "learning_rate": 0.0002, "epoch": 1.749546279491833, "step": 2410}, {"loss": 1.0033, "grad_norm": 0.6036322712898254, "learning_rate": 0.0002, "epoch": 1.7568058076225044, "step": 2420}, {"loss": 1.0526, "grad_norm": 0.7872936129570007, "learning_rate": 0.0002, "epoch": 1.764065335753176, "step": 2430}, {"loss": 0.99, "grad_norm": 0.35946124792099, "learning_rate": 0.0002, "epoch": 1.7713248638838475, "step": 2440}, {"loss": 0.8845, "grad_norm": 0.3740338981151581, "learning_rate": 0.0002, "epoch": 1.778584392014519, "step": 2450}, {"loss": 0.9333, "grad_norm": 0.6150230169296265, "learning_rate": 0.0002, "epoch": 1.7858439201451906, "step": 2460}, {"loss": 1.0373, "grad_norm": 0.4726075530052185, "learning_rate": 0.0002, "epoch": 1.793103448275862, "step": 2470}, {"loss": 0.974, "grad_norm": 0.31292253732681274, "learning_rate": 0.0002, "epoch": 1.8003629764065336, "step": 2480}, {"loss": 1.0205, "grad_norm": 0.4463104009628296, "learning_rate": 0.0002, "epoch": 1.8076225045372052, "step": 2490}, {"loss": 1.0771, "grad_norm": 0.7848200798034668, "learning_rate": 0.0002, "epoch": 1.8148820326678767, "step": 2500}, {"loss": 0.9508, "grad_norm": 0.5562082529067993, "learning_rate": 0.0002, "epoch": 1.822141560798548, "step": 2510}, {"loss": 0.9343, "grad_norm": 0.39892756938934326, "learning_rate": 0.0002, "epoch": 1.8294010889292196, "step": 2520}, {"loss": 1.0489, "grad_norm": 1.2923320531845093, "learning_rate": 0.0002, "epoch": 1.8366606170598911, "step": 2530}, {"loss": 1.1203, "grad_norm": 0.6316490769386292, "learning_rate": 0.0002, "epoch": 1.8439201451905625, "step": 2540}, {"loss": 0.9131, "grad_norm": 0.46100497245788574, "learning_rate": 0.0002, "epoch": 1.851179673321234, "step": 2550}, {"loss": 1.0977, "grad_norm": 0.7902987003326416, "learning_rate": 0.0002, "epoch": 1.8584392014519056, "step": 2560}, {"loss": 0.9702, "grad_norm": 0.4596365690231323, "learning_rate": 0.0002, "epoch": 1.865698729582577, "step": 2570}, {"loss": 0.9322, "grad_norm": 0.6592172384262085, "learning_rate": 0.0002, "epoch": 1.8729582577132486, "step": 2580}, {"loss": 0.9549, "grad_norm": 0.5764662027359009, "learning_rate": 0.0002, "epoch": 1.8802177858439202, "step": 2590}, {"loss": 0.9485, "grad_norm": 0.8421637415885925, "learning_rate": 0.0002, "epoch": 1.8874773139745917, "step": 2600}, {"loss": 1.012, "grad_norm": 0.5635305047035217, "learning_rate": 0.0002, "epoch": 1.8947368421052633, "step": 2610}, {"loss": 0.8907, "grad_norm": 0.46231237053871155, "learning_rate": 0.0002, "epoch": 1.9019963702359348, "step": 2620}, {"loss": 0.9543, "grad_norm": 0.3944607079029083, "learning_rate": 0.0002, "epoch": 1.9092558983666061, "step": 2630}, {"loss": 0.9964, "grad_norm": 0.4753907322883606, "learning_rate": 0.0002, "epoch": 1.9165154264972777, "step": 2640}, {"loss": 1.0217, "grad_norm": 0.4151090979576111, "learning_rate": 0.0002, "epoch": 1.9237749546279492, "step": 2650}, {"loss": 1.0397, "grad_norm": 0.6793725490570068, "learning_rate": 0.0002, "epoch": 1.9310344827586206, "step": 2660}, {"loss": 1.0401, "grad_norm": 0.339755117893219, "learning_rate": 0.0002, "epoch": 1.938294010889292, "step": 2670}, {"loss": 1.082, "grad_norm": 0.40789374709129333, "learning_rate": 0.0002, "epoch": 1.9455535390199636, "step": 2680}, {"loss": 0.9973, "grad_norm": 0.3750005066394806, "learning_rate": 0.0002, "epoch": 1.9528130671506352, "step": 2690}, {"loss": 0.9837, "grad_norm": 0.39684441685676575, "learning_rate": 0.0002, "epoch": 1.9600725952813067, "step": 2700}, {"loss": 1.1204, "grad_norm": 0.378287672996521, "learning_rate": 0.0002, "epoch": 1.9673321234119783, "step": 2710}, {"loss": 0.9284, "grad_norm": 0.3668482005596161, "learning_rate": 0.0002, "epoch": 1.9745916515426498, "step": 2720}, {"loss": 1.0512, "grad_norm": 0.49997565150260925, "learning_rate": 0.0002, "epoch": 1.9818511796733214, "step": 2730}, {"loss": 0.9311, "grad_norm": 0.36852124333381653, "learning_rate": 0.0002, "epoch": 1.989110707803993, "step": 2740}, {"loss": 1.0898, "grad_norm": 0.5203380584716797, "learning_rate": 0.0002, "epoch": 1.9963702359346642, "step": 2750}, {"eval_loss": 1.1086540222167969, "eval_runtime": 95.6866, "eval_samples_per_second": 4.557, "eval_steps_per_second": 0.575, "epoch": 2.0, "step": 2755}, {"loss": 0.9791, "grad_norm": 0.35921573638916016, "learning_rate": 0.0002, "epoch": 2.0036297640653356, "step": 2760}, {"loss": 0.8968, "grad_norm": 1.013861894607544, "learning_rate": 0.0002, "epoch": 2.010889292196007, "step": 2770}, {"loss": 0.7873, "grad_norm": 0.4425240159034729, "learning_rate": 0.0002, "epoch": 2.0181488203266786, "step": 2780}, {"loss": 0.8758, "grad_norm": 0.60174161195755, "learning_rate": 0.0002, "epoch": 2.02540834845735, "step": 2790}, {"loss": 0.7989, "grad_norm": 0.47582098841667175, "learning_rate": 0.0002, "epoch": 2.0326678765880217, "step": 2800}, {"loss": 0.8898, "grad_norm": 0.6012811660766602, "learning_rate": 0.0002, "epoch": 2.0399274047186933, "step": 2810}, {"loss": 0.8109, "grad_norm": 0.4444408118724823, "learning_rate": 0.0002, "epoch": 2.047186932849365, "step": 2820}, {"loss": 0.8621, "grad_norm": 0.4864003360271454, "learning_rate": 0.0002, "epoch": 2.0544464609800364, "step": 2830}, {"loss": 0.8817, "grad_norm": 0.5104215741157532, "learning_rate": 0.0002, "epoch": 2.061705989110708, "step": 2840}, {"loss": 0.7789, "grad_norm": 0.6218489408493042, "learning_rate": 0.0002, "epoch": 2.0689655172413794, "step": 2850}, {"loss": 0.8778, "grad_norm": 0.705784261226654, "learning_rate": 0.0002, "epoch": 2.076225045372051, "step": 2860}, {"loss": 0.7851, "grad_norm": 0.48091503977775574, "learning_rate": 0.0002, "epoch": 2.0834845735027225, "step": 2870}, {"loss": 0.8444, "grad_norm": 0.5062456727027893, "learning_rate": 0.0002, "epoch": 2.0907441016333936, "step": 2880}, {"loss": 0.7898, "grad_norm": 0.4862022399902344, "learning_rate": 0.0002, "epoch": 2.098003629764065, "step": 2890}, {"loss": 0.8593, "grad_norm": 0.48264044523239136, "learning_rate": 0.0002, "epoch": 2.1052631578947367, "step": 2900}, {"loss": 0.8325, "grad_norm": 0.43744346499443054, "learning_rate": 0.0002, "epoch": 2.1125226860254083, "step": 2910}, {"loss": 0.9099, "grad_norm": 0.5480492115020752, "learning_rate": 0.0002, "epoch": 2.11978221415608, "step": 2920}, {"loss": 0.7727, "grad_norm": 0.5068560838699341, "learning_rate": 0.0002, "epoch": 2.1270417422867514, "step": 2930}, {"loss": 0.8403, "grad_norm": 0.4650852680206299, "learning_rate": 0.0002, "epoch": 2.134301270417423, "step": 2940}, {"loss": 0.8467, "grad_norm": 0.4929981231689453, "learning_rate": 0.0002, "epoch": 2.1415607985480944, "step": 2950}, {"loss": 0.8747, "grad_norm": 0.6537389755249023, "learning_rate": 0.0002, "epoch": 2.148820326678766, "step": 2960}, {"loss": 0.7663, "grad_norm": 0.8032940626144409, "learning_rate": 0.0002, "epoch": 2.1560798548094375, "step": 2970}, {"loss": 0.7604, "grad_norm": 0.7131643891334534, "learning_rate": 0.0002, "epoch": 2.163339382940109, "step": 2980}, {"loss": 0.8424, "grad_norm": 0.6034275889396667, "learning_rate": 0.0002, "epoch": 2.1705989110707806, "step": 2990}, {"loss": 0.8838, "grad_norm": 0.6081095933914185, "learning_rate": 0.0002, "epoch": 2.1778584392014517, "step": 3000}, {"loss": 0.9013, "grad_norm": 0.5706912875175476, "learning_rate": 0.0002, "epoch": 2.1851179673321233, "step": 3010}, {"loss": 0.8473, "grad_norm": 0.6742380261421204, "learning_rate": 0.0002, "epoch": 2.192377495462795, "step": 3020}, {"loss": 0.8372, "grad_norm": 0.4847496449947357, "learning_rate": 0.0002, "epoch": 2.1996370235934664, "step": 3030}, {"loss": 0.8487, "grad_norm": 0.5529342889785767, "learning_rate": 0.0002, "epoch": 2.206896551724138, "step": 3040}, {"loss": 0.9057, "grad_norm": 0.6108783483505249, "learning_rate": 0.0002, "epoch": 2.2141560798548094, "step": 3050}, {"loss": 0.9659, "grad_norm": 0.8841571807861328, "learning_rate": 0.0002, "epoch": 2.221415607985481, "step": 3060}, {"loss": 0.8538, "grad_norm": 0.4227530360221863, "learning_rate": 0.0002, "epoch": 2.2286751361161525, "step": 3070}, {"loss": 0.7925, "grad_norm": 0.461935818195343, "learning_rate": 0.0002, "epoch": 2.235934664246824, "step": 3080}, {"loss": 0.8542, "grad_norm": 0.5407412648200989, "learning_rate": 0.0002, "epoch": 2.2431941923774956, "step": 3090}, {"loss": 0.8392, "grad_norm": 0.5057998895645142, "learning_rate": 0.0002, "epoch": 2.250453720508167, "step": 3100}, {"loss": 0.7898, "grad_norm": 0.530057966709137, "learning_rate": 0.0002, "epoch": 2.2577132486388383, "step": 3110}, {"loss": 0.8507, "grad_norm": 0.5066842436790466, "learning_rate": 0.0002, "epoch": 2.26497277676951, "step": 3120}, {"loss": 0.7903, "grad_norm": 0.5069178342819214, "learning_rate": 0.0002, "epoch": 2.2722323049001814, "step": 3130}, {"loss": 0.8458, "grad_norm": 0.6095499396324158, "learning_rate": 0.0002, "epoch": 2.279491833030853, "step": 3140}, {"loss": 0.8171, "grad_norm": 0.49524766206741333, "learning_rate": 0.0002, "epoch": 2.2867513611615244, "step": 3150}, {"loss": 0.7568, "grad_norm": 0.5334409475326538, "learning_rate": 0.0002, "epoch": 2.294010889292196, "step": 3160}, {"loss": 0.7533, "grad_norm": 1.681748867034912, "learning_rate": 0.0002, "epoch": 2.3012704174228675, "step": 3170}, {"loss": 0.8155, "grad_norm": 0.7225565314292908, "learning_rate": 0.0002, "epoch": 2.308529945553539, "step": 3180}, {"loss": 0.8034, "grad_norm": 0.5379388928413391, "learning_rate": 0.0002, "epoch": 2.3157894736842106, "step": 3190}, {"loss": 0.8833, "grad_norm": 0.45770326256752014, "learning_rate": 0.0002, "epoch": 2.323049001814882, "step": 3200}, {"loss": 0.8419, "grad_norm": 0.6984533071517944, "learning_rate": 0.0002, "epoch": 2.3303085299455537, "step": 3210}, {"loss": 0.7414, "grad_norm": 0.6725744605064392, "learning_rate": 0.0002, "epoch": 2.337568058076225, "step": 3220}, {"loss": 0.8657, "grad_norm": 1.1247471570968628, "learning_rate": 0.0002, "epoch": 2.344827586206897, "step": 3230}, {"loss": 0.7782, "grad_norm": 1.0240263938903809, "learning_rate": 0.0002, "epoch": 2.352087114337568, "step": 3240}, {"loss": 0.826, "grad_norm": 0.5608096122741699, "learning_rate": 0.0002, "epoch": 2.3593466424682394, "step": 3250}, {"loss": 0.8191, "grad_norm": 0.8294990062713623, "learning_rate": 0.0002, "epoch": 2.366606170598911, "step": 3260}, {"loss": 0.9845, "grad_norm": 0.6734224557876587, "learning_rate": 0.0002, "epoch": 2.3738656987295825, "step": 3270}, {"loss": 0.7921, "grad_norm": 0.6862800717353821, "learning_rate": 0.0002, "epoch": 2.381125226860254, "step": 3280}, {"loss": 0.7752, "grad_norm": 0.5442930459976196, "learning_rate": 0.0002, "epoch": 2.3883847549909256, "step": 3290}, {"loss": 0.8543, "grad_norm": 0.5745394229888916, "learning_rate": 0.0002, "epoch": 2.395644283121597, "step": 3300}, {"loss": 0.833, "grad_norm": 0.6257799863815308, "learning_rate": 0.0002, "epoch": 2.4029038112522687, "step": 3310}, {"loss": 0.8191, "grad_norm": 0.5608420968055725, "learning_rate": 0.0002, "epoch": 2.4101633393829403, "step": 3320}, {"loss": 0.8745, "grad_norm": 0.5512017011642456, "learning_rate": 0.0002, "epoch": 2.417422867513612, "step": 3330}, {"loss": 0.8642, "grad_norm": 0.7452999353408813, "learning_rate": 0.0002, "epoch": 2.4246823956442833, "step": 3340}, {"loss": 0.8417, "grad_norm": 0.4604301452636719, "learning_rate": 0.0002, "epoch": 2.4319419237749544, "step": 3350}, {"loss": 0.9148, "grad_norm": 0.8225823640823364, "learning_rate": 0.0002, "epoch": 2.439201451905626, "step": 3360}, {"loss": 0.8144, "grad_norm": 0.8604981303215027, "learning_rate": 0.0002, "epoch": 2.4464609800362975, "step": 3370}, {"loss": 0.9276, "grad_norm": 0.6620925664901733, "learning_rate": 0.0002, "epoch": 2.453720508166969, "step": 3380}, {"loss": 0.8381, "grad_norm": 0.4750158488750458, "learning_rate": 0.0002, "epoch": 2.4609800362976406, "step": 3390}, {"loss": 0.8249, "grad_norm": 0.6061418056488037, "learning_rate": 0.0002, "epoch": 2.468239564428312, "step": 3400}, {"loss": 0.8944, "grad_norm": 0.5977247357368469, "learning_rate": 0.0002, "epoch": 2.4754990925589837, "step": 3410}, {"loss": 0.8366, "grad_norm": 0.5004227757453918, "learning_rate": 0.0002, "epoch": 2.4827586206896552, "step": 3420}, {"loss": 0.9133, "grad_norm": 0.46428972482681274, "learning_rate": 0.0002, "epoch": 2.490018148820327, "step": 3430}, {"loss": 0.8534, "grad_norm": 0.7658395171165466, "learning_rate": 0.0002, "epoch": 2.4972776769509983, "step": 3440}, {"loss": 0.9066, "grad_norm": 0.621819794178009, "learning_rate": 0.0002, "epoch": 2.50453720508167, "step": 3450}, {"loss": 0.8481, "grad_norm": 0.4556088149547577, "learning_rate": 0.0002, "epoch": 2.511796733212341, "step": 3460}, {"loss": 0.8522, "grad_norm": 0.6124140024185181, "learning_rate": 0.0002, "epoch": 2.519056261343013, "step": 3470}, {"loss": 0.8158, "grad_norm": 0.6256231665611267, "learning_rate": 0.0002, "epoch": 2.526315789473684, "step": 3480}, {"loss": 0.9097, "grad_norm": 0.5464141964912415, "learning_rate": 0.0002, "epoch": 2.5335753176043556, "step": 3490}, {"loss": 0.8708, "grad_norm": 0.51471346616745, "learning_rate": 0.0002, "epoch": 2.540834845735027, "step": 3500}, {"loss": 0.9042, "grad_norm": 0.5326165556907654, "learning_rate": 0.0002, "epoch": 2.5480943738656987, "step": 3510}, {"loss": 0.8749, "grad_norm": 0.4750378429889679, "learning_rate": 0.0002, "epoch": 2.5553539019963702, "step": 3520}, {"loss": 0.8248, "grad_norm": 0.5292279124259949, "learning_rate": 0.0002, "epoch": 2.562613430127042, "step": 3530}, {"loss": 0.8808, "grad_norm": 0.6145227551460266, "learning_rate": 0.0002, "epoch": 2.5698729582577133, "step": 3540}, {"loss": 0.846, "grad_norm": 0.8275189399719238, "learning_rate": 0.0002, "epoch": 2.577132486388385, "step": 3550}, {"loss": 0.875, "grad_norm": 0.5037438273429871, "learning_rate": 0.0002, "epoch": 2.5843920145190564, "step": 3560}, {"loss": 0.8324, "grad_norm": 0.5838707685470581, "learning_rate": 0.0002, "epoch": 2.5916515426497275, "step": 3570}, {"loss": 0.8716, "grad_norm": 0.5398710370063782, "learning_rate": 0.0002, "epoch": 2.5989110707803995, "step": 3580}, {"loss": 0.7843, "grad_norm": 0.6115376949310303, "learning_rate": 0.0002, "epoch": 2.6061705989110706, "step": 3590}, {"loss": 0.8142, "grad_norm": 1.118809461593628, "learning_rate": 0.0002, "epoch": 2.613430127041742, "step": 3600}, {"loss": 0.8282, "grad_norm": 0.7811771631240845, "learning_rate": 0.0002, "epoch": 2.6206896551724137, "step": 3610}, {"loss": 0.7997, "grad_norm": 0.5753175020217896, "learning_rate": 0.0002, "epoch": 2.6279491833030852, "step": 3620}, {"loss": 0.7705, "grad_norm": 0.550829291343689, "learning_rate": 0.0002, "epoch": 2.635208711433757, "step": 3630}, {"loss": 0.8564, "grad_norm": 0.5360019207000732, "learning_rate": 0.0002, "epoch": 2.6424682395644283, "step": 3640}, {"loss": 0.7648, "grad_norm": 0.63050377368927, "learning_rate": 0.0002, "epoch": 2.6497277676951, "step": 3650}, {"loss": 0.9156, "grad_norm": 0.5833110213279724, "learning_rate": 0.0002, "epoch": 2.6569872958257714, "step": 3660}, {"loss": 0.8033, "grad_norm": 0.5543047189712524, "learning_rate": 0.0002, "epoch": 2.664246823956443, "step": 3670}, {"loss": 0.8867, "grad_norm": 0.6842212080955505, "learning_rate": 0.0002, "epoch": 2.671506352087114, "step": 3680}, {"loss": 0.8569, "grad_norm": 0.6859333515167236, "learning_rate": 0.0002, "epoch": 2.678765880217786, "step": 3690}, {"loss": 0.8081, "grad_norm": 0.7038410902023315, "learning_rate": 0.0002, "epoch": 2.686025408348457, "step": 3700}, {"loss": 0.8686, "grad_norm": 0.447233647108078, "learning_rate": 0.0002, "epoch": 2.6932849364791287, "step": 3710}, {"loss": 0.8951, "grad_norm": 0.6453872919082642, "learning_rate": 0.0002, "epoch": 2.7005444646098002, "step": 3720}, {"loss": 0.8292, "grad_norm": 0.8025672435760498, "learning_rate": 0.0002, "epoch": 2.707803992740472, "step": 3730}, {"loss": 0.8488, "grad_norm": 0.5997087955474854, "learning_rate": 0.0002, "epoch": 2.7150635208711433, "step": 3740}, {"loss": 0.8308, "grad_norm": 0.6901142001152039, "learning_rate": 0.0002, "epoch": 2.722323049001815, "step": 3750}, {"loss": 0.8517, "grad_norm": 1.036145567893982, "learning_rate": 0.0002, "epoch": 2.7295825771324864, "step": 3760}, {"loss": 0.8455, "grad_norm": 0.7207072377204895, "learning_rate": 0.0002, "epoch": 2.736842105263158, "step": 3770}, {"loss": 0.8009, "grad_norm": 1.0452989339828491, "learning_rate": 0.0002, "epoch": 2.7441016333938295, "step": 3780}, {"loss": 0.8868, "grad_norm": 0.5615278482437134, "learning_rate": 0.0002, "epoch": 2.751361161524501, "step": 3790}, {"loss": 0.9047, "grad_norm": 0.46439215540885925, "learning_rate": 0.0002, "epoch": 2.7586206896551726, "step": 3800}, {"loss": 0.7847, "grad_norm": 0.7134917974472046, "learning_rate": 0.0002, "epoch": 2.7658802177858437, "step": 3810}, {"loss": 0.8315, "grad_norm": 0.5139115452766418, "learning_rate": 0.0002, "epoch": 2.7731397459165157, "step": 3820}, {"loss": 0.8995, "grad_norm": 0.8595781326293945, "learning_rate": 0.0002, "epoch": 2.780399274047187, "step": 3830}, {"loss": 0.9079, "grad_norm": 0.544614851474762, "learning_rate": 0.0002, "epoch": 2.7876588021778583, "step": 3840}, {"loss": 0.8322, "grad_norm": 0.6073850393295288, "learning_rate": 0.0002, "epoch": 2.79491833030853, "step": 3850}, {"loss": 0.8277, "grad_norm": 0.8224069476127625, "learning_rate": 0.0002, "epoch": 2.8021778584392014, "step": 3860}, {"loss": 0.815, "grad_norm": 0.5347970128059387, "learning_rate": 0.0002, "epoch": 2.809437386569873, "step": 3870}, {"loss": 0.8219, "grad_norm": 0.6799601912498474, "learning_rate": 0.0002, "epoch": 2.8166969147005445, "step": 3880}, {"loss": 0.8796, "grad_norm": 0.5219197869300842, "learning_rate": 0.0002, "epoch": 2.823956442831216, "step": 3890}, {"loss": 0.8649, "grad_norm": 0.5710130333900452, "learning_rate": 0.0002, "epoch": 2.8312159709618876, "step": 3900}, {"loss": 0.8741, "grad_norm": 0.5857213139533997, "learning_rate": 0.0002, "epoch": 2.838475499092559, "step": 3910}, {"loss": 0.8154, "grad_norm": 0.5206644535064697, "learning_rate": 0.0002, "epoch": 2.8457350272232302, "step": 3920}, {"loss": 0.9328, "grad_norm": 1.2902015447616577, "learning_rate": 0.0002, "epoch": 2.8529945553539022, "step": 3930}, {"loss": 0.8192, "grad_norm": 0.5252797603607178, "learning_rate": 0.0002, "epoch": 2.8602540834845733, "step": 3940}, {"loss": 0.9047, "grad_norm": 0.5925108790397644, "learning_rate": 0.0002, "epoch": 2.867513611615245, "step": 3950}, {"loss": 0.8611, "grad_norm": 0.9719610810279846, "learning_rate": 0.0002, "epoch": 2.8747731397459164, "step": 3960}, {"loss": 0.8531, "grad_norm": 0.6834747195243835, "learning_rate": 0.0002, "epoch": 2.882032667876588, "step": 3970}, {"loss": 0.8124, "grad_norm": 0.6952353119850159, "learning_rate": 0.0002, "epoch": 2.8892921960072595, "step": 3980}, {"loss": 0.8365, "grad_norm": 0.49889910221099854, "learning_rate": 0.0002, "epoch": 2.896551724137931, "step": 3990}, {"loss": 0.8197, "grad_norm": 0.5007026791572571, "learning_rate": 0.0002, "epoch": 2.9038112522686026, "step": 4000}, {"loss": 0.7752, "grad_norm": 0.5474239587783813, "learning_rate": 0.0002, "epoch": 2.911070780399274, "step": 4010}, {"loss": 0.8579, "grad_norm": 0.6617428660392761, "learning_rate": 0.0002, "epoch": 2.9183303085299457, "step": 4020}, {"loss": 0.8583, "grad_norm": 0.6097776293754578, "learning_rate": 0.0002, "epoch": 2.925589836660617, "step": 4030}, {"loss": 0.8296, "grad_norm": 0.5985828638076782, "learning_rate": 0.0002, "epoch": 2.9328493647912888, "step": 4040}, {"loss": 0.9134, "grad_norm": 0.769488513469696, "learning_rate": 0.0002, "epoch": 2.94010889292196, "step": 4050}, {"loss": 0.8148, "grad_norm": 0.5167055130004883, "learning_rate": 0.0002, "epoch": 2.9473684210526314, "step": 4060}, {"loss": 0.8665, "grad_norm": 0.6265496015548706, "learning_rate": 0.0002, "epoch": 2.954627949183303, "step": 4070}, {"loss": 0.9218, "grad_norm": 1.2644082307815552, "learning_rate": 0.0002, "epoch": 2.9618874773139745, "step": 4080}, {"loss": 0.8026, "grad_norm": 0.6007561087608337, "learning_rate": 0.0002, "epoch": 2.969147005444646, "step": 4090}, {"loss": 0.8262, "grad_norm": 0.47984927892684937, "learning_rate": 0.0002, "epoch": 2.9764065335753176, "step": 4100}, {"loss": 0.7897, "grad_norm": 1.128198504447937, "learning_rate": 0.0002, "epoch": 2.983666061705989, "step": 4110}, {"loss": 0.7883, "grad_norm": 0.526292085647583, "learning_rate": 0.0002, "epoch": 2.9909255898366607, "step": 4120}, {"loss": 0.7801, "grad_norm": 0.5462674498558044, "learning_rate": 0.0002, "epoch": 2.9981851179673322, "step": 4130}, {"eval_loss": 1.1093357801437378, "eval_runtime": 46.2498, "eval_samples_per_second": 9.427, "eval_steps_per_second": 1.189, "epoch": 2.9996370235934666, "step": 4132}, {"loss": 0.721, "grad_norm": 0.48288026452064514, "learning_rate": 0.0002, "epoch": 3.0054446460980038, "step": 4140}, {"loss": 0.6769, "grad_norm": 1.0181782245635986, "learning_rate": 0.0002, "epoch": 3.0127041742286753, "step": 4150}, {"loss": 0.7185, "grad_norm": 0.7718019485473633, "learning_rate": 0.0002, "epoch": 3.019963702359347, "step": 4160}, {"loss": 0.6552, "grad_norm": 0.7492219805717468, "learning_rate": 0.0002, "epoch": 3.027223230490018, "step": 4170}, {"loss": 0.6678, "grad_norm": 0.9363632798194885, "learning_rate": 0.0002, "epoch": 3.0344827586206895, "step": 4180}, {"loss": 0.7187, "grad_norm": 0.6888533234596252, "learning_rate": 0.0002, "epoch": 3.041742286751361, "step": 4190}, {"loss": 0.6469, "grad_norm": 0.7072834968566895, "learning_rate": 0.0002, "epoch": 3.0490018148820326, "step": 4200}, {"loss": 0.6387, "grad_norm": 0.7182047963142395, "learning_rate": 0.0002, "epoch": 3.056261343012704, "step": 4210}, {"loss": 0.6385, "grad_norm": 0.7194355130195618, "learning_rate": 0.0002, "epoch": 3.0635208711433757, "step": 4220}, {"loss": 0.5812, "grad_norm": 0.9454023838043213, "learning_rate": 0.0002, "epoch": 3.0707803992740472, "step": 4230}, {"loss": 0.6036, "grad_norm": 0.838657557964325, "learning_rate": 0.0002, "epoch": 3.0780399274047188, "step": 4240}, {"loss": 0.646, "grad_norm": 0.740113377571106, "learning_rate": 0.0002, "epoch": 3.0852994555353903, "step": 4250}, {"loss": 0.604, "grad_norm": 0.6616561412811279, "learning_rate": 0.0002, "epoch": 3.092558983666062, "step": 4260}, {"loss": 0.6462, "grad_norm": 0.8846506476402283, "learning_rate": 0.0002, "epoch": 3.0998185117967334, "step": 4270}, {"loss": 0.6037, "grad_norm": 0.6322125792503357, "learning_rate": 0.0002, "epoch": 3.107078039927405, "step": 4280}, {"loss": 0.5953, "grad_norm": 0.7461467385292053, "learning_rate": 0.0002, "epoch": 3.114337568058076, "step": 4290}, {"loss": 0.6099, "grad_norm": 0.8251287341117859, "learning_rate": 0.0002, "epoch": 3.1215970961887476, "step": 4300}, {"loss": 0.6284, "grad_norm": 0.8767673373222351, "learning_rate": 0.0002, "epoch": 3.128856624319419, "step": 4310}, {"loss": 0.7535, "grad_norm": 0.7758759260177612, "learning_rate": 0.0002, "epoch": 3.1361161524500907, "step": 4320}, {"loss": 0.6624, "grad_norm": 1.1056879758834839, "learning_rate": 0.0002, "epoch": 3.143375680580762, "step": 4330}, {"loss": 0.691, "grad_norm": 0.8259835243225098, "learning_rate": 0.0002, "epoch": 3.1506352087114338, "step": 4340}, {"loss": 0.6635, "grad_norm": 0.6607027053833008, "learning_rate": 0.0002, "epoch": 3.1578947368421053, "step": 4350}, {"loss": 0.5911, "grad_norm": 0.7983301281929016, "learning_rate": 0.0002, "epoch": 3.165154264972777, "step": 4360}, {"loss": 0.6496, "grad_norm": 0.6725239157676697, "learning_rate": 0.0002, "epoch": 3.1724137931034484, "step": 4370}, {"loss": 0.5966, "grad_norm": 0.9052095413208008, "learning_rate": 0.0002, "epoch": 3.17967332123412, "step": 4380}, {"loss": 0.6877, "grad_norm": 0.8131307363510132, "learning_rate": 0.0002, "epoch": 3.1869328493647915, "step": 4390}, {"loss": 0.6384, "grad_norm": 0.6435626149177551, "learning_rate": 0.0002, "epoch": 3.1941923774954626, "step": 4400}, {"loss": 0.5819, "grad_norm": 0.84367436170578, "learning_rate": 0.0002, "epoch": 3.201451905626134, "step": 4410}, {"loss": 0.6104, "grad_norm": 1.5018867254257202, "learning_rate": 0.0002, "epoch": 3.2087114337568057, "step": 4420}, {"loss": 0.6838, "grad_norm": 0.7019091844558716, "learning_rate": 0.0002, "epoch": 3.215970961887477, "step": 4430}, {"loss": 0.6153, "grad_norm": 0.9164197444915771, "learning_rate": 0.0002, "epoch": 3.2232304900181488, "step": 4440}, {"loss": 0.6618, "grad_norm": 0.7890861630439758, "learning_rate": 0.0002, "epoch": 3.2304900181488203, "step": 4450}, {"loss": 0.6401, "grad_norm": 0.6517660617828369, "learning_rate": 0.0002, "epoch": 3.237749546279492, "step": 4460}, {"loss": 0.6699, "grad_norm": 1.10188889503479, "learning_rate": 0.0002, "epoch": 3.2450090744101634, "step": 4470}, {"loss": 0.6356, "grad_norm": 0.8158330917358398, "learning_rate": 0.0002, "epoch": 3.252268602540835, "step": 4480}, {"loss": 0.7757, "grad_norm": 0.7663109302520752, "learning_rate": 0.0002, "epoch": 3.2595281306715065, "step": 4490}, {"loss": 0.6539, "grad_norm": 0.8473444581031799, "learning_rate": 0.0002, "epoch": 3.266787658802178, "step": 4500}, {"loss": 0.6511, "grad_norm": 0.9724768996238708, "learning_rate": 0.0002, "epoch": 3.274047186932849, "step": 4510}, {"loss": 0.5464, "grad_norm": 0.8516759276390076, "learning_rate": 0.0002, "epoch": 3.281306715063521, "step": 4520}, {"loss": 0.6534, "grad_norm": 0.7543437480926514, "learning_rate": 0.0002, "epoch": 3.288566243194192, "step": 4530}, {"loss": 0.6095, "grad_norm": 1.0472029447555542, "learning_rate": 0.0002, "epoch": 3.2958257713248638, "step": 4540}, {"loss": 0.6216, "grad_norm": 0.6240826845169067, "learning_rate": 0.0002, "epoch": 3.3030852994555353, "step": 4550}, {"loss": 0.6223, "grad_norm": 0.9957774877548218, "learning_rate": 0.0002, "epoch": 3.310344827586207, "step": 4560}, {"loss": 0.618, "grad_norm": 0.6448912620544434, "learning_rate": 0.0002, "epoch": 3.3176043557168784, "step": 4570}, {"loss": 0.6188, "grad_norm": 0.7519692778587341, "learning_rate": 0.0002, "epoch": 3.32486388384755, "step": 4580}, {"loss": 0.6672, "grad_norm": 0.7367453575134277, "learning_rate": 0.0002, "epoch": 3.3321234119782215, "step": 4590}, {"loss": 0.6517, "grad_norm": 0.8064960837364197, "learning_rate": 0.0002, "epoch": 3.339382940108893, "step": 4600}, {"loss": 0.6062, "grad_norm": 0.7664631009101868, "learning_rate": 0.0002, "epoch": 3.3466424682395646, "step": 4610}, {"loss": 0.6834, "grad_norm": 0.7803396582603455, "learning_rate": 0.0002, "epoch": 3.353901996370236, "step": 4620}, {"loss": 0.6961, "grad_norm": 0.9141599535942078, "learning_rate": 0.0002, "epoch": 3.3611615245009077, "step": 4630}, {"loss": 0.6889, "grad_norm": 0.9719856381416321, "learning_rate": 0.0002, "epoch": 3.3684210526315788, "step": 4640}, {"loss": 0.6914, "grad_norm": 0.9223218560218811, "learning_rate": 0.0002, "epoch": 3.3756805807622503, "step": 4650}, {"loss": 0.5981, "grad_norm": 0.7289277911186218, "learning_rate": 0.0002, "epoch": 3.382940108892922, "step": 4660}, {"loss": 0.595, "grad_norm": 1.039724349975586, "learning_rate": 0.0002, "epoch": 3.3901996370235934, "step": 4670}, {"loss": 0.8121, "grad_norm": 1.397438883781433, "learning_rate": 0.0002, "epoch": 3.397459165154265, "step": 4680}, {"loss": 0.6334, "grad_norm": 1.0069999694824219, "learning_rate": 0.0002, "epoch": 3.4047186932849365, "step": 4690}, {"loss": 0.6598, "grad_norm": 0.816291332244873, "learning_rate": 0.0002, "epoch": 3.411978221415608, "step": 4700}, {"loss": 0.6748, "grad_norm": 1.2831530570983887, "learning_rate": 0.0002, "epoch": 3.4192377495462796, "step": 4710}, {"loss": 0.6625, "grad_norm": 0.9573889970779419, "learning_rate": 0.0002, "epoch": 3.426497277676951, "step": 4720}, {"loss": 0.7279, "grad_norm": 0.7685632705688477, "learning_rate": 0.0002, "epoch": 3.4337568058076227, "step": 4730}, {"loss": 0.6104, "grad_norm": 0.7019195556640625, "learning_rate": 0.0002, "epoch": 3.441016333938294, "step": 4740}, {"loss": 0.7606, "grad_norm": 0.7244833707809448, "learning_rate": 0.0002, "epoch": 3.4482758620689653, "step": 4750}, {"loss": 0.6951, "grad_norm": 1.3468551635742188, "learning_rate": 0.0002, "epoch": 3.455535390199637, "step": 4760}, {"loss": 0.6945, "grad_norm": 0.822846531867981, "learning_rate": 0.0002, "epoch": 3.4627949183303084, "step": 4770}, {"loss": 0.6431, "grad_norm": 0.7311608195304871, "learning_rate": 0.0002, "epoch": 3.47005444646098, "step": 4780}, {"loss": 0.7019, "grad_norm": 0.9466770887374878, "learning_rate": 0.0002, "epoch": 3.4773139745916515, "step": 4790}, {"loss": 0.7767, "grad_norm": 1.1527155637741089, "learning_rate": 0.0002, "epoch": 3.484573502722323, "step": 4800}, {"loss": 0.6882, "grad_norm": 1.1288906335830688, "learning_rate": 0.0002, "epoch": 3.4918330308529946, "step": 4810}, {"loss": 0.6564, "grad_norm": 0.9096164107322693, "learning_rate": 0.0002, "epoch": 3.499092558983666, "step": 4820}, {"loss": 0.6127, "grad_norm": 0.7988565564155579, "learning_rate": 0.0002, "epoch": 3.5063520871143377, "step": 4830}, {"loss": 0.7004, "grad_norm": 0.7183415293693542, "learning_rate": 0.0002, "epoch": 3.513611615245009, "step": 4840}, {"loss": 0.74, "grad_norm": 0.6614915132522583, "learning_rate": 0.0002, "epoch": 3.5208711433756807, "step": 4850}, {"loss": 0.7271, "grad_norm": 0.8609521985054016, "learning_rate": 0.0002, "epoch": 3.528130671506352, "step": 4860}, {"loss": 0.6664, "grad_norm": 0.86552894115448, "learning_rate": 0.0002, "epoch": 3.535390199637024, "step": 4870}, {"loss": 0.6432, "grad_norm": 0.6926496028900146, "learning_rate": 0.0002, "epoch": 3.542649727767695, "step": 4880}, {"loss": 0.7117, "grad_norm": 0.8157467246055603, "learning_rate": 0.0002, "epoch": 3.5499092558983665, "step": 4890}, {"loss": 0.6201, "grad_norm": 0.9085357189178467, "learning_rate": 0.0002, "epoch": 3.557168784029038, "step": 4900}, {"loss": 0.6521, "grad_norm": 0.6322644948959351, "learning_rate": 0.0002, "epoch": 3.5644283121597096, "step": 4910}, {"loss": 0.6607, "grad_norm": 1.263205885887146, "learning_rate": 0.0002, "epoch": 3.571687840290381, "step": 4920}, {"loss": 0.6657, "grad_norm": 0.8901070356369019, "learning_rate": 0.0002, "epoch": 3.5789473684210527, "step": 4930}, {"loss": 0.6434, "grad_norm": 0.7983952164649963, "learning_rate": 0.0002, "epoch": 3.586206896551724, "step": 4940}, {"loss": 0.6861, "grad_norm": 0.9887813925743103, "learning_rate": 0.0002, "epoch": 3.5934664246823957, "step": 4950}, {"loss": 0.6502, "grad_norm": 0.7895187735557556, "learning_rate": 0.0002, "epoch": 3.6007259528130673, "step": 4960}, {"loss": 0.7111, "grad_norm": 0.9685819745063782, "learning_rate": 0.0002, "epoch": 3.6079854809437384, "step": 4970}, {"loss": 0.6915, "grad_norm": 0.6576591730117798, "learning_rate": 0.0002, "epoch": 3.6152450090744104, "step": 4980}, {"loss": 0.6195, "grad_norm": 0.856985330581665, "learning_rate": 0.0002, "epoch": 3.6225045372050815, "step": 4990}, {"loss": 0.6318, "grad_norm": 0.7230252623558044, "learning_rate": 0.0002, "epoch": 3.629764065335753, "step": 5000}, {"loss": 0.742, "grad_norm": 0.8260893821716309, "learning_rate": 0.0002, "epoch": 3.6370235934664246, "step": 5010}, {"loss": 0.7223, "grad_norm": 0.7635950446128845, "learning_rate": 0.0002, "epoch": 3.644283121597096, "step": 5020}, {"loss": 0.6837, "grad_norm": 0.7060768604278564, "learning_rate": 0.0002, "epoch": 3.6515426497277677, "step": 5030}, {"loss": 0.6921, "grad_norm": 0.8020303249359131, "learning_rate": 0.0002, "epoch": 3.658802177858439, "step": 5040}, {"loss": 0.6446, "grad_norm": 0.8530341386795044, "learning_rate": 0.0002, "epoch": 3.6660617059891107, "step": 5050}, {"loss": 0.7222, "grad_norm": 0.6667101979255676, "learning_rate": 0.0002, "epoch": 3.6733212341197823, "step": 5060}, {"loss": 0.7081, "grad_norm": 0.7385406494140625, "learning_rate": 0.0002, "epoch": 3.680580762250454, "step": 5070}, {"loss": 0.7035, "grad_norm": 0.7753380537033081, "learning_rate": 0.0002, "epoch": 3.6878402903811254, "step": 5080}, {"loss": 0.6491, "grad_norm": 0.7516207098960876, "learning_rate": 0.0002, "epoch": 3.695099818511797, "step": 5090}, {"loss": 0.672, "grad_norm": 0.8171586394309998, "learning_rate": 0.0002, "epoch": 3.702359346642468, "step": 5100}, {"loss": 0.7459, "grad_norm": 1.0796279907226562, "learning_rate": 0.0002, "epoch": 3.70961887477314, "step": 5110}, {"loss": 0.5948, "grad_norm": 0.6957688927650452, "learning_rate": 0.0002, "epoch": 3.716878402903811, "step": 5120}, {"loss": 0.7515, "grad_norm": 0.8550161719322205, "learning_rate": 0.0002, "epoch": 3.7241379310344827, "step": 5130}, {"loss": 0.7286, "grad_norm": 0.9396728277206421, "learning_rate": 0.0002, "epoch": 3.731397459165154, "step": 5140}, {"loss": 0.7594, "grad_norm": 1.4264805316925049, "learning_rate": 0.0002, "epoch": 3.7386569872958257, "step": 5150}, {"loss": 0.6575, "grad_norm": 0.8725108504295349, "learning_rate": 0.0002, "epoch": 3.7459165154264973, "step": 5160}, {"loss": 0.6778, "grad_norm": 1.0346195697784424, "learning_rate": 0.0002, "epoch": 3.753176043557169, "step": 5170}, {"loss": 0.6371, "grad_norm": 0.5395554304122925, "learning_rate": 0.0002, "epoch": 3.7604355716878404, "step": 5180}, {"loss": 0.7308, "grad_norm": 1.3153616189956665, "learning_rate": 0.0002, "epoch": 3.767695099818512, "step": 5190}, {"loss": 0.78, "grad_norm": 0.9879828691482544, "learning_rate": 0.0002, "epoch": 3.7749546279491835, "step": 5200}, {"loss": 0.7068, "grad_norm": 0.8876672983169556, "learning_rate": 0.0002, "epoch": 3.7822141560798546, "step": 5210}, {"loss": 0.6283, "grad_norm": 0.8363267779350281, "learning_rate": 0.0002, "epoch": 3.7894736842105265, "step": 5220}, {"loss": 0.6255, "grad_norm": 0.637294590473175, "learning_rate": 0.0002, "epoch": 3.7967332123411976, "step": 5230}, {"loss": 0.6685, "grad_norm": 1.1408970355987549, "learning_rate": 0.0002, "epoch": 3.803992740471869, "step": 5240}, {"loss": 0.6761, "grad_norm": 1.0128360986709595, "learning_rate": 0.0002, "epoch": 3.8112522686025407, "step": 5250}, {"loss": 0.6764, "grad_norm": 0.8061144351959229, "learning_rate": 0.0002, "epoch": 3.8185117967332123, "step": 5260}, {"loss": 0.7254, "grad_norm": 0.9626626968383789, "learning_rate": 0.0002, "epoch": 3.825771324863884, "step": 5270}, {"loss": 0.7367, "grad_norm": 0.9013627171516418, "learning_rate": 0.0002, "epoch": 3.8330308529945554, "step": 5280}, {"loss": 0.6806, "grad_norm": 0.8411344289779663, "learning_rate": 0.0002, "epoch": 3.840290381125227, "step": 5290}, {"loss": 0.6818, "grad_norm": 0.7426059246063232, "learning_rate": 0.0002, "epoch": 3.8475499092558985, "step": 5300}, {"loss": 0.6748, "grad_norm": 1.003413438796997, "learning_rate": 0.0002, "epoch": 3.85480943738657, "step": 5310}, {"loss": 0.8554, "grad_norm": 0.7527840733528137, "learning_rate": 0.0002, "epoch": 3.862068965517241, "step": 5320}, {"loss": 0.7521, "grad_norm": 0.738610565662384, "learning_rate": 0.0002, "epoch": 3.869328493647913, "step": 5330}, {"loss": 0.7266, "grad_norm": 0.7277999520301819, "learning_rate": 0.0002, "epoch": 3.876588021778584, "step": 5340}, {"loss": 0.7503, "grad_norm": 0.5951359272003174, "learning_rate": 0.0002, "epoch": 3.8838475499092557, "step": 5350}, {"loss": 0.7447, "grad_norm": 1.043884038925171, "learning_rate": 0.0002, "epoch": 3.8911070780399273, "step": 5360}, {"loss": 0.6862, "grad_norm": 0.8436498045921326, "learning_rate": 0.0002, "epoch": 3.898366606170599, "step": 5370}, {"loss": 0.665, "grad_norm": 0.5603365302085876, "learning_rate": 0.0002, "epoch": 3.9056261343012704, "step": 5380}, {"loss": 0.7098, "grad_norm": 1.0128886699676514, "learning_rate": 0.0002, "epoch": 3.912885662431942, "step": 5390}, {"loss": 0.6707, "grad_norm": 0.7970930337905884, "learning_rate": 0.0002, "epoch": 3.9201451905626135, "step": 5400}, {"loss": 0.637, "grad_norm": 0.7699369192123413, "learning_rate": 0.0002, "epoch": 3.927404718693285, "step": 5410}, {"loss": 0.6742, "grad_norm": 0.800561249256134, "learning_rate": 0.0002, "epoch": 3.9346642468239565, "step": 5420}, {"loss": 0.7208, "grad_norm": 0.8020331859588623, "learning_rate": 0.0002, "epoch": 3.941923774954628, "step": 5430}, {"loss": 0.7294, "grad_norm": 0.7461140155792236, "learning_rate": 0.0002, "epoch": 3.9491833030852996, "step": 5440}, {"loss": 0.7013, "grad_norm": 0.8346918821334839, "learning_rate": 0.0002, "epoch": 3.9564428312159707, "step": 5450}, {"loss": 0.6289, "grad_norm": 0.9723302125930786, "learning_rate": 0.0002, "epoch": 3.9637023593466427, "step": 5460}, {"loss": 0.8029, "grad_norm": 0.6809740662574768, "learning_rate": 0.0002, "epoch": 3.970961887477314, "step": 5470}, {"loss": 0.6896, "grad_norm": 0.7353498339653015, "learning_rate": 0.0002, "epoch": 3.9782214156079854, "step": 5480}, {"loss": 0.6722, "grad_norm": 0.748009443283081, "learning_rate": 0.0002, "epoch": 3.985480943738657, "step": 5490}, {"loss": 0.6866, "grad_norm": 1.3656195402145386, "learning_rate": 0.0002, "epoch": 3.9927404718693285, "step": 5500}, {"loss": 0.7368, "grad_norm": 0.8402108550071716, "learning_rate": 0.0002, "epoch": 4.0, "step": 5510}]} +{"epoch": 4.999637023593467, "step": 6887, "epoch_duration": 2059.8379588127136, "total_accumulated_duration": 16346.150188922882, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 24486.0}, "peak_memory_reserved": {"GPU_0": 24752.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4936, "grad_norm": 0.46010470390319824, "learning_rate": 0.0002, "epoch": 0.007259528130671506, "step": 10}, {"loss": 1.8442, "grad_norm": 0.5103179216384888, "learning_rate": 0.0002, "epoch": 0.014519056261343012, "step": 20}, {"loss": 1.559, "grad_norm": 0.6282716989517212, "learning_rate": 0.0002, "epoch": 0.021778584392014518, "step": 30}, {"loss": 1.3618, "grad_norm": 1.2480497360229492, "learning_rate": 0.0002, "epoch": 0.029038112522686024, "step": 40}, {"loss": 1.3874, "grad_norm": 0.4114855229854584, "learning_rate": 0.0002, "epoch": 0.036297640653357534, "step": 50}, {"loss": 1.2836, "grad_norm": 0.49482840299606323, "learning_rate": 0.0002, "epoch": 0.043557168784029036, "step": 60}, {"loss": 1.1728, "grad_norm": 0.4536272883415222, "learning_rate": 0.0002, "epoch": 0.050816696914700546, "step": 70}, {"loss": 1.1125, "grad_norm": 0.32328274846076965, "learning_rate": 0.0002, "epoch": 0.05807622504537205, "step": 80}, {"loss": 1.2447, "grad_norm": 0.40990468859672546, "learning_rate": 0.0002, "epoch": 0.06533575317604355, "step": 90}, {"loss": 1.2216, "grad_norm": 0.37273502349853516, "learning_rate": 0.0002, "epoch": 0.07259528130671507, "step": 100}, {"loss": 1.2259, "grad_norm": 0.3903871476650238, "learning_rate": 0.0002, "epoch": 0.07985480943738657, "step": 110}, {"loss": 1.3718, "grad_norm": 0.3272787928581238, "learning_rate": 0.0002, "epoch": 0.08711433756805807, "step": 120}, {"loss": 1.1832, "grad_norm": 0.3622824251651764, "learning_rate": 0.0002, "epoch": 0.09437386569872959, "step": 130}, {"loss": 1.2966, "grad_norm": 0.3503916561603546, "learning_rate": 0.0002, "epoch": 0.10163339382940109, "step": 140}, {"loss": 1.0913, "grad_norm": 0.32787832617759705, "learning_rate": 0.0002, "epoch": 0.1088929219600726, "step": 150}, {"loss": 1.2143, "grad_norm": 0.5822657942771912, "learning_rate": 0.0002, "epoch": 0.1161524500907441, "step": 160}, {"loss": 1.2091, "grad_norm": 0.28028249740600586, "learning_rate": 0.0002, "epoch": 0.12341197822141561, "step": 170}, {"loss": 1.1731, "grad_norm": 0.33602750301361084, "learning_rate": 0.0002, "epoch": 0.1306715063520871, "step": 180}, {"loss": 1.127, "grad_norm": 0.29106274247169495, "learning_rate": 0.0002, "epoch": 0.13793103448275862, "step": 190}, {"loss": 1.3261, "grad_norm": 0.38753363490104675, "learning_rate": 0.0002, "epoch": 0.14519056261343014, "step": 200}, {"loss": 1.1536, "grad_norm": 0.361009418964386, "learning_rate": 0.0002, "epoch": 0.15245009074410162, "step": 210}, {"loss": 1.1483, "grad_norm": 0.6743836402893066, "learning_rate": 0.0002, "epoch": 0.15970961887477314, "step": 220}, {"loss": 1.264, "grad_norm": 0.3923613727092743, "learning_rate": 0.0002, "epoch": 0.16696914700544466, "step": 230}, {"loss": 1.0437, "grad_norm": 0.2809699773788452, "learning_rate": 0.0002, "epoch": 0.17422867513611615, "step": 240}, {"loss": 1.1826, "grad_norm": 0.3631494641304016, "learning_rate": 0.0002, "epoch": 0.18148820326678766, "step": 250}, {"loss": 1.1584, "grad_norm": 0.24658171832561493, "learning_rate": 0.0002, "epoch": 0.18874773139745918, "step": 260}, {"loss": 1.0065, "grad_norm": 0.5780664682388306, "learning_rate": 0.0002, "epoch": 0.19600725952813067, "step": 270}, {"loss": 1.1251, "grad_norm": 0.3056720495223999, "learning_rate": 0.0002, "epoch": 0.20326678765880218, "step": 280}, {"loss": 1.1503, "grad_norm": 0.2857084572315216, "learning_rate": 0.0002, "epoch": 0.21052631578947367, "step": 290}, {"loss": 1.1215, "grad_norm": 0.6645345687866211, "learning_rate": 0.0002, "epoch": 0.2177858439201452, "step": 300}, {"loss": 1.127, "grad_norm": 0.5966078639030457, "learning_rate": 0.0002, "epoch": 0.2250453720508167, "step": 310}, {"loss": 1.1974, "grad_norm": 0.40937140583992004, "learning_rate": 0.0002, "epoch": 0.2323049001814882, "step": 320}, {"loss": 1.2385, "grad_norm": 0.5642806887626648, "learning_rate": 0.0002, "epoch": 0.2395644283121597, "step": 330}, {"loss": 1.1703, "grad_norm": 0.2750748097896576, "learning_rate": 0.0002, "epoch": 0.24682395644283123, "step": 340}, {"loss": 1.2598, "grad_norm": 0.34350234270095825, "learning_rate": 0.0002, "epoch": 0.2540834845735027, "step": 350}, {"loss": 1.1942, "grad_norm": 0.6767239570617676, "learning_rate": 0.0002, "epoch": 0.2613430127041742, "step": 360}, {"loss": 1.1436, "grad_norm": 0.31006959080696106, "learning_rate": 0.0002, "epoch": 0.26860254083484575, "step": 370}, {"loss": 1.2084, "grad_norm": 0.3825474679470062, "learning_rate": 0.0002, "epoch": 0.27586206896551724, "step": 380}, {"loss": 1.1523, "grad_norm": 0.30241551995277405, "learning_rate": 0.0002, "epoch": 0.2831215970961887, "step": 390}, {"loss": 1.1298, "grad_norm": 0.2962397336959839, "learning_rate": 0.0002, "epoch": 0.29038112522686027, "step": 400}, {"loss": 1.1299, "grad_norm": 0.2600369155406952, "learning_rate": 0.0002, "epoch": 0.29764065335753176, "step": 410}, {"loss": 1.1366, "grad_norm": 0.3675060272216797, "learning_rate": 0.0002, "epoch": 0.30490018148820325, "step": 420}, {"loss": 1.156, "grad_norm": 0.3429498076438904, "learning_rate": 0.0002, "epoch": 0.3121597096188748, "step": 430}, {"loss": 1.2741, "grad_norm": 0.34311825037002563, "learning_rate": 0.0002, "epoch": 0.3194192377495463, "step": 440}, {"loss": 1.3523, "grad_norm": 0.37872210144996643, "learning_rate": 0.0002, "epoch": 0.32667876588021777, "step": 450}, {"loss": 1.1365, "grad_norm": 0.33271121978759766, "learning_rate": 0.0002, "epoch": 0.3339382940108893, "step": 460}, {"loss": 1.1572, "grad_norm": 0.34605276584625244, "learning_rate": 0.0002, "epoch": 0.3411978221415608, "step": 470}, {"loss": 1.2755, "grad_norm": 0.41050562262535095, "learning_rate": 0.0002, "epoch": 0.3484573502722323, "step": 480}, {"loss": 1.075, "grad_norm": 0.2066836953163147, "learning_rate": 0.0002, "epoch": 0.35571687840290384, "step": 490}, {"loss": 1.2072, "grad_norm": 0.2859014868736267, "learning_rate": 0.0002, "epoch": 0.3629764065335753, "step": 500}, {"loss": 1.1435, "grad_norm": 0.28763777017593384, "learning_rate": 0.0002, "epoch": 0.3702359346642468, "step": 510}, {"loss": 1.1341, "grad_norm": 0.2730471193790436, "learning_rate": 0.0002, "epoch": 0.37749546279491836, "step": 520}, {"loss": 1.348, "grad_norm": 0.3968936800956726, "learning_rate": 0.0002, "epoch": 0.38475499092558985, "step": 530}, {"loss": 1.2363, "grad_norm": 0.3624701201915741, "learning_rate": 0.0002, "epoch": 0.39201451905626133, "step": 540}, {"loss": 1.1206, "grad_norm": 0.3303608298301697, "learning_rate": 0.0002, "epoch": 0.3992740471869328, "step": 550}, {"loss": 1.203, "grad_norm": 0.33507466316223145, "learning_rate": 0.0002, "epoch": 0.40653357531760437, "step": 560}, {"loss": 1.2294, "grad_norm": 0.3297670781612396, "learning_rate": 0.0002, "epoch": 0.41379310344827586, "step": 570}, {"loss": 1.2, "grad_norm": 0.32334890961647034, "learning_rate": 0.0002, "epoch": 0.42105263157894735, "step": 580}, {"loss": 1.112, "grad_norm": 0.30281195044517517, "learning_rate": 0.0002, "epoch": 0.4283121597096189, "step": 590}, {"loss": 1.1352, "grad_norm": 0.5900027751922607, "learning_rate": 0.0002, "epoch": 0.4355716878402904, "step": 600}, {"loss": 1.1575, "grad_norm": 0.28437477350234985, "learning_rate": 0.0002, "epoch": 0.44283121597096187, "step": 610}, {"loss": 1.2009, "grad_norm": 0.39601704478263855, "learning_rate": 0.0002, "epoch": 0.4500907441016334, "step": 620}, {"loss": 1.2302, "grad_norm": 0.41971510648727417, "learning_rate": 0.0002, "epoch": 0.4573502722323049, "step": 630}, {"loss": 1.236, "grad_norm": 0.33814409375190735, "learning_rate": 0.0002, "epoch": 0.4646098003629764, "step": 640}, {"loss": 1.2471, "grad_norm": 0.575718104839325, "learning_rate": 0.0002, "epoch": 0.47186932849364793, "step": 650}, {"loss": 1.1012, "grad_norm": 0.37927401065826416, "learning_rate": 0.0002, "epoch": 0.4791288566243194, "step": 660}, {"loss": 1.1552, "grad_norm": 0.3224332630634308, "learning_rate": 0.0002, "epoch": 0.4863883847549909, "step": 670}, {"loss": 1.1354, "grad_norm": 0.32683515548706055, "learning_rate": 0.0002, "epoch": 0.49364791288566245, "step": 680}, {"loss": 1.207, "grad_norm": 0.4316163659095764, "learning_rate": 0.0002, "epoch": 0.5009074410163339, "step": 690}, {"loss": 1.2116, "grad_norm": 0.342602401971817, "learning_rate": 0.0002, "epoch": 0.5081669691470054, "step": 700}, {"loss": 1.0433, "grad_norm": 0.2794898748397827, "learning_rate": 0.0002, "epoch": 0.515426497277677, "step": 710}, {"loss": 1.2395, "grad_norm": 0.3322339951992035, "learning_rate": 0.0002, "epoch": 0.5226860254083484, "step": 720}, {"loss": 1.0762, "grad_norm": 0.3088509142398834, "learning_rate": 0.0002, "epoch": 0.52994555353902, "step": 730}, {"loss": 1.1495, "grad_norm": 0.24444378912448883, "learning_rate": 0.0002, "epoch": 0.5372050816696915, "step": 740}, {"loss": 1.2176, "grad_norm": 0.3483171761035919, "learning_rate": 0.0002, "epoch": 0.5444646098003629, "step": 750}, {"loss": 1.2248, "grad_norm": 0.3425690531730652, "learning_rate": 0.0002, "epoch": 0.5517241379310345, "step": 760}, {"loss": 1.1127, "grad_norm": 0.31841927766799927, "learning_rate": 0.0002, "epoch": 0.558983666061706, "step": 770}, {"loss": 1.1841, "grad_norm": 0.39423868060112, "learning_rate": 0.0002, "epoch": 0.5662431941923775, "step": 780}, {"loss": 1.0583, "grad_norm": 0.30328479409217834, "learning_rate": 0.0002, "epoch": 0.573502722323049, "step": 790}, {"loss": 1.0718, "grad_norm": 0.24475938081741333, "learning_rate": 0.0002, "epoch": 0.5807622504537205, "step": 800}, {"loss": 1.2006, "grad_norm": 0.37132805585861206, "learning_rate": 0.0002, "epoch": 0.588021778584392, "step": 810}, {"loss": 1.1544, "grad_norm": 0.32195979356765747, "learning_rate": 0.0002, "epoch": 0.5952813067150635, "step": 820}, {"loss": 0.9937, "grad_norm": 0.2848738729953766, "learning_rate": 0.0002, "epoch": 0.6025408348457351, "step": 830}, {"loss": 1.0937, "grad_norm": 0.28015264868736267, "learning_rate": 0.0002, "epoch": 0.6098003629764065, "step": 840}, {"loss": 1.1234, "grad_norm": 0.37796008586883545, "learning_rate": 0.0002, "epoch": 0.617059891107078, "step": 850}, {"loss": 1.1455, "grad_norm": 0.39311841130256653, "learning_rate": 0.0002, "epoch": 0.6243194192377496, "step": 860}, {"loss": 1.1066, "grad_norm": 0.2761685252189636, "learning_rate": 0.0002, "epoch": 0.631578947368421, "step": 870}, {"loss": 1.0859, "grad_norm": 0.3826720118522644, "learning_rate": 0.0002, "epoch": 0.6388384754990926, "step": 880}, {"loss": 1.1182, "grad_norm": 0.30076679587364197, "learning_rate": 0.0002, "epoch": 0.6460980036297641, "step": 890}, {"loss": 1.0927, "grad_norm": 0.21997687220573425, "learning_rate": 0.0002, "epoch": 0.6533575317604355, "step": 900}, {"loss": 1.1603, "grad_norm": 0.32593777775764465, "learning_rate": 0.0002, "epoch": 0.6606170598911071, "step": 910}, {"loss": 1.1578, "grad_norm": 0.30347898602485657, "learning_rate": 0.0002, "epoch": 0.6678765880217786, "step": 920}, {"loss": 1.1642, "grad_norm": 0.44173774123191833, "learning_rate": 0.0002, "epoch": 0.6751361161524501, "step": 930}, {"loss": 1.1832, "grad_norm": 0.2507467269897461, "learning_rate": 0.0002, "epoch": 0.6823956442831216, "step": 940}, {"loss": 1.3001, "grad_norm": 0.29463833570480347, "learning_rate": 0.0002, "epoch": 0.6896551724137931, "step": 950}, {"loss": 1.1651, "grad_norm": 0.9363154172897339, "learning_rate": 0.0002, "epoch": 0.6969147005444646, "step": 960}, {"loss": 1.158, "grad_norm": 0.3236212134361267, "learning_rate": 0.0002, "epoch": 0.7041742286751361, "step": 970}, {"loss": 1.1078, "grad_norm": 0.3123254179954529, "learning_rate": 0.0002, "epoch": 0.7114337568058077, "step": 980}, {"loss": 1.1578, "grad_norm": 0.3395805060863495, "learning_rate": 0.0002, "epoch": 0.7186932849364791, "step": 990}, {"loss": 1.0754, "grad_norm": 0.3240964412689209, "learning_rate": 0.0002, "epoch": 0.7259528130671506, "step": 1000}, {"loss": 1.0313, "grad_norm": 0.31902948021888733, "learning_rate": 0.0002, "epoch": 0.7332123411978222, "step": 1010}, {"loss": 1.1664, "grad_norm": 0.4848408102989197, "learning_rate": 0.0002, "epoch": 0.7404718693284936, "step": 1020}, {"loss": 1.0838, "grad_norm": 0.33006033301353455, "learning_rate": 0.0002, "epoch": 0.7477313974591652, "step": 1030}, {"loss": 1.0979, "grad_norm": 0.2928730547428131, "learning_rate": 0.0002, "epoch": 0.7549909255898367, "step": 1040}, {"loss": 1.1351, "grad_norm": 0.3529164791107178, "learning_rate": 0.0002, "epoch": 0.7622504537205081, "step": 1050}, {"loss": 1.1274, "grad_norm": 0.2736213803291321, "learning_rate": 0.0002, "epoch": 0.7695099818511797, "step": 1060}, {"loss": 1.1169, "grad_norm": 0.7200686931610107, "learning_rate": 0.0002, "epoch": 0.7767695099818511, "step": 1070}, {"loss": 1.1171, "grad_norm": 0.33396708965301514, "learning_rate": 0.0002, "epoch": 0.7840290381125227, "step": 1080}, {"loss": 1.2447, "grad_norm": 1.5760449171066284, "learning_rate": 0.0002, "epoch": 0.7912885662431942, "step": 1090}, {"loss": 1.1126, "grad_norm": 0.28138381242752075, "learning_rate": 0.0002, "epoch": 0.7985480943738656, "step": 1100}, {"loss": 1.2012, "grad_norm": 0.2597472369670868, "learning_rate": 0.0002, "epoch": 0.8058076225045372, "step": 1110}, {"loss": 1.1177, "grad_norm": 0.3305445611476898, "learning_rate": 0.0002, "epoch": 0.8130671506352087, "step": 1120}, {"loss": 1.0849, "grad_norm": 0.3934599459171295, "learning_rate": 0.0002, "epoch": 0.8203266787658802, "step": 1130}, {"loss": 1.0889, "grad_norm": 0.3472191393375397, "learning_rate": 0.0002, "epoch": 0.8275862068965517, "step": 1140}, {"loss": 1.265, "grad_norm": 0.2857365906238556, "learning_rate": 0.0002, "epoch": 0.8348457350272233, "step": 1150}, {"loss": 1.03, "grad_norm": 0.3207702934741974, "learning_rate": 0.0002, "epoch": 0.8421052631578947, "step": 1160}, {"loss": 1.1669, "grad_norm": 0.3176484704017639, "learning_rate": 0.0002, "epoch": 0.8493647912885662, "step": 1170}, {"loss": 1.1386, "grad_norm": 0.40685558319091797, "learning_rate": 0.0002, "epoch": 0.8566243194192378, "step": 1180}, {"loss": 1.1383, "grad_norm": 0.31125199794769287, "learning_rate": 0.0002, "epoch": 0.8638838475499092, "step": 1190}, {"loss": 1.1512, "grad_norm": 0.7361181378364563, "learning_rate": 0.0002, "epoch": 0.8711433756805808, "step": 1200}, {"loss": 0.9983, "grad_norm": 0.33699527382850647, "learning_rate": 0.0002, "epoch": 0.8784029038112523, "step": 1210}, {"loss": 1.11, "grad_norm": 0.3315220773220062, "learning_rate": 0.0002, "epoch": 0.8856624319419237, "step": 1220}, {"loss": 1.0266, "grad_norm": 0.6256054043769836, "learning_rate": 0.0002, "epoch": 0.8929219600725953, "step": 1230}, {"loss": 1.1738, "grad_norm": 0.3692137897014618, "learning_rate": 0.0002, "epoch": 0.9001814882032668, "step": 1240}, {"loss": 1.1112, "grad_norm": 0.3538484573364258, "learning_rate": 0.0002, "epoch": 0.9074410163339383, "step": 1250}, {"loss": 1.0878, "grad_norm": 0.27863040566444397, "learning_rate": 0.0002, "epoch": 0.9147005444646098, "step": 1260}, {"loss": 1.0826, "grad_norm": 0.3322528302669525, "learning_rate": 0.0002, "epoch": 0.9219600725952813, "step": 1270}, {"loss": 1.0158, "grad_norm": 0.7553173303604126, "learning_rate": 0.0002, "epoch": 0.9292196007259528, "step": 1280}, {"loss": 1.198, "grad_norm": 0.4856191575527191, "learning_rate": 0.0002, "epoch": 0.9364791288566243, "step": 1290}, {"loss": 1.1678, "grad_norm": 0.3668074905872345, "learning_rate": 0.0002, "epoch": 0.9437386569872959, "step": 1300}, {"loss": 1.1809, "grad_norm": 0.29851067066192627, "learning_rate": 0.0002, "epoch": 0.9509981851179673, "step": 1310}, {"loss": 1.1299, "grad_norm": 0.276664674282074, "learning_rate": 0.0002, "epoch": 0.9582577132486388, "step": 1320}, {"loss": 1.0597, "grad_norm": 0.2941018342971802, "learning_rate": 0.0002, "epoch": 0.9655172413793104, "step": 1330}, {"loss": 1.1731, "grad_norm": 0.3505859076976776, "learning_rate": 0.0002, "epoch": 0.9727767695099818, "step": 1340}, {"loss": 1.1809, "grad_norm": 0.3067687451839447, "learning_rate": 0.0002, "epoch": 0.9800362976406534, "step": 1350}, {"loss": 1.1511, "grad_norm": 0.27151066064834595, "learning_rate": 0.0002, "epoch": 0.9872958257713249, "step": 1360}, {"loss": 1.1919, "grad_norm": 0.36370083689689636, "learning_rate": 0.0002, "epoch": 0.9945553539019963, "step": 1370}, {"eval_loss": 1.1381088495254517, "eval_runtime": 96.0848, "eval_samples_per_second": 4.538, "eval_steps_per_second": 0.572, "epoch": 0.9996370235934664, "step": 1377}, {"loss": 1.0082, "grad_norm": 0.27980583906173706, "learning_rate": 0.0002, "epoch": 1.0018148820326678, "step": 1380}, {"loss": 1.0565, "grad_norm": 0.26713913679122925, "learning_rate": 0.0002, "epoch": 1.0090744101633393, "step": 1390}, {"loss": 1.0241, "grad_norm": 0.3089541494846344, "learning_rate": 0.0002, "epoch": 1.0163339382940109, "step": 1400}, {"loss": 0.9649, "grad_norm": 0.4188242256641388, "learning_rate": 0.0002, "epoch": 1.0235934664246824, "step": 1410}, {"loss": 0.9821, "grad_norm": 0.5246463418006897, "learning_rate": 0.0002, "epoch": 1.030852994555354, "step": 1420}, {"loss": 1.0143, "grad_norm": 0.2728777825832367, "learning_rate": 0.0002, "epoch": 1.0381125226860255, "step": 1430}, {"loss": 1.075, "grad_norm": 0.38167616724967957, "learning_rate": 0.0002, "epoch": 1.0453720508166968, "step": 1440}, {"loss": 1.0229, "grad_norm": 0.4439380168914795, "learning_rate": 0.0002, "epoch": 1.0526315789473684, "step": 1450}, {"loss": 1.0451, "grad_norm": 0.30954182147979736, "learning_rate": 0.0002, "epoch": 1.05989110707804, "step": 1460}, {"loss": 1.0762, "grad_norm": 0.4022280275821686, "learning_rate": 0.0002, "epoch": 1.0671506352087115, "step": 1470}, {"loss": 1.0754, "grad_norm": 0.7390811443328857, "learning_rate": 0.0002, "epoch": 1.074410163339383, "step": 1480}, {"loss": 1.1378, "grad_norm": 0.3885486423969269, "learning_rate": 0.0002, "epoch": 1.0816696914700545, "step": 1490}, {"loss": 1.005, "grad_norm": 0.5275560617446899, "learning_rate": 0.0002, "epoch": 1.0889292196007259, "step": 1500}, {"loss": 1.0147, "grad_norm": 0.35112282633781433, "learning_rate": 0.0002, "epoch": 1.0961887477313974, "step": 1510}, {"loss": 1.0331, "grad_norm": 0.33714351058006287, "learning_rate": 0.0002, "epoch": 1.103448275862069, "step": 1520}, {"loss": 0.8723, "grad_norm": 0.31221693754196167, "learning_rate": 0.0002, "epoch": 1.1107078039927405, "step": 1530}, {"loss": 1.0701, "grad_norm": 0.27549654245376587, "learning_rate": 0.0002, "epoch": 1.117967332123412, "step": 1540}, {"loss": 1.0114, "grad_norm": 0.8465521335601807, "learning_rate": 0.0002, "epoch": 1.1252268602540836, "step": 1550}, {"loss": 0.9643, "grad_norm": 0.36125949025154114, "learning_rate": 0.0002, "epoch": 1.132486388384755, "step": 1560}, {"loss": 1.0744, "grad_norm": 0.37420371174812317, "learning_rate": 0.0002, "epoch": 1.1397459165154264, "step": 1570}, {"loss": 1.165, "grad_norm": 0.3294760584831238, "learning_rate": 0.0002, "epoch": 1.147005444646098, "step": 1580}, {"loss": 0.9207, "grad_norm": 0.3881238102912903, "learning_rate": 0.0002, "epoch": 1.1542649727767695, "step": 1590}, {"loss": 1.0983, "grad_norm": 0.4766491651535034, "learning_rate": 0.0002, "epoch": 1.161524500907441, "step": 1600}, {"loss": 1.0092, "grad_norm": 0.395530104637146, "learning_rate": 0.0002, "epoch": 1.1687840290381124, "step": 1610}, {"loss": 0.9895, "grad_norm": 0.3297106623649597, "learning_rate": 0.0002, "epoch": 1.176043557168784, "step": 1620}, {"loss": 1.1121, "grad_norm": 0.39528271555900574, "learning_rate": 0.0002, "epoch": 1.1833030852994555, "step": 1630}, {"loss": 0.9202, "grad_norm": 0.3370221257209778, "learning_rate": 0.0002, "epoch": 1.190562613430127, "step": 1640}, {"loss": 1.102, "grad_norm": 0.31922030448913574, "learning_rate": 0.0002, "epoch": 1.1978221415607986, "step": 1650}, {"loss": 1.0565, "grad_norm": 0.6142027378082275, "learning_rate": 0.0002, "epoch": 1.2050816696914701, "step": 1660}, {"loss": 0.9541, "grad_norm": 0.44769710302352905, "learning_rate": 0.0002, "epoch": 1.2123411978221417, "step": 1670}, {"loss": 1.2501, "grad_norm": 0.41233646869659424, "learning_rate": 0.0002, "epoch": 1.219600725952813, "step": 1680}, {"loss": 1.0147, "grad_norm": 0.2928866147994995, "learning_rate": 0.0002, "epoch": 1.2268602540834845, "step": 1690}, {"loss": 0.9074, "grad_norm": 0.36913734674453735, "learning_rate": 0.0002, "epoch": 1.234119782214156, "step": 1700}, {"loss": 1.0847, "grad_norm": 0.5281891226768494, "learning_rate": 0.0002, "epoch": 1.2413793103448276, "step": 1710}, {"loss": 0.9482, "grad_norm": 0.3374697268009186, "learning_rate": 0.0002, "epoch": 1.2486388384754992, "step": 1720}, {"loss": 0.9826, "grad_norm": 0.3802020847797394, "learning_rate": 0.0002, "epoch": 1.2558983666061705, "step": 1730}, {"loss": 1.0191, "grad_norm": 0.38048651814460754, "learning_rate": 0.0002, "epoch": 1.263157894736842, "step": 1740}, {"loss": 1.0142, "grad_norm": 0.6676169633865356, "learning_rate": 0.0002, "epoch": 1.2704174228675136, "step": 1750}, {"loss": 0.992, "grad_norm": 0.4075961410999298, "learning_rate": 0.0002, "epoch": 1.2776769509981851, "step": 1760}, {"loss": 1.0301, "grad_norm": 0.4374721348285675, "learning_rate": 0.0002, "epoch": 1.2849364791288567, "step": 1770}, {"loss": 1.0601, "grad_norm": 0.4638824164867401, "learning_rate": 0.0002, "epoch": 1.2921960072595282, "step": 1780}, {"loss": 0.9593, "grad_norm": 0.38631564378738403, "learning_rate": 0.0002, "epoch": 1.2994555353901998, "step": 1790}, {"loss": 1.0042, "grad_norm": 0.35873809456825256, "learning_rate": 0.0002, "epoch": 1.306715063520871, "step": 1800}, {"loss": 0.9835, "grad_norm": 0.33602237701416016, "learning_rate": 0.0002, "epoch": 1.3139745916515426, "step": 1810}, {"loss": 1.1032, "grad_norm": 0.46696463227272034, "learning_rate": 0.0002, "epoch": 1.3212341197822142, "step": 1820}, {"loss": 1.0123, "grad_norm": 0.368958979845047, "learning_rate": 0.0002, "epoch": 1.3284936479128857, "step": 1830}, {"loss": 1.0456, "grad_norm": 0.3160957396030426, "learning_rate": 0.0002, "epoch": 1.335753176043557, "step": 1840}, {"loss": 1.0264, "grad_norm": 0.4511511027812958, "learning_rate": 0.0002, "epoch": 1.3430127041742286, "step": 1850}, {"loss": 1.0179, "grad_norm": 0.6769845485687256, "learning_rate": 0.0002, "epoch": 1.3502722323049001, "step": 1860}, {"loss": 1.011, "grad_norm": 0.3749309480190277, "learning_rate": 0.0002, "epoch": 1.3575317604355717, "step": 1870}, {"loss": 1.0223, "grad_norm": 0.553798496723175, "learning_rate": 0.0002, "epoch": 1.3647912885662432, "step": 1880}, {"loss": 0.9634, "grad_norm": 0.3538985550403595, "learning_rate": 0.0002, "epoch": 1.3720508166969148, "step": 1890}, {"loss": 1.0275, "grad_norm": 0.5501534938812256, "learning_rate": 0.0002, "epoch": 1.3793103448275863, "step": 1900}, {"loss": 1.0461, "grad_norm": 0.4432051181793213, "learning_rate": 0.0002, "epoch": 1.3865698729582578, "step": 1910}, {"loss": 0.9082, "grad_norm": 0.41755786538124084, "learning_rate": 0.0002, "epoch": 1.3938294010889292, "step": 1920}, {"loss": 1.0858, "grad_norm": 0.5626114010810852, "learning_rate": 0.0002, "epoch": 1.4010889292196007, "step": 1930}, {"loss": 1.0687, "grad_norm": 0.44038185477256775, "learning_rate": 0.0002, "epoch": 1.4083484573502723, "step": 1940}, {"loss": 0.9454, "grad_norm": 0.3397001624107361, "learning_rate": 0.0002, "epoch": 1.4156079854809438, "step": 1950}, {"loss": 0.943, "grad_norm": 0.4325368106365204, "learning_rate": 0.0002, "epoch": 1.4228675136116151, "step": 1960}, {"loss": 1.0488, "grad_norm": 0.3900907039642334, "learning_rate": 0.0002, "epoch": 1.4301270417422867, "step": 1970}, {"loss": 0.9699, "grad_norm": 0.369612455368042, "learning_rate": 0.0002, "epoch": 1.4373865698729582, "step": 1980}, {"loss": 1.0609, "grad_norm": 0.4389338791370392, "learning_rate": 0.0002, "epoch": 1.4446460980036298, "step": 1990}, {"loss": 1.042, "grad_norm": 1.694450855255127, "learning_rate": 0.0002, "epoch": 1.4519056261343013, "step": 2000}, {"loss": 0.997, "grad_norm": 0.516957700252533, "learning_rate": 0.0002, "epoch": 1.4591651542649728, "step": 2010}, {"loss": 1.0979, "grad_norm": 0.45515501499176025, "learning_rate": 0.0002, "epoch": 1.4664246823956444, "step": 2020}, {"loss": 1.0188, "grad_norm": 0.4153139591217041, "learning_rate": 0.0002, "epoch": 1.4736842105263157, "step": 2030}, {"loss": 1.1521, "grad_norm": 0.44353052973747253, "learning_rate": 0.0002, "epoch": 1.4809437386569873, "step": 2040}, {"loss": 0.9653, "grad_norm": 0.570554256439209, "learning_rate": 0.0002, "epoch": 1.4882032667876588, "step": 2050}, {"loss": 1.0352, "grad_norm": 0.5742740035057068, "learning_rate": 0.0002, "epoch": 1.4954627949183303, "step": 2060}, {"loss": 1.0963, "grad_norm": 0.5890517830848694, "learning_rate": 0.0002, "epoch": 1.5027223230490017, "step": 2070}, {"loss": 1.0639, "grad_norm": 0.4162650406360626, "learning_rate": 0.0002, "epoch": 1.5099818511796732, "step": 2080}, {"loss": 1.071, "grad_norm": 0.7334717512130737, "learning_rate": 0.0002, "epoch": 1.5172413793103448, "step": 2090}, {"loss": 0.8957, "grad_norm": 0.2963249683380127, "learning_rate": 0.0002, "epoch": 1.5245009074410163, "step": 2100}, {"loss": 1.0446, "grad_norm": 0.30676454305648804, "learning_rate": 0.0002, "epoch": 1.5317604355716878, "step": 2110}, {"loss": 0.9967, "grad_norm": 0.35984641313552856, "learning_rate": 0.0002, "epoch": 1.5390199637023594, "step": 2120}, {"loss": 1.0352, "grad_norm": 0.3384549617767334, "learning_rate": 0.0002, "epoch": 1.546279491833031, "step": 2130}, {"loss": 0.9874, "grad_norm": 0.4725518226623535, "learning_rate": 0.0002, "epoch": 1.5535390199637025, "step": 2140}, {"loss": 1.1166, "grad_norm": 0.3252685070037842, "learning_rate": 0.0002, "epoch": 1.560798548094374, "step": 2150}, {"loss": 0.9127, "grad_norm": 0.45043081045150757, "learning_rate": 0.0002, "epoch": 1.5680580762250453, "step": 2160}, {"loss": 1.0767, "grad_norm": 0.374208003282547, "learning_rate": 0.0002, "epoch": 1.5753176043557169, "step": 2170}, {"loss": 1.0255, "grad_norm": 0.5118404030799866, "learning_rate": 0.0002, "epoch": 1.5825771324863884, "step": 2180}, {"loss": 1.0062, "grad_norm": 0.482170969247818, "learning_rate": 0.0002, "epoch": 1.5898366606170597, "step": 2190}, {"loss": 1.0872, "grad_norm": 0.5337533950805664, "learning_rate": 0.0002, "epoch": 1.5970961887477313, "step": 2200}, {"loss": 1.0405, "grad_norm": 0.5195064544677734, "learning_rate": 0.0002, "epoch": 1.6043557168784028, "step": 2210}, {"loss": 1.0454, "grad_norm": 0.30807098746299744, "learning_rate": 0.0002, "epoch": 1.6116152450090744, "step": 2220}, {"loss": 1.0293, "grad_norm": 0.3962925672531128, "learning_rate": 0.0002, "epoch": 1.618874773139746, "step": 2230}, {"loss": 1.0137, "grad_norm": 0.7636962532997131, "learning_rate": 0.0002, "epoch": 1.6261343012704175, "step": 2240}, {"loss": 0.991, "grad_norm": 0.32380592823028564, "learning_rate": 0.0002, "epoch": 1.633393829401089, "step": 2250}, {"loss": 0.9471, "grad_norm": 0.5767741799354553, "learning_rate": 0.0002, "epoch": 1.6406533575317606, "step": 2260}, {"loss": 0.9524, "grad_norm": 0.39964812994003296, "learning_rate": 0.0002, "epoch": 1.647912885662432, "step": 2270}, {"loss": 0.9866, "grad_norm": 0.622629702091217, "learning_rate": 0.0002, "epoch": 1.6551724137931034, "step": 2280}, {"loss": 0.9609, "grad_norm": 0.40202152729034424, "learning_rate": 0.0002, "epoch": 1.662431941923775, "step": 2290}, {"loss": 0.9859, "grad_norm": 0.4467349052429199, "learning_rate": 0.0002, "epoch": 1.6696914700544465, "step": 2300}, {"loss": 1.0312, "grad_norm": 0.5026949048042297, "learning_rate": 0.0002, "epoch": 1.6769509981851178, "step": 2310}, {"loss": 1.0776, "grad_norm": 0.43754541873931885, "learning_rate": 0.0002, "epoch": 1.6842105263157894, "step": 2320}, {"loss": 0.9786, "grad_norm": 0.42869430780410767, "learning_rate": 0.0002, "epoch": 1.691470054446461, "step": 2330}, {"loss": 1.0747, "grad_norm": 0.4192679524421692, "learning_rate": 0.0002, "epoch": 1.6987295825771325, "step": 2340}, {"loss": 1.0054, "grad_norm": 0.3243155777454376, "learning_rate": 0.0002, "epoch": 1.705989110707804, "step": 2350}, {"loss": 0.9165, "grad_norm": 1.0514075756072998, "learning_rate": 0.0002, "epoch": 1.7132486388384756, "step": 2360}, {"loss": 1.1353, "grad_norm": 0.4719122648239136, "learning_rate": 0.0002, "epoch": 1.720508166969147, "step": 2370}, {"loss": 0.9913, "grad_norm": 0.3846144676208496, "learning_rate": 0.0002, "epoch": 1.7277676950998186, "step": 2380}, {"loss": 0.9935, "grad_norm": 0.7266581058502197, "learning_rate": 0.0002, "epoch": 1.73502722323049, "step": 2390}, {"loss": 0.9509, "grad_norm": 0.6183241605758667, "learning_rate": 0.0002, "epoch": 1.7422867513611615, "step": 2400}, {"loss": 0.9713, "grad_norm": 0.3658260405063629, "learning_rate": 0.0002, "epoch": 1.749546279491833, "step": 2410}, {"loss": 1.0033, "grad_norm": 0.6036322712898254, "learning_rate": 0.0002, "epoch": 1.7568058076225044, "step": 2420}, {"loss": 1.0526, "grad_norm": 0.7872936129570007, "learning_rate": 0.0002, "epoch": 1.764065335753176, "step": 2430}, {"loss": 0.99, "grad_norm": 0.35946124792099, "learning_rate": 0.0002, "epoch": 1.7713248638838475, "step": 2440}, {"loss": 0.8845, "grad_norm": 0.3740338981151581, "learning_rate": 0.0002, "epoch": 1.778584392014519, "step": 2450}, {"loss": 0.9333, "grad_norm": 0.6150230169296265, "learning_rate": 0.0002, "epoch": 1.7858439201451906, "step": 2460}, {"loss": 1.0373, "grad_norm": 0.4726075530052185, "learning_rate": 0.0002, "epoch": 1.793103448275862, "step": 2470}, {"loss": 0.974, "grad_norm": 0.31292253732681274, "learning_rate": 0.0002, "epoch": 1.8003629764065336, "step": 2480}, {"loss": 1.0205, "grad_norm": 0.4463104009628296, "learning_rate": 0.0002, "epoch": 1.8076225045372052, "step": 2490}, {"loss": 1.0771, "grad_norm": 0.7848200798034668, "learning_rate": 0.0002, "epoch": 1.8148820326678767, "step": 2500}, {"loss": 0.9508, "grad_norm": 0.5562082529067993, "learning_rate": 0.0002, "epoch": 1.822141560798548, "step": 2510}, {"loss": 0.9343, "grad_norm": 0.39892756938934326, "learning_rate": 0.0002, "epoch": 1.8294010889292196, "step": 2520}, {"loss": 1.0489, "grad_norm": 1.2923320531845093, "learning_rate": 0.0002, "epoch": 1.8366606170598911, "step": 2530}, {"loss": 1.1203, "grad_norm": 0.6316490769386292, "learning_rate": 0.0002, "epoch": 1.8439201451905625, "step": 2540}, {"loss": 0.9131, "grad_norm": 0.46100497245788574, "learning_rate": 0.0002, "epoch": 1.851179673321234, "step": 2550}, {"loss": 1.0977, "grad_norm": 0.7902987003326416, "learning_rate": 0.0002, "epoch": 1.8584392014519056, "step": 2560}, {"loss": 0.9702, "grad_norm": 0.4596365690231323, "learning_rate": 0.0002, "epoch": 1.865698729582577, "step": 2570}, {"loss": 0.9322, "grad_norm": 0.6592172384262085, "learning_rate": 0.0002, "epoch": 1.8729582577132486, "step": 2580}, {"loss": 0.9549, "grad_norm": 0.5764662027359009, "learning_rate": 0.0002, "epoch": 1.8802177858439202, "step": 2590}, {"loss": 0.9485, "grad_norm": 0.8421637415885925, "learning_rate": 0.0002, "epoch": 1.8874773139745917, "step": 2600}, {"loss": 1.012, "grad_norm": 0.5635305047035217, "learning_rate": 0.0002, "epoch": 1.8947368421052633, "step": 2610}, {"loss": 0.8907, "grad_norm": 0.46231237053871155, "learning_rate": 0.0002, "epoch": 1.9019963702359348, "step": 2620}, {"loss": 0.9543, "grad_norm": 0.3944607079029083, "learning_rate": 0.0002, "epoch": 1.9092558983666061, "step": 2630}, {"loss": 0.9964, "grad_norm": 0.4753907322883606, "learning_rate": 0.0002, "epoch": 1.9165154264972777, "step": 2640}, {"loss": 1.0217, "grad_norm": 0.4151090979576111, "learning_rate": 0.0002, "epoch": 1.9237749546279492, "step": 2650}, {"loss": 1.0397, "grad_norm": 0.6793725490570068, "learning_rate": 0.0002, "epoch": 1.9310344827586206, "step": 2660}, {"loss": 1.0401, "grad_norm": 0.339755117893219, "learning_rate": 0.0002, "epoch": 1.938294010889292, "step": 2670}, {"loss": 1.082, "grad_norm": 0.40789374709129333, "learning_rate": 0.0002, "epoch": 1.9455535390199636, "step": 2680}, {"loss": 0.9973, "grad_norm": 0.3750005066394806, "learning_rate": 0.0002, "epoch": 1.9528130671506352, "step": 2690}, {"loss": 0.9837, "grad_norm": 0.39684441685676575, "learning_rate": 0.0002, "epoch": 1.9600725952813067, "step": 2700}, {"loss": 1.1204, "grad_norm": 0.378287672996521, "learning_rate": 0.0002, "epoch": 1.9673321234119783, "step": 2710}, {"loss": 0.9284, "grad_norm": 0.3668482005596161, "learning_rate": 0.0002, "epoch": 1.9745916515426498, "step": 2720}, {"loss": 1.0512, "grad_norm": 0.49997565150260925, "learning_rate": 0.0002, "epoch": 1.9818511796733214, "step": 2730}, {"loss": 0.9311, "grad_norm": 0.36852124333381653, "learning_rate": 0.0002, "epoch": 1.989110707803993, "step": 2740}, {"loss": 1.0898, "grad_norm": 0.5203380584716797, "learning_rate": 0.0002, "epoch": 1.9963702359346642, "step": 2750}, {"eval_loss": 1.1086540222167969, "eval_runtime": 95.6866, "eval_samples_per_second": 4.557, "eval_steps_per_second": 0.575, "epoch": 2.0, "step": 2755}, {"loss": 0.9791, "grad_norm": 0.35921573638916016, "learning_rate": 0.0002, "epoch": 2.0036297640653356, "step": 2760}, {"loss": 0.8968, "grad_norm": 1.013861894607544, "learning_rate": 0.0002, "epoch": 2.010889292196007, "step": 2770}, {"loss": 0.7873, "grad_norm": 0.4425240159034729, "learning_rate": 0.0002, "epoch": 2.0181488203266786, "step": 2780}, {"loss": 0.8758, "grad_norm": 0.60174161195755, "learning_rate": 0.0002, "epoch": 2.02540834845735, "step": 2790}, {"loss": 0.7989, "grad_norm": 0.47582098841667175, "learning_rate": 0.0002, "epoch": 2.0326678765880217, "step": 2800}, {"loss": 0.8898, "grad_norm": 0.6012811660766602, "learning_rate": 0.0002, "epoch": 2.0399274047186933, "step": 2810}, {"loss": 0.8109, "grad_norm": 0.4444408118724823, "learning_rate": 0.0002, "epoch": 2.047186932849365, "step": 2820}, {"loss": 0.8621, "grad_norm": 0.4864003360271454, "learning_rate": 0.0002, "epoch": 2.0544464609800364, "step": 2830}, {"loss": 0.8817, "grad_norm": 0.5104215741157532, "learning_rate": 0.0002, "epoch": 2.061705989110708, "step": 2840}, {"loss": 0.7789, "grad_norm": 0.6218489408493042, "learning_rate": 0.0002, "epoch": 2.0689655172413794, "step": 2850}, {"loss": 0.8778, "grad_norm": 0.705784261226654, "learning_rate": 0.0002, "epoch": 2.076225045372051, "step": 2860}, {"loss": 0.7851, "grad_norm": 0.48091503977775574, "learning_rate": 0.0002, "epoch": 2.0834845735027225, "step": 2870}, {"loss": 0.8444, "grad_norm": 0.5062456727027893, "learning_rate": 0.0002, "epoch": 2.0907441016333936, "step": 2880}, {"loss": 0.7898, "grad_norm": 0.4862022399902344, "learning_rate": 0.0002, "epoch": 2.098003629764065, "step": 2890}, {"loss": 0.8593, "grad_norm": 0.48264044523239136, "learning_rate": 0.0002, "epoch": 2.1052631578947367, "step": 2900}, {"loss": 0.8325, "grad_norm": 0.43744346499443054, "learning_rate": 0.0002, "epoch": 2.1125226860254083, "step": 2910}, {"loss": 0.9099, "grad_norm": 0.5480492115020752, "learning_rate": 0.0002, "epoch": 2.11978221415608, "step": 2920}, {"loss": 0.7727, "grad_norm": 0.5068560838699341, "learning_rate": 0.0002, "epoch": 2.1270417422867514, "step": 2930}, {"loss": 0.8403, "grad_norm": 0.4650852680206299, "learning_rate": 0.0002, "epoch": 2.134301270417423, "step": 2940}, {"loss": 0.8467, "grad_norm": 0.4929981231689453, "learning_rate": 0.0002, "epoch": 2.1415607985480944, "step": 2950}, {"loss": 0.8747, "grad_norm": 0.6537389755249023, "learning_rate": 0.0002, "epoch": 2.148820326678766, "step": 2960}, {"loss": 0.7663, "grad_norm": 0.8032940626144409, "learning_rate": 0.0002, "epoch": 2.1560798548094375, "step": 2970}, {"loss": 0.7604, "grad_norm": 0.7131643891334534, "learning_rate": 0.0002, "epoch": 2.163339382940109, "step": 2980}, {"loss": 0.8424, "grad_norm": 0.6034275889396667, "learning_rate": 0.0002, "epoch": 2.1705989110707806, "step": 2990}, {"loss": 0.8838, "grad_norm": 0.6081095933914185, "learning_rate": 0.0002, "epoch": 2.1778584392014517, "step": 3000}, {"loss": 0.9013, "grad_norm": 0.5706912875175476, "learning_rate": 0.0002, "epoch": 2.1851179673321233, "step": 3010}, {"loss": 0.8473, "grad_norm": 0.6742380261421204, "learning_rate": 0.0002, "epoch": 2.192377495462795, "step": 3020}, {"loss": 0.8372, "grad_norm": 0.4847496449947357, "learning_rate": 0.0002, "epoch": 2.1996370235934664, "step": 3030}, {"loss": 0.8487, "grad_norm": 0.5529342889785767, "learning_rate": 0.0002, "epoch": 2.206896551724138, "step": 3040}, {"loss": 0.9057, "grad_norm": 0.6108783483505249, "learning_rate": 0.0002, "epoch": 2.2141560798548094, "step": 3050}, {"loss": 0.9659, "grad_norm": 0.8841571807861328, "learning_rate": 0.0002, "epoch": 2.221415607985481, "step": 3060}, {"loss": 0.8538, "grad_norm": 0.4227530360221863, "learning_rate": 0.0002, "epoch": 2.2286751361161525, "step": 3070}, {"loss": 0.7925, "grad_norm": 0.461935818195343, "learning_rate": 0.0002, "epoch": 2.235934664246824, "step": 3080}, {"loss": 0.8542, "grad_norm": 0.5407412648200989, "learning_rate": 0.0002, "epoch": 2.2431941923774956, "step": 3090}, {"loss": 0.8392, "grad_norm": 0.5057998895645142, "learning_rate": 0.0002, "epoch": 2.250453720508167, "step": 3100}, {"loss": 0.7898, "grad_norm": 0.530057966709137, "learning_rate": 0.0002, "epoch": 2.2577132486388383, "step": 3110}, {"loss": 0.8507, "grad_norm": 0.5066842436790466, "learning_rate": 0.0002, "epoch": 2.26497277676951, "step": 3120}, {"loss": 0.7903, "grad_norm": 0.5069178342819214, "learning_rate": 0.0002, "epoch": 2.2722323049001814, "step": 3130}, {"loss": 0.8458, "grad_norm": 0.6095499396324158, "learning_rate": 0.0002, "epoch": 2.279491833030853, "step": 3140}, {"loss": 0.8171, "grad_norm": 0.49524766206741333, "learning_rate": 0.0002, "epoch": 2.2867513611615244, "step": 3150}, {"loss": 0.7568, "grad_norm": 0.5334409475326538, "learning_rate": 0.0002, "epoch": 2.294010889292196, "step": 3160}, {"loss": 0.7533, "grad_norm": 1.681748867034912, "learning_rate": 0.0002, "epoch": 2.3012704174228675, "step": 3170}, {"loss": 0.8155, "grad_norm": 0.7225565314292908, "learning_rate": 0.0002, "epoch": 2.308529945553539, "step": 3180}, {"loss": 0.8034, "grad_norm": 0.5379388928413391, "learning_rate": 0.0002, "epoch": 2.3157894736842106, "step": 3190}, {"loss": 0.8833, "grad_norm": 0.45770326256752014, "learning_rate": 0.0002, "epoch": 2.323049001814882, "step": 3200}, {"loss": 0.8419, "grad_norm": 0.6984533071517944, "learning_rate": 0.0002, "epoch": 2.3303085299455537, "step": 3210}, {"loss": 0.7414, "grad_norm": 0.6725744605064392, "learning_rate": 0.0002, "epoch": 2.337568058076225, "step": 3220}, {"loss": 0.8657, "grad_norm": 1.1247471570968628, "learning_rate": 0.0002, "epoch": 2.344827586206897, "step": 3230}, {"loss": 0.7782, "grad_norm": 1.0240263938903809, "learning_rate": 0.0002, "epoch": 2.352087114337568, "step": 3240}, {"loss": 0.826, "grad_norm": 0.5608096122741699, "learning_rate": 0.0002, "epoch": 2.3593466424682394, "step": 3250}, {"loss": 0.8191, "grad_norm": 0.8294990062713623, "learning_rate": 0.0002, "epoch": 2.366606170598911, "step": 3260}, {"loss": 0.9845, "grad_norm": 0.6734224557876587, "learning_rate": 0.0002, "epoch": 2.3738656987295825, "step": 3270}, {"loss": 0.7921, "grad_norm": 0.6862800717353821, "learning_rate": 0.0002, "epoch": 2.381125226860254, "step": 3280}, {"loss": 0.7752, "grad_norm": 0.5442930459976196, "learning_rate": 0.0002, "epoch": 2.3883847549909256, "step": 3290}, {"loss": 0.8543, "grad_norm": 0.5745394229888916, "learning_rate": 0.0002, "epoch": 2.395644283121597, "step": 3300}, {"loss": 0.833, "grad_norm": 0.6257799863815308, "learning_rate": 0.0002, "epoch": 2.4029038112522687, "step": 3310}, {"loss": 0.8191, "grad_norm": 0.5608420968055725, "learning_rate": 0.0002, "epoch": 2.4101633393829403, "step": 3320}, {"loss": 0.8745, "grad_norm": 0.5512017011642456, "learning_rate": 0.0002, "epoch": 2.417422867513612, "step": 3330}, {"loss": 0.8642, "grad_norm": 0.7452999353408813, "learning_rate": 0.0002, "epoch": 2.4246823956442833, "step": 3340}, {"loss": 0.8417, "grad_norm": 0.4604301452636719, "learning_rate": 0.0002, "epoch": 2.4319419237749544, "step": 3350}, {"loss": 0.9148, "grad_norm": 0.8225823640823364, "learning_rate": 0.0002, "epoch": 2.439201451905626, "step": 3360}, {"loss": 0.8144, "grad_norm": 0.8604981303215027, "learning_rate": 0.0002, "epoch": 2.4464609800362975, "step": 3370}, {"loss": 0.9276, "grad_norm": 0.6620925664901733, "learning_rate": 0.0002, "epoch": 2.453720508166969, "step": 3380}, {"loss": 0.8381, "grad_norm": 0.4750158488750458, "learning_rate": 0.0002, "epoch": 2.4609800362976406, "step": 3390}, {"loss": 0.8249, "grad_norm": 0.6061418056488037, "learning_rate": 0.0002, "epoch": 2.468239564428312, "step": 3400}, {"loss": 0.8944, "grad_norm": 0.5977247357368469, "learning_rate": 0.0002, "epoch": 2.4754990925589837, "step": 3410}, {"loss": 0.8366, "grad_norm": 0.5004227757453918, "learning_rate": 0.0002, "epoch": 2.4827586206896552, "step": 3420}, {"loss": 0.9133, "grad_norm": 0.46428972482681274, "learning_rate": 0.0002, "epoch": 2.490018148820327, "step": 3430}, {"loss": 0.8534, "grad_norm": 0.7658395171165466, "learning_rate": 0.0002, "epoch": 2.4972776769509983, "step": 3440}, {"loss": 0.9066, "grad_norm": 0.621819794178009, "learning_rate": 0.0002, "epoch": 2.50453720508167, "step": 3450}, {"loss": 0.8481, "grad_norm": 0.4556088149547577, "learning_rate": 0.0002, "epoch": 2.511796733212341, "step": 3460}, {"loss": 0.8522, "grad_norm": 0.6124140024185181, "learning_rate": 0.0002, "epoch": 2.519056261343013, "step": 3470}, {"loss": 0.8158, "grad_norm": 0.6256231665611267, "learning_rate": 0.0002, "epoch": 2.526315789473684, "step": 3480}, {"loss": 0.9097, "grad_norm": 0.5464141964912415, "learning_rate": 0.0002, "epoch": 2.5335753176043556, "step": 3490}, {"loss": 0.8708, "grad_norm": 0.51471346616745, "learning_rate": 0.0002, "epoch": 2.540834845735027, "step": 3500}, {"loss": 0.9042, "grad_norm": 0.5326165556907654, "learning_rate": 0.0002, "epoch": 2.5480943738656987, "step": 3510}, {"loss": 0.8749, "grad_norm": 0.4750378429889679, "learning_rate": 0.0002, "epoch": 2.5553539019963702, "step": 3520}, {"loss": 0.8248, "grad_norm": 0.5292279124259949, "learning_rate": 0.0002, "epoch": 2.562613430127042, "step": 3530}, {"loss": 0.8808, "grad_norm": 0.6145227551460266, "learning_rate": 0.0002, "epoch": 2.5698729582577133, "step": 3540}, {"loss": 0.846, "grad_norm": 0.8275189399719238, "learning_rate": 0.0002, "epoch": 2.577132486388385, "step": 3550}, {"loss": 0.875, "grad_norm": 0.5037438273429871, "learning_rate": 0.0002, "epoch": 2.5843920145190564, "step": 3560}, {"loss": 0.8324, "grad_norm": 0.5838707685470581, "learning_rate": 0.0002, "epoch": 2.5916515426497275, "step": 3570}, {"loss": 0.8716, "grad_norm": 0.5398710370063782, "learning_rate": 0.0002, "epoch": 2.5989110707803995, "step": 3580}, {"loss": 0.7843, "grad_norm": 0.6115376949310303, "learning_rate": 0.0002, "epoch": 2.6061705989110706, "step": 3590}, {"loss": 0.8142, "grad_norm": 1.118809461593628, "learning_rate": 0.0002, "epoch": 2.613430127041742, "step": 3600}, {"loss": 0.8282, "grad_norm": 0.7811771631240845, "learning_rate": 0.0002, "epoch": 2.6206896551724137, "step": 3610}, {"loss": 0.7997, "grad_norm": 0.5753175020217896, "learning_rate": 0.0002, "epoch": 2.6279491833030852, "step": 3620}, {"loss": 0.7705, "grad_norm": 0.550829291343689, "learning_rate": 0.0002, "epoch": 2.635208711433757, "step": 3630}, {"loss": 0.8564, "grad_norm": 0.5360019207000732, "learning_rate": 0.0002, "epoch": 2.6424682395644283, "step": 3640}, {"loss": 0.7648, "grad_norm": 0.63050377368927, "learning_rate": 0.0002, "epoch": 2.6497277676951, "step": 3650}, {"loss": 0.9156, "grad_norm": 0.5833110213279724, "learning_rate": 0.0002, "epoch": 2.6569872958257714, "step": 3660}, {"loss": 0.8033, "grad_norm": 0.5543047189712524, "learning_rate": 0.0002, "epoch": 2.664246823956443, "step": 3670}, {"loss": 0.8867, "grad_norm": 0.6842212080955505, "learning_rate": 0.0002, "epoch": 2.671506352087114, "step": 3680}, {"loss": 0.8569, "grad_norm": 0.6859333515167236, "learning_rate": 0.0002, "epoch": 2.678765880217786, "step": 3690}, {"loss": 0.8081, "grad_norm": 0.7038410902023315, "learning_rate": 0.0002, "epoch": 2.686025408348457, "step": 3700}, {"loss": 0.8686, "grad_norm": 0.447233647108078, "learning_rate": 0.0002, "epoch": 2.6932849364791287, "step": 3710}, {"loss": 0.8951, "grad_norm": 0.6453872919082642, "learning_rate": 0.0002, "epoch": 2.7005444646098002, "step": 3720}, {"loss": 0.8292, "grad_norm": 0.8025672435760498, "learning_rate": 0.0002, "epoch": 2.707803992740472, "step": 3730}, {"loss": 0.8488, "grad_norm": 0.5997087955474854, "learning_rate": 0.0002, "epoch": 2.7150635208711433, "step": 3740}, {"loss": 0.8308, "grad_norm": 0.6901142001152039, "learning_rate": 0.0002, "epoch": 2.722323049001815, "step": 3750}, {"loss": 0.8517, "grad_norm": 1.036145567893982, "learning_rate": 0.0002, "epoch": 2.7295825771324864, "step": 3760}, {"loss": 0.8455, "grad_norm": 0.7207072377204895, "learning_rate": 0.0002, "epoch": 2.736842105263158, "step": 3770}, {"loss": 0.8009, "grad_norm": 1.0452989339828491, "learning_rate": 0.0002, "epoch": 2.7441016333938295, "step": 3780}, {"loss": 0.8868, "grad_norm": 0.5615278482437134, "learning_rate": 0.0002, "epoch": 2.751361161524501, "step": 3790}, {"loss": 0.9047, "grad_norm": 0.46439215540885925, "learning_rate": 0.0002, "epoch": 2.7586206896551726, "step": 3800}, {"loss": 0.7847, "grad_norm": 0.7134917974472046, "learning_rate": 0.0002, "epoch": 2.7658802177858437, "step": 3810}, {"loss": 0.8315, "grad_norm": 0.5139115452766418, "learning_rate": 0.0002, "epoch": 2.7731397459165157, "step": 3820}, {"loss": 0.8995, "grad_norm": 0.8595781326293945, "learning_rate": 0.0002, "epoch": 2.780399274047187, "step": 3830}, {"loss": 0.9079, "grad_norm": 0.544614851474762, "learning_rate": 0.0002, "epoch": 2.7876588021778583, "step": 3840}, {"loss": 0.8322, "grad_norm": 0.6073850393295288, "learning_rate": 0.0002, "epoch": 2.79491833030853, "step": 3850}, {"loss": 0.8277, "grad_norm": 0.8224069476127625, "learning_rate": 0.0002, "epoch": 2.8021778584392014, "step": 3860}, {"loss": 0.815, "grad_norm": 0.5347970128059387, "learning_rate": 0.0002, "epoch": 2.809437386569873, "step": 3870}, {"loss": 0.8219, "grad_norm": 0.6799601912498474, "learning_rate": 0.0002, "epoch": 2.8166969147005445, "step": 3880}, {"loss": 0.8796, "grad_norm": 0.5219197869300842, "learning_rate": 0.0002, "epoch": 2.823956442831216, "step": 3890}, {"loss": 0.8649, "grad_norm": 0.5710130333900452, "learning_rate": 0.0002, "epoch": 2.8312159709618876, "step": 3900}, {"loss": 0.8741, "grad_norm": 0.5857213139533997, "learning_rate": 0.0002, "epoch": 2.838475499092559, "step": 3910}, {"loss": 0.8154, "grad_norm": 0.5206644535064697, "learning_rate": 0.0002, "epoch": 2.8457350272232302, "step": 3920}, {"loss": 0.9328, "grad_norm": 1.2902015447616577, "learning_rate": 0.0002, "epoch": 2.8529945553539022, "step": 3930}, {"loss": 0.8192, "grad_norm": 0.5252797603607178, "learning_rate": 0.0002, "epoch": 2.8602540834845733, "step": 3940}, {"loss": 0.9047, "grad_norm": 0.5925108790397644, "learning_rate": 0.0002, "epoch": 2.867513611615245, "step": 3950}, {"loss": 0.8611, "grad_norm": 0.9719610810279846, "learning_rate": 0.0002, "epoch": 2.8747731397459164, "step": 3960}, {"loss": 0.8531, "grad_norm": 0.6834747195243835, "learning_rate": 0.0002, "epoch": 2.882032667876588, "step": 3970}, {"loss": 0.8124, "grad_norm": 0.6952353119850159, "learning_rate": 0.0002, "epoch": 2.8892921960072595, "step": 3980}, {"loss": 0.8365, "grad_norm": 0.49889910221099854, "learning_rate": 0.0002, "epoch": 2.896551724137931, "step": 3990}, {"loss": 0.8197, "grad_norm": 0.5007026791572571, "learning_rate": 0.0002, "epoch": 2.9038112522686026, "step": 4000}, {"loss": 0.7752, "grad_norm": 0.5474239587783813, "learning_rate": 0.0002, "epoch": 2.911070780399274, "step": 4010}, {"loss": 0.8579, "grad_norm": 0.6617428660392761, "learning_rate": 0.0002, "epoch": 2.9183303085299457, "step": 4020}, {"loss": 0.8583, "grad_norm": 0.6097776293754578, "learning_rate": 0.0002, "epoch": 2.925589836660617, "step": 4030}, {"loss": 0.8296, "grad_norm": 0.5985828638076782, "learning_rate": 0.0002, "epoch": 2.9328493647912888, "step": 4040}, {"loss": 0.9134, "grad_norm": 0.769488513469696, "learning_rate": 0.0002, "epoch": 2.94010889292196, "step": 4050}, {"loss": 0.8148, "grad_norm": 0.5167055130004883, "learning_rate": 0.0002, "epoch": 2.9473684210526314, "step": 4060}, {"loss": 0.8665, "grad_norm": 0.6265496015548706, "learning_rate": 0.0002, "epoch": 2.954627949183303, "step": 4070}, {"loss": 0.9218, "grad_norm": 1.2644082307815552, "learning_rate": 0.0002, "epoch": 2.9618874773139745, "step": 4080}, {"loss": 0.8026, "grad_norm": 0.6007561087608337, "learning_rate": 0.0002, "epoch": 2.969147005444646, "step": 4090}, {"loss": 0.8262, "grad_norm": 0.47984927892684937, "learning_rate": 0.0002, "epoch": 2.9764065335753176, "step": 4100}, {"loss": 0.7897, "grad_norm": 1.128198504447937, "learning_rate": 0.0002, "epoch": 2.983666061705989, "step": 4110}, {"loss": 0.7883, "grad_norm": 0.526292085647583, "learning_rate": 0.0002, "epoch": 2.9909255898366607, "step": 4120}, {"loss": 0.7801, "grad_norm": 0.5462674498558044, "learning_rate": 0.0002, "epoch": 2.9981851179673322, "step": 4130}, {"eval_loss": 1.1093357801437378, "eval_runtime": 46.2498, "eval_samples_per_second": 9.427, "eval_steps_per_second": 1.189, "epoch": 2.9996370235934666, "step": 4132}, {"loss": 0.721, "grad_norm": 0.48288026452064514, "learning_rate": 0.0002, "epoch": 3.0054446460980038, "step": 4140}, {"loss": 0.6769, "grad_norm": 1.0181782245635986, "learning_rate": 0.0002, "epoch": 3.0127041742286753, "step": 4150}, {"loss": 0.7185, "grad_norm": 0.7718019485473633, "learning_rate": 0.0002, "epoch": 3.019963702359347, "step": 4160}, {"loss": 0.6552, "grad_norm": 0.7492219805717468, "learning_rate": 0.0002, "epoch": 3.027223230490018, "step": 4170}, {"loss": 0.6678, "grad_norm": 0.9363632798194885, "learning_rate": 0.0002, "epoch": 3.0344827586206895, "step": 4180}, {"loss": 0.7187, "grad_norm": 0.6888533234596252, "learning_rate": 0.0002, "epoch": 3.041742286751361, "step": 4190}, {"loss": 0.6469, "grad_norm": 0.7072834968566895, "learning_rate": 0.0002, "epoch": 3.0490018148820326, "step": 4200}, {"loss": 0.6387, "grad_norm": 0.7182047963142395, "learning_rate": 0.0002, "epoch": 3.056261343012704, "step": 4210}, {"loss": 0.6385, "grad_norm": 0.7194355130195618, "learning_rate": 0.0002, "epoch": 3.0635208711433757, "step": 4220}, {"loss": 0.5812, "grad_norm": 0.9454023838043213, "learning_rate": 0.0002, "epoch": 3.0707803992740472, "step": 4230}, {"loss": 0.6036, "grad_norm": 0.838657557964325, "learning_rate": 0.0002, "epoch": 3.0780399274047188, "step": 4240}, {"loss": 0.646, "grad_norm": 0.740113377571106, "learning_rate": 0.0002, "epoch": 3.0852994555353903, "step": 4250}, {"loss": 0.604, "grad_norm": 0.6616561412811279, "learning_rate": 0.0002, "epoch": 3.092558983666062, "step": 4260}, {"loss": 0.6462, "grad_norm": 0.8846506476402283, "learning_rate": 0.0002, "epoch": 3.0998185117967334, "step": 4270}, {"loss": 0.6037, "grad_norm": 0.6322125792503357, "learning_rate": 0.0002, "epoch": 3.107078039927405, "step": 4280}, {"loss": 0.5953, "grad_norm": 0.7461467385292053, "learning_rate": 0.0002, "epoch": 3.114337568058076, "step": 4290}, {"loss": 0.6099, "grad_norm": 0.8251287341117859, "learning_rate": 0.0002, "epoch": 3.1215970961887476, "step": 4300}, {"loss": 0.6284, "grad_norm": 0.8767673373222351, "learning_rate": 0.0002, "epoch": 3.128856624319419, "step": 4310}, {"loss": 0.7535, "grad_norm": 0.7758759260177612, "learning_rate": 0.0002, "epoch": 3.1361161524500907, "step": 4320}, {"loss": 0.6624, "grad_norm": 1.1056879758834839, "learning_rate": 0.0002, "epoch": 3.143375680580762, "step": 4330}, {"loss": 0.691, "grad_norm": 0.8259835243225098, "learning_rate": 0.0002, "epoch": 3.1506352087114338, "step": 4340}, {"loss": 0.6635, "grad_norm": 0.6607027053833008, "learning_rate": 0.0002, "epoch": 3.1578947368421053, "step": 4350}, {"loss": 0.5911, "grad_norm": 0.7983301281929016, "learning_rate": 0.0002, "epoch": 3.165154264972777, "step": 4360}, {"loss": 0.6496, "grad_norm": 0.6725239157676697, "learning_rate": 0.0002, "epoch": 3.1724137931034484, "step": 4370}, {"loss": 0.5966, "grad_norm": 0.9052095413208008, "learning_rate": 0.0002, "epoch": 3.17967332123412, "step": 4380}, {"loss": 0.6877, "grad_norm": 0.8131307363510132, "learning_rate": 0.0002, "epoch": 3.1869328493647915, "step": 4390}, {"loss": 0.6384, "grad_norm": 0.6435626149177551, "learning_rate": 0.0002, "epoch": 3.1941923774954626, "step": 4400}, {"loss": 0.5819, "grad_norm": 0.84367436170578, "learning_rate": 0.0002, "epoch": 3.201451905626134, "step": 4410}, {"loss": 0.6104, "grad_norm": 1.5018867254257202, "learning_rate": 0.0002, "epoch": 3.2087114337568057, "step": 4420}, {"loss": 0.6838, "grad_norm": 0.7019091844558716, "learning_rate": 0.0002, "epoch": 3.215970961887477, "step": 4430}, {"loss": 0.6153, "grad_norm": 0.9164197444915771, "learning_rate": 0.0002, "epoch": 3.2232304900181488, "step": 4440}, {"loss": 0.6618, "grad_norm": 0.7890861630439758, "learning_rate": 0.0002, "epoch": 3.2304900181488203, "step": 4450}, {"loss": 0.6401, "grad_norm": 0.6517660617828369, "learning_rate": 0.0002, "epoch": 3.237749546279492, "step": 4460}, {"loss": 0.6699, "grad_norm": 1.10188889503479, "learning_rate": 0.0002, "epoch": 3.2450090744101634, "step": 4470}, {"loss": 0.6356, "grad_norm": 0.8158330917358398, "learning_rate": 0.0002, "epoch": 3.252268602540835, "step": 4480}, {"loss": 0.7757, "grad_norm": 0.7663109302520752, "learning_rate": 0.0002, "epoch": 3.2595281306715065, "step": 4490}, {"loss": 0.6539, "grad_norm": 0.8473444581031799, "learning_rate": 0.0002, "epoch": 3.266787658802178, "step": 4500}, {"loss": 0.6511, "grad_norm": 0.9724768996238708, "learning_rate": 0.0002, "epoch": 3.274047186932849, "step": 4510}, {"loss": 0.5464, "grad_norm": 0.8516759276390076, "learning_rate": 0.0002, "epoch": 3.281306715063521, "step": 4520}, {"loss": 0.6534, "grad_norm": 0.7543437480926514, "learning_rate": 0.0002, "epoch": 3.288566243194192, "step": 4530}, {"loss": 0.6095, "grad_norm": 1.0472029447555542, "learning_rate": 0.0002, "epoch": 3.2958257713248638, "step": 4540}, {"loss": 0.6216, "grad_norm": 0.6240826845169067, "learning_rate": 0.0002, "epoch": 3.3030852994555353, "step": 4550}, {"loss": 0.6223, "grad_norm": 0.9957774877548218, "learning_rate": 0.0002, "epoch": 3.310344827586207, "step": 4560}, {"loss": 0.618, "grad_norm": 0.6448912620544434, "learning_rate": 0.0002, "epoch": 3.3176043557168784, "step": 4570}, {"loss": 0.6188, "grad_norm": 0.7519692778587341, "learning_rate": 0.0002, "epoch": 3.32486388384755, "step": 4580}, {"loss": 0.6672, "grad_norm": 0.7367453575134277, "learning_rate": 0.0002, "epoch": 3.3321234119782215, "step": 4590}, {"loss": 0.6517, "grad_norm": 0.8064960837364197, "learning_rate": 0.0002, "epoch": 3.339382940108893, "step": 4600}, {"loss": 0.6062, "grad_norm": 0.7664631009101868, "learning_rate": 0.0002, "epoch": 3.3466424682395646, "step": 4610}, {"loss": 0.6834, "grad_norm": 0.7803396582603455, "learning_rate": 0.0002, "epoch": 3.353901996370236, "step": 4620}, {"loss": 0.6961, "grad_norm": 0.9141599535942078, "learning_rate": 0.0002, "epoch": 3.3611615245009077, "step": 4630}, {"loss": 0.6889, "grad_norm": 0.9719856381416321, "learning_rate": 0.0002, "epoch": 3.3684210526315788, "step": 4640}, {"loss": 0.6914, "grad_norm": 0.9223218560218811, "learning_rate": 0.0002, "epoch": 3.3756805807622503, "step": 4650}, {"loss": 0.5981, "grad_norm": 0.7289277911186218, "learning_rate": 0.0002, "epoch": 3.382940108892922, "step": 4660}, {"loss": 0.595, "grad_norm": 1.039724349975586, "learning_rate": 0.0002, "epoch": 3.3901996370235934, "step": 4670}, {"loss": 0.8121, "grad_norm": 1.397438883781433, "learning_rate": 0.0002, "epoch": 3.397459165154265, "step": 4680}, {"loss": 0.6334, "grad_norm": 1.0069999694824219, "learning_rate": 0.0002, "epoch": 3.4047186932849365, "step": 4690}, {"loss": 0.6598, "grad_norm": 0.816291332244873, "learning_rate": 0.0002, "epoch": 3.411978221415608, "step": 4700}, {"loss": 0.6748, "grad_norm": 1.2831530570983887, "learning_rate": 0.0002, "epoch": 3.4192377495462796, "step": 4710}, {"loss": 0.6625, "grad_norm": 0.9573889970779419, "learning_rate": 0.0002, "epoch": 3.426497277676951, "step": 4720}, {"loss": 0.7279, "grad_norm": 0.7685632705688477, "learning_rate": 0.0002, "epoch": 3.4337568058076227, "step": 4730}, {"loss": 0.6104, "grad_norm": 0.7019195556640625, "learning_rate": 0.0002, "epoch": 3.441016333938294, "step": 4740}, {"loss": 0.7606, "grad_norm": 0.7244833707809448, "learning_rate": 0.0002, "epoch": 3.4482758620689653, "step": 4750}, {"loss": 0.6951, "grad_norm": 1.3468551635742188, "learning_rate": 0.0002, "epoch": 3.455535390199637, "step": 4760}, {"loss": 0.6945, "grad_norm": 0.822846531867981, "learning_rate": 0.0002, "epoch": 3.4627949183303084, "step": 4770}, {"loss": 0.6431, "grad_norm": 0.7311608195304871, "learning_rate": 0.0002, "epoch": 3.47005444646098, "step": 4780}, {"loss": 0.7019, "grad_norm": 0.9466770887374878, "learning_rate": 0.0002, "epoch": 3.4773139745916515, "step": 4790}, {"loss": 0.7767, "grad_norm": 1.1527155637741089, "learning_rate": 0.0002, "epoch": 3.484573502722323, "step": 4800}, {"loss": 0.6882, "grad_norm": 1.1288906335830688, "learning_rate": 0.0002, "epoch": 3.4918330308529946, "step": 4810}, {"loss": 0.6564, "grad_norm": 0.9096164107322693, "learning_rate": 0.0002, "epoch": 3.499092558983666, "step": 4820}, {"loss": 0.6127, "grad_norm": 0.7988565564155579, "learning_rate": 0.0002, "epoch": 3.5063520871143377, "step": 4830}, {"loss": 0.7004, "grad_norm": 0.7183415293693542, "learning_rate": 0.0002, "epoch": 3.513611615245009, "step": 4840}, {"loss": 0.74, "grad_norm": 0.6614915132522583, "learning_rate": 0.0002, "epoch": 3.5208711433756807, "step": 4850}, {"loss": 0.7271, "grad_norm": 0.8609521985054016, "learning_rate": 0.0002, "epoch": 3.528130671506352, "step": 4860}, {"loss": 0.6664, "grad_norm": 0.86552894115448, "learning_rate": 0.0002, "epoch": 3.535390199637024, "step": 4870}, {"loss": 0.6432, "grad_norm": 0.6926496028900146, "learning_rate": 0.0002, "epoch": 3.542649727767695, "step": 4880}, {"loss": 0.7117, "grad_norm": 0.8157467246055603, "learning_rate": 0.0002, "epoch": 3.5499092558983665, "step": 4890}, {"loss": 0.6201, "grad_norm": 0.9085357189178467, "learning_rate": 0.0002, "epoch": 3.557168784029038, "step": 4900}, {"loss": 0.6521, "grad_norm": 0.6322644948959351, "learning_rate": 0.0002, "epoch": 3.5644283121597096, "step": 4910}, {"loss": 0.6607, "grad_norm": 1.263205885887146, "learning_rate": 0.0002, "epoch": 3.571687840290381, "step": 4920}, {"loss": 0.6657, "grad_norm": 0.8901070356369019, "learning_rate": 0.0002, "epoch": 3.5789473684210527, "step": 4930}, {"loss": 0.6434, "grad_norm": 0.7983952164649963, "learning_rate": 0.0002, "epoch": 3.586206896551724, "step": 4940}, {"loss": 0.6861, "grad_norm": 0.9887813925743103, "learning_rate": 0.0002, "epoch": 3.5934664246823957, "step": 4950}, {"loss": 0.6502, "grad_norm": 0.7895187735557556, "learning_rate": 0.0002, "epoch": 3.6007259528130673, "step": 4960}, {"loss": 0.7111, "grad_norm": 0.9685819745063782, "learning_rate": 0.0002, "epoch": 3.6079854809437384, "step": 4970}, {"loss": 0.6915, "grad_norm": 0.6576591730117798, "learning_rate": 0.0002, "epoch": 3.6152450090744104, "step": 4980}, {"loss": 0.6195, "grad_norm": 0.856985330581665, "learning_rate": 0.0002, "epoch": 3.6225045372050815, "step": 4990}, {"loss": 0.6318, "grad_norm": 0.7230252623558044, "learning_rate": 0.0002, "epoch": 3.629764065335753, "step": 5000}, {"loss": 0.742, "grad_norm": 0.8260893821716309, "learning_rate": 0.0002, "epoch": 3.6370235934664246, "step": 5010}, {"loss": 0.7223, "grad_norm": 0.7635950446128845, "learning_rate": 0.0002, "epoch": 3.644283121597096, "step": 5020}, {"loss": 0.6837, "grad_norm": 0.7060768604278564, "learning_rate": 0.0002, "epoch": 3.6515426497277677, "step": 5030}, {"loss": 0.6921, "grad_norm": 0.8020303249359131, "learning_rate": 0.0002, "epoch": 3.658802177858439, "step": 5040}, {"loss": 0.6446, "grad_norm": 0.8530341386795044, "learning_rate": 0.0002, "epoch": 3.6660617059891107, "step": 5050}, {"loss": 0.7222, "grad_norm": 0.6667101979255676, "learning_rate": 0.0002, "epoch": 3.6733212341197823, "step": 5060}, {"loss": 0.7081, "grad_norm": 0.7385406494140625, "learning_rate": 0.0002, "epoch": 3.680580762250454, "step": 5070}, {"loss": 0.7035, "grad_norm": 0.7753380537033081, "learning_rate": 0.0002, "epoch": 3.6878402903811254, "step": 5080}, {"loss": 0.6491, "grad_norm": 0.7516207098960876, "learning_rate": 0.0002, "epoch": 3.695099818511797, "step": 5090}, {"loss": 0.672, "grad_norm": 0.8171586394309998, "learning_rate": 0.0002, "epoch": 3.702359346642468, "step": 5100}, {"loss": 0.7459, "grad_norm": 1.0796279907226562, "learning_rate": 0.0002, "epoch": 3.70961887477314, "step": 5110}, {"loss": 0.5948, "grad_norm": 0.6957688927650452, "learning_rate": 0.0002, "epoch": 3.716878402903811, "step": 5120}, {"loss": 0.7515, "grad_norm": 0.8550161719322205, "learning_rate": 0.0002, "epoch": 3.7241379310344827, "step": 5130}, {"loss": 0.7286, "grad_norm": 0.9396728277206421, "learning_rate": 0.0002, "epoch": 3.731397459165154, "step": 5140}, {"loss": 0.7594, "grad_norm": 1.4264805316925049, "learning_rate": 0.0002, "epoch": 3.7386569872958257, "step": 5150}, {"loss": 0.6575, "grad_norm": 0.8725108504295349, "learning_rate": 0.0002, "epoch": 3.7459165154264973, "step": 5160}, {"loss": 0.6778, "grad_norm": 1.0346195697784424, "learning_rate": 0.0002, "epoch": 3.753176043557169, "step": 5170}, {"loss": 0.6371, "grad_norm": 0.5395554304122925, "learning_rate": 0.0002, "epoch": 3.7604355716878404, "step": 5180}, {"loss": 0.7308, "grad_norm": 1.3153616189956665, "learning_rate": 0.0002, "epoch": 3.767695099818512, "step": 5190}, {"loss": 0.78, "grad_norm": 0.9879828691482544, "learning_rate": 0.0002, "epoch": 3.7749546279491835, "step": 5200}, {"loss": 0.7068, "grad_norm": 0.8876672983169556, "learning_rate": 0.0002, "epoch": 3.7822141560798546, "step": 5210}, {"loss": 0.6283, "grad_norm": 0.8363267779350281, "learning_rate": 0.0002, "epoch": 3.7894736842105265, "step": 5220}, {"loss": 0.6255, "grad_norm": 0.637294590473175, "learning_rate": 0.0002, "epoch": 3.7967332123411976, "step": 5230}, {"loss": 0.6685, "grad_norm": 1.1408970355987549, "learning_rate": 0.0002, "epoch": 3.803992740471869, "step": 5240}, {"loss": 0.6761, "grad_norm": 1.0128360986709595, "learning_rate": 0.0002, "epoch": 3.8112522686025407, "step": 5250}, {"loss": 0.6764, "grad_norm": 0.8061144351959229, "learning_rate": 0.0002, "epoch": 3.8185117967332123, "step": 5260}, {"loss": 0.7254, "grad_norm": 0.9626626968383789, "learning_rate": 0.0002, "epoch": 3.825771324863884, "step": 5270}, {"loss": 0.7367, "grad_norm": 0.9013627171516418, "learning_rate": 0.0002, "epoch": 3.8330308529945554, "step": 5280}, {"loss": 0.6806, "grad_norm": 0.8411344289779663, "learning_rate": 0.0002, "epoch": 3.840290381125227, "step": 5290}, {"loss": 0.6818, "grad_norm": 0.7426059246063232, "learning_rate": 0.0002, "epoch": 3.8475499092558985, "step": 5300}, {"loss": 0.6748, "grad_norm": 1.003413438796997, "learning_rate": 0.0002, "epoch": 3.85480943738657, "step": 5310}, {"loss": 0.8554, "grad_norm": 0.7527840733528137, "learning_rate": 0.0002, "epoch": 3.862068965517241, "step": 5320}, {"loss": 0.7521, "grad_norm": 0.738610565662384, "learning_rate": 0.0002, "epoch": 3.869328493647913, "step": 5330}, {"loss": 0.7266, "grad_norm": 0.7277999520301819, "learning_rate": 0.0002, "epoch": 3.876588021778584, "step": 5340}, {"loss": 0.7503, "grad_norm": 0.5951359272003174, "learning_rate": 0.0002, "epoch": 3.8838475499092557, "step": 5350}, {"loss": 0.7447, "grad_norm": 1.043884038925171, "learning_rate": 0.0002, "epoch": 3.8911070780399273, "step": 5360}, {"loss": 0.6862, "grad_norm": 0.8436498045921326, "learning_rate": 0.0002, "epoch": 3.898366606170599, "step": 5370}, {"loss": 0.665, "grad_norm": 0.5603365302085876, "learning_rate": 0.0002, "epoch": 3.9056261343012704, "step": 5380}, {"loss": 0.7098, "grad_norm": 1.0128886699676514, "learning_rate": 0.0002, "epoch": 3.912885662431942, "step": 5390}, {"loss": 0.6707, "grad_norm": 0.7970930337905884, "learning_rate": 0.0002, "epoch": 3.9201451905626135, "step": 5400}, {"loss": 0.637, "grad_norm": 0.7699369192123413, "learning_rate": 0.0002, "epoch": 3.927404718693285, "step": 5410}, {"loss": 0.6742, "grad_norm": 0.800561249256134, "learning_rate": 0.0002, "epoch": 3.9346642468239565, "step": 5420}, {"loss": 0.7208, "grad_norm": 0.8020331859588623, "learning_rate": 0.0002, "epoch": 3.941923774954628, "step": 5430}, {"loss": 0.7294, "grad_norm": 0.7461140155792236, "learning_rate": 0.0002, "epoch": 3.9491833030852996, "step": 5440}, {"loss": 0.7013, "grad_norm": 0.8346918821334839, "learning_rate": 0.0002, "epoch": 3.9564428312159707, "step": 5450}, {"loss": 0.6289, "grad_norm": 0.9723302125930786, "learning_rate": 0.0002, "epoch": 3.9637023593466427, "step": 5460}, {"loss": 0.8029, "grad_norm": 0.6809740662574768, "learning_rate": 0.0002, "epoch": 3.970961887477314, "step": 5470}, {"loss": 0.6896, "grad_norm": 0.7353498339653015, "learning_rate": 0.0002, "epoch": 3.9782214156079854, "step": 5480}, {"loss": 0.6722, "grad_norm": 0.748009443283081, "learning_rate": 0.0002, "epoch": 3.985480943738657, "step": 5490}, {"loss": 0.6866, "grad_norm": 1.3656195402145386, "learning_rate": 0.0002, "epoch": 3.9927404718693285, "step": 5500}, {"loss": 0.7368, "grad_norm": 0.8402108550071716, "learning_rate": 0.0002, "epoch": 4.0, "step": 5510}, {"eval_loss": 1.17229425907135, "eval_runtime": 46.2554, "eval_samples_per_second": 9.426, "eval_steps_per_second": 1.189, "epoch": 4.0, "step": 5510}, {"loss": 0.4637, "grad_norm": 0.8601235747337341, "learning_rate": 0.0002, "epoch": 4.007259528130671, "step": 5520}, {"loss": 0.4717, "grad_norm": 1.2635200023651123, "learning_rate": 0.0002, "epoch": 4.014519056261343, "step": 5530}, {"loss": 0.503, "grad_norm": 1.0257477760314941, "learning_rate": 0.0002, "epoch": 4.021778584392014, "step": 5540}, {"loss": 0.4547, "grad_norm": 0.9436745047569275, "learning_rate": 0.0002, "epoch": 4.029038112522686, "step": 5550}, {"loss": 0.459, "grad_norm": 0.9443606734275818, "learning_rate": 0.0002, "epoch": 4.036297640653357, "step": 5560}, {"loss": 0.5386, "grad_norm": 1.3965742588043213, "learning_rate": 0.0002, "epoch": 4.043557168784029, "step": 5570}, {"loss": 0.4248, "grad_norm": 0.8973520398139954, "learning_rate": 0.0002, "epoch": 4.0508166969147, "step": 5580}, {"loss": 0.4111, "grad_norm": 0.9998409748077393, "learning_rate": 0.0002, "epoch": 4.058076225045372, "step": 5590}, {"loss": 0.4828, "grad_norm": 1.1213387250900269, "learning_rate": 0.0002, "epoch": 4.0653357531760435, "step": 5600}, {"loss": 0.439, "grad_norm": 0.7064558863639832, "learning_rate": 0.0002, "epoch": 4.072595281306715, "step": 5610}, {"loss": 0.4607, "grad_norm": 1.2390803098678589, "learning_rate": 0.0002, "epoch": 4.0798548094373865, "step": 5620}, {"loss": 0.5014, "grad_norm": 1.123469591140747, "learning_rate": 0.0002, "epoch": 4.087114337568058, "step": 5630}, {"loss": 0.513, "grad_norm": 1.229573369026184, "learning_rate": 0.0002, "epoch": 4.09437386569873, "step": 5640}, {"loss": 0.5258, "grad_norm": 1.7182831764221191, "learning_rate": 0.0002, "epoch": 4.101633393829401, "step": 5650}, {"loss": 0.5371, "grad_norm": 0.894903302192688, "learning_rate": 0.0002, "epoch": 4.108892921960073, "step": 5660}, {"loss": 0.4813, "grad_norm": 0.8754552006721497, "learning_rate": 0.0002, "epoch": 4.116152450090744, "step": 5670}, {"loss": 0.491, "grad_norm": 1.2401553392410278, "learning_rate": 0.0002, "epoch": 4.123411978221416, "step": 5680}, {"loss": 0.4549, "grad_norm": 0.8631148934364319, "learning_rate": 0.0002, "epoch": 4.130671506352087, "step": 5690}, {"loss": 0.487, "grad_norm": 1.1798022985458374, "learning_rate": 0.0002, "epoch": 4.137931034482759, "step": 5700}, {"loss": 0.4522, "grad_norm": 0.8344549536705017, "learning_rate": 0.0002, "epoch": 4.14519056261343, "step": 5710}, {"loss": 0.4559, "grad_norm": 1.2342697381973267, "learning_rate": 0.0002, "epoch": 4.152450090744102, "step": 5720}, {"loss": 0.53, "grad_norm": 1.1601094007492065, "learning_rate": 0.0002, "epoch": 4.159709618874773, "step": 5730}, {"loss": 0.4755, "grad_norm": 1.2925703525543213, "learning_rate": 0.0002, "epoch": 4.166969147005445, "step": 5740}, {"loss": 0.4973, "grad_norm": 1.0870997905731201, "learning_rate": 0.0002, "epoch": 4.174228675136116, "step": 5750}, {"loss": 0.5184, "grad_norm": 0.9077792763710022, "learning_rate": 0.0002, "epoch": 4.181488203266787, "step": 5760}, {"loss": 0.4606, "grad_norm": 1.009273886680603, "learning_rate": 0.0002, "epoch": 4.188747731397459, "step": 5770}, {"loss": 0.5383, "grad_norm": 1.2465530633926392, "learning_rate": 0.0002, "epoch": 4.19600725952813, "step": 5780}, {"loss": 0.4938, "grad_norm": 1.2261253595352173, "learning_rate": 0.0002, "epoch": 4.203266787658802, "step": 5790}, {"loss": 0.5255, "grad_norm": 1.1498041152954102, "learning_rate": 0.0002, "epoch": 4.2105263157894735, "step": 5800}, {"loss": 0.5269, "grad_norm": 1.1966725587844849, "learning_rate": 0.0002, "epoch": 4.217785843920145, "step": 5810}, {"loss": 0.5626, "grad_norm": 1.2651296854019165, "learning_rate": 0.0002, "epoch": 4.2250453720508165, "step": 5820}, {"loss": 0.5213, "grad_norm": 1.0388574600219727, "learning_rate": 0.0002, "epoch": 4.2323049001814885, "step": 5830}, {"loss": 0.4965, "grad_norm": 1.3042771816253662, "learning_rate": 0.0002, "epoch": 4.23956442831216, "step": 5840}, {"loss": 0.5116, "grad_norm": 1.1127727031707764, "learning_rate": 0.0002, "epoch": 4.246823956442832, "step": 5850}, {"loss": 0.5197, "grad_norm": 0.9653958082199097, "learning_rate": 0.0002, "epoch": 4.254083484573503, "step": 5860}, {"loss": 0.4414, "grad_norm": 1.0500504970550537, "learning_rate": 0.0002, "epoch": 4.261343012704174, "step": 5870}, {"loss": 0.544, "grad_norm": 1.1476165056228638, "learning_rate": 0.0002, "epoch": 4.268602540834846, "step": 5880}, {"loss": 0.4667, "grad_norm": 0.9424414038658142, "learning_rate": 0.0002, "epoch": 4.275862068965517, "step": 5890}, {"loss": 0.5039, "grad_norm": 1.3309166431427002, "learning_rate": 0.0002, "epoch": 4.283121597096189, "step": 5900}, {"loss": 0.5472, "grad_norm": 1.3025873899459839, "learning_rate": 0.0002, "epoch": 4.29038112522686, "step": 5910}, {"loss": 0.4644, "grad_norm": 1.1442325115203857, "learning_rate": 0.0002, "epoch": 4.297640653357532, "step": 5920}, {"loss": 0.5066, "grad_norm": 0.9820859432220459, "learning_rate": 0.0002, "epoch": 4.304900181488203, "step": 5930}, {"loss": 0.5224, "grad_norm": 0.9615740180015564, "learning_rate": 0.0002, "epoch": 4.312159709618875, "step": 5940}, {"loss": 0.5665, "grad_norm": 1.1627109050750732, "learning_rate": 0.0002, "epoch": 4.319419237749546, "step": 5950}, {"loss": 0.4852, "grad_norm": 0.9381322860717773, "learning_rate": 0.0002, "epoch": 4.326678765880218, "step": 5960}, {"loss": 0.4532, "grad_norm": 0.8154335618019104, "learning_rate": 0.0002, "epoch": 4.333938294010889, "step": 5970}, {"loss": 0.5237, "grad_norm": 0.877671480178833, "learning_rate": 0.0002, "epoch": 4.341197822141561, "step": 5980}, {"loss": 0.6113, "grad_norm": 1.1742031574249268, "learning_rate": 0.0002, "epoch": 4.348457350272232, "step": 5990}, {"loss": 0.5704, "grad_norm": 1.0352917909622192, "learning_rate": 0.0002, "epoch": 4.3557168784029034, "step": 6000}, {"loss": 0.4996, "grad_norm": 0.9963878989219666, "learning_rate": 0.0002, "epoch": 4.362976406533575, "step": 6010}, {"loss": 0.4813, "grad_norm": 1.1892237663269043, "learning_rate": 0.0002, "epoch": 4.3702359346642465, "step": 6020}, {"loss": 0.5224, "grad_norm": 1.2516111135482788, "learning_rate": 0.0002, "epoch": 4.3774954627949185, "step": 6030}, {"loss": 0.5952, "grad_norm": 1.2111951112747192, "learning_rate": 0.0002, "epoch": 4.38475499092559, "step": 6040}, {"loss": 0.4275, "grad_norm": 1.0820083618164062, "learning_rate": 0.0002, "epoch": 4.392014519056262, "step": 6050}, {"loss": 0.5117, "grad_norm": 1.033915638923645, "learning_rate": 0.0002, "epoch": 4.399274047186933, "step": 6060}, {"loss": 0.5431, "grad_norm": 1.0635870695114136, "learning_rate": 0.0002, "epoch": 4.406533575317605, "step": 6070}, {"loss": 0.5341, "grad_norm": 1.0520414113998413, "learning_rate": 0.0002, "epoch": 4.413793103448276, "step": 6080}, {"loss": 0.512, "grad_norm": 1.0821926593780518, "learning_rate": 0.0002, "epoch": 4.421052631578947, "step": 6090}, {"loss": 0.5065, "grad_norm": 1.0533246994018555, "learning_rate": 0.0002, "epoch": 4.428312159709619, "step": 6100}, {"loss": 0.4577, "grad_norm": 0.9231932759284973, "learning_rate": 0.0002, "epoch": 4.43557168784029, "step": 6110}, {"loss": 0.583, "grad_norm": 0.9910260438919067, "learning_rate": 0.0002, "epoch": 4.442831215970962, "step": 6120}, {"loss": 0.4717, "grad_norm": 1.061949372291565, "learning_rate": 0.0002, "epoch": 4.450090744101633, "step": 6130}, {"loss": 0.5893, "grad_norm": 1.2927039861679077, "learning_rate": 0.0002, "epoch": 4.457350272232305, "step": 6140}, {"loss": 0.4684, "grad_norm": 1.3966081142425537, "learning_rate": 0.0002, "epoch": 4.464609800362976, "step": 6150}, {"loss": 0.5507, "grad_norm": 1.3835992813110352, "learning_rate": 0.0002, "epoch": 4.471869328493648, "step": 6160}, {"loss": 0.5911, "grad_norm": 1.0892692804336548, "learning_rate": 0.0002, "epoch": 4.479128856624319, "step": 6170}, {"loss": 0.478, "grad_norm": 1.0318800210952759, "learning_rate": 0.0002, "epoch": 4.486388384754991, "step": 6180}, {"loss": 0.5198, "grad_norm": 0.8174677491188049, "learning_rate": 0.0002, "epoch": 4.493647912885662, "step": 6190}, {"loss": 0.5387, "grad_norm": 1.4157509803771973, "learning_rate": 0.0002, "epoch": 4.500907441016334, "step": 6200}, {"loss": 0.5868, "grad_norm": 1.5244114398956299, "learning_rate": 0.0002, "epoch": 4.508166969147005, "step": 6210}, {"loss": 0.4642, "grad_norm": 0.8164850473403931, "learning_rate": 0.0002, "epoch": 4.5154264972776765, "step": 6220}, {"loss": 0.522, "grad_norm": 1.2904746532440186, "learning_rate": 0.0002, "epoch": 4.5226860254083485, "step": 6230}, {"loss": 0.5103, "grad_norm": 0.7987732887268066, "learning_rate": 0.0002, "epoch": 4.52994555353902, "step": 6240}, {"loss": 0.4615, "grad_norm": 0.831040620803833, "learning_rate": 0.0002, "epoch": 4.537205081669692, "step": 6250}, {"loss": 0.5065, "grad_norm": 0.9545485973358154, "learning_rate": 0.0002, "epoch": 4.544464609800363, "step": 6260}, {"loss": 0.5515, "grad_norm": 0.9291793704032898, "learning_rate": 0.0002, "epoch": 4.551724137931035, "step": 6270}, {"loss": 0.4535, "grad_norm": 0.8977208733558655, "learning_rate": 0.0002, "epoch": 4.558983666061706, "step": 6280}, {"loss": 0.544, "grad_norm": 1.1768537759780884, "learning_rate": 0.0002, "epoch": 4.566243194192378, "step": 6290}, {"loss": 0.5925, "grad_norm": 1.0688952207565308, "learning_rate": 0.0002, "epoch": 4.573502722323049, "step": 6300}, {"loss": 0.5207, "grad_norm": 0.8800966739654541, "learning_rate": 0.0002, "epoch": 4.580762250453721, "step": 6310}, {"loss": 0.6106, "grad_norm": 1.0911834239959717, "learning_rate": 0.0002, "epoch": 4.588021778584392, "step": 6320}, {"loss": 0.5109, "grad_norm": 1.1420872211456299, "learning_rate": 0.0002, "epoch": 4.595281306715064, "step": 6330}, {"loss": 0.5147, "grad_norm": 1.0215224027633667, "learning_rate": 0.0002, "epoch": 4.602540834845735, "step": 6340}, {"loss": 0.592, "grad_norm": 0.9685489535331726, "learning_rate": 0.0002, "epoch": 4.609800362976406, "step": 6350}, {"loss": 0.5775, "grad_norm": 1.12773597240448, "learning_rate": 0.0002, "epoch": 4.617059891107078, "step": 6360}, {"loss": 0.5966, "grad_norm": 1.0663973093032837, "learning_rate": 0.0002, "epoch": 4.624319419237749, "step": 6370}, {"loss": 0.512, "grad_norm": 1.1707262992858887, "learning_rate": 0.0002, "epoch": 4.631578947368421, "step": 6380}, {"loss": 0.5497, "grad_norm": 1.0672980546951294, "learning_rate": 0.0002, "epoch": 4.638838475499092, "step": 6390}, {"loss": 0.5699, "grad_norm": 1.1464333534240723, "learning_rate": 0.0002, "epoch": 4.646098003629764, "step": 6400}, {"loss": 0.5514, "grad_norm": 1.070230484008789, "learning_rate": 0.0002, "epoch": 4.653357531760435, "step": 6410}, {"loss": 0.5013, "grad_norm": 0.9673764109611511, "learning_rate": 0.0002, "epoch": 4.660617059891107, "step": 6420}, {"loss": 0.5901, "grad_norm": 1.0189043283462524, "learning_rate": 0.0002, "epoch": 4.6678765880217785, "step": 6430}, {"loss": 0.5193, "grad_norm": 1.185896396636963, "learning_rate": 0.0002, "epoch": 4.67513611615245, "step": 6440}, {"loss": 0.5318, "grad_norm": 1.0682812929153442, "learning_rate": 0.0002, "epoch": 4.682395644283122, "step": 6450}, {"loss": 0.5773, "grad_norm": 1.3586071729660034, "learning_rate": 0.0002, "epoch": 4.689655172413794, "step": 6460}, {"loss": 0.5482, "grad_norm": 0.6561792492866516, "learning_rate": 0.0002, "epoch": 4.696914700544465, "step": 6470}, {"loss": 0.5711, "grad_norm": 1.1394113302230835, "learning_rate": 0.0002, "epoch": 4.704174228675136, "step": 6480}, {"loss": 0.5325, "grad_norm": 0.9683151245117188, "learning_rate": 0.0002, "epoch": 4.711433756805808, "step": 6490}, {"loss": 0.5139, "grad_norm": 1.0247553586959839, "learning_rate": 0.0002, "epoch": 4.718693284936479, "step": 6500}, {"loss": 0.5794, "grad_norm": 0.8046169281005859, "learning_rate": 0.0002, "epoch": 4.725952813067151, "step": 6510}, {"loss": 0.5471, "grad_norm": 1.0710240602493286, "learning_rate": 0.0002, "epoch": 4.733212341197822, "step": 6520}, {"loss": 0.4805, "grad_norm": 0.9438924193382263, "learning_rate": 0.0002, "epoch": 4.740471869328494, "step": 6530}, {"loss": 0.5404, "grad_norm": 0.869162380695343, "learning_rate": 0.0002, "epoch": 4.747731397459165, "step": 6540}, {"loss": 0.6379, "grad_norm": 0.9776787161827087, "learning_rate": 0.0002, "epoch": 4.754990925589837, "step": 6550}, {"loss": 0.5288, "grad_norm": 1.1990505456924438, "learning_rate": 0.0002, "epoch": 4.762250453720508, "step": 6560}, {"loss": 0.5539, "grad_norm": 1.0582209825515747, "learning_rate": 0.0002, "epoch": 4.769509981851179, "step": 6570}, {"loss": 0.489, "grad_norm": 0.9966367483139038, "learning_rate": 0.0002, "epoch": 4.776769509981851, "step": 6580}, {"loss": 0.5514, "grad_norm": 0.9130612015724182, "learning_rate": 0.0002, "epoch": 4.784029038112522, "step": 6590}, {"loss": 0.5864, "grad_norm": 1.0950500965118408, "learning_rate": 0.0002, "epoch": 4.791288566243194, "step": 6600}, {"loss": 0.5266, "grad_norm": 1.108681321144104, "learning_rate": 0.0002, "epoch": 4.798548094373865, "step": 6610}, {"loss": 0.5875, "grad_norm": 1.1873763799667358, "learning_rate": 0.0002, "epoch": 4.805807622504537, "step": 6620}, {"loss": 0.5736, "grad_norm": 1.305367112159729, "learning_rate": 0.0002, "epoch": 4.8130671506352085, "step": 6630}, {"loss": 0.5636, "grad_norm": 1.2801482677459717, "learning_rate": 0.0002, "epoch": 4.8203266787658805, "step": 6640}, {"loss": 0.582, "grad_norm": 1.26764976978302, "learning_rate": 0.0002, "epoch": 4.827586206896552, "step": 6650}, {"loss": 0.5259, "grad_norm": 1.0018208026885986, "learning_rate": 0.0002, "epoch": 4.834845735027224, "step": 6660}, {"loss": 0.548, "grad_norm": 1.2326326370239258, "learning_rate": 0.0002, "epoch": 4.842105263157895, "step": 6670}, {"loss": 0.5933, "grad_norm": 0.9707282781600952, "learning_rate": 0.0002, "epoch": 4.849364791288567, "step": 6680}, {"loss": 0.5612, "grad_norm": 1.2772048711776733, "learning_rate": 0.0002, "epoch": 4.856624319419238, "step": 6690}, {"loss": 0.5346, "grad_norm": 2.6652262210845947, "learning_rate": 0.0002, "epoch": 4.863883847549909, "step": 6700}, {"loss": 0.5428, "grad_norm": 1.215828537940979, "learning_rate": 0.0002, "epoch": 4.871143375680581, "step": 6710}, {"loss": 0.6571, "grad_norm": 1.3704510927200317, "learning_rate": 0.0002, "epoch": 4.878402903811252, "step": 6720}, {"loss": 0.4963, "grad_norm": 0.7781757116317749, "learning_rate": 0.0002, "epoch": 4.885662431941924, "step": 6730}, {"loss": 0.5989, "grad_norm": 1.1883646249771118, "learning_rate": 0.0002, "epoch": 4.892921960072595, "step": 6740}, {"loss": 0.6067, "grad_norm": 0.9216066002845764, "learning_rate": 0.0002, "epoch": 4.900181488203267, "step": 6750}, {"loss": 0.5085, "grad_norm": 1.0558464527130127, "learning_rate": 0.0002, "epoch": 4.907441016333938, "step": 6760}, {"loss": 0.5216, "grad_norm": 1.032656192779541, "learning_rate": 0.0002, "epoch": 4.91470054446461, "step": 6770}, {"loss": 0.5426, "grad_norm": 1.1261441707611084, "learning_rate": 0.0002, "epoch": 4.921960072595281, "step": 6780}, {"loss": 0.5295, "grad_norm": 1.2178640365600586, "learning_rate": 0.0002, "epoch": 4.929219600725952, "step": 6790}, {"loss": 0.5476, "grad_norm": 1.5369361639022827, "learning_rate": 0.0002, "epoch": 4.936479128856624, "step": 6800}, {"loss": 0.5358, "grad_norm": 1.1188377141952515, "learning_rate": 0.0002, "epoch": 4.943738656987296, "step": 6810}, {"loss": 0.5483, "grad_norm": 1.2506113052368164, "learning_rate": 0.0002, "epoch": 4.950998185117967, "step": 6820}, {"loss": 0.567, "grad_norm": 0.8776047825813293, "learning_rate": 0.0002, "epoch": 4.9582577132486385, "step": 6830}, {"loss": 0.5764, "grad_norm": 0.9700555205345154, "learning_rate": 0.0002, "epoch": 4.9655172413793105, "step": 6840}, {"loss": 0.5396, "grad_norm": 1.2713534832000732, "learning_rate": 0.0002, "epoch": 4.972776769509982, "step": 6850}, {"loss": 0.5451, "grad_norm": 0.9855955243110657, "learning_rate": 0.0002, "epoch": 4.980036297640654, "step": 6860}, {"loss": 0.5884, "grad_norm": 0.8734853863716125, "learning_rate": 0.0002, "epoch": 4.987295825771325, "step": 6870}, {"loss": 0.5189, "grad_norm": 0.8065403699874878, "learning_rate": 0.0002, "epoch": 4.994555353901997, "step": 6880}]} +{"epoch": 6.0, "step": 8265, "epoch_duration": 2057.1978249549866, "total_accumulated_duration": 18403.34801387787, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 24486.0}, "peak_memory_reserved": {"GPU_0": 24752.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4936, "grad_norm": 0.46010470390319824, "learning_rate": 0.0002, "epoch": 0.007259528130671506, "step": 10}, {"loss": 1.8442, "grad_norm": 0.5103179216384888, "learning_rate": 0.0002, "epoch": 0.014519056261343012, "step": 20}, {"loss": 1.559, "grad_norm": 0.6282716989517212, "learning_rate": 0.0002, "epoch": 0.021778584392014518, "step": 30}, {"loss": 1.3618, "grad_norm": 1.2480497360229492, "learning_rate": 0.0002, "epoch": 0.029038112522686024, "step": 40}, {"loss": 1.3874, "grad_norm": 0.4114855229854584, "learning_rate": 0.0002, "epoch": 0.036297640653357534, "step": 50}, {"loss": 1.2836, "grad_norm": 0.49482840299606323, "learning_rate": 0.0002, "epoch": 0.043557168784029036, "step": 60}, {"loss": 1.1728, "grad_norm": 0.4536272883415222, "learning_rate": 0.0002, "epoch": 0.050816696914700546, "step": 70}, {"loss": 1.1125, "grad_norm": 0.32328274846076965, "learning_rate": 0.0002, "epoch": 0.05807622504537205, "step": 80}, {"loss": 1.2447, "grad_norm": 0.40990468859672546, "learning_rate": 0.0002, "epoch": 0.06533575317604355, "step": 90}, {"loss": 1.2216, "grad_norm": 0.37273502349853516, "learning_rate": 0.0002, "epoch": 0.07259528130671507, "step": 100}, {"loss": 1.2259, "grad_norm": 0.3903871476650238, "learning_rate": 0.0002, "epoch": 0.07985480943738657, "step": 110}, {"loss": 1.3718, "grad_norm": 0.3272787928581238, "learning_rate": 0.0002, "epoch": 0.08711433756805807, "step": 120}, {"loss": 1.1832, "grad_norm": 0.3622824251651764, "learning_rate": 0.0002, "epoch": 0.09437386569872959, "step": 130}, {"loss": 1.2966, "grad_norm": 0.3503916561603546, "learning_rate": 0.0002, "epoch": 0.10163339382940109, "step": 140}, {"loss": 1.0913, "grad_norm": 0.32787832617759705, "learning_rate": 0.0002, "epoch": 0.1088929219600726, "step": 150}, {"loss": 1.2143, "grad_norm": 0.5822657942771912, "learning_rate": 0.0002, "epoch": 0.1161524500907441, "step": 160}, {"loss": 1.2091, "grad_norm": 0.28028249740600586, "learning_rate": 0.0002, "epoch": 0.12341197822141561, "step": 170}, {"loss": 1.1731, "grad_norm": 0.33602750301361084, "learning_rate": 0.0002, "epoch": 0.1306715063520871, "step": 180}, {"loss": 1.127, "grad_norm": 0.29106274247169495, "learning_rate": 0.0002, "epoch": 0.13793103448275862, "step": 190}, {"loss": 1.3261, "grad_norm": 0.38753363490104675, "learning_rate": 0.0002, "epoch": 0.14519056261343014, "step": 200}, {"loss": 1.1536, "grad_norm": 0.361009418964386, "learning_rate": 0.0002, "epoch": 0.15245009074410162, "step": 210}, {"loss": 1.1483, "grad_norm": 0.6743836402893066, "learning_rate": 0.0002, "epoch": 0.15970961887477314, "step": 220}, {"loss": 1.264, "grad_norm": 0.3923613727092743, "learning_rate": 0.0002, "epoch": 0.16696914700544466, "step": 230}, {"loss": 1.0437, "grad_norm": 0.2809699773788452, "learning_rate": 0.0002, "epoch": 0.17422867513611615, "step": 240}, {"loss": 1.1826, "grad_norm": 0.3631494641304016, "learning_rate": 0.0002, "epoch": 0.18148820326678766, "step": 250}, {"loss": 1.1584, "grad_norm": 0.24658171832561493, "learning_rate": 0.0002, "epoch": 0.18874773139745918, "step": 260}, {"loss": 1.0065, "grad_norm": 0.5780664682388306, "learning_rate": 0.0002, "epoch": 0.19600725952813067, "step": 270}, {"loss": 1.1251, "grad_norm": 0.3056720495223999, "learning_rate": 0.0002, "epoch": 0.20326678765880218, "step": 280}, {"loss": 1.1503, "grad_norm": 0.2857084572315216, "learning_rate": 0.0002, "epoch": 0.21052631578947367, "step": 290}, {"loss": 1.1215, "grad_norm": 0.6645345687866211, "learning_rate": 0.0002, "epoch": 0.2177858439201452, "step": 300}, {"loss": 1.127, "grad_norm": 0.5966078639030457, "learning_rate": 0.0002, "epoch": 0.2250453720508167, "step": 310}, {"loss": 1.1974, "grad_norm": 0.40937140583992004, "learning_rate": 0.0002, "epoch": 0.2323049001814882, "step": 320}, {"loss": 1.2385, "grad_norm": 0.5642806887626648, "learning_rate": 0.0002, "epoch": 0.2395644283121597, "step": 330}, {"loss": 1.1703, "grad_norm": 0.2750748097896576, "learning_rate": 0.0002, "epoch": 0.24682395644283123, "step": 340}, {"loss": 1.2598, "grad_norm": 0.34350234270095825, "learning_rate": 0.0002, "epoch": 0.2540834845735027, "step": 350}, {"loss": 1.1942, "grad_norm": 0.6767239570617676, "learning_rate": 0.0002, "epoch": 0.2613430127041742, "step": 360}, {"loss": 1.1436, "grad_norm": 0.31006959080696106, "learning_rate": 0.0002, "epoch": 0.26860254083484575, "step": 370}, {"loss": 1.2084, "grad_norm": 0.3825474679470062, "learning_rate": 0.0002, "epoch": 0.27586206896551724, "step": 380}, {"loss": 1.1523, "grad_norm": 0.30241551995277405, "learning_rate": 0.0002, "epoch": 0.2831215970961887, "step": 390}, {"loss": 1.1298, "grad_norm": 0.2962397336959839, "learning_rate": 0.0002, "epoch": 0.29038112522686027, "step": 400}, {"loss": 1.1299, "grad_norm": 0.2600369155406952, "learning_rate": 0.0002, "epoch": 0.29764065335753176, "step": 410}, {"loss": 1.1366, "grad_norm": 0.3675060272216797, "learning_rate": 0.0002, "epoch": 0.30490018148820325, "step": 420}, {"loss": 1.156, "grad_norm": 0.3429498076438904, "learning_rate": 0.0002, "epoch": 0.3121597096188748, "step": 430}, {"loss": 1.2741, "grad_norm": 0.34311825037002563, "learning_rate": 0.0002, "epoch": 0.3194192377495463, "step": 440}, {"loss": 1.3523, "grad_norm": 0.37872210144996643, "learning_rate": 0.0002, "epoch": 0.32667876588021777, "step": 450}, {"loss": 1.1365, "grad_norm": 0.33271121978759766, "learning_rate": 0.0002, "epoch": 0.3339382940108893, "step": 460}, {"loss": 1.1572, "grad_norm": 0.34605276584625244, "learning_rate": 0.0002, "epoch": 0.3411978221415608, "step": 470}, {"loss": 1.2755, "grad_norm": 0.41050562262535095, "learning_rate": 0.0002, "epoch": 0.3484573502722323, "step": 480}, {"loss": 1.075, "grad_norm": 0.2066836953163147, "learning_rate": 0.0002, "epoch": 0.35571687840290384, "step": 490}, {"loss": 1.2072, "grad_norm": 0.2859014868736267, "learning_rate": 0.0002, "epoch": 0.3629764065335753, "step": 500}, {"loss": 1.1435, "grad_norm": 0.28763777017593384, "learning_rate": 0.0002, "epoch": 0.3702359346642468, "step": 510}, {"loss": 1.1341, "grad_norm": 0.2730471193790436, "learning_rate": 0.0002, "epoch": 0.37749546279491836, "step": 520}, {"loss": 1.348, "grad_norm": 0.3968936800956726, "learning_rate": 0.0002, "epoch": 0.38475499092558985, "step": 530}, {"loss": 1.2363, "grad_norm": 0.3624701201915741, "learning_rate": 0.0002, "epoch": 0.39201451905626133, "step": 540}, {"loss": 1.1206, "grad_norm": 0.3303608298301697, "learning_rate": 0.0002, "epoch": 0.3992740471869328, "step": 550}, {"loss": 1.203, "grad_norm": 0.33507466316223145, "learning_rate": 0.0002, "epoch": 0.40653357531760437, "step": 560}, {"loss": 1.2294, "grad_norm": 0.3297670781612396, "learning_rate": 0.0002, "epoch": 0.41379310344827586, "step": 570}, {"loss": 1.2, "grad_norm": 0.32334890961647034, "learning_rate": 0.0002, "epoch": 0.42105263157894735, "step": 580}, {"loss": 1.112, "grad_norm": 0.30281195044517517, "learning_rate": 0.0002, "epoch": 0.4283121597096189, "step": 590}, {"loss": 1.1352, "grad_norm": 0.5900027751922607, "learning_rate": 0.0002, "epoch": 0.4355716878402904, "step": 600}, {"loss": 1.1575, "grad_norm": 0.28437477350234985, "learning_rate": 0.0002, "epoch": 0.44283121597096187, "step": 610}, {"loss": 1.2009, "grad_norm": 0.39601704478263855, "learning_rate": 0.0002, "epoch": 0.4500907441016334, "step": 620}, {"loss": 1.2302, "grad_norm": 0.41971510648727417, "learning_rate": 0.0002, "epoch": 0.4573502722323049, "step": 630}, {"loss": 1.236, "grad_norm": 0.33814409375190735, "learning_rate": 0.0002, "epoch": 0.4646098003629764, "step": 640}, {"loss": 1.2471, "grad_norm": 0.575718104839325, "learning_rate": 0.0002, "epoch": 0.47186932849364793, "step": 650}, {"loss": 1.1012, "grad_norm": 0.37927401065826416, "learning_rate": 0.0002, "epoch": 0.4791288566243194, "step": 660}, {"loss": 1.1552, "grad_norm": 0.3224332630634308, "learning_rate": 0.0002, "epoch": 0.4863883847549909, "step": 670}, {"loss": 1.1354, "grad_norm": 0.32683515548706055, "learning_rate": 0.0002, "epoch": 0.49364791288566245, "step": 680}, {"loss": 1.207, "grad_norm": 0.4316163659095764, "learning_rate": 0.0002, "epoch": 0.5009074410163339, "step": 690}, {"loss": 1.2116, "grad_norm": 0.342602401971817, "learning_rate": 0.0002, "epoch": 0.5081669691470054, "step": 700}, {"loss": 1.0433, "grad_norm": 0.2794898748397827, "learning_rate": 0.0002, "epoch": 0.515426497277677, "step": 710}, {"loss": 1.2395, "grad_norm": 0.3322339951992035, "learning_rate": 0.0002, "epoch": 0.5226860254083484, "step": 720}, {"loss": 1.0762, "grad_norm": 0.3088509142398834, "learning_rate": 0.0002, "epoch": 0.52994555353902, "step": 730}, {"loss": 1.1495, "grad_norm": 0.24444378912448883, "learning_rate": 0.0002, "epoch": 0.5372050816696915, "step": 740}, {"loss": 1.2176, "grad_norm": 0.3483171761035919, "learning_rate": 0.0002, "epoch": 0.5444646098003629, "step": 750}, {"loss": 1.2248, "grad_norm": 0.3425690531730652, "learning_rate": 0.0002, "epoch": 0.5517241379310345, "step": 760}, {"loss": 1.1127, "grad_norm": 0.31841927766799927, "learning_rate": 0.0002, "epoch": 0.558983666061706, "step": 770}, {"loss": 1.1841, "grad_norm": 0.39423868060112, "learning_rate": 0.0002, "epoch": 0.5662431941923775, "step": 780}, {"loss": 1.0583, "grad_norm": 0.30328479409217834, "learning_rate": 0.0002, "epoch": 0.573502722323049, "step": 790}, {"loss": 1.0718, "grad_norm": 0.24475938081741333, "learning_rate": 0.0002, "epoch": 0.5807622504537205, "step": 800}, {"loss": 1.2006, "grad_norm": 0.37132805585861206, "learning_rate": 0.0002, "epoch": 0.588021778584392, "step": 810}, {"loss": 1.1544, "grad_norm": 0.32195979356765747, "learning_rate": 0.0002, "epoch": 0.5952813067150635, "step": 820}, {"loss": 0.9937, "grad_norm": 0.2848738729953766, "learning_rate": 0.0002, "epoch": 0.6025408348457351, "step": 830}, {"loss": 1.0937, "grad_norm": 0.28015264868736267, "learning_rate": 0.0002, "epoch": 0.6098003629764065, "step": 840}, {"loss": 1.1234, "grad_norm": 0.37796008586883545, "learning_rate": 0.0002, "epoch": 0.617059891107078, "step": 850}, {"loss": 1.1455, "grad_norm": 0.39311841130256653, "learning_rate": 0.0002, "epoch": 0.6243194192377496, "step": 860}, {"loss": 1.1066, "grad_norm": 0.2761685252189636, "learning_rate": 0.0002, "epoch": 0.631578947368421, "step": 870}, {"loss": 1.0859, "grad_norm": 0.3826720118522644, "learning_rate": 0.0002, "epoch": 0.6388384754990926, "step": 880}, {"loss": 1.1182, "grad_norm": 0.30076679587364197, "learning_rate": 0.0002, "epoch": 0.6460980036297641, "step": 890}, {"loss": 1.0927, "grad_norm": 0.21997687220573425, "learning_rate": 0.0002, "epoch": 0.6533575317604355, "step": 900}, {"loss": 1.1603, "grad_norm": 0.32593777775764465, "learning_rate": 0.0002, "epoch": 0.6606170598911071, "step": 910}, {"loss": 1.1578, "grad_norm": 0.30347898602485657, "learning_rate": 0.0002, "epoch": 0.6678765880217786, "step": 920}, {"loss": 1.1642, "grad_norm": 0.44173774123191833, "learning_rate": 0.0002, "epoch": 0.6751361161524501, "step": 930}, {"loss": 1.1832, "grad_norm": 0.2507467269897461, "learning_rate": 0.0002, "epoch": 0.6823956442831216, "step": 940}, {"loss": 1.3001, "grad_norm": 0.29463833570480347, "learning_rate": 0.0002, "epoch": 0.6896551724137931, "step": 950}, {"loss": 1.1651, "grad_norm": 0.9363154172897339, "learning_rate": 0.0002, "epoch": 0.6969147005444646, "step": 960}, {"loss": 1.158, "grad_norm": 0.3236212134361267, "learning_rate": 0.0002, "epoch": 0.7041742286751361, "step": 970}, {"loss": 1.1078, "grad_norm": 0.3123254179954529, "learning_rate": 0.0002, "epoch": 0.7114337568058077, "step": 980}, {"loss": 1.1578, "grad_norm": 0.3395805060863495, "learning_rate": 0.0002, "epoch": 0.7186932849364791, "step": 990}, {"loss": 1.0754, "grad_norm": 0.3240964412689209, "learning_rate": 0.0002, "epoch": 0.7259528130671506, "step": 1000}, {"loss": 1.0313, "grad_norm": 0.31902948021888733, "learning_rate": 0.0002, "epoch": 0.7332123411978222, "step": 1010}, {"loss": 1.1664, "grad_norm": 0.4848408102989197, "learning_rate": 0.0002, "epoch": 0.7404718693284936, "step": 1020}, {"loss": 1.0838, "grad_norm": 0.33006033301353455, "learning_rate": 0.0002, "epoch": 0.7477313974591652, "step": 1030}, {"loss": 1.0979, "grad_norm": 0.2928730547428131, "learning_rate": 0.0002, "epoch": 0.7549909255898367, "step": 1040}, {"loss": 1.1351, "grad_norm": 0.3529164791107178, "learning_rate": 0.0002, "epoch": 0.7622504537205081, "step": 1050}, {"loss": 1.1274, "grad_norm": 0.2736213803291321, "learning_rate": 0.0002, "epoch": 0.7695099818511797, "step": 1060}, {"loss": 1.1169, "grad_norm": 0.7200686931610107, "learning_rate": 0.0002, "epoch": 0.7767695099818511, "step": 1070}, {"loss": 1.1171, "grad_norm": 0.33396708965301514, "learning_rate": 0.0002, "epoch": 0.7840290381125227, "step": 1080}, {"loss": 1.2447, "grad_norm": 1.5760449171066284, "learning_rate": 0.0002, "epoch": 0.7912885662431942, "step": 1090}, {"loss": 1.1126, "grad_norm": 0.28138381242752075, "learning_rate": 0.0002, "epoch": 0.7985480943738656, "step": 1100}, {"loss": 1.2012, "grad_norm": 0.2597472369670868, "learning_rate": 0.0002, "epoch": 0.8058076225045372, "step": 1110}, {"loss": 1.1177, "grad_norm": 0.3305445611476898, "learning_rate": 0.0002, "epoch": 0.8130671506352087, "step": 1120}, {"loss": 1.0849, "grad_norm": 0.3934599459171295, "learning_rate": 0.0002, "epoch": 0.8203266787658802, "step": 1130}, {"loss": 1.0889, "grad_norm": 0.3472191393375397, "learning_rate": 0.0002, "epoch": 0.8275862068965517, "step": 1140}, {"loss": 1.265, "grad_norm": 0.2857365906238556, "learning_rate": 0.0002, "epoch": 0.8348457350272233, "step": 1150}, {"loss": 1.03, "grad_norm": 0.3207702934741974, "learning_rate": 0.0002, "epoch": 0.8421052631578947, "step": 1160}, {"loss": 1.1669, "grad_norm": 0.3176484704017639, "learning_rate": 0.0002, "epoch": 0.8493647912885662, "step": 1170}, {"loss": 1.1386, "grad_norm": 0.40685558319091797, "learning_rate": 0.0002, "epoch": 0.8566243194192378, "step": 1180}, {"loss": 1.1383, "grad_norm": 0.31125199794769287, "learning_rate": 0.0002, "epoch": 0.8638838475499092, "step": 1190}, {"loss": 1.1512, "grad_norm": 0.7361181378364563, "learning_rate": 0.0002, "epoch": 0.8711433756805808, "step": 1200}, {"loss": 0.9983, "grad_norm": 0.33699527382850647, "learning_rate": 0.0002, "epoch": 0.8784029038112523, "step": 1210}, {"loss": 1.11, "grad_norm": 0.3315220773220062, "learning_rate": 0.0002, "epoch": 0.8856624319419237, "step": 1220}, {"loss": 1.0266, "grad_norm": 0.6256054043769836, "learning_rate": 0.0002, "epoch": 0.8929219600725953, "step": 1230}, {"loss": 1.1738, "grad_norm": 0.3692137897014618, "learning_rate": 0.0002, "epoch": 0.9001814882032668, "step": 1240}, {"loss": 1.1112, "grad_norm": 0.3538484573364258, "learning_rate": 0.0002, "epoch": 0.9074410163339383, "step": 1250}, {"loss": 1.0878, "grad_norm": 0.27863040566444397, "learning_rate": 0.0002, "epoch": 0.9147005444646098, "step": 1260}, {"loss": 1.0826, "grad_norm": 0.3322528302669525, "learning_rate": 0.0002, "epoch": 0.9219600725952813, "step": 1270}, {"loss": 1.0158, "grad_norm": 0.7553173303604126, "learning_rate": 0.0002, "epoch": 0.9292196007259528, "step": 1280}, {"loss": 1.198, "grad_norm": 0.4856191575527191, "learning_rate": 0.0002, "epoch": 0.9364791288566243, "step": 1290}, {"loss": 1.1678, "grad_norm": 0.3668074905872345, "learning_rate": 0.0002, "epoch": 0.9437386569872959, "step": 1300}, {"loss": 1.1809, "grad_norm": 0.29851067066192627, "learning_rate": 0.0002, "epoch": 0.9509981851179673, "step": 1310}, {"loss": 1.1299, "grad_norm": 0.276664674282074, "learning_rate": 0.0002, "epoch": 0.9582577132486388, "step": 1320}, {"loss": 1.0597, "grad_norm": 0.2941018342971802, "learning_rate": 0.0002, "epoch": 0.9655172413793104, "step": 1330}, {"loss": 1.1731, "grad_norm": 0.3505859076976776, "learning_rate": 0.0002, "epoch": 0.9727767695099818, "step": 1340}, {"loss": 1.1809, "grad_norm": 0.3067687451839447, "learning_rate": 0.0002, "epoch": 0.9800362976406534, "step": 1350}, {"loss": 1.1511, "grad_norm": 0.27151066064834595, "learning_rate": 0.0002, "epoch": 0.9872958257713249, "step": 1360}, {"loss": 1.1919, "grad_norm": 0.36370083689689636, "learning_rate": 0.0002, "epoch": 0.9945553539019963, "step": 1370}, {"eval_loss": 1.1381088495254517, "eval_runtime": 96.0848, "eval_samples_per_second": 4.538, "eval_steps_per_second": 0.572, "epoch": 0.9996370235934664, "step": 1377}, {"loss": 1.0082, "grad_norm": 0.27980583906173706, "learning_rate": 0.0002, "epoch": 1.0018148820326678, "step": 1380}, {"loss": 1.0565, "grad_norm": 0.26713913679122925, "learning_rate": 0.0002, "epoch": 1.0090744101633393, "step": 1390}, {"loss": 1.0241, "grad_norm": 0.3089541494846344, "learning_rate": 0.0002, "epoch": 1.0163339382940109, "step": 1400}, {"loss": 0.9649, "grad_norm": 0.4188242256641388, "learning_rate": 0.0002, "epoch": 1.0235934664246824, "step": 1410}, {"loss": 0.9821, "grad_norm": 0.5246463418006897, "learning_rate": 0.0002, "epoch": 1.030852994555354, "step": 1420}, {"loss": 1.0143, "grad_norm": 0.2728777825832367, "learning_rate": 0.0002, "epoch": 1.0381125226860255, "step": 1430}, {"loss": 1.075, "grad_norm": 0.38167616724967957, "learning_rate": 0.0002, "epoch": 1.0453720508166968, "step": 1440}, {"loss": 1.0229, "grad_norm": 0.4439380168914795, "learning_rate": 0.0002, "epoch": 1.0526315789473684, "step": 1450}, {"loss": 1.0451, "grad_norm": 0.30954182147979736, "learning_rate": 0.0002, "epoch": 1.05989110707804, "step": 1460}, {"loss": 1.0762, "grad_norm": 0.4022280275821686, "learning_rate": 0.0002, "epoch": 1.0671506352087115, "step": 1470}, {"loss": 1.0754, "grad_norm": 0.7390811443328857, "learning_rate": 0.0002, "epoch": 1.074410163339383, "step": 1480}, {"loss": 1.1378, "grad_norm": 0.3885486423969269, "learning_rate": 0.0002, "epoch": 1.0816696914700545, "step": 1490}, {"loss": 1.005, "grad_norm": 0.5275560617446899, "learning_rate": 0.0002, "epoch": 1.0889292196007259, "step": 1500}, {"loss": 1.0147, "grad_norm": 0.35112282633781433, "learning_rate": 0.0002, "epoch": 1.0961887477313974, "step": 1510}, {"loss": 1.0331, "grad_norm": 0.33714351058006287, "learning_rate": 0.0002, "epoch": 1.103448275862069, "step": 1520}, {"loss": 0.8723, "grad_norm": 0.31221693754196167, "learning_rate": 0.0002, "epoch": 1.1107078039927405, "step": 1530}, {"loss": 1.0701, "grad_norm": 0.27549654245376587, "learning_rate": 0.0002, "epoch": 1.117967332123412, "step": 1540}, {"loss": 1.0114, "grad_norm": 0.8465521335601807, "learning_rate": 0.0002, "epoch": 1.1252268602540836, "step": 1550}, {"loss": 0.9643, "grad_norm": 0.36125949025154114, "learning_rate": 0.0002, "epoch": 1.132486388384755, "step": 1560}, {"loss": 1.0744, "grad_norm": 0.37420371174812317, "learning_rate": 0.0002, "epoch": 1.1397459165154264, "step": 1570}, {"loss": 1.165, "grad_norm": 0.3294760584831238, "learning_rate": 0.0002, "epoch": 1.147005444646098, "step": 1580}, {"loss": 0.9207, "grad_norm": 0.3881238102912903, "learning_rate": 0.0002, "epoch": 1.1542649727767695, "step": 1590}, {"loss": 1.0983, "grad_norm": 0.4766491651535034, "learning_rate": 0.0002, "epoch": 1.161524500907441, "step": 1600}, {"loss": 1.0092, "grad_norm": 0.395530104637146, "learning_rate": 0.0002, "epoch": 1.1687840290381124, "step": 1610}, {"loss": 0.9895, "grad_norm": 0.3297106623649597, "learning_rate": 0.0002, "epoch": 1.176043557168784, "step": 1620}, {"loss": 1.1121, "grad_norm": 0.39528271555900574, "learning_rate": 0.0002, "epoch": 1.1833030852994555, "step": 1630}, {"loss": 0.9202, "grad_norm": 0.3370221257209778, "learning_rate": 0.0002, "epoch": 1.190562613430127, "step": 1640}, {"loss": 1.102, "grad_norm": 0.31922030448913574, "learning_rate": 0.0002, "epoch": 1.1978221415607986, "step": 1650}, {"loss": 1.0565, "grad_norm": 0.6142027378082275, "learning_rate": 0.0002, "epoch": 1.2050816696914701, "step": 1660}, {"loss": 0.9541, "grad_norm": 0.44769710302352905, "learning_rate": 0.0002, "epoch": 1.2123411978221417, "step": 1670}, {"loss": 1.2501, "grad_norm": 0.41233646869659424, "learning_rate": 0.0002, "epoch": 1.219600725952813, "step": 1680}, {"loss": 1.0147, "grad_norm": 0.2928866147994995, "learning_rate": 0.0002, "epoch": 1.2268602540834845, "step": 1690}, {"loss": 0.9074, "grad_norm": 0.36913734674453735, "learning_rate": 0.0002, "epoch": 1.234119782214156, "step": 1700}, {"loss": 1.0847, "grad_norm": 0.5281891226768494, "learning_rate": 0.0002, "epoch": 1.2413793103448276, "step": 1710}, {"loss": 0.9482, "grad_norm": 0.3374697268009186, "learning_rate": 0.0002, "epoch": 1.2486388384754992, "step": 1720}, {"loss": 0.9826, "grad_norm": 0.3802020847797394, "learning_rate": 0.0002, "epoch": 1.2558983666061705, "step": 1730}, {"loss": 1.0191, "grad_norm": 0.38048651814460754, "learning_rate": 0.0002, "epoch": 1.263157894736842, "step": 1740}, {"loss": 1.0142, "grad_norm": 0.6676169633865356, "learning_rate": 0.0002, "epoch": 1.2704174228675136, "step": 1750}, {"loss": 0.992, "grad_norm": 0.4075961410999298, "learning_rate": 0.0002, "epoch": 1.2776769509981851, "step": 1760}, {"loss": 1.0301, "grad_norm": 0.4374721348285675, "learning_rate": 0.0002, "epoch": 1.2849364791288567, "step": 1770}, {"loss": 1.0601, "grad_norm": 0.4638824164867401, "learning_rate": 0.0002, "epoch": 1.2921960072595282, "step": 1780}, {"loss": 0.9593, "grad_norm": 0.38631564378738403, "learning_rate": 0.0002, "epoch": 1.2994555353901998, "step": 1790}, {"loss": 1.0042, "grad_norm": 0.35873809456825256, "learning_rate": 0.0002, "epoch": 1.306715063520871, "step": 1800}, {"loss": 0.9835, "grad_norm": 0.33602237701416016, "learning_rate": 0.0002, "epoch": 1.3139745916515426, "step": 1810}, {"loss": 1.1032, "grad_norm": 0.46696463227272034, "learning_rate": 0.0002, "epoch": 1.3212341197822142, "step": 1820}, {"loss": 1.0123, "grad_norm": 0.368958979845047, "learning_rate": 0.0002, "epoch": 1.3284936479128857, "step": 1830}, {"loss": 1.0456, "grad_norm": 0.3160957396030426, "learning_rate": 0.0002, "epoch": 1.335753176043557, "step": 1840}, {"loss": 1.0264, "grad_norm": 0.4511511027812958, "learning_rate": 0.0002, "epoch": 1.3430127041742286, "step": 1850}, {"loss": 1.0179, "grad_norm": 0.6769845485687256, "learning_rate": 0.0002, "epoch": 1.3502722323049001, "step": 1860}, {"loss": 1.011, "grad_norm": 0.3749309480190277, "learning_rate": 0.0002, "epoch": 1.3575317604355717, "step": 1870}, {"loss": 1.0223, "grad_norm": 0.553798496723175, "learning_rate": 0.0002, "epoch": 1.3647912885662432, "step": 1880}, {"loss": 0.9634, "grad_norm": 0.3538985550403595, "learning_rate": 0.0002, "epoch": 1.3720508166969148, "step": 1890}, {"loss": 1.0275, "grad_norm": 0.5501534938812256, "learning_rate": 0.0002, "epoch": 1.3793103448275863, "step": 1900}, {"loss": 1.0461, "grad_norm": 0.4432051181793213, "learning_rate": 0.0002, "epoch": 1.3865698729582578, "step": 1910}, {"loss": 0.9082, "grad_norm": 0.41755786538124084, "learning_rate": 0.0002, "epoch": 1.3938294010889292, "step": 1920}, {"loss": 1.0858, "grad_norm": 0.5626114010810852, "learning_rate": 0.0002, "epoch": 1.4010889292196007, "step": 1930}, {"loss": 1.0687, "grad_norm": 0.44038185477256775, "learning_rate": 0.0002, "epoch": 1.4083484573502723, "step": 1940}, {"loss": 0.9454, "grad_norm": 0.3397001624107361, "learning_rate": 0.0002, "epoch": 1.4156079854809438, "step": 1950}, {"loss": 0.943, "grad_norm": 0.4325368106365204, "learning_rate": 0.0002, "epoch": 1.4228675136116151, "step": 1960}, {"loss": 1.0488, "grad_norm": 0.3900907039642334, "learning_rate": 0.0002, "epoch": 1.4301270417422867, "step": 1970}, {"loss": 0.9699, "grad_norm": 0.369612455368042, "learning_rate": 0.0002, "epoch": 1.4373865698729582, "step": 1980}, {"loss": 1.0609, "grad_norm": 0.4389338791370392, "learning_rate": 0.0002, "epoch": 1.4446460980036298, "step": 1990}, {"loss": 1.042, "grad_norm": 1.694450855255127, "learning_rate": 0.0002, "epoch": 1.4519056261343013, "step": 2000}, {"loss": 0.997, "grad_norm": 0.516957700252533, "learning_rate": 0.0002, "epoch": 1.4591651542649728, "step": 2010}, {"loss": 1.0979, "grad_norm": 0.45515501499176025, "learning_rate": 0.0002, "epoch": 1.4664246823956444, "step": 2020}, {"loss": 1.0188, "grad_norm": 0.4153139591217041, "learning_rate": 0.0002, "epoch": 1.4736842105263157, "step": 2030}, {"loss": 1.1521, "grad_norm": 0.44353052973747253, "learning_rate": 0.0002, "epoch": 1.4809437386569873, "step": 2040}, {"loss": 0.9653, "grad_norm": 0.570554256439209, "learning_rate": 0.0002, "epoch": 1.4882032667876588, "step": 2050}, {"loss": 1.0352, "grad_norm": 0.5742740035057068, "learning_rate": 0.0002, "epoch": 1.4954627949183303, "step": 2060}, {"loss": 1.0963, "grad_norm": 0.5890517830848694, "learning_rate": 0.0002, "epoch": 1.5027223230490017, "step": 2070}, {"loss": 1.0639, "grad_norm": 0.4162650406360626, "learning_rate": 0.0002, "epoch": 1.5099818511796732, "step": 2080}, {"loss": 1.071, "grad_norm": 0.7334717512130737, "learning_rate": 0.0002, "epoch": 1.5172413793103448, "step": 2090}, {"loss": 0.8957, "grad_norm": 0.2963249683380127, "learning_rate": 0.0002, "epoch": 1.5245009074410163, "step": 2100}, {"loss": 1.0446, "grad_norm": 0.30676454305648804, "learning_rate": 0.0002, "epoch": 1.5317604355716878, "step": 2110}, {"loss": 0.9967, "grad_norm": 0.35984641313552856, "learning_rate": 0.0002, "epoch": 1.5390199637023594, "step": 2120}, {"loss": 1.0352, "grad_norm": 0.3384549617767334, "learning_rate": 0.0002, "epoch": 1.546279491833031, "step": 2130}, {"loss": 0.9874, "grad_norm": 0.4725518226623535, "learning_rate": 0.0002, "epoch": 1.5535390199637025, "step": 2140}, {"loss": 1.1166, "grad_norm": 0.3252685070037842, "learning_rate": 0.0002, "epoch": 1.560798548094374, "step": 2150}, {"loss": 0.9127, "grad_norm": 0.45043081045150757, "learning_rate": 0.0002, "epoch": 1.5680580762250453, "step": 2160}, {"loss": 1.0767, "grad_norm": 0.374208003282547, "learning_rate": 0.0002, "epoch": 1.5753176043557169, "step": 2170}, {"loss": 1.0255, "grad_norm": 0.5118404030799866, "learning_rate": 0.0002, "epoch": 1.5825771324863884, "step": 2180}, {"loss": 1.0062, "grad_norm": 0.482170969247818, "learning_rate": 0.0002, "epoch": 1.5898366606170597, "step": 2190}, {"loss": 1.0872, "grad_norm": 0.5337533950805664, "learning_rate": 0.0002, "epoch": 1.5970961887477313, "step": 2200}, {"loss": 1.0405, "grad_norm": 0.5195064544677734, "learning_rate": 0.0002, "epoch": 1.6043557168784028, "step": 2210}, {"loss": 1.0454, "grad_norm": 0.30807098746299744, "learning_rate": 0.0002, "epoch": 1.6116152450090744, "step": 2220}, {"loss": 1.0293, "grad_norm": 0.3962925672531128, "learning_rate": 0.0002, "epoch": 1.618874773139746, "step": 2230}, {"loss": 1.0137, "grad_norm": 0.7636962532997131, "learning_rate": 0.0002, "epoch": 1.6261343012704175, "step": 2240}, {"loss": 0.991, "grad_norm": 0.32380592823028564, "learning_rate": 0.0002, "epoch": 1.633393829401089, "step": 2250}, {"loss": 0.9471, "grad_norm": 0.5767741799354553, "learning_rate": 0.0002, "epoch": 1.6406533575317606, "step": 2260}, {"loss": 0.9524, "grad_norm": 0.39964812994003296, "learning_rate": 0.0002, "epoch": 1.647912885662432, "step": 2270}, {"loss": 0.9866, "grad_norm": 0.622629702091217, "learning_rate": 0.0002, "epoch": 1.6551724137931034, "step": 2280}, {"loss": 0.9609, "grad_norm": 0.40202152729034424, "learning_rate": 0.0002, "epoch": 1.662431941923775, "step": 2290}, {"loss": 0.9859, "grad_norm": 0.4467349052429199, "learning_rate": 0.0002, "epoch": 1.6696914700544465, "step": 2300}, {"loss": 1.0312, "grad_norm": 0.5026949048042297, "learning_rate": 0.0002, "epoch": 1.6769509981851178, "step": 2310}, {"loss": 1.0776, "grad_norm": 0.43754541873931885, "learning_rate": 0.0002, "epoch": 1.6842105263157894, "step": 2320}, {"loss": 0.9786, "grad_norm": 0.42869430780410767, "learning_rate": 0.0002, "epoch": 1.691470054446461, "step": 2330}, {"loss": 1.0747, "grad_norm": 0.4192679524421692, "learning_rate": 0.0002, "epoch": 1.6987295825771325, "step": 2340}, {"loss": 1.0054, "grad_norm": 0.3243155777454376, "learning_rate": 0.0002, "epoch": 1.705989110707804, "step": 2350}, {"loss": 0.9165, "grad_norm": 1.0514075756072998, "learning_rate": 0.0002, "epoch": 1.7132486388384756, "step": 2360}, {"loss": 1.1353, "grad_norm": 0.4719122648239136, "learning_rate": 0.0002, "epoch": 1.720508166969147, "step": 2370}, {"loss": 0.9913, "grad_norm": 0.3846144676208496, "learning_rate": 0.0002, "epoch": 1.7277676950998186, "step": 2380}, {"loss": 0.9935, "grad_norm": 0.7266581058502197, "learning_rate": 0.0002, "epoch": 1.73502722323049, "step": 2390}, {"loss": 0.9509, "grad_norm": 0.6183241605758667, "learning_rate": 0.0002, "epoch": 1.7422867513611615, "step": 2400}, {"loss": 0.9713, "grad_norm": 0.3658260405063629, "learning_rate": 0.0002, "epoch": 1.749546279491833, "step": 2410}, {"loss": 1.0033, "grad_norm": 0.6036322712898254, "learning_rate": 0.0002, "epoch": 1.7568058076225044, "step": 2420}, {"loss": 1.0526, "grad_norm": 0.7872936129570007, "learning_rate": 0.0002, "epoch": 1.764065335753176, "step": 2430}, {"loss": 0.99, "grad_norm": 0.35946124792099, "learning_rate": 0.0002, "epoch": 1.7713248638838475, "step": 2440}, {"loss": 0.8845, "grad_norm": 0.3740338981151581, "learning_rate": 0.0002, "epoch": 1.778584392014519, "step": 2450}, {"loss": 0.9333, "grad_norm": 0.6150230169296265, "learning_rate": 0.0002, "epoch": 1.7858439201451906, "step": 2460}, {"loss": 1.0373, "grad_norm": 0.4726075530052185, "learning_rate": 0.0002, "epoch": 1.793103448275862, "step": 2470}, {"loss": 0.974, "grad_norm": 0.31292253732681274, "learning_rate": 0.0002, "epoch": 1.8003629764065336, "step": 2480}, {"loss": 1.0205, "grad_norm": 0.4463104009628296, "learning_rate": 0.0002, "epoch": 1.8076225045372052, "step": 2490}, {"loss": 1.0771, "grad_norm": 0.7848200798034668, "learning_rate": 0.0002, "epoch": 1.8148820326678767, "step": 2500}, {"loss": 0.9508, "grad_norm": 0.5562082529067993, "learning_rate": 0.0002, "epoch": 1.822141560798548, "step": 2510}, {"loss": 0.9343, "grad_norm": 0.39892756938934326, "learning_rate": 0.0002, "epoch": 1.8294010889292196, "step": 2520}, {"loss": 1.0489, "grad_norm": 1.2923320531845093, "learning_rate": 0.0002, "epoch": 1.8366606170598911, "step": 2530}, {"loss": 1.1203, "grad_norm": 0.6316490769386292, "learning_rate": 0.0002, "epoch": 1.8439201451905625, "step": 2540}, {"loss": 0.9131, "grad_norm": 0.46100497245788574, "learning_rate": 0.0002, "epoch": 1.851179673321234, "step": 2550}, {"loss": 1.0977, "grad_norm": 0.7902987003326416, "learning_rate": 0.0002, "epoch": 1.8584392014519056, "step": 2560}, {"loss": 0.9702, "grad_norm": 0.4596365690231323, "learning_rate": 0.0002, "epoch": 1.865698729582577, "step": 2570}, {"loss": 0.9322, "grad_norm": 0.6592172384262085, "learning_rate": 0.0002, "epoch": 1.8729582577132486, "step": 2580}, {"loss": 0.9549, "grad_norm": 0.5764662027359009, "learning_rate": 0.0002, "epoch": 1.8802177858439202, "step": 2590}, {"loss": 0.9485, "grad_norm": 0.8421637415885925, "learning_rate": 0.0002, "epoch": 1.8874773139745917, "step": 2600}, {"loss": 1.012, "grad_norm": 0.5635305047035217, "learning_rate": 0.0002, "epoch": 1.8947368421052633, "step": 2610}, {"loss": 0.8907, "grad_norm": 0.46231237053871155, "learning_rate": 0.0002, "epoch": 1.9019963702359348, "step": 2620}, {"loss": 0.9543, "grad_norm": 0.3944607079029083, "learning_rate": 0.0002, "epoch": 1.9092558983666061, "step": 2630}, {"loss": 0.9964, "grad_norm": 0.4753907322883606, "learning_rate": 0.0002, "epoch": 1.9165154264972777, "step": 2640}, {"loss": 1.0217, "grad_norm": 0.4151090979576111, "learning_rate": 0.0002, "epoch": 1.9237749546279492, "step": 2650}, {"loss": 1.0397, "grad_norm": 0.6793725490570068, "learning_rate": 0.0002, "epoch": 1.9310344827586206, "step": 2660}, {"loss": 1.0401, "grad_norm": 0.339755117893219, "learning_rate": 0.0002, "epoch": 1.938294010889292, "step": 2670}, {"loss": 1.082, "grad_norm": 0.40789374709129333, "learning_rate": 0.0002, "epoch": 1.9455535390199636, "step": 2680}, {"loss": 0.9973, "grad_norm": 0.3750005066394806, "learning_rate": 0.0002, "epoch": 1.9528130671506352, "step": 2690}, {"loss": 0.9837, "grad_norm": 0.39684441685676575, "learning_rate": 0.0002, "epoch": 1.9600725952813067, "step": 2700}, {"loss": 1.1204, "grad_norm": 0.378287672996521, "learning_rate": 0.0002, "epoch": 1.9673321234119783, "step": 2710}, {"loss": 0.9284, "grad_norm": 0.3668482005596161, "learning_rate": 0.0002, "epoch": 1.9745916515426498, "step": 2720}, {"loss": 1.0512, "grad_norm": 0.49997565150260925, "learning_rate": 0.0002, "epoch": 1.9818511796733214, "step": 2730}, {"loss": 0.9311, "grad_norm": 0.36852124333381653, "learning_rate": 0.0002, "epoch": 1.989110707803993, "step": 2740}, {"loss": 1.0898, "grad_norm": 0.5203380584716797, "learning_rate": 0.0002, "epoch": 1.9963702359346642, "step": 2750}, {"eval_loss": 1.1086540222167969, "eval_runtime": 95.6866, "eval_samples_per_second": 4.557, "eval_steps_per_second": 0.575, "epoch": 2.0, "step": 2755}, {"loss": 0.9791, "grad_norm": 0.35921573638916016, "learning_rate": 0.0002, "epoch": 2.0036297640653356, "step": 2760}, {"loss": 0.8968, "grad_norm": 1.013861894607544, "learning_rate": 0.0002, "epoch": 2.010889292196007, "step": 2770}, {"loss": 0.7873, "grad_norm": 0.4425240159034729, "learning_rate": 0.0002, "epoch": 2.0181488203266786, "step": 2780}, {"loss": 0.8758, "grad_norm": 0.60174161195755, "learning_rate": 0.0002, "epoch": 2.02540834845735, "step": 2790}, {"loss": 0.7989, "grad_norm": 0.47582098841667175, "learning_rate": 0.0002, "epoch": 2.0326678765880217, "step": 2800}, {"loss": 0.8898, "grad_norm": 0.6012811660766602, "learning_rate": 0.0002, "epoch": 2.0399274047186933, "step": 2810}, {"loss": 0.8109, "grad_norm": 0.4444408118724823, "learning_rate": 0.0002, "epoch": 2.047186932849365, "step": 2820}, {"loss": 0.8621, "grad_norm": 0.4864003360271454, "learning_rate": 0.0002, "epoch": 2.0544464609800364, "step": 2830}, {"loss": 0.8817, "grad_norm": 0.5104215741157532, "learning_rate": 0.0002, "epoch": 2.061705989110708, "step": 2840}, {"loss": 0.7789, "grad_norm": 0.6218489408493042, "learning_rate": 0.0002, "epoch": 2.0689655172413794, "step": 2850}, {"loss": 0.8778, "grad_norm": 0.705784261226654, "learning_rate": 0.0002, "epoch": 2.076225045372051, "step": 2860}, {"loss": 0.7851, "grad_norm": 0.48091503977775574, "learning_rate": 0.0002, "epoch": 2.0834845735027225, "step": 2870}, {"loss": 0.8444, "grad_norm": 0.5062456727027893, "learning_rate": 0.0002, "epoch": 2.0907441016333936, "step": 2880}, {"loss": 0.7898, "grad_norm": 0.4862022399902344, "learning_rate": 0.0002, "epoch": 2.098003629764065, "step": 2890}, {"loss": 0.8593, "grad_norm": 0.48264044523239136, "learning_rate": 0.0002, "epoch": 2.1052631578947367, "step": 2900}, {"loss": 0.8325, "grad_norm": 0.43744346499443054, "learning_rate": 0.0002, "epoch": 2.1125226860254083, "step": 2910}, {"loss": 0.9099, "grad_norm": 0.5480492115020752, "learning_rate": 0.0002, "epoch": 2.11978221415608, "step": 2920}, {"loss": 0.7727, "grad_norm": 0.5068560838699341, "learning_rate": 0.0002, "epoch": 2.1270417422867514, "step": 2930}, {"loss": 0.8403, "grad_norm": 0.4650852680206299, "learning_rate": 0.0002, "epoch": 2.134301270417423, "step": 2940}, {"loss": 0.8467, "grad_norm": 0.4929981231689453, "learning_rate": 0.0002, "epoch": 2.1415607985480944, "step": 2950}, {"loss": 0.8747, "grad_norm": 0.6537389755249023, "learning_rate": 0.0002, "epoch": 2.148820326678766, "step": 2960}, {"loss": 0.7663, "grad_norm": 0.8032940626144409, "learning_rate": 0.0002, "epoch": 2.1560798548094375, "step": 2970}, {"loss": 0.7604, "grad_norm": 0.7131643891334534, "learning_rate": 0.0002, "epoch": 2.163339382940109, "step": 2980}, {"loss": 0.8424, "grad_norm": 0.6034275889396667, "learning_rate": 0.0002, "epoch": 2.1705989110707806, "step": 2990}, {"loss": 0.8838, "grad_norm": 0.6081095933914185, "learning_rate": 0.0002, "epoch": 2.1778584392014517, "step": 3000}, {"loss": 0.9013, "grad_norm": 0.5706912875175476, "learning_rate": 0.0002, "epoch": 2.1851179673321233, "step": 3010}, {"loss": 0.8473, "grad_norm": 0.6742380261421204, "learning_rate": 0.0002, "epoch": 2.192377495462795, "step": 3020}, {"loss": 0.8372, "grad_norm": 0.4847496449947357, "learning_rate": 0.0002, "epoch": 2.1996370235934664, "step": 3030}, {"loss": 0.8487, "grad_norm": 0.5529342889785767, "learning_rate": 0.0002, "epoch": 2.206896551724138, "step": 3040}, {"loss": 0.9057, "grad_norm": 0.6108783483505249, "learning_rate": 0.0002, "epoch": 2.2141560798548094, "step": 3050}, {"loss": 0.9659, "grad_norm": 0.8841571807861328, "learning_rate": 0.0002, "epoch": 2.221415607985481, "step": 3060}, {"loss": 0.8538, "grad_norm": 0.4227530360221863, "learning_rate": 0.0002, "epoch": 2.2286751361161525, "step": 3070}, {"loss": 0.7925, "grad_norm": 0.461935818195343, "learning_rate": 0.0002, "epoch": 2.235934664246824, "step": 3080}, {"loss": 0.8542, "grad_norm": 0.5407412648200989, "learning_rate": 0.0002, "epoch": 2.2431941923774956, "step": 3090}, {"loss": 0.8392, "grad_norm": 0.5057998895645142, "learning_rate": 0.0002, "epoch": 2.250453720508167, "step": 3100}, {"loss": 0.7898, "grad_norm": 0.530057966709137, "learning_rate": 0.0002, "epoch": 2.2577132486388383, "step": 3110}, {"loss": 0.8507, "grad_norm": 0.5066842436790466, "learning_rate": 0.0002, "epoch": 2.26497277676951, "step": 3120}, {"loss": 0.7903, "grad_norm": 0.5069178342819214, "learning_rate": 0.0002, "epoch": 2.2722323049001814, "step": 3130}, {"loss": 0.8458, "grad_norm": 0.6095499396324158, "learning_rate": 0.0002, "epoch": 2.279491833030853, "step": 3140}, {"loss": 0.8171, "grad_norm": 0.49524766206741333, "learning_rate": 0.0002, "epoch": 2.2867513611615244, "step": 3150}, {"loss": 0.7568, "grad_norm": 0.5334409475326538, "learning_rate": 0.0002, "epoch": 2.294010889292196, "step": 3160}, {"loss": 0.7533, "grad_norm": 1.681748867034912, "learning_rate": 0.0002, "epoch": 2.3012704174228675, "step": 3170}, {"loss": 0.8155, "grad_norm": 0.7225565314292908, "learning_rate": 0.0002, "epoch": 2.308529945553539, "step": 3180}, {"loss": 0.8034, "grad_norm": 0.5379388928413391, "learning_rate": 0.0002, "epoch": 2.3157894736842106, "step": 3190}, {"loss": 0.8833, "grad_norm": 0.45770326256752014, "learning_rate": 0.0002, "epoch": 2.323049001814882, "step": 3200}, {"loss": 0.8419, "grad_norm": 0.6984533071517944, "learning_rate": 0.0002, "epoch": 2.3303085299455537, "step": 3210}, {"loss": 0.7414, "grad_norm": 0.6725744605064392, "learning_rate": 0.0002, "epoch": 2.337568058076225, "step": 3220}, {"loss": 0.8657, "grad_norm": 1.1247471570968628, "learning_rate": 0.0002, "epoch": 2.344827586206897, "step": 3230}, {"loss": 0.7782, "grad_norm": 1.0240263938903809, "learning_rate": 0.0002, "epoch": 2.352087114337568, "step": 3240}, {"loss": 0.826, "grad_norm": 0.5608096122741699, "learning_rate": 0.0002, "epoch": 2.3593466424682394, "step": 3250}, {"loss": 0.8191, "grad_norm": 0.8294990062713623, "learning_rate": 0.0002, "epoch": 2.366606170598911, "step": 3260}, {"loss": 0.9845, "grad_norm": 0.6734224557876587, "learning_rate": 0.0002, "epoch": 2.3738656987295825, "step": 3270}, {"loss": 0.7921, "grad_norm": 0.6862800717353821, "learning_rate": 0.0002, "epoch": 2.381125226860254, "step": 3280}, {"loss": 0.7752, "grad_norm": 0.5442930459976196, "learning_rate": 0.0002, "epoch": 2.3883847549909256, "step": 3290}, {"loss": 0.8543, "grad_norm": 0.5745394229888916, "learning_rate": 0.0002, "epoch": 2.395644283121597, "step": 3300}, {"loss": 0.833, "grad_norm": 0.6257799863815308, "learning_rate": 0.0002, "epoch": 2.4029038112522687, "step": 3310}, {"loss": 0.8191, "grad_norm": 0.5608420968055725, "learning_rate": 0.0002, "epoch": 2.4101633393829403, "step": 3320}, {"loss": 0.8745, "grad_norm": 0.5512017011642456, "learning_rate": 0.0002, "epoch": 2.417422867513612, "step": 3330}, {"loss": 0.8642, "grad_norm": 0.7452999353408813, "learning_rate": 0.0002, "epoch": 2.4246823956442833, "step": 3340}, {"loss": 0.8417, "grad_norm": 0.4604301452636719, "learning_rate": 0.0002, "epoch": 2.4319419237749544, "step": 3350}, {"loss": 0.9148, "grad_norm": 0.8225823640823364, "learning_rate": 0.0002, "epoch": 2.439201451905626, "step": 3360}, {"loss": 0.8144, "grad_norm": 0.8604981303215027, "learning_rate": 0.0002, "epoch": 2.4464609800362975, "step": 3370}, {"loss": 0.9276, "grad_norm": 0.6620925664901733, "learning_rate": 0.0002, "epoch": 2.453720508166969, "step": 3380}, {"loss": 0.8381, "grad_norm": 0.4750158488750458, "learning_rate": 0.0002, "epoch": 2.4609800362976406, "step": 3390}, {"loss": 0.8249, "grad_norm": 0.6061418056488037, "learning_rate": 0.0002, "epoch": 2.468239564428312, "step": 3400}, {"loss": 0.8944, "grad_norm": 0.5977247357368469, "learning_rate": 0.0002, "epoch": 2.4754990925589837, "step": 3410}, {"loss": 0.8366, "grad_norm": 0.5004227757453918, "learning_rate": 0.0002, "epoch": 2.4827586206896552, "step": 3420}, {"loss": 0.9133, "grad_norm": 0.46428972482681274, "learning_rate": 0.0002, "epoch": 2.490018148820327, "step": 3430}, {"loss": 0.8534, "grad_norm": 0.7658395171165466, "learning_rate": 0.0002, "epoch": 2.4972776769509983, "step": 3440}, {"loss": 0.9066, "grad_norm": 0.621819794178009, "learning_rate": 0.0002, "epoch": 2.50453720508167, "step": 3450}, {"loss": 0.8481, "grad_norm": 0.4556088149547577, "learning_rate": 0.0002, "epoch": 2.511796733212341, "step": 3460}, {"loss": 0.8522, "grad_norm": 0.6124140024185181, "learning_rate": 0.0002, "epoch": 2.519056261343013, "step": 3470}, {"loss": 0.8158, "grad_norm": 0.6256231665611267, "learning_rate": 0.0002, "epoch": 2.526315789473684, "step": 3480}, {"loss": 0.9097, "grad_norm": 0.5464141964912415, "learning_rate": 0.0002, "epoch": 2.5335753176043556, "step": 3490}, {"loss": 0.8708, "grad_norm": 0.51471346616745, "learning_rate": 0.0002, "epoch": 2.540834845735027, "step": 3500}, {"loss": 0.9042, "grad_norm": 0.5326165556907654, "learning_rate": 0.0002, "epoch": 2.5480943738656987, "step": 3510}, {"loss": 0.8749, "grad_norm": 0.4750378429889679, "learning_rate": 0.0002, "epoch": 2.5553539019963702, "step": 3520}, {"loss": 0.8248, "grad_norm": 0.5292279124259949, "learning_rate": 0.0002, "epoch": 2.562613430127042, "step": 3530}, {"loss": 0.8808, "grad_norm": 0.6145227551460266, "learning_rate": 0.0002, "epoch": 2.5698729582577133, "step": 3540}, {"loss": 0.846, "grad_norm": 0.8275189399719238, "learning_rate": 0.0002, "epoch": 2.577132486388385, "step": 3550}, {"loss": 0.875, "grad_norm": 0.5037438273429871, "learning_rate": 0.0002, "epoch": 2.5843920145190564, "step": 3560}, {"loss": 0.8324, "grad_norm": 0.5838707685470581, "learning_rate": 0.0002, "epoch": 2.5916515426497275, "step": 3570}, {"loss": 0.8716, "grad_norm": 0.5398710370063782, "learning_rate": 0.0002, "epoch": 2.5989110707803995, "step": 3580}, {"loss": 0.7843, "grad_norm": 0.6115376949310303, "learning_rate": 0.0002, "epoch": 2.6061705989110706, "step": 3590}, {"loss": 0.8142, "grad_norm": 1.118809461593628, "learning_rate": 0.0002, "epoch": 2.613430127041742, "step": 3600}, {"loss": 0.8282, "grad_norm": 0.7811771631240845, "learning_rate": 0.0002, "epoch": 2.6206896551724137, "step": 3610}, {"loss": 0.7997, "grad_norm": 0.5753175020217896, "learning_rate": 0.0002, "epoch": 2.6279491833030852, "step": 3620}, {"loss": 0.7705, "grad_norm": 0.550829291343689, "learning_rate": 0.0002, "epoch": 2.635208711433757, "step": 3630}, {"loss": 0.8564, "grad_norm": 0.5360019207000732, "learning_rate": 0.0002, "epoch": 2.6424682395644283, "step": 3640}, {"loss": 0.7648, "grad_norm": 0.63050377368927, "learning_rate": 0.0002, "epoch": 2.6497277676951, "step": 3650}, {"loss": 0.9156, "grad_norm": 0.5833110213279724, "learning_rate": 0.0002, "epoch": 2.6569872958257714, "step": 3660}, {"loss": 0.8033, "grad_norm": 0.5543047189712524, "learning_rate": 0.0002, "epoch": 2.664246823956443, "step": 3670}, {"loss": 0.8867, "grad_norm": 0.6842212080955505, "learning_rate": 0.0002, "epoch": 2.671506352087114, "step": 3680}, {"loss": 0.8569, "grad_norm": 0.6859333515167236, "learning_rate": 0.0002, "epoch": 2.678765880217786, "step": 3690}, {"loss": 0.8081, "grad_norm": 0.7038410902023315, "learning_rate": 0.0002, "epoch": 2.686025408348457, "step": 3700}, {"loss": 0.8686, "grad_norm": 0.447233647108078, "learning_rate": 0.0002, "epoch": 2.6932849364791287, "step": 3710}, {"loss": 0.8951, "grad_norm": 0.6453872919082642, "learning_rate": 0.0002, "epoch": 2.7005444646098002, "step": 3720}, {"loss": 0.8292, "grad_norm": 0.8025672435760498, "learning_rate": 0.0002, "epoch": 2.707803992740472, "step": 3730}, {"loss": 0.8488, "grad_norm": 0.5997087955474854, "learning_rate": 0.0002, "epoch": 2.7150635208711433, "step": 3740}, {"loss": 0.8308, "grad_norm": 0.6901142001152039, "learning_rate": 0.0002, "epoch": 2.722323049001815, "step": 3750}, {"loss": 0.8517, "grad_norm": 1.036145567893982, "learning_rate": 0.0002, "epoch": 2.7295825771324864, "step": 3760}, {"loss": 0.8455, "grad_norm": 0.7207072377204895, "learning_rate": 0.0002, "epoch": 2.736842105263158, "step": 3770}, {"loss": 0.8009, "grad_norm": 1.0452989339828491, "learning_rate": 0.0002, "epoch": 2.7441016333938295, "step": 3780}, {"loss": 0.8868, "grad_norm": 0.5615278482437134, "learning_rate": 0.0002, "epoch": 2.751361161524501, "step": 3790}, {"loss": 0.9047, "grad_norm": 0.46439215540885925, "learning_rate": 0.0002, "epoch": 2.7586206896551726, "step": 3800}, {"loss": 0.7847, "grad_norm": 0.7134917974472046, "learning_rate": 0.0002, "epoch": 2.7658802177858437, "step": 3810}, {"loss": 0.8315, "grad_norm": 0.5139115452766418, "learning_rate": 0.0002, "epoch": 2.7731397459165157, "step": 3820}, {"loss": 0.8995, "grad_norm": 0.8595781326293945, "learning_rate": 0.0002, "epoch": 2.780399274047187, "step": 3830}, {"loss": 0.9079, "grad_norm": 0.544614851474762, "learning_rate": 0.0002, "epoch": 2.7876588021778583, "step": 3840}, {"loss": 0.8322, "grad_norm": 0.6073850393295288, "learning_rate": 0.0002, "epoch": 2.79491833030853, "step": 3850}, {"loss": 0.8277, "grad_norm": 0.8224069476127625, "learning_rate": 0.0002, "epoch": 2.8021778584392014, "step": 3860}, {"loss": 0.815, "grad_norm": 0.5347970128059387, "learning_rate": 0.0002, "epoch": 2.809437386569873, "step": 3870}, {"loss": 0.8219, "grad_norm": 0.6799601912498474, "learning_rate": 0.0002, "epoch": 2.8166969147005445, "step": 3880}, {"loss": 0.8796, "grad_norm": 0.5219197869300842, "learning_rate": 0.0002, "epoch": 2.823956442831216, "step": 3890}, {"loss": 0.8649, "grad_norm": 0.5710130333900452, "learning_rate": 0.0002, "epoch": 2.8312159709618876, "step": 3900}, {"loss": 0.8741, "grad_norm": 0.5857213139533997, "learning_rate": 0.0002, "epoch": 2.838475499092559, "step": 3910}, {"loss": 0.8154, "grad_norm": 0.5206644535064697, "learning_rate": 0.0002, "epoch": 2.8457350272232302, "step": 3920}, {"loss": 0.9328, "grad_norm": 1.2902015447616577, "learning_rate": 0.0002, "epoch": 2.8529945553539022, "step": 3930}, {"loss": 0.8192, "grad_norm": 0.5252797603607178, "learning_rate": 0.0002, "epoch": 2.8602540834845733, "step": 3940}, {"loss": 0.9047, "grad_norm": 0.5925108790397644, "learning_rate": 0.0002, "epoch": 2.867513611615245, "step": 3950}, {"loss": 0.8611, "grad_norm": 0.9719610810279846, "learning_rate": 0.0002, "epoch": 2.8747731397459164, "step": 3960}, {"loss": 0.8531, "grad_norm": 0.6834747195243835, "learning_rate": 0.0002, "epoch": 2.882032667876588, "step": 3970}, {"loss": 0.8124, "grad_norm": 0.6952353119850159, "learning_rate": 0.0002, "epoch": 2.8892921960072595, "step": 3980}, {"loss": 0.8365, "grad_norm": 0.49889910221099854, "learning_rate": 0.0002, "epoch": 2.896551724137931, "step": 3990}, {"loss": 0.8197, "grad_norm": 0.5007026791572571, "learning_rate": 0.0002, "epoch": 2.9038112522686026, "step": 4000}, {"loss": 0.7752, "grad_norm": 0.5474239587783813, "learning_rate": 0.0002, "epoch": 2.911070780399274, "step": 4010}, {"loss": 0.8579, "grad_norm": 0.6617428660392761, "learning_rate": 0.0002, "epoch": 2.9183303085299457, "step": 4020}, {"loss": 0.8583, "grad_norm": 0.6097776293754578, "learning_rate": 0.0002, "epoch": 2.925589836660617, "step": 4030}, {"loss": 0.8296, "grad_norm": 0.5985828638076782, "learning_rate": 0.0002, "epoch": 2.9328493647912888, "step": 4040}, {"loss": 0.9134, "grad_norm": 0.769488513469696, "learning_rate": 0.0002, "epoch": 2.94010889292196, "step": 4050}, {"loss": 0.8148, "grad_norm": 0.5167055130004883, "learning_rate": 0.0002, "epoch": 2.9473684210526314, "step": 4060}, {"loss": 0.8665, "grad_norm": 0.6265496015548706, "learning_rate": 0.0002, "epoch": 2.954627949183303, "step": 4070}, {"loss": 0.9218, "grad_norm": 1.2644082307815552, "learning_rate": 0.0002, "epoch": 2.9618874773139745, "step": 4080}, {"loss": 0.8026, "grad_norm": 0.6007561087608337, "learning_rate": 0.0002, "epoch": 2.969147005444646, "step": 4090}, {"loss": 0.8262, "grad_norm": 0.47984927892684937, "learning_rate": 0.0002, "epoch": 2.9764065335753176, "step": 4100}, {"loss": 0.7897, "grad_norm": 1.128198504447937, "learning_rate": 0.0002, "epoch": 2.983666061705989, "step": 4110}, {"loss": 0.7883, "grad_norm": 0.526292085647583, "learning_rate": 0.0002, "epoch": 2.9909255898366607, "step": 4120}, {"loss": 0.7801, "grad_norm": 0.5462674498558044, "learning_rate": 0.0002, "epoch": 2.9981851179673322, "step": 4130}, {"eval_loss": 1.1093357801437378, "eval_runtime": 46.2498, "eval_samples_per_second": 9.427, "eval_steps_per_second": 1.189, "epoch": 2.9996370235934666, "step": 4132}, {"loss": 0.721, "grad_norm": 0.48288026452064514, "learning_rate": 0.0002, "epoch": 3.0054446460980038, "step": 4140}, {"loss": 0.6769, "grad_norm": 1.0181782245635986, "learning_rate": 0.0002, "epoch": 3.0127041742286753, "step": 4150}, {"loss": 0.7185, "grad_norm": 0.7718019485473633, "learning_rate": 0.0002, "epoch": 3.019963702359347, "step": 4160}, {"loss": 0.6552, "grad_norm": 0.7492219805717468, "learning_rate": 0.0002, "epoch": 3.027223230490018, "step": 4170}, {"loss": 0.6678, "grad_norm": 0.9363632798194885, "learning_rate": 0.0002, "epoch": 3.0344827586206895, "step": 4180}, {"loss": 0.7187, "grad_norm": 0.6888533234596252, "learning_rate": 0.0002, "epoch": 3.041742286751361, "step": 4190}, {"loss": 0.6469, "grad_norm": 0.7072834968566895, "learning_rate": 0.0002, "epoch": 3.0490018148820326, "step": 4200}, {"loss": 0.6387, "grad_norm": 0.7182047963142395, "learning_rate": 0.0002, "epoch": 3.056261343012704, "step": 4210}, {"loss": 0.6385, "grad_norm": 0.7194355130195618, "learning_rate": 0.0002, "epoch": 3.0635208711433757, "step": 4220}, {"loss": 0.5812, "grad_norm": 0.9454023838043213, "learning_rate": 0.0002, "epoch": 3.0707803992740472, "step": 4230}, {"loss": 0.6036, "grad_norm": 0.838657557964325, "learning_rate": 0.0002, "epoch": 3.0780399274047188, "step": 4240}, {"loss": 0.646, "grad_norm": 0.740113377571106, "learning_rate": 0.0002, "epoch": 3.0852994555353903, "step": 4250}, {"loss": 0.604, "grad_norm": 0.6616561412811279, "learning_rate": 0.0002, "epoch": 3.092558983666062, "step": 4260}, {"loss": 0.6462, "grad_norm": 0.8846506476402283, "learning_rate": 0.0002, "epoch": 3.0998185117967334, "step": 4270}, {"loss": 0.6037, "grad_norm": 0.6322125792503357, "learning_rate": 0.0002, "epoch": 3.107078039927405, "step": 4280}, {"loss": 0.5953, "grad_norm": 0.7461467385292053, "learning_rate": 0.0002, "epoch": 3.114337568058076, "step": 4290}, {"loss": 0.6099, "grad_norm": 0.8251287341117859, "learning_rate": 0.0002, "epoch": 3.1215970961887476, "step": 4300}, {"loss": 0.6284, "grad_norm": 0.8767673373222351, "learning_rate": 0.0002, "epoch": 3.128856624319419, "step": 4310}, {"loss": 0.7535, "grad_norm": 0.7758759260177612, "learning_rate": 0.0002, "epoch": 3.1361161524500907, "step": 4320}, {"loss": 0.6624, "grad_norm": 1.1056879758834839, "learning_rate": 0.0002, "epoch": 3.143375680580762, "step": 4330}, {"loss": 0.691, "grad_norm": 0.8259835243225098, "learning_rate": 0.0002, "epoch": 3.1506352087114338, "step": 4340}, {"loss": 0.6635, "grad_norm": 0.6607027053833008, "learning_rate": 0.0002, "epoch": 3.1578947368421053, "step": 4350}, {"loss": 0.5911, "grad_norm": 0.7983301281929016, "learning_rate": 0.0002, "epoch": 3.165154264972777, "step": 4360}, {"loss": 0.6496, "grad_norm": 0.6725239157676697, "learning_rate": 0.0002, "epoch": 3.1724137931034484, "step": 4370}, {"loss": 0.5966, "grad_norm": 0.9052095413208008, "learning_rate": 0.0002, "epoch": 3.17967332123412, "step": 4380}, {"loss": 0.6877, "grad_norm": 0.8131307363510132, "learning_rate": 0.0002, "epoch": 3.1869328493647915, "step": 4390}, {"loss": 0.6384, "grad_norm": 0.6435626149177551, "learning_rate": 0.0002, "epoch": 3.1941923774954626, "step": 4400}, {"loss": 0.5819, "grad_norm": 0.84367436170578, "learning_rate": 0.0002, "epoch": 3.201451905626134, "step": 4410}, {"loss": 0.6104, "grad_norm": 1.5018867254257202, "learning_rate": 0.0002, "epoch": 3.2087114337568057, "step": 4420}, {"loss": 0.6838, "grad_norm": 0.7019091844558716, "learning_rate": 0.0002, "epoch": 3.215970961887477, "step": 4430}, {"loss": 0.6153, "grad_norm": 0.9164197444915771, "learning_rate": 0.0002, "epoch": 3.2232304900181488, "step": 4440}, {"loss": 0.6618, "grad_norm": 0.7890861630439758, "learning_rate": 0.0002, "epoch": 3.2304900181488203, "step": 4450}, {"loss": 0.6401, "grad_norm": 0.6517660617828369, "learning_rate": 0.0002, "epoch": 3.237749546279492, "step": 4460}, {"loss": 0.6699, "grad_norm": 1.10188889503479, "learning_rate": 0.0002, "epoch": 3.2450090744101634, "step": 4470}, {"loss": 0.6356, "grad_norm": 0.8158330917358398, "learning_rate": 0.0002, "epoch": 3.252268602540835, "step": 4480}, {"loss": 0.7757, "grad_norm": 0.7663109302520752, "learning_rate": 0.0002, "epoch": 3.2595281306715065, "step": 4490}, {"loss": 0.6539, "grad_norm": 0.8473444581031799, "learning_rate": 0.0002, "epoch": 3.266787658802178, "step": 4500}, {"loss": 0.6511, "grad_norm": 0.9724768996238708, "learning_rate": 0.0002, "epoch": 3.274047186932849, "step": 4510}, {"loss": 0.5464, "grad_norm": 0.8516759276390076, "learning_rate": 0.0002, "epoch": 3.281306715063521, "step": 4520}, {"loss": 0.6534, "grad_norm": 0.7543437480926514, "learning_rate": 0.0002, "epoch": 3.288566243194192, "step": 4530}, {"loss": 0.6095, "grad_norm": 1.0472029447555542, "learning_rate": 0.0002, "epoch": 3.2958257713248638, "step": 4540}, {"loss": 0.6216, "grad_norm": 0.6240826845169067, "learning_rate": 0.0002, "epoch": 3.3030852994555353, "step": 4550}, {"loss": 0.6223, "grad_norm": 0.9957774877548218, "learning_rate": 0.0002, "epoch": 3.310344827586207, "step": 4560}, {"loss": 0.618, "grad_norm": 0.6448912620544434, "learning_rate": 0.0002, "epoch": 3.3176043557168784, "step": 4570}, {"loss": 0.6188, "grad_norm": 0.7519692778587341, "learning_rate": 0.0002, "epoch": 3.32486388384755, "step": 4580}, {"loss": 0.6672, "grad_norm": 0.7367453575134277, "learning_rate": 0.0002, "epoch": 3.3321234119782215, "step": 4590}, {"loss": 0.6517, "grad_norm": 0.8064960837364197, "learning_rate": 0.0002, "epoch": 3.339382940108893, "step": 4600}, {"loss": 0.6062, "grad_norm": 0.7664631009101868, "learning_rate": 0.0002, "epoch": 3.3466424682395646, "step": 4610}, {"loss": 0.6834, "grad_norm": 0.7803396582603455, "learning_rate": 0.0002, "epoch": 3.353901996370236, "step": 4620}, {"loss": 0.6961, "grad_norm": 0.9141599535942078, "learning_rate": 0.0002, "epoch": 3.3611615245009077, "step": 4630}, {"loss": 0.6889, "grad_norm": 0.9719856381416321, "learning_rate": 0.0002, "epoch": 3.3684210526315788, "step": 4640}, {"loss": 0.6914, "grad_norm": 0.9223218560218811, "learning_rate": 0.0002, "epoch": 3.3756805807622503, "step": 4650}, {"loss": 0.5981, "grad_norm": 0.7289277911186218, "learning_rate": 0.0002, "epoch": 3.382940108892922, "step": 4660}, {"loss": 0.595, "grad_norm": 1.039724349975586, "learning_rate": 0.0002, "epoch": 3.3901996370235934, "step": 4670}, {"loss": 0.8121, "grad_norm": 1.397438883781433, "learning_rate": 0.0002, "epoch": 3.397459165154265, "step": 4680}, {"loss": 0.6334, "grad_norm": 1.0069999694824219, "learning_rate": 0.0002, "epoch": 3.4047186932849365, "step": 4690}, {"loss": 0.6598, "grad_norm": 0.816291332244873, "learning_rate": 0.0002, "epoch": 3.411978221415608, "step": 4700}, {"loss": 0.6748, "grad_norm": 1.2831530570983887, "learning_rate": 0.0002, "epoch": 3.4192377495462796, "step": 4710}, {"loss": 0.6625, "grad_norm": 0.9573889970779419, "learning_rate": 0.0002, "epoch": 3.426497277676951, "step": 4720}, {"loss": 0.7279, "grad_norm": 0.7685632705688477, "learning_rate": 0.0002, "epoch": 3.4337568058076227, "step": 4730}, {"loss": 0.6104, "grad_norm": 0.7019195556640625, "learning_rate": 0.0002, "epoch": 3.441016333938294, "step": 4740}, {"loss": 0.7606, "grad_norm": 0.7244833707809448, "learning_rate": 0.0002, "epoch": 3.4482758620689653, "step": 4750}, {"loss": 0.6951, "grad_norm": 1.3468551635742188, "learning_rate": 0.0002, "epoch": 3.455535390199637, "step": 4760}, {"loss": 0.6945, "grad_norm": 0.822846531867981, "learning_rate": 0.0002, "epoch": 3.4627949183303084, "step": 4770}, {"loss": 0.6431, "grad_norm": 0.7311608195304871, "learning_rate": 0.0002, "epoch": 3.47005444646098, "step": 4780}, {"loss": 0.7019, "grad_norm": 0.9466770887374878, "learning_rate": 0.0002, "epoch": 3.4773139745916515, "step": 4790}, {"loss": 0.7767, "grad_norm": 1.1527155637741089, "learning_rate": 0.0002, "epoch": 3.484573502722323, "step": 4800}, {"loss": 0.6882, "grad_norm": 1.1288906335830688, "learning_rate": 0.0002, "epoch": 3.4918330308529946, "step": 4810}, {"loss": 0.6564, "grad_norm": 0.9096164107322693, "learning_rate": 0.0002, "epoch": 3.499092558983666, "step": 4820}, {"loss": 0.6127, "grad_norm": 0.7988565564155579, "learning_rate": 0.0002, "epoch": 3.5063520871143377, "step": 4830}, {"loss": 0.7004, "grad_norm": 0.7183415293693542, "learning_rate": 0.0002, "epoch": 3.513611615245009, "step": 4840}, {"loss": 0.74, "grad_norm": 0.6614915132522583, "learning_rate": 0.0002, "epoch": 3.5208711433756807, "step": 4850}, {"loss": 0.7271, "grad_norm": 0.8609521985054016, "learning_rate": 0.0002, "epoch": 3.528130671506352, "step": 4860}, {"loss": 0.6664, "grad_norm": 0.86552894115448, "learning_rate": 0.0002, "epoch": 3.535390199637024, "step": 4870}, {"loss": 0.6432, "grad_norm": 0.6926496028900146, "learning_rate": 0.0002, "epoch": 3.542649727767695, "step": 4880}, {"loss": 0.7117, "grad_norm": 0.8157467246055603, "learning_rate": 0.0002, "epoch": 3.5499092558983665, "step": 4890}, {"loss": 0.6201, "grad_norm": 0.9085357189178467, "learning_rate": 0.0002, "epoch": 3.557168784029038, "step": 4900}, {"loss": 0.6521, "grad_norm": 0.6322644948959351, "learning_rate": 0.0002, "epoch": 3.5644283121597096, "step": 4910}, {"loss": 0.6607, "grad_norm": 1.263205885887146, "learning_rate": 0.0002, "epoch": 3.571687840290381, "step": 4920}, {"loss": 0.6657, "grad_norm": 0.8901070356369019, "learning_rate": 0.0002, "epoch": 3.5789473684210527, "step": 4930}, {"loss": 0.6434, "grad_norm": 0.7983952164649963, "learning_rate": 0.0002, "epoch": 3.586206896551724, "step": 4940}, {"loss": 0.6861, "grad_norm": 0.9887813925743103, "learning_rate": 0.0002, "epoch": 3.5934664246823957, "step": 4950}, {"loss": 0.6502, "grad_norm": 0.7895187735557556, "learning_rate": 0.0002, "epoch": 3.6007259528130673, "step": 4960}, {"loss": 0.7111, "grad_norm": 0.9685819745063782, "learning_rate": 0.0002, "epoch": 3.6079854809437384, "step": 4970}, {"loss": 0.6915, "grad_norm": 0.6576591730117798, "learning_rate": 0.0002, "epoch": 3.6152450090744104, "step": 4980}, {"loss": 0.6195, "grad_norm": 0.856985330581665, "learning_rate": 0.0002, "epoch": 3.6225045372050815, "step": 4990}, {"loss": 0.6318, "grad_norm": 0.7230252623558044, "learning_rate": 0.0002, "epoch": 3.629764065335753, "step": 5000}, {"loss": 0.742, "grad_norm": 0.8260893821716309, "learning_rate": 0.0002, "epoch": 3.6370235934664246, "step": 5010}, {"loss": 0.7223, "grad_norm": 0.7635950446128845, "learning_rate": 0.0002, "epoch": 3.644283121597096, "step": 5020}, {"loss": 0.6837, "grad_norm": 0.7060768604278564, "learning_rate": 0.0002, "epoch": 3.6515426497277677, "step": 5030}, {"loss": 0.6921, "grad_norm": 0.8020303249359131, "learning_rate": 0.0002, "epoch": 3.658802177858439, "step": 5040}, {"loss": 0.6446, "grad_norm": 0.8530341386795044, "learning_rate": 0.0002, "epoch": 3.6660617059891107, "step": 5050}, {"loss": 0.7222, "grad_norm": 0.6667101979255676, "learning_rate": 0.0002, "epoch": 3.6733212341197823, "step": 5060}, {"loss": 0.7081, "grad_norm": 0.7385406494140625, "learning_rate": 0.0002, "epoch": 3.680580762250454, "step": 5070}, {"loss": 0.7035, "grad_norm": 0.7753380537033081, "learning_rate": 0.0002, "epoch": 3.6878402903811254, "step": 5080}, {"loss": 0.6491, "grad_norm": 0.7516207098960876, "learning_rate": 0.0002, "epoch": 3.695099818511797, "step": 5090}, {"loss": 0.672, "grad_norm": 0.8171586394309998, "learning_rate": 0.0002, "epoch": 3.702359346642468, "step": 5100}, {"loss": 0.7459, "grad_norm": 1.0796279907226562, "learning_rate": 0.0002, "epoch": 3.70961887477314, "step": 5110}, {"loss": 0.5948, "grad_norm": 0.6957688927650452, "learning_rate": 0.0002, "epoch": 3.716878402903811, "step": 5120}, {"loss": 0.7515, "grad_norm": 0.8550161719322205, "learning_rate": 0.0002, "epoch": 3.7241379310344827, "step": 5130}, {"loss": 0.7286, "grad_norm": 0.9396728277206421, "learning_rate": 0.0002, "epoch": 3.731397459165154, "step": 5140}, {"loss": 0.7594, "grad_norm": 1.4264805316925049, "learning_rate": 0.0002, "epoch": 3.7386569872958257, "step": 5150}, {"loss": 0.6575, "grad_norm": 0.8725108504295349, "learning_rate": 0.0002, "epoch": 3.7459165154264973, "step": 5160}, {"loss": 0.6778, "grad_norm": 1.0346195697784424, "learning_rate": 0.0002, "epoch": 3.753176043557169, "step": 5170}, {"loss": 0.6371, "grad_norm": 0.5395554304122925, "learning_rate": 0.0002, "epoch": 3.7604355716878404, "step": 5180}, {"loss": 0.7308, "grad_norm": 1.3153616189956665, "learning_rate": 0.0002, "epoch": 3.767695099818512, "step": 5190}, {"loss": 0.78, "grad_norm": 0.9879828691482544, "learning_rate": 0.0002, "epoch": 3.7749546279491835, "step": 5200}, {"loss": 0.7068, "grad_norm": 0.8876672983169556, "learning_rate": 0.0002, "epoch": 3.7822141560798546, "step": 5210}, {"loss": 0.6283, "grad_norm": 0.8363267779350281, "learning_rate": 0.0002, "epoch": 3.7894736842105265, "step": 5220}, {"loss": 0.6255, "grad_norm": 0.637294590473175, "learning_rate": 0.0002, "epoch": 3.7967332123411976, "step": 5230}, {"loss": 0.6685, "grad_norm": 1.1408970355987549, "learning_rate": 0.0002, "epoch": 3.803992740471869, "step": 5240}, {"loss": 0.6761, "grad_norm": 1.0128360986709595, "learning_rate": 0.0002, "epoch": 3.8112522686025407, "step": 5250}, {"loss": 0.6764, "grad_norm": 0.8061144351959229, "learning_rate": 0.0002, "epoch": 3.8185117967332123, "step": 5260}, {"loss": 0.7254, "grad_norm": 0.9626626968383789, "learning_rate": 0.0002, "epoch": 3.825771324863884, "step": 5270}, {"loss": 0.7367, "grad_norm": 0.9013627171516418, "learning_rate": 0.0002, "epoch": 3.8330308529945554, "step": 5280}, {"loss": 0.6806, "grad_norm": 0.8411344289779663, "learning_rate": 0.0002, "epoch": 3.840290381125227, "step": 5290}, {"loss": 0.6818, "grad_norm": 0.7426059246063232, "learning_rate": 0.0002, "epoch": 3.8475499092558985, "step": 5300}, {"loss": 0.6748, "grad_norm": 1.003413438796997, "learning_rate": 0.0002, "epoch": 3.85480943738657, "step": 5310}, {"loss": 0.8554, "grad_norm": 0.7527840733528137, "learning_rate": 0.0002, "epoch": 3.862068965517241, "step": 5320}, {"loss": 0.7521, "grad_norm": 0.738610565662384, "learning_rate": 0.0002, "epoch": 3.869328493647913, "step": 5330}, {"loss": 0.7266, "grad_norm": 0.7277999520301819, "learning_rate": 0.0002, "epoch": 3.876588021778584, "step": 5340}, {"loss": 0.7503, "grad_norm": 0.5951359272003174, "learning_rate": 0.0002, "epoch": 3.8838475499092557, "step": 5350}, {"loss": 0.7447, "grad_norm": 1.043884038925171, "learning_rate": 0.0002, "epoch": 3.8911070780399273, "step": 5360}, {"loss": 0.6862, "grad_norm": 0.8436498045921326, "learning_rate": 0.0002, "epoch": 3.898366606170599, "step": 5370}, {"loss": 0.665, "grad_norm": 0.5603365302085876, "learning_rate": 0.0002, "epoch": 3.9056261343012704, "step": 5380}, {"loss": 0.7098, "grad_norm": 1.0128886699676514, "learning_rate": 0.0002, "epoch": 3.912885662431942, "step": 5390}, {"loss": 0.6707, "grad_norm": 0.7970930337905884, "learning_rate": 0.0002, "epoch": 3.9201451905626135, "step": 5400}, {"loss": 0.637, "grad_norm": 0.7699369192123413, "learning_rate": 0.0002, "epoch": 3.927404718693285, "step": 5410}, {"loss": 0.6742, "grad_norm": 0.800561249256134, "learning_rate": 0.0002, "epoch": 3.9346642468239565, "step": 5420}, {"loss": 0.7208, "grad_norm": 0.8020331859588623, "learning_rate": 0.0002, "epoch": 3.941923774954628, "step": 5430}, {"loss": 0.7294, "grad_norm": 0.7461140155792236, "learning_rate": 0.0002, "epoch": 3.9491833030852996, "step": 5440}, {"loss": 0.7013, "grad_norm": 0.8346918821334839, "learning_rate": 0.0002, "epoch": 3.9564428312159707, "step": 5450}, {"loss": 0.6289, "grad_norm": 0.9723302125930786, "learning_rate": 0.0002, "epoch": 3.9637023593466427, "step": 5460}, {"loss": 0.8029, "grad_norm": 0.6809740662574768, "learning_rate": 0.0002, "epoch": 3.970961887477314, "step": 5470}, {"loss": 0.6896, "grad_norm": 0.7353498339653015, "learning_rate": 0.0002, "epoch": 3.9782214156079854, "step": 5480}, {"loss": 0.6722, "grad_norm": 0.748009443283081, "learning_rate": 0.0002, "epoch": 3.985480943738657, "step": 5490}, {"loss": 0.6866, "grad_norm": 1.3656195402145386, "learning_rate": 0.0002, "epoch": 3.9927404718693285, "step": 5500}, {"loss": 0.7368, "grad_norm": 0.8402108550071716, "learning_rate": 0.0002, "epoch": 4.0, "step": 5510}, {"eval_loss": 1.17229425907135, "eval_runtime": 46.2554, "eval_samples_per_second": 9.426, "eval_steps_per_second": 1.189, "epoch": 4.0, "step": 5510}, {"loss": 0.4637, "grad_norm": 0.8601235747337341, "learning_rate": 0.0002, "epoch": 4.007259528130671, "step": 5520}, {"loss": 0.4717, "grad_norm": 1.2635200023651123, "learning_rate": 0.0002, "epoch": 4.014519056261343, "step": 5530}, {"loss": 0.503, "grad_norm": 1.0257477760314941, "learning_rate": 0.0002, "epoch": 4.021778584392014, "step": 5540}, {"loss": 0.4547, "grad_norm": 0.9436745047569275, "learning_rate": 0.0002, "epoch": 4.029038112522686, "step": 5550}, {"loss": 0.459, "grad_norm": 0.9443606734275818, "learning_rate": 0.0002, "epoch": 4.036297640653357, "step": 5560}, {"loss": 0.5386, "grad_norm": 1.3965742588043213, "learning_rate": 0.0002, "epoch": 4.043557168784029, "step": 5570}, {"loss": 0.4248, "grad_norm": 0.8973520398139954, "learning_rate": 0.0002, "epoch": 4.0508166969147, "step": 5580}, {"loss": 0.4111, "grad_norm": 0.9998409748077393, "learning_rate": 0.0002, "epoch": 4.058076225045372, "step": 5590}, {"loss": 0.4828, "grad_norm": 1.1213387250900269, "learning_rate": 0.0002, "epoch": 4.0653357531760435, "step": 5600}, {"loss": 0.439, "grad_norm": 0.7064558863639832, "learning_rate": 0.0002, "epoch": 4.072595281306715, "step": 5610}, {"loss": 0.4607, "grad_norm": 1.2390803098678589, "learning_rate": 0.0002, "epoch": 4.0798548094373865, "step": 5620}, {"loss": 0.5014, "grad_norm": 1.123469591140747, "learning_rate": 0.0002, "epoch": 4.087114337568058, "step": 5630}, {"loss": 0.513, "grad_norm": 1.229573369026184, "learning_rate": 0.0002, "epoch": 4.09437386569873, "step": 5640}, {"loss": 0.5258, "grad_norm": 1.7182831764221191, "learning_rate": 0.0002, "epoch": 4.101633393829401, "step": 5650}, {"loss": 0.5371, "grad_norm": 0.894903302192688, "learning_rate": 0.0002, "epoch": 4.108892921960073, "step": 5660}, {"loss": 0.4813, "grad_norm": 0.8754552006721497, "learning_rate": 0.0002, "epoch": 4.116152450090744, "step": 5670}, {"loss": 0.491, "grad_norm": 1.2401553392410278, "learning_rate": 0.0002, "epoch": 4.123411978221416, "step": 5680}, {"loss": 0.4549, "grad_norm": 0.8631148934364319, "learning_rate": 0.0002, "epoch": 4.130671506352087, "step": 5690}, {"loss": 0.487, "grad_norm": 1.1798022985458374, "learning_rate": 0.0002, "epoch": 4.137931034482759, "step": 5700}, {"loss": 0.4522, "grad_norm": 0.8344549536705017, "learning_rate": 0.0002, "epoch": 4.14519056261343, "step": 5710}, {"loss": 0.4559, "grad_norm": 1.2342697381973267, "learning_rate": 0.0002, "epoch": 4.152450090744102, "step": 5720}, {"loss": 0.53, "grad_norm": 1.1601094007492065, "learning_rate": 0.0002, "epoch": 4.159709618874773, "step": 5730}, {"loss": 0.4755, "grad_norm": 1.2925703525543213, "learning_rate": 0.0002, "epoch": 4.166969147005445, "step": 5740}, {"loss": 0.4973, "grad_norm": 1.0870997905731201, "learning_rate": 0.0002, "epoch": 4.174228675136116, "step": 5750}, {"loss": 0.5184, "grad_norm": 0.9077792763710022, "learning_rate": 0.0002, "epoch": 4.181488203266787, "step": 5760}, {"loss": 0.4606, "grad_norm": 1.009273886680603, "learning_rate": 0.0002, "epoch": 4.188747731397459, "step": 5770}, {"loss": 0.5383, "grad_norm": 1.2465530633926392, "learning_rate": 0.0002, "epoch": 4.19600725952813, "step": 5780}, {"loss": 0.4938, "grad_norm": 1.2261253595352173, "learning_rate": 0.0002, "epoch": 4.203266787658802, "step": 5790}, {"loss": 0.5255, "grad_norm": 1.1498041152954102, "learning_rate": 0.0002, "epoch": 4.2105263157894735, "step": 5800}, {"loss": 0.5269, "grad_norm": 1.1966725587844849, "learning_rate": 0.0002, "epoch": 4.217785843920145, "step": 5810}, {"loss": 0.5626, "grad_norm": 1.2651296854019165, "learning_rate": 0.0002, "epoch": 4.2250453720508165, "step": 5820}, {"loss": 0.5213, "grad_norm": 1.0388574600219727, "learning_rate": 0.0002, "epoch": 4.2323049001814885, "step": 5830}, {"loss": 0.4965, "grad_norm": 1.3042771816253662, "learning_rate": 0.0002, "epoch": 4.23956442831216, "step": 5840}, {"loss": 0.5116, "grad_norm": 1.1127727031707764, "learning_rate": 0.0002, "epoch": 4.246823956442832, "step": 5850}, {"loss": 0.5197, "grad_norm": 0.9653958082199097, "learning_rate": 0.0002, "epoch": 4.254083484573503, "step": 5860}, {"loss": 0.4414, "grad_norm": 1.0500504970550537, "learning_rate": 0.0002, "epoch": 4.261343012704174, "step": 5870}, {"loss": 0.544, "grad_norm": 1.1476165056228638, "learning_rate": 0.0002, "epoch": 4.268602540834846, "step": 5880}, {"loss": 0.4667, "grad_norm": 0.9424414038658142, "learning_rate": 0.0002, "epoch": 4.275862068965517, "step": 5890}, {"loss": 0.5039, "grad_norm": 1.3309166431427002, "learning_rate": 0.0002, "epoch": 4.283121597096189, "step": 5900}, {"loss": 0.5472, "grad_norm": 1.3025873899459839, "learning_rate": 0.0002, "epoch": 4.29038112522686, "step": 5910}, {"loss": 0.4644, "grad_norm": 1.1442325115203857, "learning_rate": 0.0002, "epoch": 4.297640653357532, "step": 5920}, {"loss": 0.5066, "grad_norm": 0.9820859432220459, "learning_rate": 0.0002, "epoch": 4.304900181488203, "step": 5930}, {"loss": 0.5224, "grad_norm": 0.9615740180015564, "learning_rate": 0.0002, "epoch": 4.312159709618875, "step": 5940}, {"loss": 0.5665, "grad_norm": 1.1627109050750732, "learning_rate": 0.0002, "epoch": 4.319419237749546, "step": 5950}, {"loss": 0.4852, "grad_norm": 0.9381322860717773, "learning_rate": 0.0002, "epoch": 4.326678765880218, "step": 5960}, {"loss": 0.4532, "grad_norm": 0.8154335618019104, "learning_rate": 0.0002, "epoch": 4.333938294010889, "step": 5970}, {"loss": 0.5237, "grad_norm": 0.877671480178833, "learning_rate": 0.0002, "epoch": 4.341197822141561, "step": 5980}, {"loss": 0.6113, "grad_norm": 1.1742031574249268, "learning_rate": 0.0002, "epoch": 4.348457350272232, "step": 5990}, {"loss": 0.5704, "grad_norm": 1.0352917909622192, "learning_rate": 0.0002, "epoch": 4.3557168784029034, "step": 6000}, {"loss": 0.4996, "grad_norm": 0.9963878989219666, "learning_rate": 0.0002, "epoch": 4.362976406533575, "step": 6010}, {"loss": 0.4813, "grad_norm": 1.1892237663269043, "learning_rate": 0.0002, "epoch": 4.3702359346642465, "step": 6020}, {"loss": 0.5224, "grad_norm": 1.2516111135482788, "learning_rate": 0.0002, "epoch": 4.3774954627949185, "step": 6030}, {"loss": 0.5952, "grad_norm": 1.2111951112747192, "learning_rate": 0.0002, "epoch": 4.38475499092559, "step": 6040}, {"loss": 0.4275, "grad_norm": 1.0820083618164062, "learning_rate": 0.0002, "epoch": 4.392014519056262, "step": 6050}, {"loss": 0.5117, "grad_norm": 1.033915638923645, "learning_rate": 0.0002, "epoch": 4.399274047186933, "step": 6060}, {"loss": 0.5431, "grad_norm": 1.0635870695114136, "learning_rate": 0.0002, "epoch": 4.406533575317605, "step": 6070}, {"loss": 0.5341, "grad_norm": 1.0520414113998413, "learning_rate": 0.0002, "epoch": 4.413793103448276, "step": 6080}, {"loss": 0.512, "grad_norm": 1.0821926593780518, "learning_rate": 0.0002, "epoch": 4.421052631578947, "step": 6090}, {"loss": 0.5065, "grad_norm": 1.0533246994018555, "learning_rate": 0.0002, "epoch": 4.428312159709619, "step": 6100}, {"loss": 0.4577, "grad_norm": 0.9231932759284973, "learning_rate": 0.0002, "epoch": 4.43557168784029, "step": 6110}, {"loss": 0.583, "grad_norm": 0.9910260438919067, "learning_rate": 0.0002, "epoch": 4.442831215970962, "step": 6120}, {"loss": 0.4717, "grad_norm": 1.061949372291565, "learning_rate": 0.0002, "epoch": 4.450090744101633, "step": 6130}, {"loss": 0.5893, "grad_norm": 1.2927039861679077, "learning_rate": 0.0002, "epoch": 4.457350272232305, "step": 6140}, {"loss": 0.4684, "grad_norm": 1.3966081142425537, "learning_rate": 0.0002, "epoch": 4.464609800362976, "step": 6150}, {"loss": 0.5507, "grad_norm": 1.3835992813110352, "learning_rate": 0.0002, "epoch": 4.471869328493648, "step": 6160}, {"loss": 0.5911, "grad_norm": 1.0892692804336548, "learning_rate": 0.0002, "epoch": 4.479128856624319, "step": 6170}, {"loss": 0.478, "grad_norm": 1.0318800210952759, "learning_rate": 0.0002, "epoch": 4.486388384754991, "step": 6180}, {"loss": 0.5198, "grad_norm": 0.8174677491188049, "learning_rate": 0.0002, "epoch": 4.493647912885662, "step": 6190}, {"loss": 0.5387, "grad_norm": 1.4157509803771973, "learning_rate": 0.0002, "epoch": 4.500907441016334, "step": 6200}, {"loss": 0.5868, "grad_norm": 1.5244114398956299, "learning_rate": 0.0002, "epoch": 4.508166969147005, "step": 6210}, {"loss": 0.4642, "grad_norm": 0.8164850473403931, "learning_rate": 0.0002, "epoch": 4.5154264972776765, "step": 6220}, {"loss": 0.522, "grad_norm": 1.2904746532440186, "learning_rate": 0.0002, "epoch": 4.5226860254083485, "step": 6230}, {"loss": 0.5103, "grad_norm": 0.7987732887268066, "learning_rate": 0.0002, "epoch": 4.52994555353902, "step": 6240}, {"loss": 0.4615, "grad_norm": 0.831040620803833, "learning_rate": 0.0002, "epoch": 4.537205081669692, "step": 6250}, {"loss": 0.5065, "grad_norm": 0.9545485973358154, "learning_rate": 0.0002, "epoch": 4.544464609800363, "step": 6260}, {"loss": 0.5515, "grad_norm": 0.9291793704032898, "learning_rate": 0.0002, "epoch": 4.551724137931035, "step": 6270}, {"loss": 0.4535, "grad_norm": 0.8977208733558655, "learning_rate": 0.0002, "epoch": 4.558983666061706, "step": 6280}, {"loss": 0.544, "grad_norm": 1.1768537759780884, "learning_rate": 0.0002, "epoch": 4.566243194192378, "step": 6290}, {"loss": 0.5925, "grad_norm": 1.0688952207565308, "learning_rate": 0.0002, "epoch": 4.573502722323049, "step": 6300}, {"loss": 0.5207, "grad_norm": 0.8800966739654541, "learning_rate": 0.0002, "epoch": 4.580762250453721, "step": 6310}, {"loss": 0.6106, "grad_norm": 1.0911834239959717, "learning_rate": 0.0002, "epoch": 4.588021778584392, "step": 6320}, {"loss": 0.5109, "grad_norm": 1.1420872211456299, "learning_rate": 0.0002, "epoch": 4.595281306715064, "step": 6330}, {"loss": 0.5147, "grad_norm": 1.0215224027633667, "learning_rate": 0.0002, "epoch": 4.602540834845735, "step": 6340}, {"loss": 0.592, "grad_norm": 0.9685489535331726, "learning_rate": 0.0002, "epoch": 4.609800362976406, "step": 6350}, {"loss": 0.5775, "grad_norm": 1.12773597240448, "learning_rate": 0.0002, "epoch": 4.617059891107078, "step": 6360}, {"loss": 0.5966, "grad_norm": 1.0663973093032837, "learning_rate": 0.0002, "epoch": 4.624319419237749, "step": 6370}, {"loss": 0.512, "grad_norm": 1.1707262992858887, "learning_rate": 0.0002, "epoch": 4.631578947368421, "step": 6380}, {"loss": 0.5497, "grad_norm": 1.0672980546951294, "learning_rate": 0.0002, "epoch": 4.638838475499092, "step": 6390}, {"loss": 0.5699, "grad_norm": 1.1464333534240723, "learning_rate": 0.0002, "epoch": 4.646098003629764, "step": 6400}, {"loss": 0.5514, "grad_norm": 1.070230484008789, "learning_rate": 0.0002, "epoch": 4.653357531760435, "step": 6410}, {"loss": 0.5013, "grad_norm": 0.9673764109611511, "learning_rate": 0.0002, "epoch": 4.660617059891107, "step": 6420}, {"loss": 0.5901, "grad_norm": 1.0189043283462524, "learning_rate": 0.0002, "epoch": 4.6678765880217785, "step": 6430}, {"loss": 0.5193, "grad_norm": 1.185896396636963, "learning_rate": 0.0002, "epoch": 4.67513611615245, "step": 6440}, {"loss": 0.5318, "grad_norm": 1.0682812929153442, "learning_rate": 0.0002, "epoch": 4.682395644283122, "step": 6450}, {"loss": 0.5773, "grad_norm": 1.3586071729660034, "learning_rate": 0.0002, "epoch": 4.689655172413794, "step": 6460}, {"loss": 0.5482, "grad_norm": 0.6561792492866516, "learning_rate": 0.0002, "epoch": 4.696914700544465, "step": 6470}, {"loss": 0.5711, "grad_norm": 1.1394113302230835, "learning_rate": 0.0002, "epoch": 4.704174228675136, "step": 6480}, {"loss": 0.5325, "grad_norm": 0.9683151245117188, "learning_rate": 0.0002, "epoch": 4.711433756805808, "step": 6490}, {"loss": 0.5139, "grad_norm": 1.0247553586959839, "learning_rate": 0.0002, "epoch": 4.718693284936479, "step": 6500}, {"loss": 0.5794, "grad_norm": 0.8046169281005859, "learning_rate": 0.0002, "epoch": 4.725952813067151, "step": 6510}, {"loss": 0.5471, "grad_norm": 1.0710240602493286, "learning_rate": 0.0002, "epoch": 4.733212341197822, "step": 6520}, {"loss": 0.4805, "grad_norm": 0.9438924193382263, "learning_rate": 0.0002, "epoch": 4.740471869328494, "step": 6530}, {"loss": 0.5404, "grad_norm": 0.869162380695343, "learning_rate": 0.0002, "epoch": 4.747731397459165, "step": 6540}, {"loss": 0.6379, "grad_norm": 0.9776787161827087, "learning_rate": 0.0002, "epoch": 4.754990925589837, "step": 6550}, {"loss": 0.5288, "grad_norm": 1.1990505456924438, "learning_rate": 0.0002, "epoch": 4.762250453720508, "step": 6560}, {"loss": 0.5539, "grad_norm": 1.0582209825515747, "learning_rate": 0.0002, "epoch": 4.769509981851179, "step": 6570}, {"loss": 0.489, "grad_norm": 0.9966367483139038, "learning_rate": 0.0002, "epoch": 4.776769509981851, "step": 6580}, {"loss": 0.5514, "grad_norm": 0.9130612015724182, "learning_rate": 0.0002, "epoch": 4.784029038112522, "step": 6590}, {"loss": 0.5864, "grad_norm": 1.0950500965118408, "learning_rate": 0.0002, "epoch": 4.791288566243194, "step": 6600}, {"loss": 0.5266, "grad_norm": 1.108681321144104, "learning_rate": 0.0002, "epoch": 4.798548094373865, "step": 6610}, {"loss": 0.5875, "grad_norm": 1.1873763799667358, "learning_rate": 0.0002, "epoch": 4.805807622504537, "step": 6620}, {"loss": 0.5736, "grad_norm": 1.305367112159729, "learning_rate": 0.0002, "epoch": 4.8130671506352085, "step": 6630}, {"loss": 0.5636, "grad_norm": 1.2801482677459717, "learning_rate": 0.0002, "epoch": 4.8203266787658805, "step": 6640}, {"loss": 0.582, "grad_norm": 1.26764976978302, "learning_rate": 0.0002, "epoch": 4.827586206896552, "step": 6650}, {"loss": 0.5259, "grad_norm": 1.0018208026885986, "learning_rate": 0.0002, "epoch": 4.834845735027224, "step": 6660}, {"loss": 0.548, "grad_norm": 1.2326326370239258, "learning_rate": 0.0002, "epoch": 4.842105263157895, "step": 6670}, {"loss": 0.5933, "grad_norm": 0.9707282781600952, "learning_rate": 0.0002, "epoch": 4.849364791288567, "step": 6680}, {"loss": 0.5612, "grad_norm": 1.2772048711776733, "learning_rate": 0.0002, "epoch": 4.856624319419238, "step": 6690}, {"loss": 0.5346, "grad_norm": 2.6652262210845947, "learning_rate": 0.0002, "epoch": 4.863883847549909, "step": 6700}, {"loss": 0.5428, "grad_norm": 1.215828537940979, "learning_rate": 0.0002, "epoch": 4.871143375680581, "step": 6710}, {"loss": 0.6571, "grad_norm": 1.3704510927200317, "learning_rate": 0.0002, "epoch": 4.878402903811252, "step": 6720}, {"loss": 0.4963, "grad_norm": 0.7781757116317749, "learning_rate": 0.0002, "epoch": 4.885662431941924, "step": 6730}, {"loss": 0.5989, "grad_norm": 1.1883646249771118, "learning_rate": 0.0002, "epoch": 4.892921960072595, "step": 6740}, {"loss": 0.6067, "grad_norm": 0.9216066002845764, "learning_rate": 0.0002, "epoch": 4.900181488203267, "step": 6750}, {"loss": 0.5085, "grad_norm": 1.0558464527130127, "learning_rate": 0.0002, "epoch": 4.907441016333938, "step": 6760}, {"loss": 0.5216, "grad_norm": 1.032656192779541, "learning_rate": 0.0002, "epoch": 4.91470054446461, "step": 6770}, {"loss": 0.5426, "grad_norm": 1.1261441707611084, "learning_rate": 0.0002, "epoch": 4.921960072595281, "step": 6780}, {"loss": 0.5295, "grad_norm": 1.2178640365600586, "learning_rate": 0.0002, "epoch": 4.929219600725952, "step": 6790}, {"loss": 0.5476, "grad_norm": 1.5369361639022827, "learning_rate": 0.0002, "epoch": 4.936479128856624, "step": 6800}, {"loss": 0.5358, "grad_norm": 1.1188377141952515, "learning_rate": 0.0002, "epoch": 4.943738656987296, "step": 6810}, {"loss": 0.5483, "grad_norm": 1.2506113052368164, "learning_rate": 0.0002, "epoch": 4.950998185117967, "step": 6820}, {"loss": 0.567, "grad_norm": 0.8776047825813293, "learning_rate": 0.0002, "epoch": 4.9582577132486385, "step": 6830}, {"loss": 0.5764, "grad_norm": 0.9700555205345154, "learning_rate": 0.0002, "epoch": 4.9655172413793105, "step": 6840}, {"loss": 0.5396, "grad_norm": 1.2713534832000732, "learning_rate": 0.0002, "epoch": 4.972776769509982, "step": 6850}, {"loss": 0.5451, "grad_norm": 0.9855955243110657, "learning_rate": 0.0002, "epoch": 4.980036297640654, "step": 6860}, {"loss": 0.5884, "grad_norm": 0.8734853863716125, "learning_rate": 0.0002, "epoch": 4.987295825771325, "step": 6870}, {"loss": 0.5189, "grad_norm": 0.8065403699874878, "learning_rate": 0.0002, "epoch": 4.994555353901997, "step": 6880}, {"eval_loss": 1.3302682638168335, "eval_runtime": 46.2496, "eval_samples_per_second": 9.427, "eval_steps_per_second": 1.189, "epoch": 4.999637023593467, "step": 6887}, {"loss": 0.4889, "grad_norm": 0.5163813829421997, "learning_rate": 0.0002, "epoch": 5.001814882032668, "step": 6890}, {"loss": 0.3545, "grad_norm": 1.1496137380599976, "learning_rate": 0.0002, "epoch": 5.00907441016334, "step": 6900}, {"loss": 0.39, "grad_norm": 1.0133885145187378, "learning_rate": 0.0002, "epoch": 5.016333938294011, "step": 6910}, {"loss": 0.3693, "grad_norm": 0.9479621052742004, "learning_rate": 0.0002, "epoch": 5.023593466424682, "step": 6920}, {"loss": 0.4012, "grad_norm": 0.8587583303451538, "learning_rate": 0.0002, "epoch": 5.030852994555354, "step": 6930}, {"loss": 0.3428, "grad_norm": 1.3314697742462158, "learning_rate": 0.0002, "epoch": 5.038112522686025, "step": 6940}, {"loss": 0.3909, "grad_norm": 1.195448875427246, "learning_rate": 0.0002, "epoch": 5.045372050816697, "step": 6950}, {"loss": 0.3322, "grad_norm": 1.2482256889343262, "learning_rate": 0.0002, "epoch": 5.052631578947368, "step": 6960}, {"loss": 0.3893, "grad_norm": 1.2011528015136719, "learning_rate": 0.0002, "epoch": 5.05989110707804, "step": 6970}, {"loss": 0.3265, "grad_norm": 1.3997188806533813, "learning_rate": 0.0002, "epoch": 5.067150635208711, "step": 6980}, {"loss": 0.3716, "grad_norm": 1.2147513628005981, "learning_rate": 0.0002, "epoch": 5.074410163339383, "step": 6990}, {"loss": 0.4053, "grad_norm": 1.6030137538909912, "learning_rate": 0.0002, "epoch": 5.081669691470054, "step": 7000}, {"loss": 0.3665, "grad_norm": 0.9466970562934875, "learning_rate": 0.0002, "epoch": 5.088929219600726, "step": 7010}, {"loss": 0.3451, "grad_norm": 1.4593411684036255, "learning_rate": 0.0002, "epoch": 5.096188747731397, "step": 7020}, {"loss": 0.3843, "grad_norm": 1.2196033000946045, "learning_rate": 0.0002, "epoch": 5.103448275862069, "step": 7030}, {"loss": 0.3896, "grad_norm": 1.1341328620910645, "learning_rate": 0.0002, "epoch": 5.1107078039927405, "step": 7040}, {"loss": 0.3627, "grad_norm": 1.2248976230621338, "learning_rate": 0.0002, "epoch": 5.117967332123412, "step": 7050}, {"loss": 0.3784, "grad_norm": 1.1620593070983887, "learning_rate": 0.0002, "epoch": 5.125226860254084, "step": 7060}, {"loss": 0.3678, "grad_norm": 0.9300723671913147, "learning_rate": 0.0002, "epoch": 5.132486388384755, "step": 7070}, {"loss": 0.3756, "grad_norm": 1.2265169620513916, "learning_rate": 0.0002, "epoch": 5.139745916515427, "step": 7080}, {"loss": 0.3595, "grad_norm": 1.4430373907089233, "learning_rate": 0.0002, "epoch": 5.147005444646098, "step": 7090}, {"loss": 0.3788, "grad_norm": 1.0821576118469238, "learning_rate": 0.0002, "epoch": 5.15426497277677, "step": 7100}, {"loss": 0.383, "grad_norm": 1.2574739456176758, "learning_rate": 0.0002, "epoch": 5.161524500907441, "step": 7110}, {"loss": 0.3692, "grad_norm": 1.1806069612503052, "learning_rate": 0.0002, "epoch": 5.168784029038113, "step": 7120}, {"loss": 0.3978, "grad_norm": 0.9900956153869629, "learning_rate": 0.0002, "epoch": 5.176043557168784, "step": 7130}, {"loss": 0.4358, "grad_norm": 1.2414425611495972, "learning_rate": 0.0002, "epoch": 5.183303085299456, "step": 7140}, {"loss": 0.3485, "grad_norm": 0.8220699429512024, "learning_rate": 0.0002, "epoch": 5.190562613430127, "step": 7150}, {"loss": 0.3517, "grad_norm": 1.29408860206604, "learning_rate": 0.0002, "epoch": 5.197822141560798, "step": 7160}, {"loss": 0.3405, "grad_norm": 0.8510639071464539, "learning_rate": 0.0002, "epoch": 5.20508166969147, "step": 7170}, {"loss": 0.4233, "grad_norm": 1.3448902368545532, "learning_rate": 0.0002, "epoch": 5.212341197822141, "step": 7180}, {"loss": 0.3808, "grad_norm": 1.054451584815979, "learning_rate": 0.0002, "epoch": 5.219600725952813, "step": 7190}, {"loss": 0.368, "grad_norm": 1.3752713203430176, "learning_rate": 0.0002, "epoch": 5.226860254083484, "step": 7200}, {"loss": 0.3844, "grad_norm": 1.4848095178604126, "learning_rate": 0.0002, "epoch": 5.234119782214156, "step": 7210}, {"loss": 0.4187, "grad_norm": 1.428842544555664, "learning_rate": 0.0002, "epoch": 5.241379310344827, "step": 7220}, {"loss": 0.3778, "grad_norm": 1.1703591346740723, "learning_rate": 0.0002, "epoch": 5.248638838475499, "step": 7230}, {"loss": 0.417, "grad_norm": 1.2180451154708862, "learning_rate": 0.0002, "epoch": 5.2558983666061705, "step": 7240}, {"loss": 0.3656, "grad_norm": 1.094045877456665, "learning_rate": 0.0002, "epoch": 5.2631578947368425, "step": 7250}, {"loss": 0.4331, "grad_norm": 0.9545766115188599, "learning_rate": 0.0002, "epoch": 5.270417422867514, "step": 7260}, {"loss": 0.3642, "grad_norm": 0.8356652855873108, "learning_rate": 0.0002, "epoch": 5.277676950998185, "step": 7270}, {"loss": 0.3576, "grad_norm": 1.148160457611084, "learning_rate": 0.0002, "epoch": 5.284936479128857, "step": 7280}, {"loss": 0.4178, "grad_norm": 1.2009977102279663, "learning_rate": 0.0002, "epoch": 5.292196007259528, "step": 7290}, {"loss": 0.3977, "grad_norm": 1.3283873796463013, "learning_rate": 0.0002, "epoch": 5.2994555353902, "step": 7300}, {"loss": 0.3853, "grad_norm": 0.9850481748580933, "learning_rate": 0.0002, "epoch": 5.306715063520871, "step": 7310}, {"loss": 0.3645, "grad_norm": 1.367550015449524, "learning_rate": 0.0002, "epoch": 5.313974591651543, "step": 7320}, {"loss": 0.3898, "grad_norm": 0.8602936863899231, "learning_rate": 0.0002, "epoch": 5.321234119782214, "step": 7330}, {"loss": 0.4173, "grad_norm": 1.1130679845809937, "learning_rate": 0.0002, "epoch": 5.328493647912886, "step": 7340}, {"loss": 0.3642, "grad_norm": 1.3002253770828247, "learning_rate": 0.0002, "epoch": 5.335753176043557, "step": 7350}, {"loss": 0.4138, "grad_norm": 1.6235289573669434, "learning_rate": 0.0002, "epoch": 5.343012704174229, "step": 7360}, {"loss": 0.4779, "grad_norm": 1.156379222869873, "learning_rate": 0.0002, "epoch": 5.3502722323049, "step": 7370}, {"loss": 0.3222, "grad_norm": 1.0569308996200562, "learning_rate": 0.0002, "epoch": 5.357531760435572, "step": 7380}, {"loss": 0.3573, "grad_norm": 1.6674021482467651, "learning_rate": 0.0002, "epoch": 5.364791288566243, "step": 7390}, {"loss": 0.4325, "grad_norm": 1.2962018251419067, "learning_rate": 0.0002, "epoch": 5.372050816696914, "step": 7400}, {"loss": 0.3809, "grad_norm": 1.1904195547103882, "learning_rate": 0.0002, "epoch": 5.379310344827586, "step": 7410}, {"loss": 0.3728, "grad_norm": 1.316245675086975, "learning_rate": 0.0002, "epoch": 5.386569872958257, "step": 7420}, {"loss": 0.4096, "grad_norm": 1.127570390701294, "learning_rate": 0.0002, "epoch": 5.393829401088929, "step": 7430}, {"loss": 0.3933, "grad_norm": 1.3895777463912964, "learning_rate": 0.0002, "epoch": 5.4010889292196005, "step": 7440}, {"loss": 0.4085, "grad_norm": 1.626830816268921, "learning_rate": 0.0002, "epoch": 5.4083484573502725, "step": 7450}, {"loss": 0.4186, "grad_norm": 1.3703926801681519, "learning_rate": 0.0002, "epoch": 5.415607985480944, "step": 7460}, {"loss": 0.3517, "grad_norm": 1.3854840993881226, "learning_rate": 0.0002, "epoch": 5.422867513611616, "step": 7470}, {"loss": 0.3714, "grad_norm": 1.107065200805664, "learning_rate": 0.0002, "epoch": 5.430127041742287, "step": 7480}, {"loss": 0.3855, "grad_norm": 0.7843456268310547, "learning_rate": 0.0002, "epoch": 5.437386569872959, "step": 7490}, {"loss": 0.4159, "grad_norm": 1.6692372560501099, "learning_rate": 0.0002, "epoch": 5.44464609800363, "step": 7500}, {"loss": 0.4185, "grad_norm": 1.2583858966827393, "learning_rate": 0.0002, "epoch": 5.451905626134302, "step": 7510}, {"loss": 0.4401, "grad_norm": 1.6827000379562378, "learning_rate": 0.0002, "epoch": 5.459165154264973, "step": 7520}, {"loss": 0.397, "grad_norm": 1.6680560111999512, "learning_rate": 0.0002, "epoch": 5.466424682395644, "step": 7530}, {"loss": 0.4193, "grad_norm": 1.3696072101593018, "learning_rate": 0.0002, "epoch": 5.473684210526316, "step": 7540}, {"loss": 0.4244, "grad_norm": 1.4523496627807617, "learning_rate": 0.0002, "epoch": 5.480943738656987, "step": 7550}, {"loss": 0.3609, "grad_norm": 1.3432692289352417, "learning_rate": 0.0002, "epoch": 5.488203266787659, "step": 7560}, {"loss": 0.3675, "grad_norm": 1.363818645477295, "learning_rate": 0.0002, "epoch": 5.49546279491833, "step": 7570}, {"loss": 0.3726, "grad_norm": 1.0176721811294556, "learning_rate": 0.0002, "epoch": 5.502722323049002, "step": 7580}, {"loss": 0.3751, "grad_norm": 1.1625547409057617, "learning_rate": 0.0002, "epoch": 5.509981851179673, "step": 7590}, {"loss": 0.433, "grad_norm": 1.2480388879776, "learning_rate": 0.0002, "epoch": 5.517241379310345, "step": 7600}, {"loss": 0.4511, "grad_norm": 1.341509222984314, "learning_rate": 0.0002, "epoch": 5.524500907441016, "step": 7610}, {"loss": 0.4642, "grad_norm": 1.7048436403274536, "learning_rate": 0.0002, "epoch": 5.531760435571687, "step": 7620}, {"loss": 0.4509, "grad_norm": 1.1435480117797852, "learning_rate": 0.0002, "epoch": 5.539019963702359, "step": 7630}, {"loss": 0.4528, "grad_norm": 1.2381842136383057, "learning_rate": 0.0002, "epoch": 5.5462794918330305, "step": 7640}, {"loss": 0.4496, "grad_norm": 1.50786292552948, "learning_rate": 0.0002, "epoch": 5.5535390199637025, "step": 7650}, {"loss": 0.4242, "grad_norm": 1.2263519763946533, "learning_rate": 0.0002, "epoch": 5.560798548094374, "step": 7660}, {"loss": 0.418, "grad_norm": 1.2864696979522705, "learning_rate": 0.0002, "epoch": 5.568058076225046, "step": 7670}, {"loss": 0.3832, "grad_norm": 1.4443191289901733, "learning_rate": 0.0002, "epoch": 5.575317604355717, "step": 7680}, {"loss": 0.3964, "grad_norm": 1.3360971212387085, "learning_rate": 0.0002, "epoch": 5.582577132486389, "step": 7690}, {"loss": 0.4639, "grad_norm": 1.391828179359436, "learning_rate": 0.0002, "epoch": 5.58983666061706, "step": 7700}, {"loss": 0.4722, "grad_norm": 1.3699384927749634, "learning_rate": 0.0002, "epoch": 5.597096188747732, "step": 7710}, {"loss": 0.4302, "grad_norm": 1.3778468370437622, "learning_rate": 0.0002, "epoch": 5.604355716878403, "step": 7720}, {"loss": 0.4179, "grad_norm": 1.1009501218795776, "learning_rate": 0.0002, "epoch": 5.611615245009075, "step": 7730}, {"loss": 0.4104, "grad_norm": 1.0410021543502808, "learning_rate": 0.0002, "epoch": 5.618874773139746, "step": 7740}, {"loss": 0.4489, "grad_norm": 1.1012226343154907, "learning_rate": 0.0002, "epoch": 5.626134301270417, "step": 7750}, {"loss": 0.4544, "grad_norm": 1.3246384859085083, "learning_rate": 0.0002, "epoch": 5.633393829401089, "step": 7760}, {"loss": 0.4381, "grad_norm": 1.4301716089248657, "learning_rate": 0.0002, "epoch": 5.64065335753176, "step": 7770}, {"loss": 0.4297, "grad_norm": 1.1368978023529053, "learning_rate": 0.0002, "epoch": 5.647912885662432, "step": 7780}, {"loss": 0.4063, "grad_norm": 1.3493064641952515, "learning_rate": 0.0002, "epoch": 5.655172413793103, "step": 7790}, {"loss": 0.4562, "grad_norm": 1.3328721523284912, "learning_rate": 0.0002, "epoch": 5.662431941923775, "step": 7800}, {"loss": 0.4075, "grad_norm": 1.3235671520233154, "learning_rate": 0.0002, "epoch": 5.669691470054446, "step": 7810}, {"loss": 0.4589, "grad_norm": 1.1961841583251953, "learning_rate": 0.0002, "epoch": 5.676950998185118, "step": 7820}, {"loss": 0.4503, "grad_norm": 1.4189636707305908, "learning_rate": 0.0002, "epoch": 5.684210526315789, "step": 7830}, {"loss": 0.4452, "grad_norm": 1.3551312685012817, "learning_rate": 0.0002, "epoch": 5.691470054446461, "step": 7840}, {"loss": 0.4268, "grad_norm": 1.449987769126892, "learning_rate": 0.0002, "epoch": 5.6987295825771325, "step": 7850}, {"loss": 0.4141, "grad_norm": 1.1225156784057617, "learning_rate": 0.0002, "epoch": 5.7059891107078045, "step": 7860}, {"loss": 0.41, "grad_norm": 1.4734594821929932, "learning_rate": 0.0002, "epoch": 5.713248638838476, "step": 7870}, {"loss": 0.4013, "grad_norm": 1.3793359994888306, "learning_rate": 0.0002, "epoch": 5.720508166969147, "step": 7880}, {"loss": 0.4065, "grad_norm": 1.2431834936141968, "learning_rate": 0.0002, "epoch": 5.727767695099819, "step": 7890}, {"loss": 0.4595, "grad_norm": 1.1158313751220703, "learning_rate": 0.0002, "epoch": 5.73502722323049, "step": 7900}, {"loss": 0.4342, "grad_norm": 1.212248682975769, "learning_rate": 0.0002, "epoch": 5.742286751361162, "step": 7910}, {"loss": 0.4611, "grad_norm": 1.5259995460510254, "learning_rate": 0.0002, "epoch": 5.749546279491833, "step": 7920}, {"loss": 0.4483, "grad_norm": 1.3909121751785278, "learning_rate": 0.0002, "epoch": 5.756805807622505, "step": 7930}, {"loss": 0.4325, "grad_norm": 1.2511249780654907, "learning_rate": 0.0002, "epoch": 5.764065335753176, "step": 7940}, {"loss": 0.4048, "grad_norm": 1.2511906623840332, "learning_rate": 0.0002, "epoch": 5.771324863883848, "step": 7950}, {"loss": 0.3715, "grad_norm": 1.1489921808242798, "learning_rate": 0.0002, "epoch": 5.778584392014519, "step": 7960}, {"loss": 0.4196, "grad_norm": 1.028943419456482, "learning_rate": 0.0002, "epoch": 5.78584392014519, "step": 7970}, {"loss": 0.4334, "grad_norm": 1.0820423364639282, "learning_rate": 0.0002, "epoch": 5.793103448275862, "step": 7980}, {"loss": 0.3917, "grad_norm": 1.296520471572876, "learning_rate": 0.0002, "epoch": 5.800362976406533, "step": 7990}, {"loss": 0.4509, "grad_norm": 1.3597749471664429, "learning_rate": 0.0002, "epoch": 5.807622504537205, "step": 8000}, {"loss": 0.4535, "grad_norm": 0.8741790652275085, "learning_rate": 0.0002, "epoch": 5.814882032667876, "step": 8010}, {"loss": 0.4239, "grad_norm": 1.1471822261810303, "learning_rate": 0.0002, "epoch": 5.822141560798548, "step": 8020}, {"loss": 0.5042, "grad_norm": 1.2997334003448486, "learning_rate": 0.0002, "epoch": 5.829401088929219, "step": 8030}, {"loss": 0.4758, "grad_norm": 1.1027175188064575, "learning_rate": 0.0002, "epoch": 5.836660617059891, "step": 8040}, {"loss": 0.4192, "grad_norm": 1.2695307731628418, "learning_rate": 0.0002, "epoch": 5.8439201451905625, "step": 8050}, {"loss": 0.5173, "grad_norm": 1.5275461673736572, "learning_rate": 0.0002, "epoch": 5.8511796733212345, "step": 8060}, {"loss": 0.5012, "grad_norm": 1.3059501647949219, "learning_rate": 0.0002, "epoch": 5.8584392014519056, "step": 8070}, {"loss": 0.4425, "grad_norm": 1.57442045211792, "learning_rate": 0.0002, "epoch": 5.8656987295825775, "step": 8080}, {"loss": 0.4261, "grad_norm": 1.119564414024353, "learning_rate": 0.0002, "epoch": 5.872958257713249, "step": 8090}, {"loss": 0.465, "grad_norm": 1.6517373323440552, "learning_rate": 0.0002, "epoch": 5.88021778584392, "step": 8100}, {"loss": 0.4406, "grad_norm": 1.4093554019927979, "learning_rate": 0.0002, "epoch": 5.887477313974592, "step": 8110}, {"loss": 0.4433, "grad_norm": 1.278843641281128, "learning_rate": 0.0002, "epoch": 5.894736842105263, "step": 8120}, {"loss": 0.4007, "grad_norm": 1.2042944431304932, "learning_rate": 0.0002, "epoch": 5.901996370235935, "step": 8130}, {"loss": 0.3972, "grad_norm": 1.1788326501846313, "learning_rate": 0.0002, "epoch": 5.909255898366606, "step": 8140}, {"loss": 0.4506, "grad_norm": 1.4364569187164307, "learning_rate": 0.0002, "epoch": 5.916515426497278, "step": 8150}, {"loss": 0.4651, "grad_norm": 1.1704283952713013, "learning_rate": 0.0002, "epoch": 5.923774954627949, "step": 8160}, {"loss": 0.3972, "grad_norm": 1.040814995765686, "learning_rate": 0.0002, "epoch": 5.931034482758621, "step": 8170}, {"loss": 0.4038, "grad_norm": 1.1367416381835938, "learning_rate": 0.0002, "epoch": 5.938294010889292, "step": 8180}, {"loss": 0.4387, "grad_norm": 1.3401511907577515, "learning_rate": 0.0002, "epoch": 5.945553539019964, "step": 8190}, {"loss": 0.4396, "grad_norm": 1.1154041290283203, "learning_rate": 0.0002, "epoch": 5.952813067150635, "step": 8200}, {"loss": 0.4744, "grad_norm": 1.426089882850647, "learning_rate": 0.0002, "epoch": 5.960072595281307, "step": 8210}, {"loss": 0.4105, "grad_norm": 1.3170222043991089, "learning_rate": 0.0002, "epoch": 5.967332123411978, "step": 8220}, {"loss": 0.4137, "grad_norm": 1.1960029602050781, "learning_rate": 0.0002, "epoch": 5.974591651542649, "step": 8230}, {"loss": 0.423, "grad_norm": 1.0843931436538696, "learning_rate": 0.0002, "epoch": 5.981851179673321, "step": 8240}, {"loss": 0.459, "grad_norm": 1.050421118736267, "learning_rate": 0.0002, "epoch": 5.9891107078039925, "step": 8250}, {"loss": 0.3993, "grad_norm": 1.0183138847351074, "learning_rate": 0.0002, "epoch": 5.9963702359346644, "step": 8260}]} +{"epoch": 6.999637023593467, "step": 9642, "epoch_duration": 2054.809324026108, "total_accumulated_duration": 20458.157337903976, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 24486.0}, "peak_memory_reserved": {"GPU_0": 24752.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4936, "grad_norm": 0.46010470390319824, "learning_rate": 0.0002, "epoch": 0.007259528130671506, "step": 10}, {"loss": 1.8442, "grad_norm": 0.5103179216384888, "learning_rate": 0.0002, "epoch": 0.014519056261343012, "step": 20}, {"loss": 1.559, "grad_norm": 0.6282716989517212, "learning_rate": 0.0002, "epoch": 0.021778584392014518, "step": 30}, {"loss": 1.3618, "grad_norm": 1.2480497360229492, "learning_rate": 0.0002, "epoch": 0.029038112522686024, "step": 40}, {"loss": 1.3874, "grad_norm": 0.4114855229854584, "learning_rate": 0.0002, "epoch": 0.036297640653357534, "step": 50}, {"loss": 1.2836, "grad_norm": 0.49482840299606323, "learning_rate": 0.0002, "epoch": 0.043557168784029036, "step": 60}, {"loss": 1.1728, "grad_norm": 0.4536272883415222, "learning_rate": 0.0002, "epoch": 0.050816696914700546, "step": 70}, {"loss": 1.1125, "grad_norm": 0.32328274846076965, "learning_rate": 0.0002, "epoch": 0.05807622504537205, "step": 80}, {"loss": 1.2447, "grad_norm": 0.40990468859672546, "learning_rate": 0.0002, "epoch": 0.06533575317604355, "step": 90}, {"loss": 1.2216, "grad_norm": 0.37273502349853516, "learning_rate": 0.0002, "epoch": 0.07259528130671507, "step": 100}, {"loss": 1.2259, "grad_norm": 0.3903871476650238, "learning_rate": 0.0002, "epoch": 0.07985480943738657, "step": 110}, {"loss": 1.3718, "grad_norm": 0.3272787928581238, "learning_rate": 0.0002, "epoch": 0.08711433756805807, "step": 120}, {"loss": 1.1832, "grad_norm": 0.3622824251651764, "learning_rate": 0.0002, "epoch": 0.09437386569872959, "step": 130}, {"loss": 1.2966, "grad_norm": 0.3503916561603546, "learning_rate": 0.0002, "epoch": 0.10163339382940109, "step": 140}, {"loss": 1.0913, "grad_norm": 0.32787832617759705, "learning_rate": 0.0002, "epoch": 0.1088929219600726, "step": 150}, {"loss": 1.2143, "grad_norm": 0.5822657942771912, "learning_rate": 0.0002, "epoch": 0.1161524500907441, "step": 160}, {"loss": 1.2091, "grad_norm": 0.28028249740600586, "learning_rate": 0.0002, "epoch": 0.12341197822141561, "step": 170}, {"loss": 1.1731, "grad_norm": 0.33602750301361084, "learning_rate": 0.0002, "epoch": 0.1306715063520871, "step": 180}, {"loss": 1.127, "grad_norm": 0.29106274247169495, "learning_rate": 0.0002, "epoch": 0.13793103448275862, "step": 190}, {"loss": 1.3261, "grad_norm": 0.38753363490104675, "learning_rate": 0.0002, "epoch": 0.14519056261343014, "step": 200}, {"loss": 1.1536, "grad_norm": 0.361009418964386, "learning_rate": 0.0002, "epoch": 0.15245009074410162, "step": 210}, {"loss": 1.1483, "grad_norm": 0.6743836402893066, "learning_rate": 0.0002, "epoch": 0.15970961887477314, "step": 220}, {"loss": 1.264, "grad_norm": 0.3923613727092743, "learning_rate": 0.0002, "epoch": 0.16696914700544466, "step": 230}, {"loss": 1.0437, "grad_norm": 0.2809699773788452, "learning_rate": 0.0002, "epoch": 0.17422867513611615, "step": 240}, {"loss": 1.1826, "grad_norm": 0.3631494641304016, "learning_rate": 0.0002, "epoch": 0.18148820326678766, "step": 250}, {"loss": 1.1584, "grad_norm": 0.24658171832561493, "learning_rate": 0.0002, "epoch": 0.18874773139745918, "step": 260}, {"loss": 1.0065, "grad_norm": 0.5780664682388306, "learning_rate": 0.0002, "epoch": 0.19600725952813067, "step": 270}, {"loss": 1.1251, "grad_norm": 0.3056720495223999, "learning_rate": 0.0002, "epoch": 0.20326678765880218, "step": 280}, {"loss": 1.1503, "grad_norm": 0.2857084572315216, "learning_rate": 0.0002, "epoch": 0.21052631578947367, "step": 290}, {"loss": 1.1215, "grad_norm": 0.6645345687866211, "learning_rate": 0.0002, "epoch": 0.2177858439201452, "step": 300}, {"loss": 1.127, "grad_norm": 0.5966078639030457, "learning_rate": 0.0002, "epoch": 0.2250453720508167, "step": 310}, {"loss": 1.1974, "grad_norm": 0.40937140583992004, "learning_rate": 0.0002, "epoch": 0.2323049001814882, "step": 320}, {"loss": 1.2385, "grad_norm": 0.5642806887626648, "learning_rate": 0.0002, "epoch": 0.2395644283121597, "step": 330}, {"loss": 1.1703, "grad_norm": 0.2750748097896576, "learning_rate": 0.0002, "epoch": 0.24682395644283123, "step": 340}, {"loss": 1.2598, "grad_norm": 0.34350234270095825, "learning_rate": 0.0002, "epoch": 0.2540834845735027, "step": 350}, {"loss": 1.1942, "grad_norm": 0.6767239570617676, "learning_rate": 0.0002, "epoch": 0.2613430127041742, "step": 360}, {"loss": 1.1436, "grad_norm": 0.31006959080696106, "learning_rate": 0.0002, "epoch": 0.26860254083484575, "step": 370}, {"loss": 1.2084, "grad_norm": 0.3825474679470062, "learning_rate": 0.0002, "epoch": 0.27586206896551724, "step": 380}, {"loss": 1.1523, "grad_norm": 0.30241551995277405, "learning_rate": 0.0002, "epoch": 0.2831215970961887, "step": 390}, {"loss": 1.1298, "grad_norm": 0.2962397336959839, "learning_rate": 0.0002, "epoch": 0.29038112522686027, "step": 400}, {"loss": 1.1299, "grad_norm": 0.2600369155406952, "learning_rate": 0.0002, "epoch": 0.29764065335753176, "step": 410}, {"loss": 1.1366, "grad_norm": 0.3675060272216797, "learning_rate": 0.0002, "epoch": 0.30490018148820325, "step": 420}, {"loss": 1.156, "grad_norm": 0.3429498076438904, "learning_rate": 0.0002, "epoch": 0.3121597096188748, "step": 430}, {"loss": 1.2741, "grad_norm": 0.34311825037002563, "learning_rate": 0.0002, "epoch": 0.3194192377495463, "step": 440}, {"loss": 1.3523, "grad_norm": 0.37872210144996643, "learning_rate": 0.0002, "epoch": 0.32667876588021777, "step": 450}, {"loss": 1.1365, "grad_norm": 0.33271121978759766, "learning_rate": 0.0002, "epoch": 0.3339382940108893, "step": 460}, {"loss": 1.1572, "grad_norm": 0.34605276584625244, "learning_rate": 0.0002, "epoch": 0.3411978221415608, "step": 470}, {"loss": 1.2755, "grad_norm": 0.41050562262535095, "learning_rate": 0.0002, "epoch": 0.3484573502722323, "step": 480}, {"loss": 1.075, "grad_norm": 0.2066836953163147, "learning_rate": 0.0002, "epoch": 0.35571687840290384, "step": 490}, {"loss": 1.2072, "grad_norm": 0.2859014868736267, "learning_rate": 0.0002, "epoch": 0.3629764065335753, "step": 500}, {"loss": 1.1435, "grad_norm": 0.28763777017593384, "learning_rate": 0.0002, "epoch": 0.3702359346642468, "step": 510}, {"loss": 1.1341, "grad_norm": 0.2730471193790436, "learning_rate": 0.0002, "epoch": 0.37749546279491836, "step": 520}, {"loss": 1.348, "grad_norm": 0.3968936800956726, "learning_rate": 0.0002, "epoch": 0.38475499092558985, "step": 530}, {"loss": 1.2363, "grad_norm": 0.3624701201915741, "learning_rate": 0.0002, "epoch": 0.39201451905626133, "step": 540}, {"loss": 1.1206, "grad_norm": 0.3303608298301697, "learning_rate": 0.0002, "epoch": 0.3992740471869328, "step": 550}, {"loss": 1.203, "grad_norm": 0.33507466316223145, "learning_rate": 0.0002, "epoch": 0.40653357531760437, "step": 560}, {"loss": 1.2294, "grad_norm": 0.3297670781612396, "learning_rate": 0.0002, "epoch": 0.41379310344827586, "step": 570}, {"loss": 1.2, "grad_norm": 0.32334890961647034, "learning_rate": 0.0002, "epoch": 0.42105263157894735, "step": 580}, {"loss": 1.112, "grad_norm": 0.30281195044517517, "learning_rate": 0.0002, "epoch": 0.4283121597096189, "step": 590}, {"loss": 1.1352, "grad_norm": 0.5900027751922607, "learning_rate": 0.0002, "epoch": 0.4355716878402904, "step": 600}, {"loss": 1.1575, "grad_norm": 0.28437477350234985, "learning_rate": 0.0002, "epoch": 0.44283121597096187, "step": 610}, {"loss": 1.2009, "grad_norm": 0.39601704478263855, "learning_rate": 0.0002, "epoch": 0.4500907441016334, "step": 620}, {"loss": 1.2302, "grad_norm": 0.41971510648727417, "learning_rate": 0.0002, "epoch": 0.4573502722323049, "step": 630}, {"loss": 1.236, "grad_norm": 0.33814409375190735, "learning_rate": 0.0002, "epoch": 0.4646098003629764, "step": 640}, {"loss": 1.2471, "grad_norm": 0.575718104839325, "learning_rate": 0.0002, "epoch": 0.47186932849364793, "step": 650}, {"loss": 1.1012, "grad_norm": 0.37927401065826416, "learning_rate": 0.0002, "epoch": 0.4791288566243194, "step": 660}, {"loss": 1.1552, "grad_norm": 0.3224332630634308, "learning_rate": 0.0002, "epoch": 0.4863883847549909, "step": 670}, {"loss": 1.1354, "grad_norm": 0.32683515548706055, "learning_rate": 0.0002, "epoch": 0.49364791288566245, "step": 680}, {"loss": 1.207, "grad_norm": 0.4316163659095764, "learning_rate": 0.0002, "epoch": 0.5009074410163339, "step": 690}, {"loss": 1.2116, "grad_norm": 0.342602401971817, "learning_rate": 0.0002, "epoch": 0.5081669691470054, "step": 700}, {"loss": 1.0433, "grad_norm": 0.2794898748397827, "learning_rate": 0.0002, "epoch": 0.515426497277677, "step": 710}, {"loss": 1.2395, "grad_norm": 0.3322339951992035, "learning_rate": 0.0002, "epoch": 0.5226860254083484, "step": 720}, {"loss": 1.0762, "grad_norm": 0.3088509142398834, "learning_rate": 0.0002, "epoch": 0.52994555353902, "step": 730}, {"loss": 1.1495, "grad_norm": 0.24444378912448883, "learning_rate": 0.0002, "epoch": 0.5372050816696915, "step": 740}, {"loss": 1.2176, "grad_norm": 0.3483171761035919, "learning_rate": 0.0002, "epoch": 0.5444646098003629, "step": 750}, {"loss": 1.2248, "grad_norm": 0.3425690531730652, "learning_rate": 0.0002, "epoch": 0.5517241379310345, "step": 760}, {"loss": 1.1127, "grad_norm": 0.31841927766799927, "learning_rate": 0.0002, "epoch": 0.558983666061706, "step": 770}, {"loss": 1.1841, "grad_norm": 0.39423868060112, "learning_rate": 0.0002, "epoch": 0.5662431941923775, "step": 780}, {"loss": 1.0583, "grad_norm": 0.30328479409217834, "learning_rate": 0.0002, "epoch": 0.573502722323049, "step": 790}, {"loss": 1.0718, "grad_norm": 0.24475938081741333, "learning_rate": 0.0002, "epoch": 0.5807622504537205, "step": 800}, {"loss": 1.2006, "grad_norm": 0.37132805585861206, "learning_rate": 0.0002, "epoch": 0.588021778584392, "step": 810}, {"loss": 1.1544, "grad_norm": 0.32195979356765747, "learning_rate": 0.0002, "epoch": 0.5952813067150635, "step": 820}, {"loss": 0.9937, "grad_norm": 0.2848738729953766, "learning_rate": 0.0002, "epoch": 0.6025408348457351, "step": 830}, {"loss": 1.0937, "grad_norm": 0.28015264868736267, "learning_rate": 0.0002, "epoch": 0.6098003629764065, "step": 840}, {"loss": 1.1234, "grad_norm": 0.37796008586883545, "learning_rate": 0.0002, "epoch": 0.617059891107078, "step": 850}, {"loss": 1.1455, "grad_norm": 0.39311841130256653, "learning_rate": 0.0002, "epoch": 0.6243194192377496, "step": 860}, {"loss": 1.1066, "grad_norm": 0.2761685252189636, "learning_rate": 0.0002, "epoch": 0.631578947368421, "step": 870}, {"loss": 1.0859, "grad_norm": 0.3826720118522644, "learning_rate": 0.0002, "epoch": 0.6388384754990926, "step": 880}, {"loss": 1.1182, "grad_norm": 0.30076679587364197, "learning_rate": 0.0002, "epoch": 0.6460980036297641, "step": 890}, {"loss": 1.0927, "grad_norm": 0.21997687220573425, "learning_rate": 0.0002, "epoch": 0.6533575317604355, "step": 900}, {"loss": 1.1603, "grad_norm": 0.32593777775764465, "learning_rate": 0.0002, "epoch": 0.6606170598911071, "step": 910}, {"loss": 1.1578, "grad_norm": 0.30347898602485657, "learning_rate": 0.0002, "epoch": 0.6678765880217786, "step": 920}, {"loss": 1.1642, "grad_norm": 0.44173774123191833, "learning_rate": 0.0002, "epoch": 0.6751361161524501, "step": 930}, {"loss": 1.1832, "grad_norm": 0.2507467269897461, "learning_rate": 0.0002, "epoch": 0.6823956442831216, "step": 940}, {"loss": 1.3001, "grad_norm": 0.29463833570480347, "learning_rate": 0.0002, "epoch": 0.6896551724137931, "step": 950}, {"loss": 1.1651, "grad_norm": 0.9363154172897339, "learning_rate": 0.0002, "epoch": 0.6969147005444646, "step": 960}, {"loss": 1.158, "grad_norm": 0.3236212134361267, "learning_rate": 0.0002, "epoch": 0.7041742286751361, "step": 970}, {"loss": 1.1078, "grad_norm": 0.3123254179954529, "learning_rate": 0.0002, "epoch": 0.7114337568058077, "step": 980}, {"loss": 1.1578, "grad_norm": 0.3395805060863495, "learning_rate": 0.0002, "epoch": 0.7186932849364791, "step": 990}, {"loss": 1.0754, "grad_norm": 0.3240964412689209, "learning_rate": 0.0002, "epoch": 0.7259528130671506, "step": 1000}, {"loss": 1.0313, "grad_norm": 0.31902948021888733, "learning_rate": 0.0002, "epoch": 0.7332123411978222, "step": 1010}, {"loss": 1.1664, "grad_norm": 0.4848408102989197, "learning_rate": 0.0002, "epoch": 0.7404718693284936, "step": 1020}, {"loss": 1.0838, "grad_norm": 0.33006033301353455, "learning_rate": 0.0002, "epoch": 0.7477313974591652, "step": 1030}, {"loss": 1.0979, "grad_norm": 0.2928730547428131, "learning_rate": 0.0002, "epoch": 0.7549909255898367, "step": 1040}, {"loss": 1.1351, "grad_norm": 0.3529164791107178, "learning_rate": 0.0002, "epoch": 0.7622504537205081, "step": 1050}, {"loss": 1.1274, "grad_norm": 0.2736213803291321, "learning_rate": 0.0002, "epoch": 0.7695099818511797, "step": 1060}, {"loss": 1.1169, "grad_norm": 0.7200686931610107, "learning_rate": 0.0002, "epoch": 0.7767695099818511, "step": 1070}, {"loss": 1.1171, "grad_norm": 0.33396708965301514, "learning_rate": 0.0002, "epoch": 0.7840290381125227, "step": 1080}, {"loss": 1.2447, "grad_norm": 1.5760449171066284, "learning_rate": 0.0002, "epoch": 0.7912885662431942, "step": 1090}, {"loss": 1.1126, "grad_norm": 0.28138381242752075, "learning_rate": 0.0002, "epoch": 0.7985480943738656, "step": 1100}, {"loss": 1.2012, "grad_norm": 0.2597472369670868, "learning_rate": 0.0002, "epoch": 0.8058076225045372, "step": 1110}, {"loss": 1.1177, "grad_norm": 0.3305445611476898, "learning_rate": 0.0002, "epoch": 0.8130671506352087, "step": 1120}, {"loss": 1.0849, "grad_norm": 0.3934599459171295, "learning_rate": 0.0002, "epoch": 0.8203266787658802, "step": 1130}, {"loss": 1.0889, "grad_norm": 0.3472191393375397, "learning_rate": 0.0002, "epoch": 0.8275862068965517, "step": 1140}, {"loss": 1.265, "grad_norm": 0.2857365906238556, "learning_rate": 0.0002, "epoch": 0.8348457350272233, "step": 1150}, {"loss": 1.03, "grad_norm": 0.3207702934741974, "learning_rate": 0.0002, "epoch": 0.8421052631578947, "step": 1160}, {"loss": 1.1669, "grad_norm": 0.3176484704017639, "learning_rate": 0.0002, "epoch": 0.8493647912885662, "step": 1170}, {"loss": 1.1386, "grad_norm": 0.40685558319091797, "learning_rate": 0.0002, "epoch": 0.8566243194192378, "step": 1180}, {"loss": 1.1383, "grad_norm": 0.31125199794769287, "learning_rate": 0.0002, "epoch": 0.8638838475499092, "step": 1190}, {"loss": 1.1512, "grad_norm": 0.7361181378364563, "learning_rate": 0.0002, "epoch": 0.8711433756805808, "step": 1200}, {"loss": 0.9983, "grad_norm": 0.33699527382850647, "learning_rate": 0.0002, "epoch": 0.8784029038112523, "step": 1210}, {"loss": 1.11, "grad_norm": 0.3315220773220062, "learning_rate": 0.0002, "epoch": 0.8856624319419237, "step": 1220}, {"loss": 1.0266, "grad_norm": 0.6256054043769836, "learning_rate": 0.0002, "epoch": 0.8929219600725953, "step": 1230}, {"loss": 1.1738, "grad_norm": 0.3692137897014618, "learning_rate": 0.0002, "epoch": 0.9001814882032668, "step": 1240}, {"loss": 1.1112, "grad_norm": 0.3538484573364258, "learning_rate": 0.0002, "epoch": 0.9074410163339383, "step": 1250}, {"loss": 1.0878, "grad_norm": 0.27863040566444397, "learning_rate": 0.0002, "epoch": 0.9147005444646098, "step": 1260}, {"loss": 1.0826, "grad_norm": 0.3322528302669525, "learning_rate": 0.0002, "epoch": 0.9219600725952813, "step": 1270}, {"loss": 1.0158, "grad_norm": 0.7553173303604126, "learning_rate": 0.0002, "epoch": 0.9292196007259528, "step": 1280}, {"loss": 1.198, "grad_norm": 0.4856191575527191, "learning_rate": 0.0002, "epoch": 0.9364791288566243, "step": 1290}, {"loss": 1.1678, "grad_norm": 0.3668074905872345, "learning_rate": 0.0002, "epoch": 0.9437386569872959, "step": 1300}, {"loss": 1.1809, "grad_norm": 0.29851067066192627, "learning_rate": 0.0002, "epoch": 0.9509981851179673, "step": 1310}, {"loss": 1.1299, "grad_norm": 0.276664674282074, "learning_rate": 0.0002, "epoch": 0.9582577132486388, "step": 1320}, {"loss": 1.0597, "grad_norm": 0.2941018342971802, "learning_rate": 0.0002, "epoch": 0.9655172413793104, "step": 1330}, {"loss": 1.1731, "grad_norm": 0.3505859076976776, "learning_rate": 0.0002, "epoch": 0.9727767695099818, "step": 1340}, {"loss": 1.1809, "grad_norm": 0.3067687451839447, "learning_rate": 0.0002, "epoch": 0.9800362976406534, "step": 1350}, {"loss": 1.1511, "grad_norm": 0.27151066064834595, "learning_rate": 0.0002, "epoch": 0.9872958257713249, "step": 1360}, {"loss": 1.1919, "grad_norm": 0.36370083689689636, "learning_rate": 0.0002, "epoch": 0.9945553539019963, "step": 1370}, {"eval_loss": 1.1381088495254517, "eval_runtime": 96.0848, "eval_samples_per_second": 4.538, "eval_steps_per_second": 0.572, "epoch": 0.9996370235934664, "step": 1377}, {"loss": 1.0082, "grad_norm": 0.27980583906173706, "learning_rate": 0.0002, "epoch": 1.0018148820326678, "step": 1380}, {"loss": 1.0565, "grad_norm": 0.26713913679122925, "learning_rate": 0.0002, "epoch": 1.0090744101633393, "step": 1390}, {"loss": 1.0241, "grad_norm": 0.3089541494846344, "learning_rate": 0.0002, "epoch": 1.0163339382940109, "step": 1400}, {"loss": 0.9649, "grad_norm": 0.4188242256641388, "learning_rate": 0.0002, "epoch": 1.0235934664246824, "step": 1410}, {"loss": 0.9821, "grad_norm": 0.5246463418006897, "learning_rate": 0.0002, "epoch": 1.030852994555354, "step": 1420}, {"loss": 1.0143, "grad_norm": 0.2728777825832367, "learning_rate": 0.0002, "epoch": 1.0381125226860255, "step": 1430}, {"loss": 1.075, "grad_norm": 0.38167616724967957, "learning_rate": 0.0002, "epoch": 1.0453720508166968, "step": 1440}, {"loss": 1.0229, "grad_norm": 0.4439380168914795, "learning_rate": 0.0002, "epoch": 1.0526315789473684, "step": 1450}, {"loss": 1.0451, "grad_norm": 0.30954182147979736, "learning_rate": 0.0002, "epoch": 1.05989110707804, "step": 1460}, {"loss": 1.0762, "grad_norm": 0.4022280275821686, "learning_rate": 0.0002, "epoch": 1.0671506352087115, "step": 1470}, {"loss": 1.0754, "grad_norm": 0.7390811443328857, "learning_rate": 0.0002, "epoch": 1.074410163339383, "step": 1480}, {"loss": 1.1378, "grad_norm": 0.3885486423969269, "learning_rate": 0.0002, "epoch": 1.0816696914700545, "step": 1490}, {"loss": 1.005, "grad_norm": 0.5275560617446899, "learning_rate": 0.0002, "epoch": 1.0889292196007259, "step": 1500}, {"loss": 1.0147, "grad_norm": 0.35112282633781433, "learning_rate": 0.0002, "epoch": 1.0961887477313974, "step": 1510}, {"loss": 1.0331, "grad_norm": 0.33714351058006287, "learning_rate": 0.0002, "epoch": 1.103448275862069, "step": 1520}, {"loss": 0.8723, "grad_norm": 0.31221693754196167, "learning_rate": 0.0002, "epoch": 1.1107078039927405, "step": 1530}, {"loss": 1.0701, "grad_norm": 0.27549654245376587, "learning_rate": 0.0002, "epoch": 1.117967332123412, "step": 1540}, {"loss": 1.0114, "grad_norm": 0.8465521335601807, "learning_rate": 0.0002, "epoch": 1.1252268602540836, "step": 1550}, {"loss": 0.9643, "grad_norm": 0.36125949025154114, "learning_rate": 0.0002, "epoch": 1.132486388384755, "step": 1560}, {"loss": 1.0744, "grad_norm": 0.37420371174812317, "learning_rate": 0.0002, "epoch": 1.1397459165154264, "step": 1570}, {"loss": 1.165, "grad_norm": 0.3294760584831238, "learning_rate": 0.0002, "epoch": 1.147005444646098, "step": 1580}, {"loss": 0.9207, "grad_norm": 0.3881238102912903, "learning_rate": 0.0002, "epoch": 1.1542649727767695, "step": 1590}, {"loss": 1.0983, "grad_norm": 0.4766491651535034, "learning_rate": 0.0002, "epoch": 1.161524500907441, "step": 1600}, {"loss": 1.0092, "grad_norm": 0.395530104637146, "learning_rate": 0.0002, "epoch": 1.1687840290381124, "step": 1610}, {"loss": 0.9895, "grad_norm": 0.3297106623649597, "learning_rate": 0.0002, "epoch": 1.176043557168784, "step": 1620}, {"loss": 1.1121, "grad_norm": 0.39528271555900574, "learning_rate": 0.0002, "epoch": 1.1833030852994555, "step": 1630}, {"loss": 0.9202, "grad_norm": 0.3370221257209778, "learning_rate": 0.0002, "epoch": 1.190562613430127, "step": 1640}, {"loss": 1.102, "grad_norm": 0.31922030448913574, "learning_rate": 0.0002, "epoch": 1.1978221415607986, "step": 1650}, {"loss": 1.0565, "grad_norm": 0.6142027378082275, "learning_rate": 0.0002, "epoch": 1.2050816696914701, "step": 1660}, {"loss": 0.9541, "grad_norm": 0.44769710302352905, "learning_rate": 0.0002, "epoch": 1.2123411978221417, "step": 1670}, {"loss": 1.2501, "grad_norm": 0.41233646869659424, "learning_rate": 0.0002, "epoch": 1.219600725952813, "step": 1680}, {"loss": 1.0147, "grad_norm": 0.2928866147994995, "learning_rate": 0.0002, "epoch": 1.2268602540834845, "step": 1690}, {"loss": 0.9074, "grad_norm": 0.36913734674453735, "learning_rate": 0.0002, "epoch": 1.234119782214156, "step": 1700}, {"loss": 1.0847, "grad_norm": 0.5281891226768494, "learning_rate": 0.0002, "epoch": 1.2413793103448276, "step": 1710}, {"loss": 0.9482, "grad_norm": 0.3374697268009186, "learning_rate": 0.0002, "epoch": 1.2486388384754992, "step": 1720}, {"loss": 0.9826, "grad_norm": 0.3802020847797394, "learning_rate": 0.0002, "epoch": 1.2558983666061705, "step": 1730}, {"loss": 1.0191, "grad_norm": 0.38048651814460754, "learning_rate": 0.0002, "epoch": 1.263157894736842, "step": 1740}, {"loss": 1.0142, "grad_norm": 0.6676169633865356, "learning_rate": 0.0002, "epoch": 1.2704174228675136, "step": 1750}, {"loss": 0.992, "grad_norm": 0.4075961410999298, "learning_rate": 0.0002, "epoch": 1.2776769509981851, "step": 1760}, {"loss": 1.0301, "grad_norm": 0.4374721348285675, "learning_rate": 0.0002, "epoch": 1.2849364791288567, "step": 1770}, {"loss": 1.0601, "grad_norm": 0.4638824164867401, "learning_rate": 0.0002, "epoch": 1.2921960072595282, "step": 1780}, {"loss": 0.9593, "grad_norm": 0.38631564378738403, "learning_rate": 0.0002, "epoch": 1.2994555353901998, "step": 1790}, {"loss": 1.0042, "grad_norm": 0.35873809456825256, "learning_rate": 0.0002, "epoch": 1.306715063520871, "step": 1800}, {"loss": 0.9835, "grad_norm": 0.33602237701416016, "learning_rate": 0.0002, "epoch": 1.3139745916515426, "step": 1810}, {"loss": 1.1032, "grad_norm": 0.46696463227272034, "learning_rate": 0.0002, "epoch": 1.3212341197822142, "step": 1820}, {"loss": 1.0123, "grad_norm": 0.368958979845047, "learning_rate": 0.0002, "epoch": 1.3284936479128857, "step": 1830}, {"loss": 1.0456, "grad_norm": 0.3160957396030426, "learning_rate": 0.0002, "epoch": 1.335753176043557, "step": 1840}, {"loss": 1.0264, "grad_norm": 0.4511511027812958, "learning_rate": 0.0002, "epoch": 1.3430127041742286, "step": 1850}, {"loss": 1.0179, "grad_norm": 0.6769845485687256, "learning_rate": 0.0002, "epoch": 1.3502722323049001, "step": 1860}, {"loss": 1.011, "grad_norm": 0.3749309480190277, "learning_rate": 0.0002, "epoch": 1.3575317604355717, "step": 1870}, {"loss": 1.0223, "grad_norm": 0.553798496723175, "learning_rate": 0.0002, "epoch": 1.3647912885662432, "step": 1880}, {"loss": 0.9634, "grad_norm": 0.3538985550403595, "learning_rate": 0.0002, "epoch": 1.3720508166969148, "step": 1890}, {"loss": 1.0275, "grad_norm": 0.5501534938812256, "learning_rate": 0.0002, "epoch": 1.3793103448275863, "step": 1900}, {"loss": 1.0461, "grad_norm": 0.4432051181793213, "learning_rate": 0.0002, "epoch": 1.3865698729582578, "step": 1910}, {"loss": 0.9082, "grad_norm": 0.41755786538124084, "learning_rate": 0.0002, "epoch": 1.3938294010889292, "step": 1920}, {"loss": 1.0858, "grad_norm": 0.5626114010810852, "learning_rate": 0.0002, "epoch": 1.4010889292196007, "step": 1930}, {"loss": 1.0687, "grad_norm": 0.44038185477256775, "learning_rate": 0.0002, "epoch": 1.4083484573502723, "step": 1940}, {"loss": 0.9454, "grad_norm": 0.3397001624107361, "learning_rate": 0.0002, "epoch": 1.4156079854809438, "step": 1950}, {"loss": 0.943, "grad_norm": 0.4325368106365204, "learning_rate": 0.0002, "epoch": 1.4228675136116151, "step": 1960}, {"loss": 1.0488, "grad_norm": 0.3900907039642334, "learning_rate": 0.0002, "epoch": 1.4301270417422867, "step": 1970}, {"loss": 0.9699, "grad_norm": 0.369612455368042, "learning_rate": 0.0002, "epoch": 1.4373865698729582, "step": 1980}, {"loss": 1.0609, "grad_norm": 0.4389338791370392, "learning_rate": 0.0002, "epoch": 1.4446460980036298, "step": 1990}, {"loss": 1.042, "grad_norm": 1.694450855255127, "learning_rate": 0.0002, "epoch": 1.4519056261343013, "step": 2000}, {"loss": 0.997, "grad_norm": 0.516957700252533, "learning_rate": 0.0002, "epoch": 1.4591651542649728, "step": 2010}, {"loss": 1.0979, "grad_norm": 0.45515501499176025, "learning_rate": 0.0002, "epoch": 1.4664246823956444, "step": 2020}, {"loss": 1.0188, "grad_norm": 0.4153139591217041, "learning_rate": 0.0002, "epoch": 1.4736842105263157, "step": 2030}, {"loss": 1.1521, "grad_norm": 0.44353052973747253, "learning_rate": 0.0002, "epoch": 1.4809437386569873, "step": 2040}, {"loss": 0.9653, "grad_norm": 0.570554256439209, "learning_rate": 0.0002, "epoch": 1.4882032667876588, "step": 2050}, {"loss": 1.0352, "grad_norm": 0.5742740035057068, "learning_rate": 0.0002, "epoch": 1.4954627949183303, "step": 2060}, {"loss": 1.0963, "grad_norm": 0.5890517830848694, "learning_rate": 0.0002, "epoch": 1.5027223230490017, "step": 2070}, {"loss": 1.0639, "grad_norm": 0.4162650406360626, "learning_rate": 0.0002, "epoch": 1.5099818511796732, "step": 2080}, {"loss": 1.071, "grad_norm": 0.7334717512130737, "learning_rate": 0.0002, "epoch": 1.5172413793103448, "step": 2090}, {"loss": 0.8957, "grad_norm": 0.2963249683380127, "learning_rate": 0.0002, "epoch": 1.5245009074410163, "step": 2100}, {"loss": 1.0446, "grad_norm": 0.30676454305648804, "learning_rate": 0.0002, "epoch": 1.5317604355716878, "step": 2110}, {"loss": 0.9967, "grad_norm": 0.35984641313552856, "learning_rate": 0.0002, "epoch": 1.5390199637023594, "step": 2120}, {"loss": 1.0352, "grad_norm": 0.3384549617767334, "learning_rate": 0.0002, "epoch": 1.546279491833031, "step": 2130}, {"loss": 0.9874, "grad_norm": 0.4725518226623535, "learning_rate": 0.0002, "epoch": 1.5535390199637025, "step": 2140}, {"loss": 1.1166, "grad_norm": 0.3252685070037842, "learning_rate": 0.0002, "epoch": 1.560798548094374, "step": 2150}, {"loss": 0.9127, "grad_norm": 0.45043081045150757, "learning_rate": 0.0002, "epoch": 1.5680580762250453, "step": 2160}, {"loss": 1.0767, "grad_norm": 0.374208003282547, "learning_rate": 0.0002, "epoch": 1.5753176043557169, "step": 2170}, {"loss": 1.0255, "grad_norm": 0.5118404030799866, "learning_rate": 0.0002, "epoch": 1.5825771324863884, "step": 2180}, {"loss": 1.0062, "grad_norm": 0.482170969247818, "learning_rate": 0.0002, "epoch": 1.5898366606170597, "step": 2190}, {"loss": 1.0872, "grad_norm": 0.5337533950805664, "learning_rate": 0.0002, "epoch": 1.5970961887477313, "step": 2200}, {"loss": 1.0405, "grad_norm": 0.5195064544677734, "learning_rate": 0.0002, "epoch": 1.6043557168784028, "step": 2210}, {"loss": 1.0454, "grad_norm": 0.30807098746299744, "learning_rate": 0.0002, "epoch": 1.6116152450090744, "step": 2220}, {"loss": 1.0293, "grad_norm": 0.3962925672531128, "learning_rate": 0.0002, "epoch": 1.618874773139746, "step": 2230}, {"loss": 1.0137, "grad_norm": 0.7636962532997131, "learning_rate": 0.0002, "epoch": 1.6261343012704175, "step": 2240}, {"loss": 0.991, "grad_norm": 0.32380592823028564, "learning_rate": 0.0002, "epoch": 1.633393829401089, "step": 2250}, {"loss": 0.9471, "grad_norm": 0.5767741799354553, "learning_rate": 0.0002, "epoch": 1.6406533575317606, "step": 2260}, {"loss": 0.9524, "grad_norm": 0.39964812994003296, "learning_rate": 0.0002, "epoch": 1.647912885662432, "step": 2270}, {"loss": 0.9866, "grad_norm": 0.622629702091217, "learning_rate": 0.0002, "epoch": 1.6551724137931034, "step": 2280}, {"loss": 0.9609, "grad_norm": 0.40202152729034424, "learning_rate": 0.0002, "epoch": 1.662431941923775, "step": 2290}, {"loss": 0.9859, "grad_norm": 0.4467349052429199, "learning_rate": 0.0002, "epoch": 1.6696914700544465, "step": 2300}, {"loss": 1.0312, "grad_norm": 0.5026949048042297, "learning_rate": 0.0002, "epoch": 1.6769509981851178, "step": 2310}, {"loss": 1.0776, "grad_norm": 0.43754541873931885, "learning_rate": 0.0002, "epoch": 1.6842105263157894, "step": 2320}, {"loss": 0.9786, "grad_norm": 0.42869430780410767, "learning_rate": 0.0002, "epoch": 1.691470054446461, "step": 2330}, {"loss": 1.0747, "grad_norm": 0.4192679524421692, "learning_rate": 0.0002, "epoch": 1.6987295825771325, "step": 2340}, {"loss": 1.0054, "grad_norm": 0.3243155777454376, "learning_rate": 0.0002, "epoch": 1.705989110707804, "step": 2350}, {"loss": 0.9165, "grad_norm": 1.0514075756072998, "learning_rate": 0.0002, "epoch": 1.7132486388384756, "step": 2360}, {"loss": 1.1353, "grad_norm": 0.4719122648239136, "learning_rate": 0.0002, "epoch": 1.720508166969147, "step": 2370}, {"loss": 0.9913, "grad_norm": 0.3846144676208496, "learning_rate": 0.0002, "epoch": 1.7277676950998186, "step": 2380}, {"loss": 0.9935, "grad_norm": 0.7266581058502197, "learning_rate": 0.0002, "epoch": 1.73502722323049, "step": 2390}, {"loss": 0.9509, "grad_norm": 0.6183241605758667, "learning_rate": 0.0002, "epoch": 1.7422867513611615, "step": 2400}, {"loss": 0.9713, "grad_norm": 0.3658260405063629, "learning_rate": 0.0002, "epoch": 1.749546279491833, "step": 2410}, {"loss": 1.0033, "grad_norm": 0.6036322712898254, "learning_rate": 0.0002, "epoch": 1.7568058076225044, "step": 2420}, {"loss": 1.0526, "grad_norm": 0.7872936129570007, "learning_rate": 0.0002, "epoch": 1.764065335753176, "step": 2430}, {"loss": 0.99, "grad_norm": 0.35946124792099, "learning_rate": 0.0002, "epoch": 1.7713248638838475, "step": 2440}, {"loss": 0.8845, "grad_norm": 0.3740338981151581, "learning_rate": 0.0002, "epoch": 1.778584392014519, "step": 2450}, {"loss": 0.9333, "grad_norm": 0.6150230169296265, "learning_rate": 0.0002, "epoch": 1.7858439201451906, "step": 2460}, {"loss": 1.0373, "grad_norm": 0.4726075530052185, "learning_rate": 0.0002, "epoch": 1.793103448275862, "step": 2470}, {"loss": 0.974, "grad_norm": 0.31292253732681274, "learning_rate": 0.0002, "epoch": 1.8003629764065336, "step": 2480}, {"loss": 1.0205, "grad_norm": 0.4463104009628296, "learning_rate": 0.0002, "epoch": 1.8076225045372052, "step": 2490}, {"loss": 1.0771, "grad_norm": 0.7848200798034668, "learning_rate": 0.0002, "epoch": 1.8148820326678767, "step": 2500}, {"loss": 0.9508, "grad_norm": 0.5562082529067993, "learning_rate": 0.0002, "epoch": 1.822141560798548, "step": 2510}, {"loss": 0.9343, "grad_norm": 0.39892756938934326, "learning_rate": 0.0002, "epoch": 1.8294010889292196, "step": 2520}, {"loss": 1.0489, "grad_norm": 1.2923320531845093, "learning_rate": 0.0002, "epoch": 1.8366606170598911, "step": 2530}, {"loss": 1.1203, "grad_norm": 0.6316490769386292, "learning_rate": 0.0002, "epoch": 1.8439201451905625, "step": 2540}, {"loss": 0.9131, "grad_norm": 0.46100497245788574, "learning_rate": 0.0002, "epoch": 1.851179673321234, "step": 2550}, {"loss": 1.0977, "grad_norm": 0.7902987003326416, "learning_rate": 0.0002, "epoch": 1.8584392014519056, "step": 2560}, {"loss": 0.9702, "grad_norm": 0.4596365690231323, "learning_rate": 0.0002, "epoch": 1.865698729582577, "step": 2570}, {"loss": 0.9322, "grad_norm": 0.6592172384262085, "learning_rate": 0.0002, "epoch": 1.8729582577132486, "step": 2580}, {"loss": 0.9549, "grad_norm": 0.5764662027359009, "learning_rate": 0.0002, "epoch": 1.8802177858439202, "step": 2590}, {"loss": 0.9485, "grad_norm": 0.8421637415885925, "learning_rate": 0.0002, "epoch": 1.8874773139745917, "step": 2600}, {"loss": 1.012, "grad_norm": 0.5635305047035217, "learning_rate": 0.0002, "epoch": 1.8947368421052633, "step": 2610}, {"loss": 0.8907, "grad_norm": 0.46231237053871155, "learning_rate": 0.0002, "epoch": 1.9019963702359348, "step": 2620}, {"loss": 0.9543, "grad_norm": 0.3944607079029083, "learning_rate": 0.0002, "epoch": 1.9092558983666061, "step": 2630}, {"loss": 0.9964, "grad_norm": 0.4753907322883606, "learning_rate": 0.0002, "epoch": 1.9165154264972777, "step": 2640}, {"loss": 1.0217, "grad_norm": 0.4151090979576111, "learning_rate": 0.0002, "epoch": 1.9237749546279492, "step": 2650}, {"loss": 1.0397, "grad_norm": 0.6793725490570068, "learning_rate": 0.0002, "epoch": 1.9310344827586206, "step": 2660}, {"loss": 1.0401, "grad_norm": 0.339755117893219, "learning_rate": 0.0002, "epoch": 1.938294010889292, "step": 2670}, {"loss": 1.082, "grad_norm": 0.40789374709129333, "learning_rate": 0.0002, "epoch": 1.9455535390199636, "step": 2680}, {"loss": 0.9973, "grad_norm": 0.3750005066394806, "learning_rate": 0.0002, "epoch": 1.9528130671506352, "step": 2690}, {"loss": 0.9837, "grad_norm": 0.39684441685676575, "learning_rate": 0.0002, "epoch": 1.9600725952813067, "step": 2700}, {"loss": 1.1204, "grad_norm": 0.378287672996521, "learning_rate": 0.0002, "epoch": 1.9673321234119783, "step": 2710}, {"loss": 0.9284, "grad_norm": 0.3668482005596161, "learning_rate": 0.0002, "epoch": 1.9745916515426498, "step": 2720}, {"loss": 1.0512, "grad_norm": 0.49997565150260925, "learning_rate": 0.0002, "epoch": 1.9818511796733214, "step": 2730}, {"loss": 0.9311, "grad_norm": 0.36852124333381653, "learning_rate": 0.0002, "epoch": 1.989110707803993, "step": 2740}, {"loss": 1.0898, "grad_norm": 0.5203380584716797, "learning_rate": 0.0002, "epoch": 1.9963702359346642, "step": 2750}, {"eval_loss": 1.1086540222167969, "eval_runtime": 95.6866, "eval_samples_per_second": 4.557, "eval_steps_per_second": 0.575, "epoch": 2.0, "step": 2755}, {"loss": 0.9791, "grad_norm": 0.35921573638916016, "learning_rate": 0.0002, "epoch": 2.0036297640653356, "step": 2760}, {"loss": 0.8968, "grad_norm": 1.013861894607544, "learning_rate": 0.0002, "epoch": 2.010889292196007, "step": 2770}, {"loss": 0.7873, "grad_norm": 0.4425240159034729, "learning_rate": 0.0002, "epoch": 2.0181488203266786, "step": 2780}, {"loss": 0.8758, "grad_norm": 0.60174161195755, "learning_rate": 0.0002, "epoch": 2.02540834845735, "step": 2790}, {"loss": 0.7989, "grad_norm": 0.47582098841667175, "learning_rate": 0.0002, "epoch": 2.0326678765880217, "step": 2800}, {"loss": 0.8898, "grad_norm": 0.6012811660766602, "learning_rate": 0.0002, "epoch": 2.0399274047186933, "step": 2810}, {"loss": 0.8109, "grad_norm": 0.4444408118724823, "learning_rate": 0.0002, "epoch": 2.047186932849365, "step": 2820}, {"loss": 0.8621, "grad_norm": 0.4864003360271454, "learning_rate": 0.0002, "epoch": 2.0544464609800364, "step": 2830}, {"loss": 0.8817, "grad_norm": 0.5104215741157532, "learning_rate": 0.0002, "epoch": 2.061705989110708, "step": 2840}, {"loss": 0.7789, "grad_norm": 0.6218489408493042, "learning_rate": 0.0002, "epoch": 2.0689655172413794, "step": 2850}, {"loss": 0.8778, "grad_norm": 0.705784261226654, "learning_rate": 0.0002, "epoch": 2.076225045372051, "step": 2860}, {"loss": 0.7851, "grad_norm": 0.48091503977775574, "learning_rate": 0.0002, "epoch": 2.0834845735027225, "step": 2870}, {"loss": 0.8444, "grad_norm": 0.5062456727027893, "learning_rate": 0.0002, "epoch": 2.0907441016333936, "step": 2880}, {"loss": 0.7898, "grad_norm": 0.4862022399902344, "learning_rate": 0.0002, "epoch": 2.098003629764065, "step": 2890}, {"loss": 0.8593, "grad_norm": 0.48264044523239136, "learning_rate": 0.0002, "epoch": 2.1052631578947367, "step": 2900}, {"loss": 0.8325, "grad_norm": 0.43744346499443054, "learning_rate": 0.0002, "epoch": 2.1125226860254083, "step": 2910}, {"loss": 0.9099, "grad_norm": 0.5480492115020752, "learning_rate": 0.0002, "epoch": 2.11978221415608, "step": 2920}, {"loss": 0.7727, "grad_norm": 0.5068560838699341, "learning_rate": 0.0002, "epoch": 2.1270417422867514, "step": 2930}, {"loss": 0.8403, "grad_norm": 0.4650852680206299, "learning_rate": 0.0002, "epoch": 2.134301270417423, "step": 2940}, {"loss": 0.8467, "grad_norm": 0.4929981231689453, "learning_rate": 0.0002, "epoch": 2.1415607985480944, "step": 2950}, {"loss": 0.8747, "grad_norm": 0.6537389755249023, "learning_rate": 0.0002, "epoch": 2.148820326678766, "step": 2960}, {"loss": 0.7663, "grad_norm": 0.8032940626144409, "learning_rate": 0.0002, "epoch": 2.1560798548094375, "step": 2970}, {"loss": 0.7604, "grad_norm": 0.7131643891334534, "learning_rate": 0.0002, "epoch": 2.163339382940109, "step": 2980}, {"loss": 0.8424, "grad_norm": 0.6034275889396667, "learning_rate": 0.0002, "epoch": 2.1705989110707806, "step": 2990}, {"loss": 0.8838, "grad_norm": 0.6081095933914185, "learning_rate": 0.0002, "epoch": 2.1778584392014517, "step": 3000}, {"loss": 0.9013, "grad_norm": 0.5706912875175476, "learning_rate": 0.0002, "epoch": 2.1851179673321233, "step": 3010}, {"loss": 0.8473, "grad_norm": 0.6742380261421204, "learning_rate": 0.0002, "epoch": 2.192377495462795, "step": 3020}, {"loss": 0.8372, "grad_norm": 0.4847496449947357, "learning_rate": 0.0002, "epoch": 2.1996370235934664, "step": 3030}, {"loss": 0.8487, "grad_norm": 0.5529342889785767, "learning_rate": 0.0002, "epoch": 2.206896551724138, "step": 3040}, {"loss": 0.9057, "grad_norm": 0.6108783483505249, "learning_rate": 0.0002, "epoch": 2.2141560798548094, "step": 3050}, {"loss": 0.9659, "grad_norm": 0.8841571807861328, "learning_rate": 0.0002, "epoch": 2.221415607985481, "step": 3060}, {"loss": 0.8538, "grad_norm": 0.4227530360221863, "learning_rate": 0.0002, "epoch": 2.2286751361161525, "step": 3070}, {"loss": 0.7925, "grad_norm": 0.461935818195343, "learning_rate": 0.0002, "epoch": 2.235934664246824, "step": 3080}, {"loss": 0.8542, "grad_norm": 0.5407412648200989, "learning_rate": 0.0002, "epoch": 2.2431941923774956, "step": 3090}, {"loss": 0.8392, "grad_norm": 0.5057998895645142, "learning_rate": 0.0002, "epoch": 2.250453720508167, "step": 3100}, {"loss": 0.7898, "grad_norm": 0.530057966709137, "learning_rate": 0.0002, "epoch": 2.2577132486388383, "step": 3110}, {"loss": 0.8507, "grad_norm": 0.5066842436790466, "learning_rate": 0.0002, "epoch": 2.26497277676951, "step": 3120}, {"loss": 0.7903, "grad_norm": 0.5069178342819214, "learning_rate": 0.0002, "epoch": 2.2722323049001814, "step": 3130}, {"loss": 0.8458, "grad_norm": 0.6095499396324158, "learning_rate": 0.0002, "epoch": 2.279491833030853, "step": 3140}, {"loss": 0.8171, "grad_norm": 0.49524766206741333, "learning_rate": 0.0002, "epoch": 2.2867513611615244, "step": 3150}, {"loss": 0.7568, "grad_norm": 0.5334409475326538, "learning_rate": 0.0002, "epoch": 2.294010889292196, "step": 3160}, {"loss": 0.7533, "grad_norm": 1.681748867034912, "learning_rate": 0.0002, "epoch": 2.3012704174228675, "step": 3170}, {"loss": 0.8155, "grad_norm": 0.7225565314292908, "learning_rate": 0.0002, "epoch": 2.308529945553539, "step": 3180}, {"loss": 0.8034, "grad_norm": 0.5379388928413391, "learning_rate": 0.0002, "epoch": 2.3157894736842106, "step": 3190}, {"loss": 0.8833, "grad_norm": 0.45770326256752014, "learning_rate": 0.0002, "epoch": 2.323049001814882, "step": 3200}, {"loss": 0.8419, "grad_norm": 0.6984533071517944, "learning_rate": 0.0002, "epoch": 2.3303085299455537, "step": 3210}, {"loss": 0.7414, "grad_norm": 0.6725744605064392, "learning_rate": 0.0002, "epoch": 2.337568058076225, "step": 3220}, {"loss": 0.8657, "grad_norm": 1.1247471570968628, "learning_rate": 0.0002, "epoch": 2.344827586206897, "step": 3230}, {"loss": 0.7782, "grad_norm": 1.0240263938903809, "learning_rate": 0.0002, "epoch": 2.352087114337568, "step": 3240}, {"loss": 0.826, "grad_norm": 0.5608096122741699, "learning_rate": 0.0002, "epoch": 2.3593466424682394, "step": 3250}, {"loss": 0.8191, "grad_norm": 0.8294990062713623, "learning_rate": 0.0002, "epoch": 2.366606170598911, "step": 3260}, {"loss": 0.9845, "grad_norm": 0.6734224557876587, "learning_rate": 0.0002, "epoch": 2.3738656987295825, "step": 3270}, {"loss": 0.7921, "grad_norm": 0.6862800717353821, "learning_rate": 0.0002, "epoch": 2.381125226860254, "step": 3280}, {"loss": 0.7752, "grad_norm": 0.5442930459976196, "learning_rate": 0.0002, "epoch": 2.3883847549909256, "step": 3290}, {"loss": 0.8543, "grad_norm": 0.5745394229888916, "learning_rate": 0.0002, "epoch": 2.395644283121597, "step": 3300}, {"loss": 0.833, "grad_norm": 0.6257799863815308, "learning_rate": 0.0002, "epoch": 2.4029038112522687, "step": 3310}, {"loss": 0.8191, "grad_norm": 0.5608420968055725, "learning_rate": 0.0002, "epoch": 2.4101633393829403, "step": 3320}, {"loss": 0.8745, "grad_norm": 0.5512017011642456, "learning_rate": 0.0002, "epoch": 2.417422867513612, "step": 3330}, {"loss": 0.8642, "grad_norm": 0.7452999353408813, "learning_rate": 0.0002, "epoch": 2.4246823956442833, "step": 3340}, {"loss": 0.8417, "grad_norm": 0.4604301452636719, "learning_rate": 0.0002, "epoch": 2.4319419237749544, "step": 3350}, {"loss": 0.9148, "grad_norm": 0.8225823640823364, "learning_rate": 0.0002, "epoch": 2.439201451905626, "step": 3360}, {"loss": 0.8144, "grad_norm": 0.8604981303215027, "learning_rate": 0.0002, "epoch": 2.4464609800362975, "step": 3370}, {"loss": 0.9276, "grad_norm": 0.6620925664901733, "learning_rate": 0.0002, "epoch": 2.453720508166969, "step": 3380}, {"loss": 0.8381, "grad_norm": 0.4750158488750458, "learning_rate": 0.0002, "epoch": 2.4609800362976406, "step": 3390}, {"loss": 0.8249, "grad_norm": 0.6061418056488037, "learning_rate": 0.0002, "epoch": 2.468239564428312, "step": 3400}, {"loss": 0.8944, "grad_norm": 0.5977247357368469, "learning_rate": 0.0002, "epoch": 2.4754990925589837, "step": 3410}, {"loss": 0.8366, "grad_norm": 0.5004227757453918, "learning_rate": 0.0002, "epoch": 2.4827586206896552, "step": 3420}, {"loss": 0.9133, "grad_norm": 0.46428972482681274, "learning_rate": 0.0002, "epoch": 2.490018148820327, "step": 3430}, {"loss": 0.8534, "grad_norm": 0.7658395171165466, "learning_rate": 0.0002, "epoch": 2.4972776769509983, "step": 3440}, {"loss": 0.9066, "grad_norm": 0.621819794178009, "learning_rate": 0.0002, "epoch": 2.50453720508167, "step": 3450}, {"loss": 0.8481, "grad_norm": 0.4556088149547577, "learning_rate": 0.0002, "epoch": 2.511796733212341, "step": 3460}, {"loss": 0.8522, "grad_norm": 0.6124140024185181, "learning_rate": 0.0002, "epoch": 2.519056261343013, "step": 3470}, {"loss": 0.8158, "grad_norm": 0.6256231665611267, "learning_rate": 0.0002, "epoch": 2.526315789473684, "step": 3480}, {"loss": 0.9097, "grad_norm": 0.5464141964912415, "learning_rate": 0.0002, "epoch": 2.5335753176043556, "step": 3490}, {"loss": 0.8708, "grad_norm": 0.51471346616745, "learning_rate": 0.0002, "epoch": 2.540834845735027, "step": 3500}, {"loss": 0.9042, "grad_norm": 0.5326165556907654, "learning_rate": 0.0002, "epoch": 2.5480943738656987, "step": 3510}, {"loss": 0.8749, "grad_norm": 0.4750378429889679, "learning_rate": 0.0002, "epoch": 2.5553539019963702, "step": 3520}, {"loss": 0.8248, "grad_norm": 0.5292279124259949, "learning_rate": 0.0002, "epoch": 2.562613430127042, "step": 3530}, {"loss": 0.8808, "grad_norm": 0.6145227551460266, "learning_rate": 0.0002, "epoch": 2.5698729582577133, "step": 3540}, {"loss": 0.846, "grad_norm": 0.8275189399719238, "learning_rate": 0.0002, "epoch": 2.577132486388385, "step": 3550}, {"loss": 0.875, "grad_norm": 0.5037438273429871, "learning_rate": 0.0002, "epoch": 2.5843920145190564, "step": 3560}, {"loss": 0.8324, "grad_norm": 0.5838707685470581, "learning_rate": 0.0002, "epoch": 2.5916515426497275, "step": 3570}, {"loss": 0.8716, "grad_norm": 0.5398710370063782, "learning_rate": 0.0002, "epoch": 2.5989110707803995, "step": 3580}, {"loss": 0.7843, "grad_norm": 0.6115376949310303, "learning_rate": 0.0002, "epoch": 2.6061705989110706, "step": 3590}, {"loss": 0.8142, "grad_norm": 1.118809461593628, "learning_rate": 0.0002, "epoch": 2.613430127041742, "step": 3600}, {"loss": 0.8282, "grad_norm": 0.7811771631240845, "learning_rate": 0.0002, "epoch": 2.6206896551724137, "step": 3610}, {"loss": 0.7997, "grad_norm": 0.5753175020217896, "learning_rate": 0.0002, "epoch": 2.6279491833030852, "step": 3620}, {"loss": 0.7705, "grad_norm": 0.550829291343689, "learning_rate": 0.0002, "epoch": 2.635208711433757, "step": 3630}, {"loss": 0.8564, "grad_norm": 0.5360019207000732, "learning_rate": 0.0002, "epoch": 2.6424682395644283, "step": 3640}, {"loss": 0.7648, "grad_norm": 0.63050377368927, "learning_rate": 0.0002, "epoch": 2.6497277676951, "step": 3650}, {"loss": 0.9156, "grad_norm": 0.5833110213279724, "learning_rate": 0.0002, "epoch": 2.6569872958257714, "step": 3660}, {"loss": 0.8033, "grad_norm": 0.5543047189712524, "learning_rate": 0.0002, "epoch": 2.664246823956443, "step": 3670}, {"loss": 0.8867, "grad_norm": 0.6842212080955505, "learning_rate": 0.0002, "epoch": 2.671506352087114, "step": 3680}, {"loss": 0.8569, "grad_norm": 0.6859333515167236, "learning_rate": 0.0002, "epoch": 2.678765880217786, "step": 3690}, {"loss": 0.8081, "grad_norm": 0.7038410902023315, "learning_rate": 0.0002, "epoch": 2.686025408348457, "step": 3700}, {"loss": 0.8686, "grad_norm": 0.447233647108078, "learning_rate": 0.0002, "epoch": 2.6932849364791287, "step": 3710}, {"loss": 0.8951, "grad_norm": 0.6453872919082642, "learning_rate": 0.0002, "epoch": 2.7005444646098002, "step": 3720}, {"loss": 0.8292, "grad_norm": 0.8025672435760498, "learning_rate": 0.0002, "epoch": 2.707803992740472, "step": 3730}, {"loss": 0.8488, "grad_norm": 0.5997087955474854, "learning_rate": 0.0002, "epoch": 2.7150635208711433, "step": 3740}, {"loss": 0.8308, "grad_norm": 0.6901142001152039, "learning_rate": 0.0002, "epoch": 2.722323049001815, "step": 3750}, {"loss": 0.8517, "grad_norm": 1.036145567893982, "learning_rate": 0.0002, "epoch": 2.7295825771324864, "step": 3760}, {"loss": 0.8455, "grad_norm": 0.7207072377204895, "learning_rate": 0.0002, "epoch": 2.736842105263158, "step": 3770}, {"loss": 0.8009, "grad_norm": 1.0452989339828491, "learning_rate": 0.0002, "epoch": 2.7441016333938295, "step": 3780}, {"loss": 0.8868, "grad_norm": 0.5615278482437134, "learning_rate": 0.0002, "epoch": 2.751361161524501, "step": 3790}, {"loss": 0.9047, "grad_norm": 0.46439215540885925, "learning_rate": 0.0002, "epoch": 2.7586206896551726, "step": 3800}, {"loss": 0.7847, "grad_norm": 0.7134917974472046, "learning_rate": 0.0002, "epoch": 2.7658802177858437, "step": 3810}, {"loss": 0.8315, "grad_norm": 0.5139115452766418, "learning_rate": 0.0002, "epoch": 2.7731397459165157, "step": 3820}, {"loss": 0.8995, "grad_norm": 0.8595781326293945, "learning_rate": 0.0002, "epoch": 2.780399274047187, "step": 3830}, {"loss": 0.9079, "grad_norm": 0.544614851474762, "learning_rate": 0.0002, "epoch": 2.7876588021778583, "step": 3840}, {"loss": 0.8322, "grad_norm": 0.6073850393295288, "learning_rate": 0.0002, "epoch": 2.79491833030853, "step": 3850}, {"loss": 0.8277, "grad_norm": 0.8224069476127625, "learning_rate": 0.0002, "epoch": 2.8021778584392014, "step": 3860}, {"loss": 0.815, "grad_norm": 0.5347970128059387, "learning_rate": 0.0002, "epoch": 2.809437386569873, "step": 3870}, {"loss": 0.8219, "grad_norm": 0.6799601912498474, "learning_rate": 0.0002, "epoch": 2.8166969147005445, "step": 3880}, {"loss": 0.8796, "grad_norm": 0.5219197869300842, "learning_rate": 0.0002, "epoch": 2.823956442831216, "step": 3890}, {"loss": 0.8649, "grad_norm": 0.5710130333900452, "learning_rate": 0.0002, "epoch": 2.8312159709618876, "step": 3900}, {"loss": 0.8741, "grad_norm": 0.5857213139533997, "learning_rate": 0.0002, "epoch": 2.838475499092559, "step": 3910}, {"loss": 0.8154, "grad_norm": 0.5206644535064697, "learning_rate": 0.0002, "epoch": 2.8457350272232302, "step": 3920}, {"loss": 0.9328, "grad_norm": 1.2902015447616577, "learning_rate": 0.0002, "epoch": 2.8529945553539022, "step": 3930}, {"loss": 0.8192, "grad_norm": 0.5252797603607178, "learning_rate": 0.0002, "epoch": 2.8602540834845733, "step": 3940}, {"loss": 0.9047, "grad_norm": 0.5925108790397644, "learning_rate": 0.0002, "epoch": 2.867513611615245, "step": 3950}, {"loss": 0.8611, "grad_norm": 0.9719610810279846, "learning_rate": 0.0002, "epoch": 2.8747731397459164, "step": 3960}, {"loss": 0.8531, "grad_norm": 0.6834747195243835, "learning_rate": 0.0002, "epoch": 2.882032667876588, "step": 3970}, {"loss": 0.8124, "grad_norm": 0.6952353119850159, "learning_rate": 0.0002, "epoch": 2.8892921960072595, "step": 3980}, {"loss": 0.8365, "grad_norm": 0.49889910221099854, "learning_rate": 0.0002, "epoch": 2.896551724137931, "step": 3990}, {"loss": 0.8197, "grad_norm": 0.5007026791572571, "learning_rate": 0.0002, "epoch": 2.9038112522686026, "step": 4000}, {"loss": 0.7752, "grad_norm": 0.5474239587783813, "learning_rate": 0.0002, "epoch": 2.911070780399274, "step": 4010}, {"loss": 0.8579, "grad_norm": 0.6617428660392761, "learning_rate": 0.0002, "epoch": 2.9183303085299457, "step": 4020}, {"loss": 0.8583, "grad_norm": 0.6097776293754578, "learning_rate": 0.0002, "epoch": 2.925589836660617, "step": 4030}, {"loss": 0.8296, "grad_norm": 0.5985828638076782, "learning_rate": 0.0002, "epoch": 2.9328493647912888, "step": 4040}, {"loss": 0.9134, "grad_norm": 0.769488513469696, "learning_rate": 0.0002, "epoch": 2.94010889292196, "step": 4050}, {"loss": 0.8148, "grad_norm": 0.5167055130004883, "learning_rate": 0.0002, "epoch": 2.9473684210526314, "step": 4060}, {"loss": 0.8665, "grad_norm": 0.6265496015548706, "learning_rate": 0.0002, "epoch": 2.954627949183303, "step": 4070}, {"loss": 0.9218, "grad_norm": 1.2644082307815552, "learning_rate": 0.0002, "epoch": 2.9618874773139745, "step": 4080}, {"loss": 0.8026, "grad_norm": 0.6007561087608337, "learning_rate": 0.0002, "epoch": 2.969147005444646, "step": 4090}, {"loss": 0.8262, "grad_norm": 0.47984927892684937, "learning_rate": 0.0002, "epoch": 2.9764065335753176, "step": 4100}, {"loss": 0.7897, "grad_norm": 1.128198504447937, "learning_rate": 0.0002, "epoch": 2.983666061705989, "step": 4110}, {"loss": 0.7883, "grad_norm": 0.526292085647583, "learning_rate": 0.0002, "epoch": 2.9909255898366607, "step": 4120}, {"loss": 0.7801, "grad_norm": 0.5462674498558044, "learning_rate": 0.0002, "epoch": 2.9981851179673322, "step": 4130}, {"eval_loss": 1.1093357801437378, "eval_runtime": 46.2498, "eval_samples_per_second": 9.427, "eval_steps_per_second": 1.189, "epoch": 2.9996370235934666, "step": 4132}, {"loss": 0.721, "grad_norm": 0.48288026452064514, "learning_rate": 0.0002, "epoch": 3.0054446460980038, "step": 4140}, {"loss": 0.6769, "grad_norm": 1.0181782245635986, "learning_rate": 0.0002, "epoch": 3.0127041742286753, "step": 4150}, {"loss": 0.7185, "grad_norm": 0.7718019485473633, "learning_rate": 0.0002, "epoch": 3.019963702359347, "step": 4160}, {"loss": 0.6552, "grad_norm": 0.7492219805717468, "learning_rate": 0.0002, "epoch": 3.027223230490018, "step": 4170}, {"loss": 0.6678, "grad_norm": 0.9363632798194885, "learning_rate": 0.0002, "epoch": 3.0344827586206895, "step": 4180}, {"loss": 0.7187, "grad_norm": 0.6888533234596252, "learning_rate": 0.0002, "epoch": 3.041742286751361, "step": 4190}, {"loss": 0.6469, "grad_norm": 0.7072834968566895, "learning_rate": 0.0002, "epoch": 3.0490018148820326, "step": 4200}, {"loss": 0.6387, "grad_norm": 0.7182047963142395, "learning_rate": 0.0002, "epoch": 3.056261343012704, "step": 4210}, {"loss": 0.6385, "grad_norm": 0.7194355130195618, "learning_rate": 0.0002, "epoch": 3.0635208711433757, "step": 4220}, {"loss": 0.5812, "grad_norm": 0.9454023838043213, "learning_rate": 0.0002, "epoch": 3.0707803992740472, "step": 4230}, {"loss": 0.6036, "grad_norm": 0.838657557964325, "learning_rate": 0.0002, "epoch": 3.0780399274047188, "step": 4240}, {"loss": 0.646, "grad_norm": 0.740113377571106, "learning_rate": 0.0002, "epoch": 3.0852994555353903, "step": 4250}, {"loss": 0.604, "grad_norm": 0.6616561412811279, "learning_rate": 0.0002, "epoch": 3.092558983666062, "step": 4260}, {"loss": 0.6462, "grad_norm": 0.8846506476402283, "learning_rate": 0.0002, "epoch": 3.0998185117967334, "step": 4270}, {"loss": 0.6037, "grad_norm": 0.6322125792503357, "learning_rate": 0.0002, "epoch": 3.107078039927405, "step": 4280}, {"loss": 0.5953, "grad_norm": 0.7461467385292053, "learning_rate": 0.0002, "epoch": 3.114337568058076, "step": 4290}, {"loss": 0.6099, "grad_norm": 0.8251287341117859, "learning_rate": 0.0002, "epoch": 3.1215970961887476, "step": 4300}, {"loss": 0.6284, "grad_norm": 0.8767673373222351, "learning_rate": 0.0002, "epoch": 3.128856624319419, "step": 4310}, {"loss": 0.7535, "grad_norm": 0.7758759260177612, "learning_rate": 0.0002, "epoch": 3.1361161524500907, "step": 4320}, {"loss": 0.6624, "grad_norm": 1.1056879758834839, "learning_rate": 0.0002, "epoch": 3.143375680580762, "step": 4330}, {"loss": 0.691, "grad_norm": 0.8259835243225098, "learning_rate": 0.0002, "epoch": 3.1506352087114338, "step": 4340}, {"loss": 0.6635, "grad_norm": 0.6607027053833008, "learning_rate": 0.0002, "epoch": 3.1578947368421053, "step": 4350}, {"loss": 0.5911, "grad_norm": 0.7983301281929016, "learning_rate": 0.0002, "epoch": 3.165154264972777, "step": 4360}, {"loss": 0.6496, "grad_norm": 0.6725239157676697, "learning_rate": 0.0002, "epoch": 3.1724137931034484, "step": 4370}, {"loss": 0.5966, "grad_norm": 0.9052095413208008, "learning_rate": 0.0002, "epoch": 3.17967332123412, "step": 4380}, {"loss": 0.6877, "grad_norm": 0.8131307363510132, "learning_rate": 0.0002, "epoch": 3.1869328493647915, "step": 4390}, {"loss": 0.6384, "grad_norm": 0.6435626149177551, "learning_rate": 0.0002, "epoch": 3.1941923774954626, "step": 4400}, {"loss": 0.5819, "grad_norm": 0.84367436170578, "learning_rate": 0.0002, "epoch": 3.201451905626134, "step": 4410}, {"loss": 0.6104, "grad_norm": 1.5018867254257202, "learning_rate": 0.0002, "epoch": 3.2087114337568057, "step": 4420}, {"loss": 0.6838, "grad_norm": 0.7019091844558716, "learning_rate": 0.0002, "epoch": 3.215970961887477, "step": 4430}, {"loss": 0.6153, "grad_norm": 0.9164197444915771, "learning_rate": 0.0002, "epoch": 3.2232304900181488, "step": 4440}, {"loss": 0.6618, "grad_norm": 0.7890861630439758, "learning_rate": 0.0002, "epoch": 3.2304900181488203, "step": 4450}, {"loss": 0.6401, "grad_norm": 0.6517660617828369, "learning_rate": 0.0002, "epoch": 3.237749546279492, "step": 4460}, {"loss": 0.6699, "grad_norm": 1.10188889503479, "learning_rate": 0.0002, "epoch": 3.2450090744101634, "step": 4470}, {"loss": 0.6356, "grad_norm": 0.8158330917358398, "learning_rate": 0.0002, "epoch": 3.252268602540835, "step": 4480}, {"loss": 0.7757, "grad_norm": 0.7663109302520752, "learning_rate": 0.0002, "epoch": 3.2595281306715065, "step": 4490}, {"loss": 0.6539, "grad_norm": 0.8473444581031799, "learning_rate": 0.0002, "epoch": 3.266787658802178, "step": 4500}, {"loss": 0.6511, "grad_norm": 0.9724768996238708, "learning_rate": 0.0002, "epoch": 3.274047186932849, "step": 4510}, {"loss": 0.5464, "grad_norm": 0.8516759276390076, "learning_rate": 0.0002, "epoch": 3.281306715063521, "step": 4520}, {"loss": 0.6534, "grad_norm": 0.7543437480926514, "learning_rate": 0.0002, "epoch": 3.288566243194192, "step": 4530}, {"loss": 0.6095, "grad_norm": 1.0472029447555542, "learning_rate": 0.0002, "epoch": 3.2958257713248638, "step": 4540}, {"loss": 0.6216, "grad_norm": 0.6240826845169067, "learning_rate": 0.0002, "epoch": 3.3030852994555353, "step": 4550}, {"loss": 0.6223, "grad_norm": 0.9957774877548218, "learning_rate": 0.0002, "epoch": 3.310344827586207, "step": 4560}, {"loss": 0.618, "grad_norm": 0.6448912620544434, "learning_rate": 0.0002, "epoch": 3.3176043557168784, "step": 4570}, {"loss": 0.6188, "grad_norm": 0.7519692778587341, "learning_rate": 0.0002, "epoch": 3.32486388384755, "step": 4580}, {"loss": 0.6672, "grad_norm": 0.7367453575134277, "learning_rate": 0.0002, "epoch": 3.3321234119782215, "step": 4590}, {"loss": 0.6517, "grad_norm": 0.8064960837364197, "learning_rate": 0.0002, "epoch": 3.339382940108893, "step": 4600}, {"loss": 0.6062, "grad_norm": 0.7664631009101868, "learning_rate": 0.0002, "epoch": 3.3466424682395646, "step": 4610}, {"loss": 0.6834, "grad_norm": 0.7803396582603455, "learning_rate": 0.0002, "epoch": 3.353901996370236, "step": 4620}, {"loss": 0.6961, "grad_norm": 0.9141599535942078, "learning_rate": 0.0002, "epoch": 3.3611615245009077, "step": 4630}, {"loss": 0.6889, "grad_norm": 0.9719856381416321, "learning_rate": 0.0002, "epoch": 3.3684210526315788, "step": 4640}, {"loss": 0.6914, "grad_norm": 0.9223218560218811, "learning_rate": 0.0002, "epoch": 3.3756805807622503, "step": 4650}, {"loss": 0.5981, "grad_norm": 0.7289277911186218, "learning_rate": 0.0002, "epoch": 3.382940108892922, "step": 4660}, {"loss": 0.595, "grad_norm": 1.039724349975586, "learning_rate": 0.0002, "epoch": 3.3901996370235934, "step": 4670}, {"loss": 0.8121, "grad_norm": 1.397438883781433, "learning_rate": 0.0002, "epoch": 3.397459165154265, "step": 4680}, {"loss": 0.6334, "grad_norm": 1.0069999694824219, "learning_rate": 0.0002, "epoch": 3.4047186932849365, "step": 4690}, {"loss": 0.6598, "grad_norm": 0.816291332244873, "learning_rate": 0.0002, "epoch": 3.411978221415608, "step": 4700}, {"loss": 0.6748, "grad_norm": 1.2831530570983887, "learning_rate": 0.0002, "epoch": 3.4192377495462796, "step": 4710}, {"loss": 0.6625, "grad_norm": 0.9573889970779419, "learning_rate": 0.0002, "epoch": 3.426497277676951, "step": 4720}, {"loss": 0.7279, "grad_norm": 0.7685632705688477, "learning_rate": 0.0002, "epoch": 3.4337568058076227, "step": 4730}, {"loss": 0.6104, "grad_norm": 0.7019195556640625, "learning_rate": 0.0002, "epoch": 3.441016333938294, "step": 4740}, {"loss": 0.7606, "grad_norm": 0.7244833707809448, "learning_rate": 0.0002, "epoch": 3.4482758620689653, "step": 4750}, {"loss": 0.6951, "grad_norm": 1.3468551635742188, "learning_rate": 0.0002, "epoch": 3.455535390199637, "step": 4760}, {"loss": 0.6945, "grad_norm": 0.822846531867981, "learning_rate": 0.0002, "epoch": 3.4627949183303084, "step": 4770}, {"loss": 0.6431, "grad_norm": 0.7311608195304871, "learning_rate": 0.0002, "epoch": 3.47005444646098, "step": 4780}, {"loss": 0.7019, "grad_norm": 0.9466770887374878, "learning_rate": 0.0002, "epoch": 3.4773139745916515, "step": 4790}, {"loss": 0.7767, "grad_norm": 1.1527155637741089, "learning_rate": 0.0002, "epoch": 3.484573502722323, "step": 4800}, {"loss": 0.6882, "grad_norm": 1.1288906335830688, "learning_rate": 0.0002, "epoch": 3.4918330308529946, "step": 4810}, {"loss": 0.6564, "grad_norm": 0.9096164107322693, "learning_rate": 0.0002, "epoch": 3.499092558983666, "step": 4820}, {"loss": 0.6127, "grad_norm": 0.7988565564155579, "learning_rate": 0.0002, "epoch": 3.5063520871143377, "step": 4830}, {"loss": 0.7004, "grad_norm": 0.7183415293693542, "learning_rate": 0.0002, "epoch": 3.513611615245009, "step": 4840}, {"loss": 0.74, "grad_norm": 0.6614915132522583, "learning_rate": 0.0002, "epoch": 3.5208711433756807, "step": 4850}, {"loss": 0.7271, "grad_norm": 0.8609521985054016, "learning_rate": 0.0002, "epoch": 3.528130671506352, "step": 4860}, {"loss": 0.6664, "grad_norm": 0.86552894115448, "learning_rate": 0.0002, "epoch": 3.535390199637024, "step": 4870}, {"loss": 0.6432, "grad_norm": 0.6926496028900146, "learning_rate": 0.0002, "epoch": 3.542649727767695, "step": 4880}, {"loss": 0.7117, "grad_norm": 0.8157467246055603, "learning_rate": 0.0002, "epoch": 3.5499092558983665, "step": 4890}, {"loss": 0.6201, "grad_norm": 0.9085357189178467, "learning_rate": 0.0002, "epoch": 3.557168784029038, "step": 4900}, {"loss": 0.6521, "grad_norm": 0.6322644948959351, "learning_rate": 0.0002, "epoch": 3.5644283121597096, "step": 4910}, {"loss": 0.6607, "grad_norm": 1.263205885887146, "learning_rate": 0.0002, "epoch": 3.571687840290381, "step": 4920}, {"loss": 0.6657, "grad_norm": 0.8901070356369019, "learning_rate": 0.0002, "epoch": 3.5789473684210527, "step": 4930}, {"loss": 0.6434, "grad_norm": 0.7983952164649963, "learning_rate": 0.0002, "epoch": 3.586206896551724, "step": 4940}, {"loss": 0.6861, "grad_norm": 0.9887813925743103, "learning_rate": 0.0002, "epoch": 3.5934664246823957, "step": 4950}, {"loss": 0.6502, "grad_norm": 0.7895187735557556, "learning_rate": 0.0002, "epoch": 3.6007259528130673, "step": 4960}, {"loss": 0.7111, "grad_norm": 0.9685819745063782, "learning_rate": 0.0002, "epoch": 3.6079854809437384, "step": 4970}, {"loss": 0.6915, "grad_norm": 0.6576591730117798, "learning_rate": 0.0002, "epoch": 3.6152450090744104, "step": 4980}, {"loss": 0.6195, "grad_norm": 0.856985330581665, "learning_rate": 0.0002, "epoch": 3.6225045372050815, "step": 4990}, {"loss": 0.6318, "grad_norm": 0.7230252623558044, "learning_rate": 0.0002, "epoch": 3.629764065335753, "step": 5000}, {"loss": 0.742, "grad_norm": 0.8260893821716309, "learning_rate": 0.0002, "epoch": 3.6370235934664246, "step": 5010}, {"loss": 0.7223, "grad_norm": 0.7635950446128845, "learning_rate": 0.0002, "epoch": 3.644283121597096, "step": 5020}, {"loss": 0.6837, "grad_norm": 0.7060768604278564, "learning_rate": 0.0002, "epoch": 3.6515426497277677, "step": 5030}, {"loss": 0.6921, "grad_norm": 0.8020303249359131, "learning_rate": 0.0002, "epoch": 3.658802177858439, "step": 5040}, {"loss": 0.6446, "grad_norm": 0.8530341386795044, "learning_rate": 0.0002, "epoch": 3.6660617059891107, "step": 5050}, {"loss": 0.7222, "grad_norm": 0.6667101979255676, "learning_rate": 0.0002, "epoch": 3.6733212341197823, "step": 5060}, {"loss": 0.7081, "grad_norm": 0.7385406494140625, "learning_rate": 0.0002, "epoch": 3.680580762250454, "step": 5070}, {"loss": 0.7035, "grad_norm": 0.7753380537033081, "learning_rate": 0.0002, "epoch": 3.6878402903811254, "step": 5080}, {"loss": 0.6491, "grad_norm": 0.7516207098960876, "learning_rate": 0.0002, "epoch": 3.695099818511797, "step": 5090}, {"loss": 0.672, "grad_norm": 0.8171586394309998, "learning_rate": 0.0002, "epoch": 3.702359346642468, "step": 5100}, {"loss": 0.7459, "grad_norm": 1.0796279907226562, "learning_rate": 0.0002, "epoch": 3.70961887477314, "step": 5110}, {"loss": 0.5948, "grad_norm": 0.6957688927650452, "learning_rate": 0.0002, "epoch": 3.716878402903811, "step": 5120}, {"loss": 0.7515, "grad_norm": 0.8550161719322205, "learning_rate": 0.0002, "epoch": 3.7241379310344827, "step": 5130}, {"loss": 0.7286, "grad_norm": 0.9396728277206421, "learning_rate": 0.0002, "epoch": 3.731397459165154, "step": 5140}, {"loss": 0.7594, "grad_norm": 1.4264805316925049, "learning_rate": 0.0002, "epoch": 3.7386569872958257, "step": 5150}, {"loss": 0.6575, "grad_norm": 0.8725108504295349, "learning_rate": 0.0002, "epoch": 3.7459165154264973, "step": 5160}, {"loss": 0.6778, "grad_norm": 1.0346195697784424, "learning_rate": 0.0002, "epoch": 3.753176043557169, "step": 5170}, {"loss": 0.6371, "grad_norm": 0.5395554304122925, "learning_rate": 0.0002, "epoch": 3.7604355716878404, "step": 5180}, {"loss": 0.7308, "grad_norm": 1.3153616189956665, "learning_rate": 0.0002, "epoch": 3.767695099818512, "step": 5190}, {"loss": 0.78, "grad_norm": 0.9879828691482544, "learning_rate": 0.0002, "epoch": 3.7749546279491835, "step": 5200}, {"loss": 0.7068, "grad_norm": 0.8876672983169556, "learning_rate": 0.0002, "epoch": 3.7822141560798546, "step": 5210}, {"loss": 0.6283, "grad_norm": 0.8363267779350281, "learning_rate": 0.0002, "epoch": 3.7894736842105265, "step": 5220}, {"loss": 0.6255, "grad_norm": 0.637294590473175, "learning_rate": 0.0002, "epoch": 3.7967332123411976, "step": 5230}, {"loss": 0.6685, "grad_norm": 1.1408970355987549, "learning_rate": 0.0002, "epoch": 3.803992740471869, "step": 5240}, {"loss": 0.6761, "grad_norm": 1.0128360986709595, "learning_rate": 0.0002, "epoch": 3.8112522686025407, "step": 5250}, {"loss": 0.6764, "grad_norm": 0.8061144351959229, "learning_rate": 0.0002, "epoch": 3.8185117967332123, "step": 5260}, {"loss": 0.7254, "grad_norm": 0.9626626968383789, "learning_rate": 0.0002, "epoch": 3.825771324863884, "step": 5270}, {"loss": 0.7367, "grad_norm": 0.9013627171516418, "learning_rate": 0.0002, "epoch": 3.8330308529945554, "step": 5280}, {"loss": 0.6806, "grad_norm": 0.8411344289779663, "learning_rate": 0.0002, "epoch": 3.840290381125227, "step": 5290}, {"loss": 0.6818, "grad_norm": 0.7426059246063232, "learning_rate": 0.0002, "epoch": 3.8475499092558985, "step": 5300}, {"loss": 0.6748, "grad_norm": 1.003413438796997, "learning_rate": 0.0002, "epoch": 3.85480943738657, "step": 5310}, {"loss": 0.8554, "grad_norm": 0.7527840733528137, "learning_rate": 0.0002, "epoch": 3.862068965517241, "step": 5320}, {"loss": 0.7521, "grad_norm": 0.738610565662384, "learning_rate": 0.0002, "epoch": 3.869328493647913, "step": 5330}, {"loss": 0.7266, "grad_norm": 0.7277999520301819, "learning_rate": 0.0002, "epoch": 3.876588021778584, "step": 5340}, {"loss": 0.7503, "grad_norm": 0.5951359272003174, "learning_rate": 0.0002, "epoch": 3.8838475499092557, "step": 5350}, {"loss": 0.7447, "grad_norm": 1.043884038925171, "learning_rate": 0.0002, "epoch": 3.8911070780399273, "step": 5360}, {"loss": 0.6862, "grad_norm": 0.8436498045921326, "learning_rate": 0.0002, "epoch": 3.898366606170599, "step": 5370}, {"loss": 0.665, "grad_norm": 0.5603365302085876, "learning_rate": 0.0002, "epoch": 3.9056261343012704, "step": 5380}, {"loss": 0.7098, "grad_norm": 1.0128886699676514, "learning_rate": 0.0002, "epoch": 3.912885662431942, "step": 5390}, {"loss": 0.6707, "grad_norm": 0.7970930337905884, "learning_rate": 0.0002, "epoch": 3.9201451905626135, "step": 5400}, {"loss": 0.637, "grad_norm": 0.7699369192123413, "learning_rate": 0.0002, "epoch": 3.927404718693285, "step": 5410}, {"loss": 0.6742, "grad_norm": 0.800561249256134, "learning_rate": 0.0002, "epoch": 3.9346642468239565, "step": 5420}, {"loss": 0.7208, "grad_norm": 0.8020331859588623, "learning_rate": 0.0002, "epoch": 3.941923774954628, "step": 5430}, {"loss": 0.7294, "grad_norm": 0.7461140155792236, "learning_rate": 0.0002, "epoch": 3.9491833030852996, "step": 5440}, {"loss": 0.7013, "grad_norm": 0.8346918821334839, "learning_rate": 0.0002, "epoch": 3.9564428312159707, "step": 5450}, {"loss": 0.6289, "grad_norm": 0.9723302125930786, "learning_rate": 0.0002, "epoch": 3.9637023593466427, "step": 5460}, {"loss": 0.8029, "grad_norm": 0.6809740662574768, "learning_rate": 0.0002, "epoch": 3.970961887477314, "step": 5470}, {"loss": 0.6896, "grad_norm": 0.7353498339653015, "learning_rate": 0.0002, "epoch": 3.9782214156079854, "step": 5480}, {"loss": 0.6722, "grad_norm": 0.748009443283081, "learning_rate": 0.0002, "epoch": 3.985480943738657, "step": 5490}, {"loss": 0.6866, "grad_norm": 1.3656195402145386, "learning_rate": 0.0002, "epoch": 3.9927404718693285, "step": 5500}, {"loss": 0.7368, "grad_norm": 0.8402108550071716, "learning_rate": 0.0002, "epoch": 4.0, "step": 5510}, {"eval_loss": 1.17229425907135, "eval_runtime": 46.2554, "eval_samples_per_second": 9.426, "eval_steps_per_second": 1.189, "epoch": 4.0, "step": 5510}, {"loss": 0.4637, "grad_norm": 0.8601235747337341, "learning_rate": 0.0002, "epoch": 4.007259528130671, "step": 5520}, {"loss": 0.4717, "grad_norm": 1.2635200023651123, "learning_rate": 0.0002, "epoch": 4.014519056261343, "step": 5530}, {"loss": 0.503, "grad_norm": 1.0257477760314941, "learning_rate": 0.0002, "epoch": 4.021778584392014, "step": 5540}, {"loss": 0.4547, "grad_norm": 0.9436745047569275, "learning_rate": 0.0002, "epoch": 4.029038112522686, "step": 5550}, {"loss": 0.459, "grad_norm": 0.9443606734275818, "learning_rate": 0.0002, "epoch": 4.036297640653357, "step": 5560}, {"loss": 0.5386, "grad_norm": 1.3965742588043213, "learning_rate": 0.0002, "epoch": 4.043557168784029, "step": 5570}, {"loss": 0.4248, "grad_norm": 0.8973520398139954, "learning_rate": 0.0002, "epoch": 4.0508166969147, "step": 5580}, {"loss": 0.4111, "grad_norm": 0.9998409748077393, "learning_rate": 0.0002, "epoch": 4.058076225045372, "step": 5590}, {"loss": 0.4828, "grad_norm": 1.1213387250900269, "learning_rate": 0.0002, "epoch": 4.0653357531760435, "step": 5600}, {"loss": 0.439, "grad_norm": 0.7064558863639832, "learning_rate": 0.0002, "epoch": 4.072595281306715, "step": 5610}, {"loss": 0.4607, "grad_norm": 1.2390803098678589, "learning_rate": 0.0002, "epoch": 4.0798548094373865, "step": 5620}, {"loss": 0.5014, "grad_norm": 1.123469591140747, "learning_rate": 0.0002, "epoch": 4.087114337568058, "step": 5630}, {"loss": 0.513, "grad_norm": 1.229573369026184, "learning_rate": 0.0002, "epoch": 4.09437386569873, "step": 5640}, {"loss": 0.5258, "grad_norm": 1.7182831764221191, "learning_rate": 0.0002, "epoch": 4.101633393829401, "step": 5650}, {"loss": 0.5371, "grad_norm": 0.894903302192688, "learning_rate": 0.0002, "epoch": 4.108892921960073, "step": 5660}, {"loss": 0.4813, "grad_norm": 0.8754552006721497, "learning_rate": 0.0002, "epoch": 4.116152450090744, "step": 5670}, {"loss": 0.491, "grad_norm": 1.2401553392410278, "learning_rate": 0.0002, "epoch": 4.123411978221416, "step": 5680}, {"loss": 0.4549, "grad_norm": 0.8631148934364319, "learning_rate": 0.0002, "epoch": 4.130671506352087, "step": 5690}, {"loss": 0.487, "grad_norm": 1.1798022985458374, "learning_rate": 0.0002, "epoch": 4.137931034482759, "step": 5700}, {"loss": 0.4522, "grad_norm": 0.8344549536705017, "learning_rate": 0.0002, "epoch": 4.14519056261343, "step": 5710}, {"loss": 0.4559, "grad_norm": 1.2342697381973267, "learning_rate": 0.0002, "epoch": 4.152450090744102, "step": 5720}, {"loss": 0.53, "grad_norm": 1.1601094007492065, "learning_rate": 0.0002, "epoch": 4.159709618874773, "step": 5730}, {"loss": 0.4755, "grad_norm": 1.2925703525543213, "learning_rate": 0.0002, "epoch": 4.166969147005445, "step": 5740}, {"loss": 0.4973, "grad_norm": 1.0870997905731201, "learning_rate": 0.0002, "epoch": 4.174228675136116, "step": 5750}, {"loss": 0.5184, "grad_norm": 0.9077792763710022, "learning_rate": 0.0002, "epoch": 4.181488203266787, "step": 5760}, {"loss": 0.4606, "grad_norm": 1.009273886680603, "learning_rate": 0.0002, "epoch": 4.188747731397459, "step": 5770}, {"loss": 0.5383, "grad_norm": 1.2465530633926392, "learning_rate": 0.0002, "epoch": 4.19600725952813, "step": 5780}, {"loss": 0.4938, "grad_norm": 1.2261253595352173, "learning_rate": 0.0002, "epoch": 4.203266787658802, "step": 5790}, {"loss": 0.5255, "grad_norm": 1.1498041152954102, "learning_rate": 0.0002, "epoch": 4.2105263157894735, "step": 5800}, {"loss": 0.5269, "grad_norm": 1.1966725587844849, "learning_rate": 0.0002, "epoch": 4.217785843920145, "step": 5810}, {"loss": 0.5626, "grad_norm": 1.2651296854019165, "learning_rate": 0.0002, "epoch": 4.2250453720508165, "step": 5820}, {"loss": 0.5213, "grad_norm": 1.0388574600219727, "learning_rate": 0.0002, "epoch": 4.2323049001814885, "step": 5830}, {"loss": 0.4965, "grad_norm": 1.3042771816253662, "learning_rate": 0.0002, "epoch": 4.23956442831216, "step": 5840}, {"loss": 0.5116, "grad_norm": 1.1127727031707764, "learning_rate": 0.0002, "epoch": 4.246823956442832, "step": 5850}, {"loss": 0.5197, "grad_norm": 0.9653958082199097, "learning_rate": 0.0002, "epoch": 4.254083484573503, "step": 5860}, {"loss": 0.4414, "grad_norm": 1.0500504970550537, "learning_rate": 0.0002, "epoch": 4.261343012704174, "step": 5870}, {"loss": 0.544, "grad_norm": 1.1476165056228638, "learning_rate": 0.0002, "epoch": 4.268602540834846, "step": 5880}, {"loss": 0.4667, "grad_norm": 0.9424414038658142, "learning_rate": 0.0002, "epoch": 4.275862068965517, "step": 5890}, {"loss": 0.5039, "grad_norm": 1.3309166431427002, "learning_rate": 0.0002, "epoch": 4.283121597096189, "step": 5900}, {"loss": 0.5472, "grad_norm": 1.3025873899459839, "learning_rate": 0.0002, "epoch": 4.29038112522686, "step": 5910}, {"loss": 0.4644, "grad_norm": 1.1442325115203857, "learning_rate": 0.0002, "epoch": 4.297640653357532, "step": 5920}, {"loss": 0.5066, "grad_norm": 0.9820859432220459, "learning_rate": 0.0002, "epoch": 4.304900181488203, "step": 5930}, {"loss": 0.5224, "grad_norm": 0.9615740180015564, "learning_rate": 0.0002, "epoch": 4.312159709618875, "step": 5940}, {"loss": 0.5665, "grad_norm": 1.1627109050750732, "learning_rate": 0.0002, "epoch": 4.319419237749546, "step": 5950}, {"loss": 0.4852, "grad_norm": 0.9381322860717773, "learning_rate": 0.0002, "epoch": 4.326678765880218, "step": 5960}, {"loss": 0.4532, "grad_norm": 0.8154335618019104, "learning_rate": 0.0002, "epoch": 4.333938294010889, "step": 5970}, {"loss": 0.5237, "grad_norm": 0.877671480178833, "learning_rate": 0.0002, "epoch": 4.341197822141561, "step": 5980}, {"loss": 0.6113, "grad_norm": 1.1742031574249268, "learning_rate": 0.0002, "epoch": 4.348457350272232, "step": 5990}, {"loss": 0.5704, "grad_norm": 1.0352917909622192, "learning_rate": 0.0002, "epoch": 4.3557168784029034, "step": 6000}, {"loss": 0.4996, "grad_norm": 0.9963878989219666, "learning_rate": 0.0002, "epoch": 4.362976406533575, "step": 6010}, {"loss": 0.4813, "grad_norm": 1.1892237663269043, "learning_rate": 0.0002, "epoch": 4.3702359346642465, "step": 6020}, {"loss": 0.5224, "grad_norm": 1.2516111135482788, "learning_rate": 0.0002, "epoch": 4.3774954627949185, "step": 6030}, {"loss": 0.5952, "grad_norm": 1.2111951112747192, "learning_rate": 0.0002, "epoch": 4.38475499092559, "step": 6040}, {"loss": 0.4275, "grad_norm": 1.0820083618164062, "learning_rate": 0.0002, "epoch": 4.392014519056262, "step": 6050}, {"loss": 0.5117, "grad_norm": 1.033915638923645, "learning_rate": 0.0002, "epoch": 4.399274047186933, "step": 6060}, {"loss": 0.5431, "grad_norm": 1.0635870695114136, "learning_rate": 0.0002, "epoch": 4.406533575317605, "step": 6070}, {"loss": 0.5341, "grad_norm": 1.0520414113998413, "learning_rate": 0.0002, "epoch": 4.413793103448276, "step": 6080}, {"loss": 0.512, "grad_norm": 1.0821926593780518, "learning_rate": 0.0002, "epoch": 4.421052631578947, "step": 6090}, {"loss": 0.5065, "grad_norm": 1.0533246994018555, "learning_rate": 0.0002, "epoch": 4.428312159709619, "step": 6100}, {"loss": 0.4577, "grad_norm": 0.9231932759284973, "learning_rate": 0.0002, "epoch": 4.43557168784029, "step": 6110}, {"loss": 0.583, "grad_norm": 0.9910260438919067, "learning_rate": 0.0002, "epoch": 4.442831215970962, "step": 6120}, {"loss": 0.4717, "grad_norm": 1.061949372291565, "learning_rate": 0.0002, "epoch": 4.450090744101633, "step": 6130}, {"loss": 0.5893, "grad_norm": 1.2927039861679077, "learning_rate": 0.0002, "epoch": 4.457350272232305, "step": 6140}, {"loss": 0.4684, "grad_norm": 1.3966081142425537, "learning_rate": 0.0002, "epoch": 4.464609800362976, "step": 6150}, {"loss": 0.5507, "grad_norm": 1.3835992813110352, "learning_rate": 0.0002, "epoch": 4.471869328493648, "step": 6160}, {"loss": 0.5911, "grad_norm": 1.0892692804336548, "learning_rate": 0.0002, "epoch": 4.479128856624319, "step": 6170}, {"loss": 0.478, "grad_norm": 1.0318800210952759, "learning_rate": 0.0002, "epoch": 4.486388384754991, "step": 6180}, {"loss": 0.5198, "grad_norm": 0.8174677491188049, "learning_rate": 0.0002, "epoch": 4.493647912885662, "step": 6190}, {"loss": 0.5387, "grad_norm": 1.4157509803771973, "learning_rate": 0.0002, "epoch": 4.500907441016334, "step": 6200}, {"loss": 0.5868, "grad_norm": 1.5244114398956299, "learning_rate": 0.0002, "epoch": 4.508166969147005, "step": 6210}, {"loss": 0.4642, "grad_norm": 0.8164850473403931, "learning_rate": 0.0002, "epoch": 4.5154264972776765, "step": 6220}, {"loss": 0.522, "grad_norm": 1.2904746532440186, "learning_rate": 0.0002, "epoch": 4.5226860254083485, "step": 6230}, {"loss": 0.5103, "grad_norm": 0.7987732887268066, "learning_rate": 0.0002, "epoch": 4.52994555353902, "step": 6240}, {"loss": 0.4615, "grad_norm": 0.831040620803833, "learning_rate": 0.0002, "epoch": 4.537205081669692, "step": 6250}, {"loss": 0.5065, "grad_norm": 0.9545485973358154, "learning_rate": 0.0002, "epoch": 4.544464609800363, "step": 6260}, {"loss": 0.5515, "grad_norm": 0.9291793704032898, "learning_rate": 0.0002, "epoch": 4.551724137931035, "step": 6270}, {"loss": 0.4535, "grad_norm": 0.8977208733558655, "learning_rate": 0.0002, "epoch": 4.558983666061706, "step": 6280}, {"loss": 0.544, "grad_norm": 1.1768537759780884, "learning_rate": 0.0002, "epoch": 4.566243194192378, "step": 6290}, {"loss": 0.5925, "grad_norm": 1.0688952207565308, "learning_rate": 0.0002, "epoch": 4.573502722323049, "step": 6300}, {"loss": 0.5207, "grad_norm": 0.8800966739654541, "learning_rate": 0.0002, "epoch": 4.580762250453721, "step": 6310}, {"loss": 0.6106, "grad_norm": 1.0911834239959717, "learning_rate": 0.0002, "epoch": 4.588021778584392, "step": 6320}, {"loss": 0.5109, "grad_norm": 1.1420872211456299, "learning_rate": 0.0002, "epoch": 4.595281306715064, "step": 6330}, {"loss": 0.5147, "grad_norm": 1.0215224027633667, "learning_rate": 0.0002, "epoch": 4.602540834845735, "step": 6340}, {"loss": 0.592, "grad_norm": 0.9685489535331726, "learning_rate": 0.0002, "epoch": 4.609800362976406, "step": 6350}, {"loss": 0.5775, "grad_norm": 1.12773597240448, "learning_rate": 0.0002, "epoch": 4.617059891107078, "step": 6360}, {"loss": 0.5966, "grad_norm": 1.0663973093032837, "learning_rate": 0.0002, "epoch": 4.624319419237749, "step": 6370}, {"loss": 0.512, "grad_norm": 1.1707262992858887, "learning_rate": 0.0002, "epoch": 4.631578947368421, "step": 6380}, {"loss": 0.5497, "grad_norm": 1.0672980546951294, "learning_rate": 0.0002, "epoch": 4.638838475499092, "step": 6390}, {"loss": 0.5699, "grad_norm": 1.1464333534240723, "learning_rate": 0.0002, "epoch": 4.646098003629764, "step": 6400}, {"loss": 0.5514, "grad_norm": 1.070230484008789, "learning_rate": 0.0002, "epoch": 4.653357531760435, "step": 6410}, {"loss": 0.5013, "grad_norm": 0.9673764109611511, "learning_rate": 0.0002, "epoch": 4.660617059891107, "step": 6420}, {"loss": 0.5901, "grad_norm": 1.0189043283462524, "learning_rate": 0.0002, "epoch": 4.6678765880217785, "step": 6430}, {"loss": 0.5193, "grad_norm": 1.185896396636963, "learning_rate": 0.0002, "epoch": 4.67513611615245, "step": 6440}, {"loss": 0.5318, "grad_norm": 1.0682812929153442, "learning_rate": 0.0002, "epoch": 4.682395644283122, "step": 6450}, {"loss": 0.5773, "grad_norm": 1.3586071729660034, "learning_rate": 0.0002, "epoch": 4.689655172413794, "step": 6460}, {"loss": 0.5482, "grad_norm": 0.6561792492866516, "learning_rate": 0.0002, "epoch": 4.696914700544465, "step": 6470}, {"loss": 0.5711, "grad_norm": 1.1394113302230835, "learning_rate": 0.0002, "epoch": 4.704174228675136, "step": 6480}, {"loss": 0.5325, "grad_norm": 0.9683151245117188, "learning_rate": 0.0002, "epoch": 4.711433756805808, "step": 6490}, {"loss": 0.5139, "grad_norm": 1.0247553586959839, "learning_rate": 0.0002, "epoch": 4.718693284936479, "step": 6500}, {"loss": 0.5794, "grad_norm": 0.8046169281005859, "learning_rate": 0.0002, "epoch": 4.725952813067151, "step": 6510}, {"loss": 0.5471, "grad_norm": 1.0710240602493286, "learning_rate": 0.0002, "epoch": 4.733212341197822, "step": 6520}, {"loss": 0.4805, "grad_norm": 0.9438924193382263, "learning_rate": 0.0002, "epoch": 4.740471869328494, "step": 6530}, {"loss": 0.5404, "grad_norm": 0.869162380695343, "learning_rate": 0.0002, "epoch": 4.747731397459165, "step": 6540}, {"loss": 0.6379, "grad_norm": 0.9776787161827087, "learning_rate": 0.0002, "epoch": 4.754990925589837, "step": 6550}, {"loss": 0.5288, "grad_norm": 1.1990505456924438, "learning_rate": 0.0002, "epoch": 4.762250453720508, "step": 6560}, {"loss": 0.5539, "grad_norm": 1.0582209825515747, "learning_rate": 0.0002, "epoch": 4.769509981851179, "step": 6570}, {"loss": 0.489, "grad_norm": 0.9966367483139038, "learning_rate": 0.0002, "epoch": 4.776769509981851, "step": 6580}, {"loss": 0.5514, "grad_norm": 0.9130612015724182, "learning_rate": 0.0002, "epoch": 4.784029038112522, "step": 6590}, {"loss": 0.5864, "grad_norm": 1.0950500965118408, "learning_rate": 0.0002, "epoch": 4.791288566243194, "step": 6600}, {"loss": 0.5266, "grad_norm": 1.108681321144104, "learning_rate": 0.0002, "epoch": 4.798548094373865, "step": 6610}, {"loss": 0.5875, "grad_norm": 1.1873763799667358, "learning_rate": 0.0002, "epoch": 4.805807622504537, "step": 6620}, {"loss": 0.5736, "grad_norm": 1.305367112159729, "learning_rate": 0.0002, "epoch": 4.8130671506352085, "step": 6630}, {"loss": 0.5636, "grad_norm": 1.2801482677459717, "learning_rate": 0.0002, "epoch": 4.8203266787658805, "step": 6640}, {"loss": 0.582, "grad_norm": 1.26764976978302, "learning_rate": 0.0002, "epoch": 4.827586206896552, "step": 6650}, {"loss": 0.5259, "grad_norm": 1.0018208026885986, "learning_rate": 0.0002, "epoch": 4.834845735027224, "step": 6660}, {"loss": 0.548, "grad_norm": 1.2326326370239258, "learning_rate": 0.0002, "epoch": 4.842105263157895, "step": 6670}, {"loss": 0.5933, "grad_norm": 0.9707282781600952, "learning_rate": 0.0002, "epoch": 4.849364791288567, "step": 6680}, {"loss": 0.5612, "grad_norm": 1.2772048711776733, "learning_rate": 0.0002, "epoch": 4.856624319419238, "step": 6690}, {"loss": 0.5346, "grad_norm": 2.6652262210845947, "learning_rate": 0.0002, "epoch": 4.863883847549909, "step": 6700}, {"loss": 0.5428, "grad_norm": 1.215828537940979, "learning_rate": 0.0002, "epoch": 4.871143375680581, "step": 6710}, {"loss": 0.6571, "grad_norm": 1.3704510927200317, "learning_rate": 0.0002, "epoch": 4.878402903811252, "step": 6720}, {"loss": 0.4963, "grad_norm": 0.7781757116317749, "learning_rate": 0.0002, "epoch": 4.885662431941924, "step": 6730}, {"loss": 0.5989, "grad_norm": 1.1883646249771118, "learning_rate": 0.0002, "epoch": 4.892921960072595, "step": 6740}, {"loss": 0.6067, "grad_norm": 0.9216066002845764, "learning_rate": 0.0002, "epoch": 4.900181488203267, "step": 6750}, {"loss": 0.5085, "grad_norm": 1.0558464527130127, "learning_rate": 0.0002, "epoch": 4.907441016333938, "step": 6760}, {"loss": 0.5216, "grad_norm": 1.032656192779541, "learning_rate": 0.0002, "epoch": 4.91470054446461, "step": 6770}, {"loss": 0.5426, "grad_norm": 1.1261441707611084, "learning_rate": 0.0002, "epoch": 4.921960072595281, "step": 6780}, {"loss": 0.5295, "grad_norm": 1.2178640365600586, "learning_rate": 0.0002, "epoch": 4.929219600725952, "step": 6790}, {"loss": 0.5476, "grad_norm": 1.5369361639022827, "learning_rate": 0.0002, "epoch": 4.936479128856624, "step": 6800}, {"loss": 0.5358, "grad_norm": 1.1188377141952515, "learning_rate": 0.0002, "epoch": 4.943738656987296, "step": 6810}, {"loss": 0.5483, "grad_norm": 1.2506113052368164, "learning_rate": 0.0002, "epoch": 4.950998185117967, "step": 6820}, {"loss": 0.567, "grad_norm": 0.8776047825813293, "learning_rate": 0.0002, "epoch": 4.9582577132486385, "step": 6830}, {"loss": 0.5764, "grad_norm": 0.9700555205345154, "learning_rate": 0.0002, "epoch": 4.9655172413793105, "step": 6840}, {"loss": 0.5396, "grad_norm": 1.2713534832000732, "learning_rate": 0.0002, "epoch": 4.972776769509982, "step": 6850}, {"loss": 0.5451, "grad_norm": 0.9855955243110657, "learning_rate": 0.0002, "epoch": 4.980036297640654, "step": 6860}, {"loss": 0.5884, "grad_norm": 0.8734853863716125, "learning_rate": 0.0002, "epoch": 4.987295825771325, "step": 6870}, {"loss": 0.5189, "grad_norm": 0.8065403699874878, "learning_rate": 0.0002, "epoch": 4.994555353901997, "step": 6880}, {"eval_loss": 1.3302682638168335, "eval_runtime": 46.2496, "eval_samples_per_second": 9.427, "eval_steps_per_second": 1.189, "epoch": 4.999637023593467, "step": 6887}, {"loss": 0.4889, "grad_norm": 0.5163813829421997, "learning_rate": 0.0002, "epoch": 5.001814882032668, "step": 6890}, {"loss": 0.3545, "grad_norm": 1.1496137380599976, "learning_rate": 0.0002, "epoch": 5.00907441016334, "step": 6900}, {"loss": 0.39, "grad_norm": 1.0133885145187378, "learning_rate": 0.0002, "epoch": 5.016333938294011, "step": 6910}, {"loss": 0.3693, "grad_norm": 0.9479621052742004, "learning_rate": 0.0002, "epoch": 5.023593466424682, "step": 6920}, {"loss": 0.4012, "grad_norm": 0.8587583303451538, "learning_rate": 0.0002, "epoch": 5.030852994555354, "step": 6930}, {"loss": 0.3428, "grad_norm": 1.3314697742462158, "learning_rate": 0.0002, "epoch": 5.038112522686025, "step": 6940}, {"loss": 0.3909, "grad_norm": 1.195448875427246, "learning_rate": 0.0002, "epoch": 5.045372050816697, "step": 6950}, {"loss": 0.3322, "grad_norm": 1.2482256889343262, "learning_rate": 0.0002, "epoch": 5.052631578947368, "step": 6960}, {"loss": 0.3893, "grad_norm": 1.2011528015136719, "learning_rate": 0.0002, "epoch": 5.05989110707804, "step": 6970}, {"loss": 0.3265, "grad_norm": 1.3997188806533813, "learning_rate": 0.0002, "epoch": 5.067150635208711, "step": 6980}, {"loss": 0.3716, "grad_norm": 1.2147513628005981, "learning_rate": 0.0002, "epoch": 5.074410163339383, "step": 6990}, {"loss": 0.4053, "grad_norm": 1.6030137538909912, "learning_rate": 0.0002, "epoch": 5.081669691470054, "step": 7000}, {"loss": 0.3665, "grad_norm": 0.9466970562934875, "learning_rate": 0.0002, "epoch": 5.088929219600726, "step": 7010}, {"loss": 0.3451, "grad_norm": 1.4593411684036255, "learning_rate": 0.0002, "epoch": 5.096188747731397, "step": 7020}, {"loss": 0.3843, "grad_norm": 1.2196033000946045, "learning_rate": 0.0002, "epoch": 5.103448275862069, "step": 7030}, {"loss": 0.3896, "grad_norm": 1.1341328620910645, "learning_rate": 0.0002, "epoch": 5.1107078039927405, "step": 7040}, {"loss": 0.3627, "grad_norm": 1.2248976230621338, "learning_rate": 0.0002, "epoch": 5.117967332123412, "step": 7050}, {"loss": 0.3784, "grad_norm": 1.1620593070983887, "learning_rate": 0.0002, "epoch": 5.125226860254084, "step": 7060}, {"loss": 0.3678, "grad_norm": 0.9300723671913147, "learning_rate": 0.0002, "epoch": 5.132486388384755, "step": 7070}, {"loss": 0.3756, "grad_norm": 1.2265169620513916, "learning_rate": 0.0002, "epoch": 5.139745916515427, "step": 7080}, {"loss": 0.3595, "grad_norm": 1.4430373907089233, "learning_rate": 0.0002, "epoch": 5.147005444646098, "step": 7090}, {"loss": 0.3788, "grad_norm": 1.0821576118469238, "learning_rate": 0.0002, "epoch": 5.15426497277677, "step": 7100}, {"loss": 0.383, "grad_norm": 1.2574739456176758, "learning_rate": 0.0002, "epoch": 5.161524500907441, "step": 7110}, {"loss": 0.3692, "grad_norm": 1.1806069612503052, "learning_rate": 0.0002, "epoch": 5.168784029038113, "step": 7120}, {"loss": 0.3978, "grad_norm": 0.9900956153869629, "learning_rate": 0.0002, "epoch": 5.176043557168784, "step": 7130}, {"loss": 0.4358, "grad_norm": 1.2414425611495972, "learning_rate": 0.0002, "epoch": 5.183303085299456, "step": 7140}, {"loss": 0.3485, "grad_norm": 0.8220699429512024, "learning_rate": 0.0002, "epoch": 5.190562613430127, "step": 7150}, {"loss": 0.3517, "grad_norm": 1.29408860206604, "learning_rate": 0.0002, "epoch": 5.197822141560798, "step": 7160}, {"loss": 0.3405, "grad_norm": 0.8510639071464539, "learning_rate": 0.0002, "epoch": 5.20508166969147, "step": 7170}, {"loss": 0.4233, "grad_norm": 1.3448902368545532, "learning_rate": 0.0002, "epoch": 5.212341197822141, "step": 7180}, {"loss": 0.3808, "grad_norm": 1.054451584815979, "learning_rate": 0.0002, "epoch": 5.219600725952813, "step": 7190}, {"loss": 0.368, "grad_norm": 1.3752713203430176, "learning_rate": 0.0002, "epoch": 5.226860254083484, "step": 7200}, {"loss": 0.3844, "grad_norm": 1.4848095178604126, "learning_rate": 0.0002, "epoch": 5.234119782214156, "step": 7210}, {"loss": 0.4187, "grad_norm": 1.428842544555664, "learning_rate": 0.0002, "epoch": 5.241379310344827, "step": 7220}, {"loss": 0.3778, "grad_norm": 1.1703591346740723, "learning_rate": 0.0002, "epoch": 5.248638838475499, "step": 7230}, {"loss": 0.417, "grad_norm": 1.2180451154708862, "learning_rate": 0.0002, "epoch": 5.2558983666061705, "step": 7240}, {"loss": 0.3656, "grad_norm": 1.094045877456665, "learning_rate": 0.0002, "epoch": 5.2631578947368425, "step": 7250}, {"loss": 0.4331, "grad_norm": 0.9545766115188599, "learning_rate": 0.0002, "epoch": 5.270417422867514, "step": 7260}, {"loss": 0.3642, "grad_norm": 0.8356652855873108, "learning_rate": 0.0002, "epoch": 5.277676950998185, "step": 7270}, {"loss": 0.3576, "grad_norm": 1.148160457611084, "learning_rate": 0.0002, "epoch": 5.284936479128857, "step": 7280}, {"loss": 0.4178, "grad_norm": 1.2009977102279663, "learning_rate": 0.0002, "epoch": 5.292196007259528, "step": 7290}, {"loss": 0.3977, "grad_norm": 1.3283873796463013, "learning_rate": 0.0002, "epoch": 5.2994555353902, "step": 7300}, {"loss": 0.3853, "grad_norm": 0.9850481748580933, "learning_rate": 0.0002, "epoch": 5.306715063520871, "step": 7310}, {"loss": 0.3645, "grad_norm": 1.367550015449524, "learning_rate": 0.0002, "epoch": 5.313974591651543, "step": 7320}, {"loss": 0.3898, "grad_norm": 0.8602936863899231, "learning_rate": 0.0002, "epoch": 5.321234119782214, "step": 7330}, {"loss": 0.4173, "grad_norm": 1.1130679845809937, "learning_rate": 0.0002, "epoch": 5.328493647912886, "step": 7340}, {"loss": 0.3642, "grad_norm": 1.3002253770828247, "learning_rate": 0.0002, "epoch": 5.335753176043557, "step": 7350}, {"loss": 0.4138, "grad_norm": 1.6235289573669434, "learning_rate": 0.0002, "epoch": 5.343012704174229, "step": 7360}, {"loss": 0.4779, "grad_norm": 1.156379222869873, "learning_rate": 0.0002, "epoch": 5.3502722323049, "step": 7370}, {"loss": 0.3222, "grad_norm": 1.0569308996200562, "learning_rate": 0.0002, "epoch": 5.357531760435572, "step": 7380}, {"loss": 0.3573, "grad_norm": 1.6674021482467651, "learning_rate": 0.0002, "epoch": 5.364791288566243, "step": 7390}, {"loss": 0.4325, "grad_norm": 1.2962018251419067, "learning_rate": 0.0002, "epoch": 5.372050816696914, "step": 7400}, {"loss": 0.3809, "grad_norm": 1.1904195547103882, "learning_rate": 0.0002, "epoch": 5.379310344827586, "step": 7410}, {"loss": 0.3728, "grad_norm": 1.316245675086975, "learning_rate": 0.0002, "epoch": 5.386569872958257, "step": 7420}, {"loss": 0.4096, "grad_norm": 1.127570390701294, "learning_rate": 0.0002, "epoch": 5.393829401088929, "step": 7430}, {"loss": 0.3933, "grad_norm": 1.3895777463912964, "learning_rate": 0.0002, "epoch": 5.4010889292196005, "step": 7440}, {"loss": 0.4085, "grad_norm": 1.626830816268921, "learning_rate": 0.0002, "epoch": 5.4083484573502725, "step": 7450}, {"loss": 0.4186, "grad_norm": 1.3703926801681519, "learning_rate": 0.0002, "epoch": 5.415607985480944, "step": 7460}, {"loss": 0.3517, "grad_norm": 1.3854840993881226, "learning_rate": 0.0002, "epoch": 5.422867513611616, "step": 7470}, {"loss": 0.3714, "grad_norm": 1.107065200805664, "learning_rate": 0.0002, "epoch": 5.430127041742287, "step": 7480}, {"loss": 0.3855, "grad_norm": 0.7843456268310547, "learning_rate": 0.0002, "epoch": 5.437386569872959, "step": 7490}, {"loss": 0.4159, "grad_norm": 1.6692372560501099, "learning_rate": 0.0002, "epoch": 5.44464609800363, "step": 7500}, {"loss": 0.4185, "grad_norm": 1.2583858966827393, "learning_rate": 0.0002, "epoch": 5.451905626134302, "step": 7510}, {"loss": 0.4401, "grad_norm": 1.6827000379562378, "learning_rate": 0.0002, "epoch": 5.459165154264973, "step": 7520}, {"loss": 0.397, "grad_norm": 1.6680560111999512, "learning_rate": 0.0002, "epoch": 5.466424682395644, "step": 7530}, {"loss": 0.4193, "grad_norm": 1.3696072101593018, "learning_rate": 0.0002, "epoch": 5.473684210526316, "step": 7540}, {"loss": 0.4244, "grad_norm": 1.4523496627807617, "learning_rate": 0.0002, "epoch": 5.480943738656987, "step": 7550}, {"loss": 0.3609, "grad_norm": 1.3432692289352417, "learning_rate": 0.0002, "epoch": 5.488203266787659, "step": 7560}, {"loss": 0.3675, "grad_norm": 1.363818645477295, "learning_rate": 0.0002, "epoch": 5.49546279491833, "step": 7570}, {"loss": 0.3726, "grad_norm": 1.0176721811294556, "learning_rate": 0.0002, "epoch": 5.502722323049002, "step": 7580}, {"loss": 0.3751, "grad_norm": 1.1625547409057617, "learning_rate": 0.0002, "epoch": 5.509981851179673, "step": 7590}, {"loss": 0.433, "grad_norm": 1.2480388879776, "learning_rate": 0.0002, "epoch": 5.517241379310345, "step": 7600}, {"loss": 0.4511, "grad_norm": 1.341509222984314, "learning_rate": 0.0002, "epoch": 5.524500907441016, "step": 7610}, {"loss": 0.4642, "grad_norm": 1.7048436403274536, "learning_rate": 0.0002, "epoch": 5.531760435571687, "step": 7620}, {"loss": 0.4509, "grad_norm": 1.1435480117797852, "learning_rate": 0.0002, "epoch": 5.539019963702359, "step": 7630}, {"loss": 0.4528, "grad_norm": 1.2381842136383057, "learning_rate": 0.0002, "epoch": 5.5462794918330305, "step": 7640}, {"loss": 0.4496, "grad_norm": 1.50786292552948, "learning_rate": 0.0002, "epoch": 5.5535390199637025, "step": 7650}, {"loss": 0.4242, "grad_norm": 1.2263519763946533, "learning_rate": 0.0002, "epoch": 5.560798548094374, "step": 7660}, {"loss": 0.418, "grad_norm": 1.2864696979522705, "learning_rate": 0.0002, "epoch": 5.568058076225046, "step": 7670}, {"loss": 0.3832, "grad_norm": 1.4443191289901733, "learning_rate": 0.0002, "epoch": 5.575317604355717, "step": 7680}, {"loss": 0.3964, "grad_norm": 1.3360971212387085, "learning_rate": 0.0002, "epoch": 5.582577132486389, "step": 7690}, {"loss": 0.4639, "grad_norm": 1.391828179359436, "learning_rate": 0.0002, "epoch": 5.58983666061706, "step": 7700}, {"loss": 0.4722, "grad_norm": 1.3699384927749634, "learning_rate": 0.0002, "epoch": 5.597096188747732, "step": 7710}, {"loss": 0.4302, "grad_norm": 1.3778468370437622, "learning_rate": 0.0002, "epoch": 5.604355716878403, "step": 7720}, {"loss": 0.4179, "grad_norm": 1.1009501218795776, "learning_rate": 0.0002, "epoch": 5.611615245009075, "step": 7730}, {"loss": 0.4104, "grad_norm": 1.0410021543502808, "learning_rate": 0.0002, "epoch": 5.618874773139746, "step": 7740}, {"loss": 0.4489, "grad_norm": 1.1012226343154907, "learning_rate": 0.0002, "epoch": 5.626134301270417, "step": 7750}, {"loss": 0.4544, "grad_norm": 1.3246384859085083, "learning_rate": 0.0002, "epoch": 5.633393829401089, "step": 7760}, {"loss": 0.4381, "grad_norm": 1.4301716089248657, "learning_rate": 0.0002, "epoch": 5.64065335753176, "step": 7770}, {"loss": 0.4297, "grad_norm": 1.1368978023529053, "learning_rate": 0.0002, "epoch": 5.647912885662432, "step": 7780}, {"loss": 0.4063, "grad_norm": 1.3493064641952515, "learning_rate": 0.0002, "epoch": 5.655172413793103, "step": 7790}, {"loss": 0.4562, "grad_norm": 1.3328721523284912, "learning_rate": 0.0002, "epoch": 5.662431941923775, "step": 7800}, {"loss": 0.4075, "grad_norm": 1.3235671520233154, "learning_rate": 0.0002, "epoch": 5.669691470054446, "step": 7810}, {"loss": 0.4589, "grad_norm": 1.1961841583251953, "learning_rate": 0.0002, "epoch": 5.676950998185118, "step": 7820}, {"loss": 0.4503, "grad_norm": 1.4189636707305908, "learning_rate": 0.0002, "epoch": 5.684210526315789, "step": 7830}, {"loss": 0.4452, "grad_norm": 1.3551312685012817, "learning_rate": 0.0002, "epoch": 5.691470054446461, "step": 7840}, {"loss": 0.4268, "grad_norm": 1.449987769126892, "learning_rate": 0.0002, "epoch": 5.6987295825771325, "step": 7850}, {"loss": 0.4141, "grad_norm": 1.1225156784057617, "learning_rate": 0.0002, "epoch": 5.7059891107078045, "step": 7860}, {"loss": 0.41, "grad_norm": 1.4734594821929932, "learning_rate": 0.0002, "epoch": 5.713248638838476, "step": 7870}, {"loss": 0.4013, "grad_norm": 1.3793359994888306, "learning_rate": 0.0002, "epoch": 5.720508166969147, "step": 7880}, {"loss": 0.4065, "grad_norm": 1.2431834936141968, "learning_rate": 0.0002, "epoch": 5.727767695099819, "step": 7890}, {"loss": 0.4595, "grad_norm": 1.1158313751220703, "learning_rate": 0.0002, "epoch": 5.73502722323049, "step": 7900}, {"loss": 0.4342, "grad_norm": 1.212248682975769, "learning_rate": 0.0002, "epoch": 5.742286751361162, "step": 7910}, {"loss": 0.4611, "grad_norm": 1.5259995460510254, "learning_rate": 0.0002, "epoch": 5.749546279491833, "step": 7920}, {"loss": 0.4483, "grad_norm": 1.3909121751785278, "learning_rate": 0.0002, "epoch": 5.756805807622505, "step": 7930}, {"loss": 0.4325, "grad_norm": 1.2511249780654907, "learning_rate": 0.0002, "epoch": 5.764065335753176, "step": 7940}, {"loss": 0.4048, "grad_norm": 1.2511906623840332, "learning_rate": 0.0002, "epoch": 5.771324863883848, "step": 7950}, {"loss": 0.3715, "grad_norm": 1.1489921808242798, "learning_rate": 0.0002, "epoch": 5.778584392014519, "step": 7960}, {"loss": 0.4196, "grad_norm": 1.028943419456482, "learning_rate": 0.0002, "epoch": 5.78584392014519, "step": 7970}, {"loss": 0.4334, "grad_norm": 1.0820423364639282, "learning_rate": 0.0002, "epoch": 5.793103448275862, "step": 7980}, {"loss": 0.3917, "grad_norm": 1.296520471572876, "learning_rate": 0.0002, "epoch": 5.800362976406533, "step": 7990}, {"loss": 0.4509, "grad_norm": 1.3597749471664429, "learning_rate": 0.0002, "epoch": 5.807622504537205, "step": 8000}, {"loss": 0.4535, "grad_norm": 0.8741790652275085, "learning_rate": 0.0002, "epoch": 5.814882032667876, "step": 8010}, {"loss": 0.4239, "grad_norm": 1.1471822261810303, "learning_rate": 0.0002, "epoch": 5.822141560798548, "step": 8020}, {"loss": 0.5042, "grad_norm": 1.2997334003448486, "learning_rate": 0.0002, "epoch": 5.829401088929219, "step": 8030}, {"loss": 0.4758, "grad_norm": 1.1027175188064575, "learning_rate": 0.0002, "epoch": 5.836660617059891, "step": 8040}, {"loss": 0.4192, "grad_norm": 1.2695307731628418, "learning_rate": 0.0002, "epoch": 5.8439201451905625, "step": 8050}, {"loss": 0.5173, "grad_norm": 1.5275461673736572, "learning_rate": 0.0002, "epoch": 5.8511796733212345, "step": 8060}, {"loss": 0.5012, "grad_norm": 1.3059501647949219, "learning_rate": 0.0002, "epoch": 5.8584392014519056, "step": 8070}, {"loss": 0.4425, "grad_norm": 1.57442045211792, "learning_rate": 0.0002, "epoch": 5.8656987295825775, "step": 8080}, {"loss": 0.4261, "grad_norm": 1.119564414024353, "learning_rate": 0.0002, "epoch": 5.872958257713249, "step": 8090}, {"loss": 0.465, "grad_norm": 1.6517373323440552, "learning_rate": 0.0002, "epoch": 5.88021778584392, "step": 8100}, {"loss": 0.4406, "grad_norm": 1.4093554019927979, "learning_rate": 0.0002, "epoch": 5.887477313974592, "step": 8110}, {"loss": 0.4433, "grad_norm": 1.278843641281128, "learning_rate": 0.0002, "epoch": 5.894736842105263, "step": 8120}, {"loss": 0.4007, "grad_norm": 1.2042944431304932, "learning_rate": 0.0002, "epoch": 5.901996370235935, "step": 8130}, {"loss": 0.3972, "grad_norm": 1.1788326501846313, "learning_rate": 0.0002, "epoch": 5.909255898366606, "step": 8140}, {"loss": 0.4506, "grad_norm": 1.4364569187164307, "learning_rate": 0.0002, "epoch": 5.916515426497278, "step": 8150}, {"loss": 0.4651, "grad_norm": 1.1704283952713013, "learning_rate": 0.0002, "epoch": 5.923774954627949, "step": 8160}, {"loss": 0.3972, "grad_norm": 1.040814995765686, "learning_rate": 0.0002, "epoch": 5.931034482758621, "step": 8170}, {"loss": 0.4038, "grad_norm": 1.1367416381835938, "learning_rate": 0.0002, "epoch": 5.938294010889292, "step": 8180}, {"loss": 0.4387, "grad_norm": 1.3401511907577515, "learning_rate": 0.0002, "epoch": 5.945553539019964, "step": 8190}, {"loss": 0.4396, "grad_norm": 1.1154041290283203, "learning_rate": 0.0002, "epoch": 5.952813067150635, "step": 8200}, {"loss": 0.4744, "grad_norm": 1.426089882850647, "learning_rate": 0.0002, "epoch": 5.960072595281307, "step": 8210}, {"loss": 0.4105, "grad_norm": 1.3170222043991089, "learning_rate": 0.0002, "epoch": 5.967332123411978, "step": 8220}, {"loss": 0.4137, "grad_norm": 1.1960029602050781, "learning_rate": 0.0002, "epoch": 5.974591651542649, "step": 8230}, {"loss": 0.423, "grad_norm": 1.0843931436538696, "learning_rate": 0.0002, "epoch": 5.981851179673321, "step": 8240}, {"loss": 0.459, "grad_norm": 1.050421118736267, "learning_rate": 0.0002, "epoch": 5.9891107078039925, "step": 8250}, {"loss": 0.3993, "grad_norm": 1.0183138847351074, "learning_rate": 0.0002, "epoch": 5.9963702359346644, "step": 8260}, {"eval_loss": 1.4677470922470093, "eval_runtime": 46.2504, "eval_samples_per_second": 9.427, "eval_steps_per_second": 1.189, "epoch": 6.0, "step": 8265}, {"loss": 0.3947, "grad_norm": 1.1702998876571655, "learning_rate": 0.0002, "epoch": 6.0036297640653356, "step": 8270}, {"loss": 0.2854, "grad_norm": 1.5389727354049683, "learning_rate": 0.0002, "epoch": 6.0108892921960075, "step": 8280}, {"loss": 0.2603, "grad_norm": 1.502568244934082, "learning_rate": 0.0002, "epoch": 6.018148820326679, "step": 8290}, {"loss": 0.3329, "grad_norm": 1.3846043348312378, "learning_rate": 0.0002, "epoch": 6.025408348457351, "step": 8300}, {"loss": 0.2651, "grad_norm": 1.173553228378296, "learning_rate": 0.0002, "epoch": 6.032667876588022, "step": 8310}, {"loss": 0.3142, "grad_norm": 1.5325932502746582, "learning_rate": 0.0002, "epoch": 6.039927404718694, "step": 8320}, {"loss": 0.2511, "grad_norm": 1.303783655166626, "learning_rate": 0.0002, "epoch": 6.047186932849365, "step": 8330}, {"loss": 0.2352, "grad_norm": 0.9408994913101196, "learning_rate": 0.0002, "epoch": 6.054446460980036, "step": 8340}, {"loss": 0.2548, "grad_norm": 1.5430388450622559, "learning_rate": 0.0002, "epoch": 6.061705989110708, "step": 8350}, {"loss": 0.2682, "grad_norm": 0.8765342235565186, "learning_rate": 0.0002, "epoch": 6.068965517241379, "step": 8360}, {"loss": 0.2614, "grad_norm": 1.2363157272338867, "learning_rate": 0.0002, "epoch": 6.076225045372051, "step": 8370}, {"loss": 0.294, "grad_norm": 1.21284818649292, "learning_rate": 0.0002, "epoch": 6.083484573502722, "step": 8380}, {"loss": 0.2498, "grad_norm": 1.3261712789535522, "learning_rate": 0.0002, "epoch": 6.090744101633394, "step": 8390}, {"loss": 0.2649, "grad_norm": 1.077317714691162, "learning_rate": 0.0002, "epoch": 6.098003629764065, "step": 8400}, {"loss": 0.269, "grad_norm": 0.9873808026313782, "learning_rate": 0.0002, "epoch": 6.105263157894737, "step": 8410}, {"loss": 0.2736, "grad_norm": 1.032258152961731, "learning_rate": 0.0002, "epoch": 6.112522686025408, "step": 8420}, {"loss": 0.2854, "grad_norm": 1.1014811992645264, "learning_rate": 0.0002, "epoch": 6.11978221415608, "step": 8430}, {"loss": 0.2924, "grad_norm": 1.4264203310012817, "learning_rate": 0.0002, "epoch": 6.127041742286751, "step": 8440}, {"loss": 0.3388, "grad_norm": 1.4086531400680542, "learning_rate": 0.0002, "epoch": 6.1343012704174225, "step": 8450}, {"loss": 0.2786, "grad_norm": 1.3842453956604004, "learning_rate": 0.0002, "epoch": 6.1415607985480944, "step": 8460}, {"loss": 0.3201, "grad_norm": 1.4356757402420044, "learning_rate": 0.0002, "epoch": 6.1488203266787655, "step": 8470}, {"loss": 0.2908, "grad_norm": 1.193315029144287, "learning_rate": 0.0002, "epoch": 6.1560798548094375, "step": 8480}, {"loss": 0.342, "grad_norm": 1.0623924732208252, "learning_rate": 0.0002, "epoch": 6.163339382940109, "step": 8490}, {"loss": 0.3257, "grad_norm": 1.5484434366226196, "learning_rate": 0.0002, "epoch": 6.170598911070781, "step": 8500}, {"loss": 0.2861, "grad_norm": 1.3520029783248901, "learning_rate": 0.0002, "epoch": 6.177858439201452, "step": 8510}, {"loss": 0.3242, "grad_norm": 1.2773103713989258, "learning_rate": 0.0002, "epoch": 6.185117967332124, "step": 8520}, {"loss": 0.3108, "grad_norm": 1.4675105810165405, "learning_rate": 0.0002, "epoch": 6.192377495462795, "step": 8530}, {"loss": 0.3044, "grad_norm": 1.2118732929229736, "learning_rate": 0.0002, "epoch": 6.199637023593467, "step": 8540}, {"loss": 0.2726, "grad_norm": 1.264024257659912, "learning_rate": 0.0002, "epoch": 6.206896551724138, "step": 8550}, {"loss": 0.306, "grad_norm": 1.406931757926941, "learning_rate": 0.0002, "epoch": 6.21415607985481, "step": 8560}, {"loss": 0.2904, "grad_norm": 1.385459542274475, "learning_rate": 0.0002, "epoch": 6.221415607985481, "step": 8570}, {"loss": 0.3413, "grad_norm": 1.9336168766021729, "learning_rate": 0.0002, "epoch": 6.228675136116152, "step": 8580}, {"loss": 0.2769, "grad_norm": 0.9880136847496033, "learning_rate": 0.0002, "epoch": 6.235934664246824, "step": 8590}, {"loss": 0.3035, "grad_norm": 1.3870339393615723, "learning_rate": 0.0002, "epoch": 6.243194192377495, "step": 8600}, {"loss": 0.286, "grad_norm": 1.2303647994995117, "learning_rate": 0.0002, "epoch": 6.250453720508167, "step": 8610}, {"loss": 0.3113, "grad_norm": 1.5406211614608765, "learning_rate": 0.0002, "epoch": 6.257713248638838, "step": 8620}, {"loss": 0.292, "grad_norm": 1.2436790466308594, "learning_rate": 0.0002, "epoch": 6.26497277676951, "step": 8630}, {"loss": 0.3102, "grad_norm": 0.8844212293624878, "learning_rate": 0.0002, "epoch": 6.272232304900181, "step": 8640}, {"loss": 0.3373, "grad_norm": 1.2846336364746094, "learning_rate": 0.0002, "epoch": 6.279491833030853, "step": 8650}, {"loss": 0.3535, "grad_norm": 1.593814730644226, "learning_rate": 0.0002, "epoch": 6.286751361161524, "step": 8660}, {"loss": 0.3413, "grad_norm": 1.2277469635009766, "learning_rate": 0.0002, "epoch": 6.2940108892921955, "step": 8670}, {"loss": 0.2958, "grad_norm": 1.2574384212493896, "learning_rate": 0.0002, "epoch": 6.3012704174228675, "step": 8680}, {"loss": 0.3251, "grad_norm": 1.335150957107544, "learning_rate": 0.0002, "epoch": 6.308529945553539, "step": 8690}, {"loss": 0.3009, "grad_norm": 1.3140437602996826, "learning_rate": 0.0002, "epoch": 6.315789473684211, "step": 8700}, {"loss": 0.2783, "grad_norm": 1.1689209938049316, "learning_rate": 0.0002, "epoch": 6.323049001814882, "step": 8710}, {"loss": 0.3476, "grad_norm": 1.6448503732681274, "learning_rate": 0.0002, "epoch": 6.330308529945554, "step": 8720}, {"loss": 0.2934, "grad_norm": 0.9944145679473877, "learning_rate": 0.0002, "epoch": 6.337568058076225, "step": 8730}, {"loss": 0.3315, "grad_norm": 1.1775634288787842, "learning_rate": 0.0002, "epoch": 6.344827586206897, "step": 8740}, {"loss": 0.3514, "grad_norm": 1.8438639640808105, "learning_rate": 0.0002, "epoch": 6.352087114337568, "step": 8750}, {"loss": 0.303, "grad_norm": 1.062495470046997, "learning_rate": 0.0002, "epoch": 6.35934664246824, "step": 8760}, {"loss": 0.2737, "grad_norm": 1.3224315643310547, "learning_rate": 0.0002, "epoch": 6.366606170598911, "step": 8770}, {"loss": 0.3445, "grad_norm": 1.399844765663147, "learning_rate": 0.0002, "epoch": 6.373865698729583, "step": 8780}, {"loss": 0.3277, "grad_norm": 1.0409915447235107, "learning_rate": 0.0002, "epoch": 6.381125226860254, "step": 8790}, {"loss": 0.3218, "grad_norm": 1.5657726526260376, "learning_rate": 0.0002, "epoch": 6.388384754990925, "step": 8800}, {"loss": 0.3031, "grad_norm": 1.4098644256591797, "learning_rate": 0.0002, "epoch": 6.395644283121597, "step": 8810}, {"loss": 0.3133, "grad_norm": 1.5154732465744019, "learning_rate": 0.0002, "epoch": 6.402903811252268, "step": 8820}, {"loss": 0.3111, "grad_norm": 1.1139698028564453, "learning_rate": 0.0002, "epoch": 6.41016333938294, "step": 8830}, {"loss": 0.3553, "grad_norm": 1.4149729013442993, "learning_rate": 0.0002, "epoch": 6.417422867513611, "step": 8840}, {"loss": 0.287, "grad_norm": 1.2632299661636353, "learning_rate": 0.0002, "epoch": 6.424682395644283, "step": 8850}, {"loss": 0.3198, "grad_norm": 1.6636109352111816, "learning_rate": 0.0002, "epoch": 6.431941923774954, "step": 8860}, {"loss": 0.3749, "grad_norm": 1.4149386882781982, "learning_rate": 0.0002, "epoch": 6.439201451905626, "step": 8870}, {"loss": 0.3504, "grad_norm": 1.1396206617355347, "learning_rate": 0.0002, "epoch": 6.4464609800362975, "step": 8880}, {"loss": 0.3328, "grad_norm": 1.2188775539398193, "learning_rate": 0.0002, "epoch": 6.4537205081669695, "step": 8890}, {"loss": 0.3427, "grad_norm": 0.9740369319915771, "learning_rate": 0.0002, "epoch": 6.460980036297641, "step": 8900}, {"loss": 0.3223, "grad_norm": 1.228569746017456, "learning_rate": 0.0002, "epoch": 6.468239564428313, "step": 8910}, {"loss": 0.3151, "grad_norm": 1.5019789934158325, "learning_rate": 0.0002, "epoch": 6.475499092558984, "step": 8920}, {"loss": 0.2916, "grad_norm": 1.3320101499557495, "learning_rate": 0.0002, "epoch": 6.482758620689655, "step": 8930}, {"loss": 0.298, "grad_norm": 1.5551502704620361, "learning_rate": 0.0002, "epoch": 6.490018148820327, "step": 8940}, {"loss": 0.3238, "grad_norm": 1.470131754875183, "learning_rate": 0.0002, "epoch": 6.497277676950998, "step": 8950}, {"loss": 0.2808, "grad_norm": 1.1803025007247925, "learning_rate": 0.0002, "epoch": 6.50453720508167, "step": 8960}, {"loss": 0.3025, "grad_norm": 1.3505640029907227, "learning_rate": 0.0002, "epoch": 6.511796733212341, "step": 8970}, {"loss": 0.3124, "grad_norm": 1.13093900680542, "learning_rate": 0.0002, "epoch": 6.519056261343013, "step": 8980}, {"loss": 0.3454, "grad_norm": 1.347386121749878, "learning_rate": 0.0002, "epoch": 6.526315789473684, "step": 8990}, {"loss": 0.3532, "grad_norm": 1.7879165410995483, "learning_rate": 0.0002, "epoch": 6.533575317604356, "step": 9000}, {"loss": 0.3382, "grad_norm": 1.2168169021606445, "learning_rate": 0.0002, "epoch": 6.540834845735027, "step": 9010}, {"loss": 0.3413, "grad_norm": 1.1758877038955688, "learning_rate": 0.0002, "epoch": 6.548094373865698, "step": 9020}, {"loss": 0.2806, "grad_norm": 1.7366445064544678, "learning_rate": 0.0002, "epoch": 6.55535390199637, "step": 9030}, {"loss": 0.3437, "grad_norm": 1.5919222831726074, "learning_rate": 0.0002, "epoch": 6.562613430127042, "step": 9040}, {"loss": 0.3261, "grad_norm": 1.336863398551941, "learning_rate": 0.0002, "epoch": 6.569872958257713, "step": 9050}, {"loss": 0.3103, "grad_norm": 1.1769421100616455, "learning_rate": 0.0002, "epoch": 6.577132486388384, "step": 9060}, {"loss": 0.3295, "grad_norm": 1.0048751831054688, "learning_rate": 0.0002, "epoch": 6.584392014519056, "step": 9070}, {"loss": 0.3156, "grad_norm": 1.5268515348434448, "learning_rate": 0.0002, "epoch": 6.5916515426497275, "step": 9080}, {"loss": 0.3752, "grad_norm": 1.434610366821289, "learning_rate": 0.0002, "epoch": 6.5989110707803995, "step": 9090}, {"loss": 0.3375, "grad_norm": 1.1151410341262817, "learning_rate": 0.0002, "epoch": 6.606170598911071, "step": 9100}, {"loss": 0.363, "grad_norm": 1.6690642833709717, "learning_rate": 0.0002, "epoch": 6.613430127041743, "step": 9110}, {"loss": 0.3703, "grad_norm": 1.4495552778244019, "learning_rate": 0.0002, "epoch": 6.620689655172414, "step": 9120}, {"loss": 0.3648, "grad_norm": 1.377621054649353, "learning_rate": 0.0002, "epoch": 6.627949183303086, "step": 9130}, {"loss": 0.3766, "grad_norm": 1.5459434986114502, "learning_rate": 0.0002, "epoch": 6.635208711433757, "step": 9140}, {"loss": 0.3196, "grad_norm": 1.0920850038528442, "learning_rate": 0.0002, "epoch": 6.642468239564428, "step": 9150}, {"loss": 0.3505, "grad_norm": 1.6708381175994873, "learning_rate": 0.0002, "epoch": 6.6497277676951, "step": 9160}, {"loss": 0.368, "grad_norm": 1.747514009475708, "learning_rate": 0.0002, "epoch": 6.656987295825771, "step": 9170}, {"loss": 0.3099, "grad_norm": 1.133466362953186, "learning_rate": 0.0002, "epoch": 6.664246823956443, "step": 9180}, {"loss": 0.3175, "grad_norm": 1.394358515739441, "learning_rate": 0.0002, "epoch": 6.671506352087114, "step": 9190}, {"loss": 0.2981, "grad_norm": 0.9258374571800232, "learning_rate": 0.0002, "epoch": 6.678765880217786, "step": 9200}, {"loss": 0.3723, "grad_norm": 1.3750739097595215, "learning_rate": 0.0002, "epoch": 6.686025408348457, "step": 9210}, {"loss": 0.3441, "grad_norm": 0.8604967594146729, "learning_rate": 0.0002, "epoch": 6.693284936479129, "step": 9220}, {"loss": 0.3775, "grad_norm": 1.6074559688568115, "learning_rate": 0.0002, "epoch": 6.7005444646098, "step": 9230}, {"loss": 0.3139, "grad_norm": 0.9576877355575562, "learning_rate": 0.0002, "epoch": 6.707803992740472, "step": 9240}, {"loss": 0.3633, "grad_norm": 1.7193048000335693, "learning_rate": 0.0002, "epoch": 6.715063520871143, "step": 9250}, {"loss": 0.3139, "grad_norm": 1.3131844997406006, "learning_rate": 0.0002, "epoch": 6.722323049001815, "step": 9260}, {"loss": 0.3121, "grad_norm": 1.2978184223175049, "learning_rate": 0.0002, "epoch": 6.729582577132486, "step": 9270}, {"loss": 0.3534, "grad_norm": 1.4792617559432983, "learning_rate": 0.0002, "epoch": 6.7368421052631575, "step": 9280}, {"loss": 0.3429, "grad_norm": 1.1265567541122437, "learning_rate": 0.0002, "epoch": 6.7441016333938295, "step": 9290}, {"loss": 0.3526, "grad_norm": 1.8553377389907837, "learning_rate": 0.0002, "epoch": 6.751361161524501, "step": 9300}, {"loss": 0.3666, "grad_norm": 1.3602519035339355, "learning_rate": 0.0002, "epoch": 6.758620689655173, "step": 9310}, {"loss": 0.2922, "grad_norm": 1.2874794006347656, "learning_rate": 0.0002, "epoch": 6.765880217785844, "step": 9320}, {"loss": 0.3816, "grad_norm": 1.4834712743759155, "learning_rate": 0.0002, "epoch": 6.773139745916516, "step": 9330}, {"loss": 0.3557, "grad_norm": 2.0824034214019775, "learning_rate": 0.0002, "epoch": 6.780399274047187, "step": 9340}, {"loss": 0.3174, "grad_norm": 1.2267698049545288, "learning_rate": 0.0002, "epoch": 6.787658802177859, "step": 9350}, {"loss": 0.3665, "grad_norm": 1.4485498666763306, "learning_rate": 0.0002, "epoch": 6.79491833030853, "step": 9360}, {"loss": 0.3676, "grad_norm": 1.3199396133422852, "learning_rate": 0.0002, "epoch": 6.802177858439201, "step": 9370}, {"loss": 0.298, "grad_norm": 1.2552456855773926, "learning_rate": 0.0002, "epoch": 6.809437386569873, "step": 9380}, {"loss": 0.3152, "grad_norm": 1.3895127773284912, "learning_rate": 0.0002, "epoch": 6.816696914700545, "step": 9390}, {"loss": 0.3375, "grad_norm": 1.7637823820114136, "learning_rate": 0.0002, "epoch": 6.823956442831216, "step": 9400}, {"loss": 0.3234, "grad_norm": 1.6004475355148315, "learning_rate": 0.0002, "epoch": 6.831215970961887, "step": 9410}, {"loss": 0.3364, "grad_norm": 1.4133695363998413, "learning_rate": 0.0002, "epoch": 6.838475499092559, "step": 9420}, {"loss": 0.3656, "grad_norm": 1.1583502292633057, "learning_rate": 0.0002, "epoch": 6.84573502722323, "step": 9430}, {"loss": 0.3499, "grad_norm": 1.3769075870513916, "learning_rate": 0.0002, "epoch": 6.852994555353902, "step": 9440}, {"loss": 0.3333, "grad_norm": 1.1831218004226685, "learning_rate": 0.0002, "epoch": 6.860254083484573, "step": 9450}, {"loss": 0.3501, "grad_norm": 1.6092621088027954, "learning_rate": 0.0002, "epoch": 6.867513611615245, "step": 9460}, {"loss": 0.3933, "grad_norm": 1.3850210905075073, "learning_rate": 0.0002, "epoch": 6.874773139745916, "step": 9470}, {"loss": 0.3868, "grad_norm": 1.4119619131088257, "learning_rate": 0.0002, "epoch": 6.882032667876588, "step": 9480}, {"loss": 0.3939, "grad_norm": 1.3494242429733276, "learning_rate": 0.0002, "epoch": 6.8892921960072595, "step": 9490}, {"loss": 0.3217, "grad_norm": 1.3130041360855103, "learning_rate": 0.0002, "epoch": 6.896551724137931, "step": 9500}, {"loss": 0.3738, "grad_norm": 1.169256329536438, "learning_rate": 0.0002, "epoch": 6.903811252268603, "step": 9510}, {"loss": 0.408, "grad_norm": 1.7475035190582275, "learning_rate": 0.0002, "epoch": 6.911070780399274, "step": 9520}, {"loss": 0.3407, "grad_norm": 1.440434217453003, "learning_rate": 0.0002, "epoch": 6.918330308529946, "step": 9530}, {"loss": 0.3707, "grad_norm": 1.6768704652786255, "learning_rate": 0.0002, "epoch": 6.925589836660617, "step": 9540}, {"loss": 0.3283, "grad_norm": 1.3720577955245972, "learning_rate": 0.0002, "epoch": 6.932849364791289, "step": 9550}, {"loss": 0.3257, "grad_norm": 1.8140523433685303, "learning_rate": 0.0002, "epoch": 6.94010889292196, "step": 9560}, {"loss": 0.3308, "grad_norm": 1.1828241348266602, "learning_rate": 0.0002, "epoch": 6.947368421052632, "step": 9570}, {"loss": 0.3536, "grad_norm": 1.2755135297775269, "learning_rate": 0.0002, "epoch": 6.954627949183303, "step": 9580}, {"loss": 0.3711, "grad_norm": 1.622009038925171, "learning_rate": 0.0002, "epoch": 6.961887477313975, "step": 9590}, {"loss": 0.3529, "grad_norm": 1.1543664932250977, "learning_rate": 0.0002, "epoch": 6.969147005444646, "step": 9600}, {"loss": 0.416, "grad_norm": 1.6755319833755493, "learning_rate": 0.0002, "epoch": 6.976406533575318, "step": 9610}, {"loss": 0.339, "grad_norm": 1.3726437091827393, "learning_rate": 0.0002, "epoch": 6.983666061705989, "step": 9620}, {"loss": 0.3709, "grad_norm": 1.1605958938598633, "learning_rate": 0.0002, "epoch": 6.99092558983666, "step": 9630}, {"loss": 0.371, "grad_norm": 1.5371781587600708, "learning_rate": 0.0002, "epoch": 6.998185117967332, "step": 9640}]} +{"epoch": 7.997096188747731, "step": 11016, "epoch_duration": 2048.3427917957306, "total_accumulated_duration": 22506.500129699707, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 24486.0}, "peak_memory_reserved": {"GPU_0": 24752.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_mmlu-routerbench-0shot-full-by-task_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.9-num-8828-sd-42/checkpoint-2755", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 2.4936, "grad_norm": 0.46010470390319824, "learning_rate": 0.0002, "epoch": 0.007259528130671506, "step": 10}, {"loss": 1.8442, "grad_norm": 0.5103179216384888, "learning_rate": 0.0002, "epoch": 0.014519056261343012, "step": 20}, {"loss": 1.559, "grad_norm": 0.6282716989517212, "learning_rate": 0.0002, "epoch": 0.021778584392014518, "step": 30}, {"loss": 1.3618, "grad_norm": 1.2480497360229492, "learning_rate": 0.0002, "epoch": 0.029038112522686024, "step": 40}, {"loss": 1.3874, "grad_norm": 0.4114855229854584, "learning_rate": 0.0002, "epoch": 0.036297640653357534, "step": 50}, {"loss": 1.2836, "grad_norm": 0.49482840299606323, "learning_rate": 0.0002, "epoch": 0.043557168784029036, "step": 60}, {"loss": 1.1728, "grad_norm": 0.4536272883415222, "learning_rate": 0.0002, "epoch": 0.050816696914700546, "step": 70}, {"loss": 1.1125, "grad_norm": 0.32328274846076965, "learning_rate": 0.0002, "epoch": 0.05807622504537205, "step": 80}, {"loss": 1.2447, "grad_norm": 0.40990468859672546, "learning_rate": 0.0002, "epoch": 0.06533575317604355, "step": 90}, {"loss": 1.2216, "grad_norm": 0.37273502349853516, "learning_rate": 0.0002, "epoch": 0.07259528130671507, "step": 100}, {"loss": 1.2259, "grad_norm": 0.3903871476650238, "learning_rate": 0.0002, "epoch": 0.07985480943738657, "step": 110}, {"loss": 1.3718, "grad_norm": 0.3272787928581238, "learning_rate": 0.0002, "epoch": 0.08711433756805807, "step": 120}, {"loss": 1.1832, "grad_norm": 0.3622824251651764, "learning_rate": 0.0002, "epoch": 0.09437386569872959, "step": 130}, {"loss": 1.2966, "grad_norm": 0.3503916561603546, "learning_rate": 0.0002, "epoch": 0.10163339382940109, "step": 140}, {"loss": 1.0913, "grad_norm": 0.32787832617759705, "learning_rate": 0.0002, "epoch": 0.1088929219600726, "step": 150}, {"loss": 1.2143, "grad_norm": 0.5822657942771912, "learning_rate": 0.0002, "epoch": 0.1161524500907441, "step": 160}, {"loss": 1.2091, "grad_norm": 0.28028249740600586, "learning_rate": 0.0002, "epoch": 0.12341197822141561, "step": 170}, {"loss": 1.1731, "grad_norm": 0.33602750301361084, "learning_rate": 0.0002, "epoch": 0.1306715063520871, "step": 180}, {"loss": 1.127, "grad_norm": 0.29106274247169495, "learning_rate": 0.0002, "epoch": 0.13793103448275862, "step": 190}, {"loss": 1.3261, "grad_norm": 0.38753363490104675, "learning_rate": 0.0002, "epoch": 0.14519056261343014, "step": 200}, {"loss": 1.1536, "grad_norm": 0.361009418964386, "learning_rate": 0.0002, "epoch": 0.15245009074410162, "step": 210}, {"loss": 1.1483, "grad_norm": 0.6743836402893066, "learning_rate": 0.0002, "epoch": 0.15970961887477314, "step": 220}, {"loss": 1.264, "grad_norm": 0.3923613727092743, "learning_rate": 0.0002, "epoch": 0.16696914700544466, "step": 230}, {"loss": 1.0437, "grad_norm": 0.2809699773788452, "learning_rate": 0.0002, "epoch": 0.17422867513611615, "step": 240}, {"loss": 1.1826, "grad_norm": 0.3631494641304016, "learning_rate": 0.0002, "epoch": 0.18148820326678766, "step": 250}, {"loss": 1.1584, "grad_norm": 0.24658171832561493, "learning_rate": 0.0002, "epoch": 0.18874773139745918, "step": 260}, {"loss": 1.0065, "grad_norm": 0.5780664682388306, "learning_rate": 0.0002, "epoch": 0.19600725952813067, "step": 270}, {"loss": 1.1251, "grad_norm": 0.3056720495223999, "learning_rate": 0.0002, "epoch": 0.20326678765880218, "step": 280}, {"loss": 1.1503, "grad_norm": 0.2857084572315216, "learning_rate": 0.0002, "epoch": 0.21052631578947367, "step": 290}, {"loss": 1.1215, "grad_norm": 0.6645345687866211, "learning_rate": 0.0002, "epoch": 0.2177858439201452, "step": 300}, {"loss": 1.127, "grad_norm": 0.5966078639030457, "learning_rate": 0.0002, "epoch": 0.2250453720508167, "step": 310}, {"loss": 1.1974, "grad_norm": 0.40937140583992004, "learning_rate": 0.0002, "epoch": 0.2323049001814882, "step": 320}, {"loss": 1.2385, "grad_norm": 0.5642806887626648, "learning_rate": 0.0002, "epoch": 0.2395644283121597, "step": 330}, {"loss": 1.1703, "grad_norm": 0.2750748097896576, "learning_rate": 0.0002, "epoch": 0.24682395644283123, "step": 340}, {"loss": 1.2598, "grad_norm": 0.34350234270095825, "learning_rate": 0.0002, "epoch": 0.2540834845735027, "step": 350}, {"loss": 1.1942, "grad_norm": 0.6767239570617676, "learning_rate": 0.0002, "epoch": 0.2613430127041742, "step": 360}, {"loss": 1.1436, "grad_norm": 0.31006959080696106, "learning_rate": 0.0002, "epoch": 0.26860254083484575, "step": 370}, {"loss": 1.2084, "grad_norm": 0.3825474679470062, "learning_rate": 0.0002, "epoch": 0.27586206896551724, "step": 380}, {"loss": 1.1523, "grad_norm": 0.30241551995277405, "learning_rate": 0.0002, "epoch": 0.2831215970961887, "step": 390}, {"loss": 1.1298, "grad_norm": 0.2962397336959839, "learning_rate": 0.0002, "epoch": 0.29038112522686027, "step": 400}, {"loss": 1.1299, "grad_norm": 0.2600369155406952, "learning_rate": 0.0002, "epoch": 0.29764065335753176, "step": 410}, {"loss": 1.1366, "grad_norm": 0.3675060272216797, "learning_rate": 0.0002, "epoch": 0.30490018148820325, "step": 420}, {"loss": 1.156, "grad_norm": 0.3429498076438904, "learning_rate": 0.0002, "epoch": 0.3121597096188748, "step": 430}, {"loss": 1.2741, "grad_norm": 0.34311825037002563, "learning_rate": 0.0002, "epoch": 0.3194192377495463, "step": 440}, {"loss": 1.3523, "grad_norm": 0.37872210144996643, "learning_rate": 0.0002, "epoch": 0.32667876588021777, "step": 450}, {"loss": 1.1365, "grad_norm": 0.33271121978759766, "learning_rate": 0.0002, "epoch": 0.3339382940108893, "step": 460}, {"loss": 1.1572, "grad_norm": 0.34605276584625244, "learning_rate": 0.0002, "epoch": 0.3411978221415608, "step": 470}, {"loss": 1.2755, "grad_norm": 0.41050562262535095, "learning_rate": 0.0002, "epoch": 0.3484573502722323, "step": 480}, {"loss": 1.075, "grad_norm": 0.2066836953163147, "learning_rate": 0.0002, "epoch": 0.35571687840290384, "step": 490}, {"loss": 1.2072, "grad_norm": 0.2859014868736267, "learning_rate": 0.0002, "epoch": 0.3629764065335753, "step": 500}, {"loss": 1.1435, "grad_norm": 0.28763777017593384, "learning_rate": 0.0002, "epoch": 0.3702359346642468, "step": 510}, {"loss": 1.1341, "grad_norm": 0.2730471193790436, "learning_rate": 0.0002, "epoch": 0.37749546279491836, "step": 520}, {"loss": 1.348, "grad_norm": 0.3968936800956726, "learning_rate": 0.0002, "epoch": 0.38475499092558985, "step": 530}, {"loss": 1.2363, "grad_norm": 0.3624701201915741, "learning_rate": 0.0002, "epoch": 0.39201451905626133, "step": 540}, {"loss": 1.1206, "grad_norm": 0.3303608298301697, "learning_rate": 0.0002, "epoch": 0.3992740471869328, "step": 550}, {"loss": 1.203, "grad_norm": 0.33507466316223145, "learning_rate": 0.0002, "epoch": 0.40653357531760437, "step": 560}, {"loss": 1.2294, "grad_norm": 0.3297670781612396, "learning_rate": 0.0002, "epoch": 0.41379310344827586, "step": 570}, {"loss": 1.2, "grad_norm": 0.32334890961647034, "learning_rate": 0.0002, "epoch": 0.42105263157894735, "step": 580}, {"loss": 1.112, "grad_norm": 0.30281195044517517, "learning_rate": 0.0002, "epoch": 0.4283121597096189, "step": 590}, {"loss": 1.1352, "grad_norm": 0.5900027751922607, "learning_rate": 0.0002, "epoch": 0.4355716878402904, "step": 600}, {"loss": 1.1575, "grad_norm": 0.28437477350234985, "learning_rate": 0.0002, "epoch": 0.44283121597096187, "step": 610}, {"loss": 1.2009, "grad_norm": 0.39601704478263855, "learning_rate": 0.0002, "epoch": 0.4500907441016334, "step": 620}, {"loss": 1.2302, "grad_norm": 0.41971510648727417, "learning_rate": 0.0002, "epoch": 0.4573502722323049, "step": 630}, {"loss": 1.236, "grad_norm": 0.33814409375190735, "learning_rate": 0.0002, "epoch": 0.4646098003629764, "step": 640}, {"loss": 1.2471, "grad_norm": 0.575718104839325, "learning_rate": 0.0002, "epoch": 0.47186932849364793, "step": 650}, {"loss": 1.1012, "grad_norm": 0.37927401065826416, "learning_rate": 0.0002, "epoch": 0.4791288566243194, "step": 660}, {"loss": 1.1552, "grad_norm": 0.3224332630634308, "learning_rate": 0.0002, "epoch": 0.4863883847549909, "step": 670}, {"loss": 1.1354, "grad_norm": 0.32683515548706055, "learning_rate": 0.0002, "epoch": 0.49364791288566245, "step": 680}, {"loss": 1.207, "grad_norm": 0.4316163659095764, "learning_rate": 0.0002, "epoch": 0.5009074410163339, "step": 690}, {"loss": 1.2116, "grad_norm": 0.342602401971817, "learning_rate": 0.0002, "epoch": 0.5081669691470054, "step": 700}, {"loss": 1.0433, "grad_norm": 0.2794898748397827, "learning_rate": 0.0002, "epoch": 0.515426497277677, "step": 710}, {"loss": 1.2395, "grad_norm": 0.3322339951992035, "learning_rate": 0.0002, "epoch": 0.5226860254083484, "step": 720}, {"loss": 1.0762, "grad_norm": 0.3088509142398834, "learning_rate": 0.0002, "epoch": 0.52994555353902, "step": 730}, {"loss": 1.1495, "grad_norm": 0.24444378912448883, "learning_rate": 0.0002, "epoch": 0.5372050816696915, "step": 740}, {"loss": 1.2176, "grad_norm": 0.3483171761035919, "learning_rate": 0.0002, "epoch": 0.5444646098003629, "step": 750}, {"loss": 1.2248, "grad_norm": 0.3425690531730652, "learning_rate": 0.0002, "epoch": 0.5517241379310345, "step": 760}, {"loss": 1.1127, "grad_norm": 0.31841927766799927, "learning_rate": 0.0002, "epoch": 0.558983666061706, "step": 770}, {"loss": 1.1841, "grad_norm": 0.39423868060112, "learning_rate": 0.0002, "epoch": 0.5662431941923775, "step": 780}, {"loss": 1.0583, "grad_norm": 0.30328479409217834, "learning_rate": 0.0002, "epoch": 0.573502722323049, "step": 790}, {"loss": 1.0718, "grad_norm": 0.24475938081741333, "learning_rate": 0.0002, "epoch": 0.5807622504537205, "step": 800}, {"loss": 1.2006, "grad_norm": 0.37132805585861206, "learning_rate": 0.0002, "epoch": 0.588021778584392, "step": 810}, {"loss": 1.1544, "grad_norm": 0.32195979356765747, "learning_rate": 0.0002, "epoch": 0.5952813067150635, "step": 820}, {"loss": 0.9937, "grad_norm": 0.2848738729953766, "learning_rate": 0.0002, "epoch": 0.6025408348457351, "step": 830}, {"loss": 1.0937, "grad_norm": 0.28015264868736267, "learning_rate": 0.0002, "epoch": 0.6098003629764065, "step": 840}, {"loss": 1.1234, "grad_norm": 0.37796008586883545, "learning_rate": 0.0002, "epoch": 0.617059891107078, "step": 850}, {"loss": 1.1455, "grad_norm": 0.39311841130256653, "learning_rate": 0.0002, "epoch": 0.6243194192377496, "step": 860}, {"loss": 1.1066, "grad_norm": 0.2761685252189636, "learning_rate": 0.0002, "epoch": 0.631578947368421, "step": 870}, {"loss": 1.0859, "grad_norm": 0.3826720118522644, "learning_rate": 0.0002, "epoch": 0.6388384754990926, "step": 880}, {"loss": 1.1182, "grad_norm": 0.30076679587364197, "learning_rate": 0.0002, "epoch": 0.6460980036297641, "step": 890}, {"loss": 1.0927, "grad_norm": 0.21997687220573425, "learning_rate": 0.0002, "epoch": 0.6533575317604355, "step": 900}, {"loss": 1.1603, "grad_norm": 0.32593777775764465, "learning_rate": 0.0002, "epoch": 0.6606170598911071, "step": 910}, {"loss": 1.1578, "grad_norm": 0.30347898602485657, "learning_rate": 0.0002, "epoch": 0.6678765880217786, "step": 920}, {"loss": 1.1642, "grad_norm": 0.44173774123191833, "learning_rate": 0.0002, "epoch": 0.6751361161524501, "step": 930}, {"loss": 1.1832, "grad_norm": 0.2507467269897461, "learning_rate": 0.0002, "epoch": 0.6823956442831216, "step": 940}, {"loss": 1.3001, "grad_norm": 0.29463833570480347, "learning_rate": 0.0002, "epoch": 0.6896551724137931, "step": 950}, {"loss": 1.1651, "grad_norm": 0.9363154172897339, "learning_rate": 0.0002, "epoch": 0.6969147005444646, "step": 960}, {"loss": 1.158, "grad_norm": 0.3236212134361267, "learning_rate": 0.0002, "epoch": 0.7041742286751361, "step": 970}, {"loss": 1.1078, "grad_norm": 0.3123254179954529, "learning_rate": 0.0002, "epoch": 0.7114337568058077, "step": 980}, {"loss": 1.1578, "grad_norm": 0.3395805060863495, "learning_rate": 0.0002, "epoch": 0.7186932849364791, "step": 990}, {"loss": 1.0754, "grad_norm": 0.3240964412689209, "learning_rate": 0.0002, "epoch": 0.7259528130671506, "step": 1000}, {"loss": 1.0313, "grad_norm": 0.31902948021888733, "learning_rate": 0.0002, "epoch": 0.7332123411978222, "step": 1010}, {"loss": 1.1664, "grad_norm": 0.4848408102989197, "learning_rate": 0.0002, "epoch": 0.7404718693284936, "step": 1020}, {"loss": 1.0838, "grad_norm": 0.33006033301353455, "learning_rate": 0.0002, "epoch": 0.7477313974591652, "step": 1030}, {"loss": 1.0979, "grad_norm": 0.2928730547428131, "learning_rate": 0.0002, "epoch": 0.7549909255898367, "step": 1040}, {"loss": 1.1351, "grad_norm": 0.3529164791107178, "learning_rate": 0.0002, "epoch": 0.7622504537205081, "step": 1050}, {"loss": 1.1274, "grad_norm": 0.2736213803291321, "learning_rate": 0.0002, "epoch": 0.7695099818511797, "step": 1060}, {"loss": 1.1169, "grad_norm": 0.7200686931610107, "learning_rate": 0.0002, "epoch": 0.7767695099818511, "step": 1070}, {"loss": 1.1171, "grad_norm": 0.33396708965301514, "learning_rate": 0.0002, "epoch": 0.7840290381125227, "step": 1080}, {"loss": 1.2447, "grad_norm": 1.5760449171066284, "learning_rate": 0.0002, "epoch": 0.7912885662431942, "step": 1090}, {"loss": 1.1126, "grad_norm": 0.28138381242752075, "learning_rate": 0.0002, "epoch": 0.7985480943738656, "step": 1100}, {"loss": 1.2012, "grad_norm": 0.2597472369670868, "learning_rate": 0.0002, "epoch": 0.8058076225045372, "step": 1110}, {"loss": 1.1177, "grad_norm": 0.3305445611476898, "learning_rate": 0.0002, "epoch": 0.8130671506352087, "step": 1120}, {"loss": 1.0849, "grad_norm": 0.3934599459171295, "learning_rate": 0.0002, "epoch": 0.8203266787658802, "step": 1130}, {"loss": 1.0889, "grad_norm": 0.3472191393375397, "learning_rate": 0.0002, "epoch": 0.8275862068965517, "step": 1140}, {"loss": 1.265, "grad_norm": 0.2857365906238556, "learning_rate": 0.0002, "epoch": 0.8348457350272233, "step": 1150}, {"loss": 1.03, "grad_norm": 0.3207702934741974, "learning_rate": 0.0002, "epoch": 0.8421052631578947, "step": 1160}, {"loss": 1.1669, "grad_norm": 0.3176484704017639, "learning_rate": 0.0002, "epoch": 0.8493647912885662, "step": 1170}, {"loss": 1.1386, "grad_norm": 0.40685558319091797, "learning_rate": 0.0002, "epoch": 0.8566243194192378, "step": 1180}, {"loss": 1.1383, "grad_norm": 0.31125199794769287, "learning_rate": 0.0002, "epoch": 0.8638838475499092, "step": 1190}, {"loss": 1.1512, "grad_norm": 0.7361181378364563, "learning_rate": 0.0002, "epoch": 0.8711433756805808, "step": 1200}, {"loss": 0.9983, "grad_norm": 0.33699527382850647, "learning_rate": 0.0002, "epoch": 0.8784029038112523, "step": 1210}, {"loss": 1.11, "grad_norm": 0.3315220773220062, "learning_rate": 0.0002, "epoch": 0.8856624319419237, "step": 1220}, {"loss": 1.0266, "grad_norm": 0.6256054043769836, "learning_rate": 0.0002, "epoch": 0.8929219600725953, "step": 1230}, {"loss": 1.1738, "grad_norm": 0.3692137897014618, "learning_rate": 0.0002, "epoch": 0.9001814882032668, "step": 1240}, {"loss": 1.1112, "grad_norm": 0.3538484573364258, "learning_rate": 0.0002, "epoch": 0.9074410163339383, "step": 1250}, {"loss": 1.0878, "grad_norm": 0.27863040566444397, "learning_rate": 0.0002, "epoch": 0.9147005444646098, "step": 1260}, {"loss": 1.0826, "grad_norm": 0.3322528302669525, "learning_rate": 0.0002, "epoch": 0.9219600725952813, "step": 1270}, {"loss": 1.0158, "grad_norm": 0.7553173303604126, "learning_rate": 0.0002, "epoch": 0.9292196007259528, "step": 1280}, {"loss": 1.198, "grad_norm": 0.4856191575527191, "learning_rate": 0.0002, "epoch": 0.9364791288566243, "step": 1290}, {"loss": 1.1678, "grad_norm": 0.3668074905872345, "learning_rate": 0.0002, "epoch": 0.9437386569872959, "step": 1300}, {"loss": 1.1809, "grad_norm": 0.29851067066192627, "learning_rate": 0.0002, "epoch": 0.9509981851179673, "step": 1310}, {"loss": 1.1299, "grad_norm": 0.276664674282074, "learning_rate": 0.0002, "epoch": 0.9582577132486388, "step": 1320}, {"loss": 1.0597, "grad_norm": 0.2941018342971802, "learning_rate": 0.0002, "epoch": 0.9655172413793104, "step": 1330}, {"loss": 1.1731, "grad_norm": 0.3505859076976776, "learning_rate": 0.0002, "epoch": 0.9727767695099818, "step": 1340}, {"loss": 1.1809, "grad_norm": 0.3067687451839447, "learning_rate": 0.0002, "epoch": 0.9800362976406534, "step": 1350}, {"loss": 1.1511, "grad_norm": 0.27151066064834595, "learning_rate": 0.0002, "epoch": 0.9872958257713249, "step": 1360}, {"loss": 1.1919, "grad_norm": 0.36370083689689636, "learning_rate": 0.0002, "epoch": 0.9945553539019963, "step": 1370}, {"eval_loss": 1.1381088495254517, "eval_runtime": 96.0848, "eval_samples_per_second": 4.538, "eval_steps_per_second": 0.572, "epoch": 0.9996370235934664, "step": 1377}, {"loss": 1.0082, "grad_norm": 0.27980583906173706, "learning_rate": 0.0002, "epoch": 1.0018148820326678, "step": 1380}, {"loss": 1.0565, "grad_norm": 0.26713913679122925, "learning_rate": 0.0002, "epoch": 1.0090744101633393, "step": 1390}, {"loss": 1.0241, "grad_norm": 0.3089541494846344, "learning_rate": 0.0002, "epoch": 1.0163339382940109, "step": 1400}, {"loss": 0.9649, "grad_norm": 0.4188242256641388, "learning_rate": 0.0002, "epoch": 1.0235934664246824, "step": 1410}, {"loss": 0.9821, "grad_norm": 0.5246463418006897, "learning_rate": 0.0002, "epoch": 1.030852994555354, "step": 1420}, {"loss": 1.0143, "grad_norm": 0.2728777825832367, "learning_rate": 0.0002, "epoch": 1.0381125226860255, "step": 1430}, {"loss": 1.075, "grad_norm": 0.38167616724967957, "learning_rate": 0.0002, "epoch": 1.0453720508166968, "step": 1440}, {"loss": 1.0229, "grad_norm": 0.4439380168914795, "learning_rate": 0.0002, "epoch": 1.0526315789473684, "step": 1450}, {"loss": 1.0451, "grad_norm": 0.30954182147979736, "learning_rate": 0.0002, "epoch": 1.05989110707804, "step": 1460}, {"loss": 1.0762, "grad_norm": 0.4022280275821686, "learning_rate": 0.0002, "epoch": 1.0671506352087115, "step": 1470}, {"loss": 1.0754, "grad_norm": 0.7390811443328857, "learning_rate": 0.0002, "epoch": 1.074410163339383, "step": 1480}, {"loss": 1.1378, "grad_norm": 0.3885486423969269, "learning_rate": 0.0002, "epoch": 1.0816696914700545, "step": 1490}, {"loss": 1.005, "grad_norm": 0.5275560617446899, "learning_rate": 0.0002, "epoch": 1.0889292196007259, "step": 1500}, {"loss": 1.0147, "grad_norm": 0.35112282633781433, "learning_rate": 0.0002, "epoch": 1.0961887477313974, "step": 1510}, {"loss": 1.0331, "grad_norm": 0.33714351058006287, "learning_rate": 0.0002, "epoch": 1.103448275862069, "step": 1520}, {"loss": 0.8723, "grad_norm": 0.31221693754196167, "learning_rate": 0.0002, "epoch": 1.1107078039927405, "step": 1530}, {"loss": 1.0701, "grad_norm": 0.27549654245376587, "learning_rate": 0.0002, "epoch": 1.117967332123412, "step": 1540}, {"loss": 1.0114, "grad_norm": 0.8465521335601807, "learning_rate": 0.0002, "epoch": 1.1252268602540836, "step": 1550}, {"loss": 0.9643, "grad_norm": 0.36125949025154114, "learning_rate": 0.0002, "epoch": 1.132486388384755, "step": 1560}, {"loss": 1.0744, "grad_norm": 0.37420371174812317, "learning_rate": 0.0002, "epoch": 1.1397459165154264, "step": 1570}, {"loss": 1.165, "grad_norm": 0.3294760584831238, "learning_rate": 0.0002, "epoch": 1.147005444646098, "step": 1580}, {"loss": 0.9207, "grad_norm": 0.3881238102912903, "learning_rate": 0.0002, "epoch": 1.1542649727767695, "step": 1590}, {"loss": 1.0983, "grad_norm": 0.4766491651535034, "learning_rate": 0.0002, "epoch": 1.161524500907441, "step": 1600}, {"loss": 1.0092, "grad_norm": 0.395530104637146, "learning_rate": 0.0002, "epoch": 1.1687840290381124, "step": 1610}, {"loss": 0.9895, "grad_norm": 0.3297106623649597, "learning_rate": 0.0002, "epoch": 1.176043557168784, "step": 1620}, {"loss": 1.1121, "grad_norm": 0.39528271555900574, "learning_rate": 0.0002, "epoch": 1.1833030852994555, "step": 1630}, {"loss": 0.9202, "grad_norm": 0.3370221257209778, "learning_rate": 0.0002, "epoch": 1.190562613430127, "step": 1640}, {"loss": 1.102, "grad_norm": 0.31922030448913574, "learning_rate": 0.0002, "epoch": 1.1978221415607986, "step": 1650}, {"loss": 1.0565, "grad_norm": 0.6142027378082275, "learning_rate": 0.0002, "epoch": 1.2050816696914701, "step": 1660}, {"loss": 0.9541, "grad_norm": 0.44769710302352905, "learning_rate": 0.0002, "epoch": 1.2123411978221417, "step": 1670}, {"loss": 1.2501, "grad_norm": 0.41233646869659424, "learning_rate": 0.0002, "epoch": 1.219600725952813, "step": 1680}, {"loss": 1.0147, "grad_norm": 0.2928866147994995, "learning_rate": 0.0002, "epoch": 1.2268602540834845, "step": 1690}, {"loss": 0.9074, "grad_norm": 0.36913734674453735, "learning_rate": 0.0002, "epoch": 1.234119782214156, "step": 1700}, {"loss": 1.0847, "grad_norm": 0.5281891226768494, "learning_rate": 0.0002, "epoch": 1.2413793103448276, "step": 1710}, {"loss": 0.9482, "grad_norm": 0.3374697268009186, "learning_rate": 0.0002, "epoch": 1.2486388384754992, "step": 1720}, {"loss": 0.9826, "grad_norm": 0.3802020847797394, "learning_rate": 0.0002, "epoch": 1.2558983666061705, "step": 1730}, {"loss": 1.0191, "grad_norm": 0.38048651814460754, "learning_rate": 0.0002, "epoch": 1.263157894736842, "step": 1740}, {"loss": 1.0142, "grad_norm": 0.6676169633865356, "learning_rate": 0.0002, "epoch": 1.2704174228675136, "step": 1750}, {"loss": 0.992, "grad_norm": 0.4075961410999298, "learning_rate": 0.0002, "epoch": 1.2776769509981851, "step": 1760}, {"loss": 1.0301, "grad_norm": 0.4374721348285675, "learning_rate": 0.0002, "epoch": 1.2849364791288567, "step": 1770}, {"loss": 1.0601, "grad_norm": 0.4638824164867401, "learning_rate": 0.0002, "epoch": 1.2921960072595282, "step": 1780}, {"loss": 0.9593, "grad_norm": 0.38631564378738403, "learning_rate": 0.0002, "epoch": 1.2994555353901998, "step": 1790}, {"loss": 1.0042, "grad_norm": 0.35873809456825256, "learning_rate": 0.0002, "epoch": 1.306715063520871, "step": 1800}, {"loss": 0.9835, "grad_norm": 0.33602237701416016, "learning_rate": 0.0002, "epoch": 1.3139745916515426, "step": 1810}, {"loss": 1.1032, "grad_norm": 0.46696463227272034, "learning_rate": 0.0002, "epoch": 1.3212341197822142, "step": 1820}, {"loss": 1.0123, "grad_norm": 0.368958979845047, "learning_rate": 0.0002, "epoch": 1.3284936479128857, "step": 1830}, {"loss": 1.0456, "grad_norm": 0.3160957396030426, "learning_rate": 0.0002, "epoch": 1.335753176043557, "step": 1840}, {"loss": 1.0264, "grad_norm": 0.4511511027812958, "learning_rate": 0.0002, "epoch": 1.3430127041742286, "step": 1850}, {"loss": 1.0179, "grad_norm": 0.6769845485687256, "learning_rate": 0.0002, "epoch": 1.3502722323049001, "step": 1860}, {"loss": 1.011, "grad_norm": 0.3749309480190277, "learning_rate": 0.0002, "epoch": 1.3575317604355717, "step": 1870}, {"loss": 1.0223, "grad_norm": 0.553798496723175, "learning_rate": 0.0002, "epoch": 1.3647912885662432, "step": 1880}, {"loss": 0.9634, "grad_norm": 0.3538985550403595, "learning_rate": 0.0002, "epoch": 1.3720508166969148, "step": 1890}, {"loss": 1.0275, "grad_norm": 0.5501534938812256, "learning_rate": 0.0002, "epoch": 1.3793103448275863, "step": 1900}, {"loss": 1.0461, "grad_norm": 0.4432051181793213, "learning_rate": 0.0002, "epoch": 1.3865698729582578, "step": 1910}, {"loss": 0.9082, "grad_norm": 0.41755786538124084, "learning_rate": 0.0002, "epoch": 1.3938294010889292, "step": 1920}, {"loss": 1.0858, "grad_norm": 0.5626114010810852, "learning_rate": 0.0002, "epoch": 1.4010889292196007, "step": 1930}, {"loss": 1.0687, "grad_norm": 0.44038185477256775, "learning_rate": 0.0002, "epoch": 1.4083484573502723, "step": 1940}, {"loss": 0.9454, "grad_norm": 0.3397001624107361, "learning_rate": 0.0002, "epoch": 1.4156079854809438, "step": 1950}, {"loss": 0.943, "grad_norm": 0.4325368106365204, "learning_rate": 0.0002, "epoch": 1.4228675136116151, "step": 1960}, {"loss": 1.0488, "grad_norm": 0.3900907039642334, "learning_rate": 0.0002, "epoch": 1.4301270417422867, "step": 1970}, {"loss": 0.9699, "grad_norm": 0.369612455368042, "learning_rate": 0.0002, "epoch": 1.4373865698729582, "step": 1980}, {"loss": 1.0609, "grad_norm": 0.4389338791370392, "learning_rate": 0.0002, "epoch": 1.4446460980036298, "step": 1990}, {"loss": 1.042, "grad_norm": 1.694450855255127, "learning_rate": 0.0002, "epoch": 1.4519056261343013, "step": 2000}, {"loss": 0.997, "grad_norm": 0.516957700252533, "learning_rate": 0.0002, "epoch": 1.4591651542649728, "step": 2010}, {"loss": 1.0979, "grad_norm": 0.45515501499176025, "learning_rate": 0.0002, "epoch": 1.4664246823956444, "step": 2020}, {"loss": 1.0188, "grad_norm": 0.4153139591217041, "learning_rate": 0.0002, "epoch": 1.4736842105263157, "step": 2030}, {"loss": 1.1521, "grad_norm": 0.44353052973747253, "learning_rate": 0.0002, "epoch": 1.4809437386569873, "step": 2040}, {"loss": 0.9653, "grad_norm": 0.570554256439209, "learning_rate": 0.0002, "epoch": 1.4882032667876588, "step": 2050}, {"loss": 1.0352, "grad_norm": 0.5742740035057068, "learning_rate": 0.0002, "epoch": 1.4954627949183303, "step": 2060}, {"loss": 1.0963, "grad_norm": 0.5890517830848694, "learning_rate": 0.0002, "epoch": 1.5027223230490017, "step": 2070}, {"loss": 1.0639, "grad_norm": 0.4162650406360626, "learning_rate": 0.0002, "epoch": 1.5099818511796732, "step": 2080}, {"loss": 1.071, "grad_norm": 0.7334717512130737, "learning_rate": 0.0002, "epoch": 1.5172413793103448, "step": 2090}, {"loss": 0.8957, "grad_norm": 0.2963249683380127, "learning_rate": 0.0002, "epoch": 1.5245009074410163, "step": 2100}, {"loss": 1.0446, "grad_norm": 0.30676454305648804, "learning_rate": 0.0002, "epoch": 1.5317604355716878, "step": 2110}, {"loss": 0.9967, "grad_norm": 0.35984641313552856, "learning_rate": 0.0002, "epoch": 1.5390199637023594, "step": 2120}, {"loss": 1.0352, "grad_norm": 0.3384549617767334, "learning_rate": 0.0002, "epoch": 1.546279491833031, "step": 2130}, {"loss": 0.9874, "grad_norm": 0.4725518226623535, "learning_rate": 0.0002, "epoch": 1.5535390199637025, "step": 2140}, {"loss": 1.1166, "grad_norm": 0.3252685070037842, "learning_rate": 0.0002, "epoch": 1.560798548094374, "step": 2150}, {"loss": 0.9127, "grad_norm": 0.45043081045150757, "learning_rate": 0.0002, "epoch": 1.5680580762250453, "step": 2160}, {"loss": 1.0767, "grad_norm": 0.374208003282547, "learning_rate": 0.0002, "epoch": 1.5753176043557169, "step": 2170}, {"loss": 1.0255, "grad_norm": 0.5118404030799866, "learning_rate": 0.0002, "epoch": 1.5825771324863884, "step": 2180}, {"loss": 1.0062, "grad_norm": 0.482170969247818, "learning_rate": 0.0002, "epoch": 1.5898366606170597, "step": 2190}, {"loss": 1.0872, "grad_norm": 0.5337533950805664, "learning_rate": 0.0002, "epoch": 1.5970961887477313, "step": 2200}, {"loss": 1.0405, "grad_norm": 0.5195064544677734, "learning_rate": 0.0002, "epoch": 1.6043557168784028, "step": 2210}, {"loss": 1.0454, "grad_norm": 0.30807098746299744, "learning_rate": 0.0002, "epoch": 1.6116152450090744, "step": 2220}, {"loss": 1.0293, "grad_norm": 0.3962925672531128, "learning_rate": 0.0002, "epoch": 1.618874773139746, "step": 2230}, {"loss": 1.0137, "grad_norm": 0.7636962532997131, "learning_rate": 0.0002, "epoch": 1.6261343012704175, "step": 2240}, {"loss": 0.991, "grad_norm": 0.32380592823028564, "learning_rate": 0.0002, "epoch": 1.633393829401089, "step": 2250}, {"loss": 0.9471, "grad_norm": 0.5767741799354553, "learning_rate": 0.0002, "epoch": 1.6406533575317606, "step": 2260}, {"loss": 0.9524, "grad_norm": 0.39964812994003296, "learning_rate": 0.0002, "epoch": 1.647912885662432, "step": 2270}, {"loss": 0.9866, "grad_norm": 0.622629702091217, "learning_rate": 0.0002, "epoch": 1.6551724137931034, "step": 2280}, {"loss": 0.9609, "grad_norm": 0.40202152729034424, "learning_rate": 0.0002, "epoch": 1.662431941923775, "step": 2290}, {"loss": 0.9859, "grad_norm": 0.4467349052429199, "learning_rate": 0.0002, "epoch": 1.6696914700544465, "step": 2300}, {"loss": 1.0312, "grad_norm": 0.5026949048042297, "learning_rate": 0.0002, "epoch": 1.6769509981851178, "step": 2310}, {"loss": 1.0776, "grad_norm": 0.43754541873931885, "learning_rate": 0.0002, "epoch": 1.6842105263157894, "step": 2320}, {"loss": 0.9786, "grad_norm": 0.42869430780410767, "learning_rate": 0.0002, "epoch": 1.691470054446461, "step": 2330}, {"loss": 1.0747, "grad_norm": 0.4192679524421692, "learning_rate": 0.0002, "epoch": 1.6987295825771325, "step": 2340}, {"loss": 1.0054, "grad_norm": 0.3243155777454376, "learning_rate": 0.0002, "epoch": 1.705989110707804, "step": 2350}, {"loss": 0.9165, "grad_norm": 1.0514075756072998, "learning_rate": 0.0002, "epoch": 1.7132486388384756, "step": 2360}, {"loss": 1.1353, "grad_norm": 0.4719122648239136, "learning_rate": 0.0002, "epoch": 1.720508166969147, "step": 2370}, {"loss": 0.9913, "grad_norm": 0.3846144676208496, "learning_rate": 0.0002, "epoch": 1.7277676950998186, "step": 2380}, {"loss": 0.9935, "grad_norm": 0.7266581058502197, "learning_rate": 0.0002, "epoch": 1.73502722323049, "step": 2390}, {"loss": 0.9509, "grad_norm": 0.6183241605758667, "learning_rate": 0.0002, "epoch": 1.7422867513611615, "step": 2400}, {"loss": 0.9713, "grad_norm": 0.3658260405063629, "learning_rate": 0.0002, "epoch": 1.749546279491833, "step": 2410}, {"loss": 1.0033, "grad_norm": 0.6036322712898254, "learning_rate": 0.0002, "epoch": 1.7568058076225044, "step": 2420}, {"loss": 1.0526, "grad_norm": 0.7872936129570007, "learning_rate": 0.0002, "epoch": 1.764065335753176, "step": 2430}, {"loss": 0.99, "grad_norm": 0.35946124792099, "learning_rate": 0.0002, "epoch": 1.7713248638838475, "step": 2440}, {"loss": 0.8845, "grad_norm": 0.3740338981151581, "learning_rate": 0.0002, "epoch": 1.778584392014519, "step": 2450}, {"loss": 0.9333, "grad_norm": 0.6150230169296265, "learning_rate": 0.0002, "epoch": 1.7858439201451906, "step": 2460}, {"loss": 1.0373, "grad_norm": 0.4726075530052185, "learning_rate": 0.0002, "epoch": 1.793103448275862, "step": 2470}, {"loss": 0.974, "grad_norm": 0.31292253732681274, "learning_rate": 0.0002, "epoch": 1.8003629764065336, "step": 2480}, {"loss": 1.0205, "grad_norm": 0.4463104009628296, "learning_rate": 0.0002, "epoch": 1.8076225045372052, "step": 2490}, {"loss": 1.0771, "grad_norm": 0.7848200798034668, "learning_rate": 0.0002, "epoch": 1.8148820326678767, "step": 2500}, {"loss": 0.9508, "grad_norm": 0.5562082529067993, "learning_rate": 0.0002, "epoch": 1.822141560798548, "step": 2510}, {"loss": 0.9343, "grad_norm": 0.39892756938934326, "learning_rate": 0.0002, "epoch": 1.8294010889292196, "step": 2520}, {"loss": 1.0489, "grad_norm": 1.2923320531845093, "learning_rate": 0.0002, "epoch": 1.8366606170598911, "step": 2530}, {"loss": 1.1203, "grad_norm": 0.6316490769386292, "learning_rate": 0.0002, "epoch": 1.8439201451905625, "step": 2540}, {"loss": 0.9131, "grad_norm": 0.46100497245788574, "learning_rate": 0.0002, "epoch": 1.851179673321234, "step": 2550}, {"loss": 1.0977, "grad_norm": 0.7902987003326416, "learning_rate": 0.0002, "epoch": 1.8584392014519056, "step": 2560}, {"loss": 0.9702, "grad_norm": 0.4596365690231323, "learning_rate": 0.0002, "epoch": 1.865698729582577, "step": 2570}, {"loss": 0.9322, "grad_norm": 0.6592172384262085, "learning_rate": 0.0002, "epoch": 1.8729582577132486, "step": 2580}, {"loss": 0.9549, "grad_norm": 0.5764662027359009, "learning_rate": 0.0002, "epoch": 1.8802177858439202, "step": 2590}, {"loss": 0.9485, "grad_norm": 0.8421637415885925, "learning_rate": 0.0002, "epoch": 1.8874773139745917, "step": 2600}, {"loss": 1.012, "grad_norm": 0.5635305047035217, "learning_rate": 0.0002, "epoch": 1.8947368421052633, "step": 2610}, {"loss": 0.8907, "grad_norm": 0.46231237053871155, "learning_rate": 0.0002, "epoch": 1.9019963702359348, "step": 2620}, {"loss": 0.9543, "grad_norm": 0.3944607079029083, "learning_rate": 0.0002, "epoch": 1.9092558983666061, "step": 2630}, {"loss": 0.9964, "grad_norm": 0.4753907322883606, "learning_rate": 0.0002, "epoch": 1.9165154264972777, "step": 2640}, {"loss": 1.0217, "grad_norm": 0.4151090979576111, "learning_rate": 0.0002, "epoch": 1.9237749546279492, "step": 2650}, {"loss": 1.0397, "grad_norm": 0.6793725490570068, "learning_rate": 0.0002, "epoch": 1.9310344827586206, "step": 2660}, {"loss": 1.0401, "grad_norm": 0.339755117893219, "learning_rate": 0.0002, "epoch": 1.938294010889292, "step": 2670}, {"loss": 1.082, "grad_norm": 0.40789374709129333, "learning_rate": 0.0002, "epoch": 1.9455535390199636, "step": 2680}, {"loss": 0.9973, "grad_norm": 0.3750005066394806, "learning_rate": 0.0002, "epoch": 1.9528130671506352, "step": 2690}, {"loss": 0.9837, "grad_norm": 0.39684441685676575, "learning_rate": 0.0002, "epoch": 1.9600725952813067, "step": 2700}, {"loss": 1.1204, "grad_norm": 0.378287672996521, "learning_rate": 0.0002, "epoch": 1.9673321234119783, "step": 2710}, {"loss": 0.9284, "grad_norm": 0.3668482005596161, "learning_rate": 0.0002, "epoch": 1.9745916515426498, "step": 2720}, {"loss": 1.0512, "grad_norm": 0.49997565150260925, "learning_rate": 0.0002, "epoch": 1.9818511796733214, "step": 2730}, {"loss": 0.9311, "grad_norm": 0.36852124333381653, "learning_rate": 0.0002, "epoch": 1.989110707803993, "step": 2740}, {"loss": 1.0898, "grad_norm": 0.5203380584716797, "learning_rate": 0.0002, "epoch": 1.9963702359346642, "step": 2750}, {"eval_loss": 1.1086540222167969, "eval_runtime": 95.6866, "eval_samples_per_second": 4.557, "eval_steps_per_second": 0.575, "epoch": 2.0, "step": 2755}, {"loss": 0.9791, "grad_norm": 0.35921573638916016, "learning_rate": 0.0002, "epoch": 2.0036297640653356, "step": 2760}, {"loss": 0.8968, "grad_norm": 1.013861894607544, "learning_rate": 0.0002, "epoch": 2.010889292196007, "step": 2770}, {"loss": 0.7873, "grad_norm": 0.4425240159034729, "learning_rate": 0.0002, "epoch": 2.0181488203266786, "step": 2780}, {"loss": 0.8758, "grad_norm": 0.60174161195755, "learning_rate": 0.0002, "epoch": 2.02540834845735, "step": 2790}, {"loss": 0.7989, "grad_norm": 0.47582098841667175, "learning_rate": 0.0002, "epoch": 2.0326678765880217, "step": 2800}, {"loss": 0.8898, "grad_norm": 0.6012811660766602, "learning_rate": 0.0002, "epoch": 2.0399274047186933, "step": 2810}, {"loss": 0.8109, "grad_norm": 0.4444408118724823, "learning_rate": 0.0002, "epoch": 2.047186932849365, "step": 2820}, {"loss": 0.8621, "grad_norm": 0.4864003360271454, "learning_rate": 0.0002, "epoch": 2.0544464609800364, "step": 2830}, {"loss": 0.8817, "grad_norm": 0.5104215741157532, "learning_rate": 0.0002, "epoch": 2.061705989110708, "step": 2840}, {"loss": 0.7789, "grad_norm": 0.6218489408493042, "learning_rate": 0.0002, "epoch": 2.0689655172413794, "step": 2850}, {"loss": 0.8778, "grad_norm": 0.705784261226654, "learning_rate": 0.0002, "epoch": 2.076225045372051, "step": 2860}, {"loss": 0.7851, "grad_norm": 0.48091503977775574, "learning_rate": 0.0002, "epoch": 2.0834845735027225, "step": 2870}, {"loss": 0.8444, "grad_norm": 0.5062456727027893, "learning_rate": 0.0002, "epoch": 2.0907441016333936, "step": 2880}, {"loss": 0.7898, "grad_norm": 0.4862022399902344, "learning_rate": 0.0002, "epoch": 2.098003629764065, "step": 2890}, {"loss": 0.8593, "grad_norm": 0.48264044523239136, "learning_rate": 0.0002, "epoch": 2.1052631578947367, "step": 2900}, {"loss": 0.8325, "grad_norm": 0.43744346499443054, "learning_rate": 0.0002, "epoch": 2.1125226860254083, "step": 2910}, {"loss": 0.9099, "grad_norm": 0.5480492115020752, "learning_rate": 0.0002, "epoch": 2.11978221415608, "step": 2920}, {"loss": 0.7727, "grad_norm": 0.5068560838699341, "learning_rate": 0.0002, "epoch": 2.1270417422867514, "step": 2930}, {"loss": 0.8403, "grad_norm": 0.4650852680206299, "learning_rate": 0.0002, "epoch": 2.134301270417423, "step": 2940}, {"loss": 0.8467, "grad_norm": 0.4929981231689453, "learning_rate": 0.0002, "epoch": 2.1415607985480944, "step": 2950}, {"loss": 0.8747, "grad_norm": 0.6537389755249023, "learning_rate": 0.0002, "epoch": 2.148820326678766, "step": 2960}, {"loss": 0.7663, "grad_norm": 0.8032940626144409, "learning_rate": 0.0002, "epoch": 2.1560798548094375, "step": 2970}, {"loss": 0.7604, "grad_norm": 0.7131643891334534, "learning_rate": 0.0002, "epoch": 2.163339382940109, "step": 2980}, {"loss": 0.8424, "grad_norm": 0.6034275889396667, "learning_rate": 0.0002, "epoch": 2.1705989110707806, "step": 2990}, {"loss": 0.8838, "grad_norm": 0.6081095933914185, "learning_rate": 0.0002, "epoch": 2.1778584392014517, "step": 3000}, {"loss": 0.9013, "grad_norm": 0.5706912875175476, "learning_rate": 0.0002, "epoch": 2.1851179673321233, "step": 3010}, {"loss": 0.8473, "grad_norm": 0.6742380261421204, "learning_rate": 0.0002, "epoch": 2.192377495462795, "step": 3020}, {"loss": 0.8372, "grad_norm": 0.4847496449947357, "learning_rate": 0.0002, "epoch": 2.1996370235934664, "step": 3030}, {"loss": 0.8487, "grad_norm": 0.5529342889785767, "learning_rate": 0.0002, "epoch": 2.206896551724138, "step": 3040}, {"loss": 0.9057, "grad_norm": 0.6108783483505249, "learning_rate": 0.0002, "epoch": 2.2141560798548094, "step": 3050}, {"loss": 0.9659, "grad_norm": 0.8841571807861328, "learning_rate": 0.0002, "epoch": 2.221415607985481, "step": 3060}, {"loss": 0.8538, "grad_norm": 0.4227530360221863, "learning_rate": 0.0002, "epoch": 2.2286751361161525, "step": 3070}, {"loss": 0.7925, "grad_norm": 0.461935818195343, "learning_rate": 0.0002, "epoch": 2.235934664246824, "step": 3080}, {"loss": 0.8542, "grad_norm": 0.5407412648200989, "learning_rate": 0.0002, "epoch": 2.2431941923774956, "step": 3090}, {"loss": 0.8392, "grad_norm": 0.5057998895645142, "learning_rate": 0.0002, "epoch": 2.250453720508167, "step": 3100}, {"loss": 0.7898, "grad_norm": 0.530057966709137, "learning_rate": 0.0002, "epoch": 2.2577132486388383, "step": 3110}, {"loss": 0.8507, "grad_norm": 0.5066842436790466, "learning_rate": 0.0002, "epoch": 2.26497277676951, "step": 3120}, {"loss": 0.7903, "grad_norm": 0.5069178342819214, "learning_rate": 0.0002, "epoch": 2.2722323049001814, "step": 3130}, {"loss": 0.8458, "grad_norm": 0.6095499396324158, "learning_rate": 0.0002, "epoch": 2.279491833030853, "step": 3140}, {"loss": 0.8171, "grad_norm": 0.49524766206741333, "learning_rate": 0.0002, "epoch": 2.2867513611615244, "step": 3150}, {"loss": 0.7568, "grad_norm": 0.5334409475326538, "learning_rate": 0.0002, "epoch": 2.294010889292196, "step": 3160}, {"loss": 0.7533, "grad_norm": 1.681748867034912, "learning_rate": 0.0002, "epoch": 2.3012704174228675, "step": 3170}, {"loss": 0.8155, "grad_norm": 0.7225565314292908, "learning_rate": 0.0002, "epoch": 2.308529945553539, "step": 3180}, {"loss": 0.8034, "grad_norm": 0.5379388928413391, "learning_rate": 0.0002, "epoch": 2.3157894736842106, "step": 3190}, {"loss": 0.8833, "grad_norm": 0.45770326256752014, "learning_rate": 0.0002, "epoch": 2.323049001814882, "step": 3200}, {"loss": 0.8419, "grad_norm": 0.6984533071517944, "learning_rate": 0.0002, "epoch": 2.3303085299455537, "step": 3210}, {"loss": 0.7414, "grad_norm": 0.6725744605064392, "learning_rate": 0.0002, "epoch": 2.337568058076225, "step": 3220}, {"loss": 0.8657, "grad_norm": 1.1247471570968628, "learning_rate": 0.0002, "epoch": 2.344827586206897, "step": 3230}, {"loss": 0.7782, "grad_norm": 1.0240263938903809, "learning_rate": 0.0002, "epoch": 2.352087114337568, "step": 3240}, {"loss": 0.826, "grad_norm": 0.5608096122741699, "learning_rate": 0.0002, "epoch": 2.3593466424682394, "step": 3250}, {"loss": 0.8191, "grad_norm": 0.8294990062713623, "learning_rate": 0.0002, "epoch": 2.366606170598911, "step": 3260}, {"loss": 0.9845, "grad_norm": 0.6734224557876587, "learning_rate": 0.0002, "epoch": 2.3738656987295825, "step": 3270}, {"loss": 0.7921, "grad_norm": 0.6862800717353821, "learning_rate": 0.0002, "epoch": 2.381125226860254, "step": 3280}, {"loss": 0.7752, "grad_norm": 0.5442930459976196, "learning_rate": 0.0002, "epoch": 2.3883847549909256, "step": 3290}, {"loss": 0.8543, "grad_norm": 0.5745394229888916, "learning_rate": 0.0002, "epoch": 2.395644283121597, "step": 3300}, {"loss": 0.833, "grad_norm": 0.6257799863815308, "learning_rate": 0.0002, "epoch": 2.4029038112522687, "step": 3310}, {"loss": 0.8191, "grad_norm": 0.5608420968055725, "learning_rate": 0.0002, "epoch": 2.4101633393829403, "step": 3320}, {"loss": 0.8745, "grad_norm": 0.5512017011642456, "learning_rate": 0.0002, "epoch": 2.417422867513612, "step": 3330}, {"loss": 0.8642, "grad_norm": 0.7452999353408813, "learning_rate": 0.0002, "epoch": 2.4246823956442833, "step": 3340}, {"loss": 0.8417, "grad_norm": 0.4604301452636719, "learning_rate": 0.0002, "epoch": 2.4319419237749544, "step": 3350}, {"loss": 0.9148, "grad_norm": 0.8225823640823364, "learning_rate": 0.0002, "epoch": 2.439201451905626, "step": 3360}, {"loss": 0.8144, "grad_norm": 0.8604981303215027, "learning_rate": 0.0002, "epoch": 2.4464609800362975, "step": 3370}, {"loss": 0.9276, "grad_norm": 0.6620925664901733, "learning_rate": 0.0002, "epoch": 2.453720508166969, "step": 3380}, {"loss": 0.8381, "grad_norm": 0.4750158488750458, "learning_rate": 0.0002, "epoch": 2.4609800362976406, "step": 3390}, {"loss": 0.8249, "grad_norm": 0.6061418056488037, "learning_rate": 0.0002, "epoch": 2.468239564428312, "step": 3400}, {"loss": 0.8944, "grad_norm": 0.5977247357368469, "learning_rate": 0.0002, "epoch": 2.4754990925589837, "step": 3410}, {"loss": 0.8366, "grad_norm": 0.5004227757453918, "learning_rate": 0.0002, "epoch": 2.4827586206896552, "step": 3420}, {"loss": 0.9133, "grad_norm": 0.46428972482681274, "learning_rate": 0.0002, "epoch": 2.490018148820327, "step": 3430}, {"loss": 0.8534, "grad_norm": 0.7658395171165466, "learning_rate": 0.0002, "epoch": 2.4972776769509983, "step": 3440}, {"loss": 0.9066, "grad_norm": 0.621819794178009, "learning_rate": 0.0002, "epoch": 2.50453720508167, "step": 3450}, {"loss": 0.8481, "grad_norm": 0.4556088149547577, "learning_rate": 0.0002, "epoch": 2.511796733212341, "step": 3460}, {"loss": 0.8522, "grad_norm": 0.6124140024185181, "learning_rate": 0.0002, "epoch": 2.519056261343013, "step": 3470}, {"loss": 0.8158, "grad_norm": 0.6256231665611267, "learning_rate": 0.0002, "epoch": 2.526315789473684, "step": 3480}, {"loss": 0.9097, "grad_norm": 0.5464141964912415, "learning_rate": 0.0002, "epoch": 2.5335753176043556, "step": 3490}, {"loss": 0.8708, "grad_norm": 0.51471346616745, "learning_rate": 0.0002, "epoch": 2.540834845735027, "step": 3500}, {"loss": 0.9042, "grad_norm": 0.5326165556907654, "learning_rate": 0.0002, "epoch": 2.5480943738656987, "step": 3510}, {"loss": 0.8749, "grad_norm": 0.4750378429889679, "learning_rate": 0.0002, "epoch": 2.5553539019963702, "step": 3520}, {"loss": 0.8248, "grad_norm": 0.5292279124259949, "learning_rate": 0.0002, "epoch": 2.562613430127042, "step": 3530}, {"loss": 0.8808, "grad_norm": 0.6145227551460266, "learning_rate": 0.0002, "epoch": 2.5698729582577133, "step": 3540}, {"loss": 0.846, "grad_norm": 0.8275189399719238, "learning_rate": 0.0002, "epoch": 2.577132486388385, "step": 3550}, {"loss": 0.875, "grad_norm": 0.5037438273429871, "learning_rate": 0.0002, "epoch": 2.5843920145190564, "step": 3560}, {"loss": 0.8324, "grad_norm": 0.5838707685470581, "learning_rate": 0.0002, "epoch": 2.5916515426497275, "step": 3570}, {"loss": 0.8716, "grad_norm": 0.5398710370063782, "learning_rate": 0.0002, "epoch": 2.5989110707803995, "step": 3580}, {"loss": 0.7843, "grad_norm": 0.6115376949310303, "learning_rate": 0.0002, "epoch": 2.6061705989110706, "step": 3590}, {"loss": 0.8142, "grad_norm": 1.118809461593628, "learning_rate": 0.0002, "epoch": 2.613430127041742, "step": 3600}, {"loss": 0.8282, "grad_norm": 0.7811771631240845, "learning_rate": 0.0002, "epoch": 2.6206896551724137, "step": 3610}, {"loss": 0.7997, "grad_norm": 0.5753175020217896, "learning_rate": 0.0002, "epoch": 2.6279491833030852, "step": 3620}, {"loss": 0.7705, "grad_norm": 0.550829291343689, "learning_rate": 0.0002, "epoch": 2.635208711433757, "step": 3630}, {"loss": 0.8564, "grad_norm": 0.5360019207000732, "learning_rate": 0.0002, "epoch": 2.6424682395644283, "step": 3640}, {"loss": 0.7648, "grad_norm": 0.63050377368927, "learning_rate": 0.0002, "epoch": 2.6497277676951, "step": 3650}, {"loss": 0.9156, "grad_norm": 0.5833110213279724, "learning_rate": 0.0002, "epoch": 2.6569872958257714, "step": 3660}, {"loss": 0.8033, "grad_norm": 0.5543047189712524, "learning_rate": 0.0002, "epoch": 2.664246823956443, "step": 3670}, {"loss": 0.8867, "grad_norm": 0.6842212080955505, "learning_rate": 0.0002, "epoch": 2.671506352087114, "step": 3680}, {"loss": 0.8569, "grad_norm": 0.6859333515167236, "learning_rate": 0.0002, "epoch": 2.678765880217786, "step": 3690}, {"loss": 0.8081, "grad_norm": 0.7038410902023315, "learning_rate": 0.0002, "epoch": 2.686025408348457, "step": 3700}, {"loss": 0.8686, "grad_norm": 0.447233647108078, "learning_rate": 0.0002, "epoch": 2.6932849364791287, "step": 3710}, {"loss": 0.8951, "grad_norm": 0.6453872919082642, "learning_rate": 0.0002, "epoch": 2.7005444646098002, "step": 3720}, {"loss": 0.8292, "grad_norm": 0.8025672435760498, "learning_rate": 0.0002, "epoch": 2.707803992740472, "step": 3730}, {"loss": 0.8488, "grad_norm": 0.5997087955474854, "learning_rate": 0.0002, "epoch": 2.7150635208711433, "step": 3740}, {"loss": 0.8308, "grad_norm": 0.6901142001152039, "learning_rate": 0.0002, "epoch": 2.722323049001815, "step": 3750}, {"loss": 0.8517, "grad_norm": 1.036145567893982, "learning_rate": 0.0002, "epoch": 2.7295825771324864, "step": 3760}, {"loss": 0.8455, "grad_norm": 0.7207072377204895, "learning_rate": 0.0002, "epoch": 2.736842105263158, "step": 3770}, {"loss": 0.8009, "grad_norm": 1.0452989339828491, "learning_rate": 0.0002, "epoch": 2.7441016333938295, "step": 3780}, {"loss": 0.8868, "grad_norm": 0.5615278482437134, "learning_rate": 0.0002, "epoch": 2.751361161524501, "step": 3790}, {"loss": 0.9047, "grad_norm": 0.46439215540885925, "learning_rate": 0.0002, "epoch": 2.7586206896551726, "step": 3800}, {"loss": 0.7847, "grad_norm": 0.7134917974472046, "learning_rate": 0.0002, "epoch": 2.7658802177858437, "step": 3810}, {"loss": 0.8315, "grad_norm": 0.5139115452766418, "learning_rate": 0.0002, "epoch": 2.7731397459165157, "step": 3820}, {"loss": 0.8995, "grad_norm": 0.8595781326293945, "learning_rate": 0.0002, "epoch": 2.780399274047187, "step": 3830}, {"loss": 0.9079, "grad_norm": 0.544614851474762, "learning_rate": 0.0002, "epoch": 2.7876588021778583, "step": 3840}, {"loss": 0.8322, "grad_norm": 0.6073850393295288, "learning_rate": 0.0002, "epoch": 2.79491833030853, "step": 3850}, {"loss": 0.8277, "grad_norm": 0.8224069476127625, "learning_rate": 0.0002, "epoch": 2.8021778584392014, "step": 3860}, {"loss": 0.815, "grad_norm": 0.5347970128059387, "learning_rate": 0.0002, "epoch": 2.809437386569873, "step": 3870}, {"loss": 0.8219, "grad_norm": 0.6799601912498474, "learning_rate": 0.0002, "epoch": 2.8166969147005445, "step": 3880}, {"loss": 0.8796, "grad_norm": 0.5219197869300842, "learning_rate": 0.0002, "epoch": 2.823956442831216, "step": 3890}, {"loss": 0.8649, "grad_norm": 0.5710130333900452, "learning_rate": 0.0002, "epoch": 2.8312159709618876, "step": 3900}, {"loss": 0.8741, "grad_norm": 0.5857213139533997, "learning_rate": 0.0002, "epoch": 2.838475499092559, "step": 3910}, {"loss": 0.8154, "grad_norm": 0.5206644535064697, "learning_rate": 0.0002, "epoch": 2.8457350272232302, "step": 3920}, {"loss": 0.9328, "grad_norm": 1.2902015447616577, "learning_rate": 0.0002, "epoch": 2.8529945553539022, "step": 3930}, {"loss": 0.8192, "grad_norm": 0.5252797603607178, "learning_rate": 0.0002, "epoch": 2.8602540834845733, "step": 3940}, {"loss": 0.9047, "grad_norm": 0.5925108790397644, "learning_rate": 0.0002, "epoch": 2.867513611615245, "step": 3950}, {"loss": 0.8611, "grad_norm": 0.9719610810279846, "learning_rate": 0.0002, "epoch": 2.8747731397459164, "step": 3960}, {"loss": 0.8531, "grad_norm": 0.6834747195243835, "learning_rate": 0.0002, "epoch": 2.882032667876588, "step": 3970}, {"loss": 0.8124, "grad_norm": 0.6952353119850159, "learning_rate": 0.0002, "epoch": 2.8892921960072595, "step": 3980}, {"loss": 0.8365, "grad_norm": 0.49889910221099854, "learning_rate": 0.0002, "epoch": 2.896551724137931, "step": 3990}, {"loss": 0.8197, "grad_norm": 0.5007026791572571, "learning_rate": 0.0002, "epoch": 2.9038112522686026, "step": 4000}, {"loss": 0.7752, "grad_norm": 0.5474239587783813, "learning_rate": 0.0002, "epoch": 2.911070780399274, "step": 4010}, {"loss": 0.8579, "grad_norm": 0.6617428660392761, "learning_rate": 0.0002, "epoch": 2.9183303085299457, "step": 4020}, {"loss": 0.8583, "grad_norm": 0.6097776293754578, "learning_rate": 0.0002, "epoch": 2.925589836660617, "step": 4030}, {"loss": 0.8296, "grad_norm": 0.5985828638076782, "learning_rate": 0.0002, "epoch": 2.9328493647912888, "step": 4040}, {"loss": 0.9134, "grad_norm": 0.769488513469696, "learning_rate": 0.0002, "epoch": 2.94010889292196, "step": 4050}, {"loss": 0.8148, "grad_norm": 0.5167055130004883, "learning_rate": 0.0002, "epoch": 2.9473684210526314, "step": 4060}, {"loss": 0.8665, "grad_norm": 0.6265496015548706, "learning_rate": 0.0002, "epoch": 2.954627949183303, "step": 4070}, {"loss": 0.9218, "grad_norm": 1.2644082307815552, "learning_rate": 0.0002, "epoch": 2.9618874773139745, "step": 4080}, {"loss": 0.8026, "grad_norm": 0.6007561087608337, "learning_rate": 0.0002, "epoch": 2.969147005444646, "step": 4090}, {"loss": 0.8262, "grad_norm": 0.47984927892684937, "learning_rate": 0.0002, "epoch": 2.9764065335753176, "step": 4100}, {"loss": 0.7897, "grad_norm": 1.128198504447937, "learning_rate": 0.0002, "epoch": 2.983666061705989, "step": 4110}, {"loss": 0.7883, "grad_norm": 0.526292085647583, "learning_rate": 0.0002, "epoch": 2.9909255898366607, "step": 4120}, {"loss": 0.7801, "grad_norm": 0.5462674498558044, "learning_rate": 0.0002, "epoch": 2.9981851179673322, "step": 4130}, {"eval_loss": 1.1093357801437378, "eval_runtime": 46.2498, "eval_samples_per_second": 9.427, "eval_steps_per_second": 1.189, "epoch": 2.9996370235934666, "step": 4132}, {"loss": 0.721, "grad_norm": 0.48288026452064514, "learning_rate": 0.0002, "epoch": 3.0054446460980038, "step": 4140}, {"loss": 0.6769, "grad_norm": 1.0181782245635986, "learning_rate": 0.0002, "epoch": 3.0127041742286753, "step": 4150}, {"loss": 0.7185, "grad_norm": 0.7718019485473633, "learning_rate": 0.0002, "epoch": 3.019963702359347, "step": 4160}, {"loss": 0.6552, "grad_norm": 0.7492219805717468, "learning_rate": 0.0002, "epoch": 3.027223230490018, "step": 4170}, {"loss": 0.6678, "grad_norm": 0.9363632798194885, "learning_rate": 0.0002, "epoch": 3.0344827586206895, "step": 4180}, {"loss": 0.7187, "grad_norm": 0.6888533234596252, "learning_rate": 0.0002, "epoch": 3.041742286751361, "step": 4190}, {"loss": 0.6469, "grad_norm": 0.7072834968566895, "learning_rate": 0.0002, "epoch": 3.0490018148820326, "step": 4200}, {"loss": 0.6387, "grad_norm": 0.7182047963142395, "learning_rate": 0.0002, "epoch": 3.056261343012704, "step": 4210}, {"loss": 0.6385, "grad_norm": 0.7194355130195618, "learning_rate": 0.0002, "epoch": 3.0635208711433757, "step": 4220}, {"loss": 0.5812, "grad_norm": 0.9454023838043213, "learning_rate": 0.0002, "epoch": 3.0707803992740472, "step": 4230}, {"loss": 0.6036, "grad_norm": 0.838657557964325, "learning_rate": 0.0002, "epoch": 3.0780399274047188, "step": 4240}, {"loss": 0.646, "grad_norm": 0.740113377571106, "learning_rate": 0.0002, "epoch": 3.0852994555353903, "step": 4250}, {"loss": 0.604, "grad_norm": 0.6616561412811279, "learning_rate": 0.0002, "epoch": 3.092558983666062, "step": 4260}, {"loss": 0.6462, "grad_norm": 0.8846506476402283, "learning_rate": 0.0002, "epoch": 3.0998185117967334, "step": 4270}, {"loss": 0.6037, "grad_norm": 0.6322125792503357, "learning_rate": 0.0002, "epoch": 3.107078039927405, "step": 4280}, {"loss": 0.5953, "grad_norm": 0.7461467385292053, "learning_rate": 0.0002, "epoch": 3.114337568058076, "step": 4290}, {"loss": 0.6099, "grad_norm": 0.8251287341117859, "learning_rate": 0.0002, "epoch": 3.1215970961887476, "step": 4300}, {"loss": 0.6284, "grad_norm": 0.8767673373222351, "learning_rate": 0.0002, "epoch": 3.128856624319419, "step": 4310}, {"loss": 0.7535, "grad_norm": 0.7758759260177612, "learning_rate": 0.0002, "epoch": 3.1361161524500907, "step": 4320}, {"loss": 0.6624, "grad_norm": 1.1056879758834839, "learning_rate": 0.0002, "epoch": 3.143375680580762, "step": 4330}, {"loss": 0.691, "grad_norm": 0.8259835243225098, "learning_rate": 0.0002, "epoch": 3.1506352087114338, "step": 4340}, {"loss": 0.6635, "grad_norm": 0.6607027053833008, "learning_rate": 0.0002, "epoch": 3.1578947368421053, "step": 4350}, {"loss": 0.5911, "grad_norm": 0.7983301281929016, "learning_rate": 0.0002, "epoch": 3.165154264972777, "step": 4360}, {"loss": 0.6496, "grad_norm": 0.6725239157676697, "learning_rate": 0.0002, "epoch": 3.1724137931034484, "step": 4370}, {"loss": 0.5966, "grad_norm": 0.9052095413208008, "learning_rate": 0.0002, "epoch": 3.17967332123412, "step": 4380}, {"loss": 0.6877, "grad_norm": 0.8131307363510132, "learning_rate": 0.0002, "epoch": 3.1869328493647915, "step": 4390}, {"loss": 0.6384, "grad_norm": 0.6435626149177551, "learning_rate": 0.0002, "epoch": 3.1941923774954626, "step": 4400}, {"loss": 0.5819, "grad_norm": 0.84367436170578, "learning_rate": 0.0002, "epoch": 3.201451905626134, "step": 4410}, {"loss": 0.6104, "grad_norm": 1.5018867254257202, "learning_rate": 0.0002, "epoch": 3.2087114337568057, "step": 4420}, {"loss": 0.6838, "grad_norm": 0.7019091844558716, "learning_rate": 0.0002, "epoch": 3.215970961887477, "step": 4430}, {"loss": 0.6153, "grad_norm": 0.9164197444915771, "learning_rate": 0.0002, "epoch": 3.2232304900181488, "step": 4440}, {"loss": 0.6618, "grad_norm": 0.7890861630439758, "learning_rate": 0.0002, "epoch": 3.2304900181488203, "step": 4450}, {"loss": 0.6401, "grad_norm": 0.6517660617828369, "learning_rate": 0.0002, "epoch": 3.237749546279492, "step": 4460}, {"loss": 0.6699, "grad_norm": 1.10188889503479, "learning_rate": 0.0002, "epoch": 3.2450090744101634, "step": 4470}, {"loss": 0.6356, "grad_norm": 0.8158330917358398, "learning_rate": 0.0002, "epoch": 3.252268602540835, "step": 4480}, {"loss": 0.7757, "grad_norm": 0.7663109302520752, "learning_rate": 0.0002, "epoch": 3.2595281306715065, "step": 4490}, {"loss": 0.6539, "grad_norm": 0.8473444581031799, "learning_rate": 0.0002, "epoch": 3.266787658802178, "step": 4500}, {"loss": 0.6511, "grad_norm": 0.9724768996238708, "learning_rate": 0.0002, "epoch": 3.274047186932849, "step": 4510}, {"loss": 0.5464, "grad_norm": 0.8516759276390076, "learning_rate": 0.0002, "epoch": 3.281306715063521, "step": 4520}, {"loss": 0.6534, "grad_norm": 0.7543437480926514, "learning_rate": 0.0002, "epoch": 3.288566243194192, "step": 4530}, {"loss": 0.6095, "grad_norm": 1.0472029447555542, "learning_rate": 0.0002, "epoch": 3.2958257713248638, "step": 4540}, {"loss": 0.6216, "grad_norm": 0.6240826845169067, "learning_rate": 0.0002, "epoch": 3.3030852994555353, "step": 4550}, {"loss": 0.6223, "grad_norm": 0.9957774877548218, "learning_rate": 0.0002, "epoch": 3.310344827586207, "step": 4560}, {"loss": 0.618, "grad_norm": 0.6448912620544434, "learning_rate": 0.0002, "epoch": 3.3176043557168784, "step": 4570}, {"loss": 0.6188, "grad_norm": 0.7519692778587341, "learning_rate": 0.0002, "epoch": 3.32486388384755, "step": 4580}, {"loss": 0.6672, "grad_norm": 0.7367453575134277, "learning_rate": 0.0002, "epoch": 3.3321234119782215, "step": 4590}, {"loss": 0.6517, "grad_norm": 0.8064960837364197, "learning_rate": 0.0002, "epoch": 3.339382940108893, "step": 4600}, {"loss": 0.6062, "grad_norm": 0.7664631009101868, "learning_rate": 0.0002, "epoch": 3.3466424682395646, "step": 4610}, {"loss": 0.6834, "grad_norm": 0.7803396582603455, "learning_rate": 0.0002, "epoch": 3.353901996370236, "step": 4620}, {"loss": 0.6961, "grad_norm": 0.9141599535942078, "learning_rate": 0.0002, "epoch": 3.3611615245009077, "step": 4630}, {"loss": 0.6889, "grad_norm": 0.9719856381416321, "learning_rate": 0.0002, "epoch": 3.3684210526315788, "step": 4640}, {"loss": 0.6914, "grad_norm": 0.9223218560218811, "learning_rate": 0.0002, "epoch": 3.3756805807622503, "step": 4650}, {"loss": 0.5981, "grad_norm": 0.7289277911186218, "learning_rate": 0.0002, "epoch": 3.382940108892922, "step": 4660}, {"loss": 0.595, "grad_norm": 1.039724349975586, "learning_rate": 0.0002, "epoch": 3.3901996370235934, "step": 4670}, {"loss": 0.8121, "grad_norm": 1.397438883781433, "learning_rate": 0.0002, "epoch": 3.397459165154265, "step": 4680}, {"loss": 0.6334, "grad_norm": 1.0069999694824219, "learning_rate": 0.0002, "epoch": 3.4047186932849365, "step": 4690}, {"loss": 0.6598, "grad_norm": 0.816291332244873, "learning_rate": 0.0002, "epoch": 3.411978221415608, "step": 4700}, {"loss": 0.6748, "grad_norm": 1.2831530570983887, "learning_rate": 0.0002, "epoch": 3.4192377495462796, "step": 4710}, {"loss": 0.6625, "grad_norm": 0.9573889970779419, "learning_rate": 0.0002, "epoch": 3.426497277676951, "step": 4720}, {"loss": 0.7279, "grad_norm": 0.7685632705688477, "learning_rate": 0.0002, "epoch": 3.4337568058076227, "step": 4730}, {"loss": 0.6104, "grad_norm": 0.7019195556640625, "learning_rate": 0.0002, "epoch": 3.441016333938294, "step": 4740}, {"loss": 0.7606, "grad_norm": 0.7244833707809448, "learning_rate": 0.0002, "epoch": 3.4482758620689653, "step": 4750}, {"loss": 0.6951, "grad_norm": 1.3468551635742188, "learning_rate": 0.0002, "epoch": 3.455535390199637, "step": 4760}, {"loss": 0.6945, "grad_norm": 0.822846531867981, "learning_rate": 0.0002, "epoch": 3.4627949183303084, "step": 4770}, {"loss": 0.6431, "grad_norm": 0.7311608195304871, "learning_rate": 0.0002, "epoch": 3.47005444646098, "step": 4780}, {"loss": 0.7019, "grad_norm": 0.9466770887374878, "learning_rate": 0.0002, "epoch": 3.4773139745916515, "step": 4790}, {"loss": 0.7767, "grad_norm": 1.1527155637741089, "learning_rate": 0.0002, "epoch": 3.484573502722323, "step": 4800}, {"loss": 0.6882, "grad_norm": 1.1288906335830688, "learning_rate": 0.0002, "epoch": 3.4918330308529946, "step": 4810}, {"loss": 0.6564, "grad_norm": 0.9096164107322693, "learning_rate": 0.0002, "epoch": 3.499092558983666, "step": 4820}, {"loss": 0.6127, "grad_norm": 0.7988565564155579, "learning_rate": 0.0002, "epoch": 3.5063520871143377, "step": 4830}, {"loss": 0.7004, "grad_norm": 0.7183415293693542, "learning_rate": 0.0002, "epoch": 3.513611615245009, "step": 4840}, {"loss": 0.74, "grad_norm": 0.6614915132522583, "learning_rate": 0.0002, "epoch": 3.5208711433756807, "step": 4850}, {"loss": 0.7271, "grad_norm": 0.8609521985054016, "learning_rate": 0.0002, "epoch": 3.528130671506352, "step": 4860}, {"loss": 0.6664, "grad_norm": 0.86552894115448, "learning_rate": 0.0002, "epoch": 3.535390199637024, "step": 4870}, {"loss": 0.6432, "grad_norm": 0.6926496028900146, "learning_rate": 0.0002, "epoch": 3.542649727767695, "step": 4880}, {"loss": 0.7117, "grad_norm": 0.8157467246055603, "learning_rate": 0.0002, "epoch": 3.5499092558983665, "step": 4890}, {"loss": 0.6201, "grad_norm": 0.9085357189178467, "learning_rate": 0.0002, "epoch": 3.557168784029038, "step": 4900}, {"loss": 0.6521, "grad_norm": 0.6322644948959351, "learning_rate": 0.0002, "epoch": 3.5644283121597096, "step": 4910}, {"loss": 0.6607, "grad_norm": 1.263205885887146, "learning_rate": 0.0002, "epoch": 3.571687840290381, "step": 4920}, {"loss": 0.6657, "grad_norm": 0.8901070356369019, "learning_rate": 0.0002, "epoch": 3.5789473684210527, "step": 4930}, {"loss": 0.6434, "grad_norm": 0.7983952164649963, "learning_rate": 0.0002, "epoch": 3.586206896551724, "step": 4940}, {"loss": 0.6861, "grad_norm": 0.9887813925743103, "learning_rate": 0.0002, "epoch": 3.5934664246823957, "step": 4950}, {"loss": 0.6502, "grad_norm": 0.7895187735557556, "learning_rate": 0.0002, "epoch": 3.6007259528130673, "step": 4960}, {"loss": 0.7111, "grad_norm": 0.9685819745063782, "learning_rate": 0.0002, "epoch": 3.6079854809437384, "step": 4970}, {"loss": 0.6915, "grad_norm": 0.6576591730117798, "learning_rate": 0.0002, "epoch": 3.6152450090744104, "step": 4980}, {"loss": 0.6195, "grad_norm": 0.856985330581665, "learning_rate": 0.0002, "epoch": 3.6225045372050815, "step": 4990}, {"loss": 0.6318, "grad_norm": 0.7230252623558044, "learning_rate": 0.0002, "epoch": 3.629764065335753, "step": 5000}, {"loss": 0.742, "grad_norm": 0.8260893821716309, "learning_rate": 0.0002, "epoch": 3.6370235934664246, "step": 5010}, {"loss": 0.7223, "grad_norm": 0.7635950446128845, "learning_rate": 0.0002, "epoch": 3.644283121597096, "step": 5020}, {"loss": 0.6837, "grad_norm": 0.7060768604278564, "learning_rate": 0.0002, "epoch": 3.6515426497277677, "step": 5030}, {"loss": 0.6921, "grad_norm": 0.8020303249359131, "learning_rate": 0.0002, "epoch": 3.658802177858439, "step": 5040}, {"loss": 0.6446, "grad_norm": 0.8530341386795044, "learning_rate": 0.0002, "epoch": 3.6660617059891107, "step": 5050}, {"loss": 0.7222, "grad_norm": 0.6667101979255676, "learning_rate": 0.0002, "epoch": 3.6733212341197823, "step": 5060}, {"loss": 0.7081, "grad_norm": 0.7385406494140625, "learning_rate": 0.0002, "epoch": 3.680580762250454, "step": 5070}, {"loss": 0.7035, "grad_norm": 0.7753380537033081, "learning_rate": 0.0002, "epoch": 3.6878402903811254, "step": 5080}, {"loss": 0.6491, "grad_norm": 0.7516207098960876, "learning_rate": 0.0002, "epoch": 3.695099818511797, "step": 5090}, {"loss": 0.672, "grad_norm": 0.8171586394309998, "learning_rate": 0.0002, "epoch": 3.702359346642468, "step": 5100}, {"loss": 0.7459, "grad_norm": 1.0796279907226562, "learning_rate": 0.0002, "epoch": 3.70961887477314, "step": 5110}, {"loss": 0.5948, "grad_norm": 0.6957688927650452, "learning_rate": 0.0002, "epoch": 3.716878402903811, "step": 5120}, {"loss": 0.7515, "grad_norm": 0.8550161719322205, "learning_rate": 0.0002, "epoch": 3.7241379310344827, "step": 5130}, {"loss": 0.7286, "grad_norm": 0.9396728277206421, "learning_rate": 0.0002, "epoch": 3.731397459165154, "step": 5140}, {"loss": 0.7594, "grad_norm": 1.4264805316925049, "learning_rate": 0.0002, "epoch": 3.7386569872958257, "step": 5150}, {"loss": 0.6575, "grad_norm": 0.8725108504295349, "learning_rate": 0.0002, "epoch": 3.7459165154264973, "step": 5160}, {"loss": 0.6778, "grad_norm": 1.0346195697784424, "learning_rate": 0.0002, "epoch": 3.753176043557169, "step": 5170}, {"loss": 0.6371, "grad_norm": 0.5395554304122925, "learning_rate": 0.0002, "epoch": 3.7604355716878404, "step": 5180}, {"loss": 0.7308, "grad_norm": 1.3153616189956665, "learning_rate": 0.0002, "epoch": 3.767695099818512, "step": 5190}, {"loss": 0.78, "grad_norm": 0.9879828691482544, "learning_rate": 0.0002, "epoch": 3.7749546279491835, "step": 5200}, {"loss": 0.7068, "grad_norm": 0.8876672983169556, "learning_rate": 0.0002, "epoch": 3.7822141560798546, "step": 5210}, {"loss": 0.6283, "grad_norm": 0.8363267779350281, "learning_rate": 0.0002, "epoch": 3.7894736842105265, "step": 5220}, {"loss": 0.6255, "grad_norm": 0.637294590473175, "learning_rate": 0.0002, "epoch": 3.7967332123411976, "step": 5230}, {"loss": 0.6685, "grad_norm": 1.1408970355987549, "learning_rate": 0.0002, "epoch": 3.803992740471869, "step": 5240}, {"loss": 0.6761, "grad_norm": 1.0128360986709595, "learning_rate": 0.0002, "epoch": 3.8112522686025407, "step": 5250}, {"loss": 0.6764, "grad_norm": 0.8061144351959229, "learning_rate": 0.0002, "epoch": 3.8185117967332123, "step": 5260}, {"loss": 0.7254, "grad_norm": 0.9626626968383789, "learning_rate": 0.0002, "epoch": 3.825771324863884, "step": 5270}, {"loss": 0.7367, "grad_norm": 0.9013627171516418, "learning_rate": 0.0002, "epoch": 3.8330308529945554, "step": 5280}, {"loss": 0.6806, "grad_norm": 0.8411344289779663, "learning_rate": 0.0002, "epoch": 3.840290381125227, "step": 5290}, {"loss": 0.6818, "grad_norm": 0.7426059246063232, "learning_rate": 0.0002, "epoch": 3.8475499092558985, "step": 5300}, {"loss": 0.6748, "grad_norm": 1.003413438796997, "learning_rate": 0.0002, "epoch": 3.85480943738657, "step": 5310}, {"loss": 0.8554, "grad_norm": 0.7527840733528137, "learning_rate": 0.0002, "epoch": 3.862068965517241, "step": 5320}, {"loss": 0.7521, "grad_norm": 0.738610565662384, "learning_rate": 0.0002, "epoch": 3.869328493647913, "step": 5330}, {"loss": 0.7266, "grad_norm": 0.7277999520301819, "learning_rate": 0.0002, "epoch": 3.876588021778584, "step": 5340}, {"loss": 0.7503, "grad_norm": 0.5951359272003174, "learning_rate": 0.0002, "epoch": 3.8838475499092557, "step": 5350}, {"loss": 0.7447, "grad_norm": 1.043884038925171, "learning_rate": 0.0002, "epoch": 3.8911070780399273, "step": 5360}, {"loss": 0.6862, "grad_norm": 0.8436498045921326, "learning_rate": 0.0002, "epoch": 3.898366606170599, "step": 5370}, {"loss": 0.665, "grad_norm": 0.5603365302085876, "learning_rate": 0.0002, "epoch": 3.9056261343012704, "step": 5380}, {"loss": 0.7098, "grad_norm": 1.0128886699676514, "learning_rate": 0.0002, "epoch": 3.912885662431942, "step": 5390}, {"loss": 0.6707, "grad_norm": 0.7970930337905884, "learning_rate": 0.0002, "epoch": 3.9201451905626135, "step": 5400}, {"loss": 0.637, "grad_norm": 0.7699369192123413, "learning_rate": 0.0002, "epoch": 3.927404718693285, "step": 5410}, {"loss": 0.6742, "grad_norm": 0.800561249256134, "learning_rate": 0.0002, "epoch": 3.9346642468239565, "step": 5420}, {"loss": 0.7208, "grad_norm": 0.8020331859588623, "learning_rate": 0.0002, "epoch": 3.941923774954628, "step": 5430}, {"loss": 0.7294, "grad_norm": 0.7461140155792236, "learning_rate": 0.0002, "epoch": 3.9491833030852996, "step": 5440}, {"loss": 0.7013, "grad_norm": 0.8346918821334839, "learning_rate": 0.0002, "epoch": 3.9564428312159707, "step": 5450}, {"loss": 0.6289, "grad_norm": 0.9723302125930786, "learning_rate": 0.0002, "epoch": 3.9637023593466427, "step": 5460}, {"loss": 0.8029, "grad_norm": 0.6809740662574768, "learning_rate": 0.0002, "epoch": 3.970961887477314, "step": 5470}, {"loss": 0.6896, "grad_norm": 0.7353498339653015, "learning_rate": 0.0002, "epoch": 3.9782214156079854, "step": 5480}, {"loss": 0.6722, "grad_norm": 0.748009443283081, "learning_rate": 0.0002, "epoch": 3.985480943738657, "step": 5490}, {"loss": 0.6866, "grad_norm": 1.3656195402145386, "learning_rate": 0.0002, "epoch": 3.9927404718693285, "step": 5500}, {"loss": 0.7368, "grad_norm": 0.8402108550071716, "learning_rate": 0.0002, "epoch": 4.0, "step": 5510}, {"eval_loss": 1.17229425907135, "eval_runtime": 46.2554, "eval_samples_per_second": 9.426, "eval_steps_per_second": 1.189, "epoch": 4.0, "step": 5510}, {"loss": 0.4637, "grad_norm": 0.8601235747337341, "learning_rate": 0.0002, "epoch": 4.007259528130671, "step": 5520}, {"loss": 0.4717, "grad_norm": 1.2635200023651123, "learning_rate": 0.0002, "epoch": 4.014519056261343, "step": 5530}, {"loss": 0.503, "grad_norm": 1.0257477760314941, "learning_rate": 0.0002, "epoch": 4.021778584392014, "step": 5540}, {"loss": 0.4547, "grad_norm": 0.9436745047569275, "learning_rate": 0.0002, "epoch": 4.029038112522686, "step": 5550}, {"loss": 0.459, "grad_norm": 0.9443606734275818, "learning_rate": 0.0002, "epoch": 4.036297640653357, "step": 5560}, {"loss": 0.5386, "grad_norm": 1.3965742588043213, "learning_rate": 0.0002, "epoch": 4.043557168784029, "step": 5570}, {"loss": 0.4248, "grad_norm": 0.8973520398139954, "learning_rate": 0.0002, "epoch": 4.0508166969147, "step": 5580}, {"loss": 0.4111, "grad_norm": 0.9998409748077393, "learning_rate": 0.0002, "epoch": 4.058076225045372, "step": 5590}, {"loss": 0.4828, "grad_norm": 1.1213387250900269, "learning_rate": 0.0002, "epoch": 4.0653357531760435, "step": 5600}, {"loss": 0.439, "grad_norm": 0.7064558863639832, "learning_rate": 0.0002, "epoch": 4.072595281306715, "step": 5610}, {"loss": 0.4607, "grad_norm": 1.2390803098678589, "learning_rate": 0.0002, "epoch": 4.0798548094373865, "step": 5620}, {"loss": 0.5014, "grad_norm": 1.123469591140747, "learning_rate": 0.0002, "epoch": 4.087114337568058, "step": 5630}, {"loss": 0.513, "grad_norm": 1.229573369026184, "learning_rate": 0.0002, "epoch": 4.09437386569873, "step": 5640}, {"loss": 0.5258, "grad_norm": 1.7182831764221191, "learning_rate": 0.0002, "epoch": 4.101633393829401, "step": 5650}, {"loss": 0.5371, "grad_norm": 0.894903302192688, "learning_rate": 0.0002, "epoch": 4.108892921960073, "step": 5660}, {"loss": 0.4813, "grad_norm": 0.8754552006721497, "learning_rate": 0.0002, "epoch": 4.116152450090744, "step": 5670}, {"loss": 0.491, "grad_norm": 1.2401553392410278, "learning_rate": 0.0002, "epoch": 4.123411978221416, "step": 5680}, {"loss": 0.4549, "grad_norm": 0.8631148934364319, "learning_rate": 0.0002, "epoch": 4.130671506352087, "step": 5690}, {"loss": 0.487, "grad_norm": 1.1798022985458374, "learning_rate": 0.0002, "epoch": 4.137931034482759, "step": 5700}, {"loss": 0.4522, "grad_norm": 0.8344549536705017, "learning_rate": 0.0002, "epoch": 4.14519056261343, "step": 5710}, {"loss": 0.4559, "grad_norm": 1.2342697381973267, "learning_rate": 0.0002, "epoch": 4.152450090744102, "step": 5720}, {"loss": 0.53, "grad_norm": 1.1601094007492065, "learning_rate": 0.0002, "epoch": 4.159709618874773, "step": 5730}, {"loss": 0.4755, "grad_norm": 1.2925703525543213, "learning_rate": 0.0002, "epoch": 4.166969147005445, "step": 5740}, {"loss": 0.4973, "grad_norm": 1.0870997905731201, "learning_rate": 0.0002, "epoch": 4.174228675136116, "step": 5750}, {"loss": 0.5184, "grad_norm": 0.9077792763710022, "learning_rate": 0.0002, "epoch": 4.181488203266787, "step": 5760}, {"loss": 0.4606, "grad_norm": 1.009273886680603, "learning_rate": 0.0002, "epoch": 4.188747731397459, "step": 5770}, {"loss": 0.5383, "grad_norm": 1.2465530633926392, "learning_rate": 0.0002, "epoch": 4.19600725952813, "step": 5780}, {"loss": 0.4938, "grad_norm": 1.2261253595352173, "learning_rate": 0.0002, "epoch": 4.203266787658802, "step": 5790}, {"loss": 0.5255, "grad_norm": 1.1498041152954102, "learning_rate": 0.0002, "epoch": 4.2105263157894735, "step": 5800}, {"loss": 0.5269, "grad_norm": 1.1966725587844849, "learning_rate": 0.0002, "epoch": 4.217785843920145, "step": 5810}, {"loss": 0.5626, "grad_norm": 1.2651296854019165, "learning_rate": 0.0002, "epoch": 4.2250453720508165, "step": 5820}, {"loss": 0.5213, "grad_norm": 1.0388574600219727, "learning_rate": 0.0002, "epoch": 4.2323049001814885, "step": 5830}, {"loss": 0.4965, "grad_norm": 1.3042771816253662, "learning_rate": 0.0002, "epoch": 4.23956442831216, "step": 5840}, {"loss": 0.5116, "grad_norm": 1.1127727031707764, "learning_rate": 0.0002, "epoch": 4.246823956442832, "step": 5850}, {"loss": 0.5197, "grad_norm": 0.9653958082199097, "learning_rate": 0.0002, "epoch": 4.254083484573503, "step": 5860}, {"loss": 0.4414, "grad_norm": 1.0500504970550537, "learning_rate": 0.0002, "epoch": 4.261343012704174, "step": 5870}, {"loss": 0.544, "grad_norm": 1.1476165056228638, "learning_rate": 0.0002, "epoch": 4.268602540834846, "step": 5880}, {"loss": 0.4667, "grad_norm": 0.9424414038658142, "learning_rate": 0.0002, "epoch": 4.275862068965517, "step": 5890}, {"loss": 0.5039, "grad_norm": 1.3309166431427002, "learning_rate": 0.0002, "epoch": 4.283121597096189, "step": 5900}, {"loss": 0.5472, "grad_norm": 1.3025873899459839, "learning_rate": 0.0002, "epoch": 4.29038112522686, "step": 5910}, {"loss": 0.4644, "grad_norm": 1.1442325115203857, "learning_rate": 0.0002, "epoch": 4.297640653357532, "step": 5920}, {"loss": 0.5066, "grad_norm": 0.9820859432220459, "learning_rate": 0.0002, "epoch": 4.304900181488203, "step": 5930}, {"loss": 0.5224, "grad_norm": 0.9615740180015564, "learning_rate": 0.0002, "epoch": 4.312159709618875, "step": 5940}, {"loss": 0.5665, "grad_norm": 1.1627109050750732, "learning_rate": 0.0002, "epoch": 4.319419237749546, "step": 5950}, {"loss": 0.4852, "grad_norm": 0.9381322860717773, "learning_rate": 0.0002, "epoch": 4.326678765880218, "step": 5960}, {"loss": 0.4532, "grad_norm": 0.8154335618019104, "learning_rate": 0.0002, "epoch": 4.333938294010889, "step": 5970}, {"loss": 0.5237, "grad_norm": 0.877671480178833, "learning_rate": 0.0002, "epoch": 4.341197822141561, "step": 5980}, {"loss": 0.6113, "grad_norm": 1.1742031574249268, "learning_rate": 0.0002, "epoch": 4.348457350272232, "step": 5990}, {"loss": 0.5704, "grad_norm": 1.0352917909622192, "learning_rate": 0.0002, "epoch": 4.3557168784029034, "step": 6000}, {"loss": 0.4996, "grad_norm": 0.9963878989219666, "learning_rate": 0.0002, "epoch": 4.362976406533575, "step": 6010}, {"loss": 0.4813, "grad_norm": 1.1892237663269043, "learning_rate": 0.0002, "epoch": 4.3702359346642465, "step": 6020}, {"loss": 0.5224, "grad_norm": 1.2516111135482788, "learning_rate": 0.0002, "epoch": 4.3774954627949185, "step": 6030}, {"loss": 0.5952, "grad_norm": 1.2111951112747192, "learning_rate": 0.0002, "epoch": 4.38475499092559, "step": 6040}, {"loss": 0.4275, "grad_norm": 1.0820083618164062, "learning_rate": 0.0002, "epoch": 4.392014519056262, "step": 6050}, {"loss": 0.5117, "grad_norm": 1.033915638923645, "learning_rate": 0.0002, "epoch": 4.399274047186933, "step": 6060}, {"loss": 0.5431, "grad_norm": 1.0635870695114136, "learning_rate": 0.0002, "epoch": 4.406533575317605, "step": 6070}, {"loss": 0.5341, "grad_norm": 1.0520414113998413, "learning_rate": 0.0002, "epoch": 4.413793103448276, "step": 6080}, {"loss": 0.512, "grad_norm": 1.0821926593780518, "learning_rate": 0.0002, "epoch": 4.421052631578947, "step": 6090}, {"loss": 0.5065, "grad_norm": 1.0533246994018555, "learning_rate": 0.0002, "epoch": 4.428312159709619, "step": 6100}, {"loss": 0.4577, "grad_norm": 0.9231932759284973, "learning_rate": 0.0002, "epoch": 4.43557168784029, "step": 6110}, {"loss": 0.583, "grad_norm": 0.9910260438919067, "learning_rate": 0.0002, "epoch": 4.442831215970962, "step": 6120}, {"loss": 0.4717, "grad_norm": 1.061949372291565, "learning_rate": 0.0002, "epoch": 4.450090744101633, "step": 6130}, {"loss": 0.5893, "grad_norm": 1.2927039861679077, "learning_rate": 0.0002, "epoch": 4.457350272232305, "step": 6140}, {"loss": 0.4684, "grad_norm": 1.3966081142425537, "learning_rate": 0.0002, "epoch": 4.464609800362976, "step": 6150}, {"loss": 0.5507, "grad_norm": 1.3835992813110352, "learning_rate": 0.0002, "epoch": 4.471869328493648, "step": 6160}, {"loss": 0.5911, "grad_norm": 1.0892692804336548, "learning_rate": 0.0002, "epoch": 4.479128856624319, "step": 6170}, {"loss": 0.478, "grad_norm": 1.0318800210952759, "learning_rate": 0.0002, "epoch": 4.486388384754991, "step": 6180}, {"loss": 0.5198, "grad_norm": 0.8174677491188049, "learning_rate": 0.0002, "epoch": 4.493647912885662, "step": 6190}, {"loss": 0.5387, "grad_norm": 1.4157509803771973, "learning_rate": 0.0002, "epoch": 4.500907441016334, "step": 6200}, {"loss": 0.5868, "grad_norm": 1.5244114398956299, "learning_rate": 0.0002, "epoch": 4.508166969147005, "step": 6210}, {"loss": 0.4642, "grad_norm": 0.8164850473403931, "learning_rate": 0.0002, "epoch": 4.5154264972776765, "step": 6220}, {"loss": 0.522, "grad_norm": 1.2904746532440186, "learning_rate": 0.0002, "epoch": 4.5226860254083485, "step": 6230}, {"loss": 0.5103, "grad_norm": 0.7987732887268066, "learning_rate": 0.0002, "epoch": 4.52994555353902, "step": 6240}, {"loss": 0.4615, "grad_norm": 0.831040620803833, "learning_rate": 0.0002, "epoch": 4.537205081669692, "step": 6250}, {"loss": 0.5065, "grad_norm": 0.9545485973358154, "learning_rate": 0.0002, "epoch": 4.544464609800363, "step": 6260}, {"loss": 0.5515, "grad_norm": 0.9291793704032898, "learning_rate": 0.0002, "epoch": 4.551724137931035, "step": 6270}, {"loss": 0.4535, "grad_norm": 0.8977208733558655, "learning_rate": 0.0002, "epoch": 4.558983666061706, "step": 6280}, {"loss": 0.544, "grad_norm": 1.1768537759780884, "learning_rate": 0.0002, "epoch": 4.566243194192378, "step": 6290}, {"loss": 0.5925, "grad_norm": 1.0688952207565308, "learning_rate": 0.0002, "epoch": 4.573502722323049, "step": 6300}, {"loss": 0.5207, "grad_norm": 0.8800966739654541, "learning_rate": 0.0002, "epoch": 4.580762250453721, "step": 6310}, {"loss": 0.6106, "grad_norm": 1.0911834239959717, "learning_rate": 0.0002, "epoch": 4.588021778584392, "step": 6320}, {"loss": 0.5109, "grad_norm": 1.1420872211456299, "learning_rate": 0.0002, "epoch": 4.595281306715064, "step": 6330}, {"loss": 0.5147, "grad_norm": 1.0215224027633667, "learning_rate": 0.0002, "epoch": 4.602540834845735, "step": 6340}, {"loss": 0.592, "grad_norm": 0.9685489535331726, "learning_rate": 0.0002, "epoch": 4.609800362976406, "step": 6350}, {"loss": 0.5775, "grad_norm": 1.12773597240448, "learning_rate": 0.0002, "epoch": 4.617059891107078, "step": 6360}, {"loss": 0.5966, "grad_norm": 1.0663973093032837, "learning_rate": 0.0002, "epoch": 4.624319419237749, "step": 6370}, {"loss": 0.512, "grad_norm": 1.1707262992858887, "learning_rate": 0.0002, "epoch": 4.631578947368421, "step": 6380}, {"loss": 0.5497, "grad_norm": 1.0672980546951294, "learning_rate": 0.0002, "epoch": 4.638838475499092, "step": 6390}, {"loss": 0.5699, "grad_norm": 1.1464333534240723, "learning_rate": 0.0002, "epoch": 4.646098003629764, "step": 6400}, {"loss": 0.5514, "grad_norm": 1.070230484008789, "learning_rate": 0.0002, "epoch": 4.653357531760435, "step": 6410}, {"loss": 0.5013, "grad_norm": 0.9673764109611511, "learning_rate": 0.0002, "epoch": 4.660617059891107, "step": 6420}, {"loss": 0.5901, "grad_norm": 1.0189043283462524, "learning_rate": 0.0002, "epoch": 4.6678765880217785, "step": 6430}, {"loss": 0.5193, "grad_norm": 1.185896396636963, "learning_rate": 0.0002, "epoch": 4.67513611615245, "step": 6440}, {"loss": 0.5318, "grad_norm": 1.0682812929153442, "learning_rate": 0.0002, "epoch": 4.682395644283122, "step": 6450}, {"loss": 0.5773, "grad_norm": 1.3586071729660034, "learning_rate": 0.0002, "epoch": 4.689655172413794, "step": 6460}, {"loss": 0.5482, "grad_norm": 0.6561792492866516, "learning_rate": 0.0002, "epoch": 4.696914700544465, "step": 6470}, {"loss": 0.5711, "grad_norm": 1.1394113302230835, "learning_rate": 0.0002, "epoch": 4.704174228675136, "step": 6480}, {"loss": 0.5325, "grad_norm": 0.9683151245117188, "learning_rate": 0.0002, "epoch": 4.711433756805808, "step": 6490}, {"loss": 0.5139, "grad_norm": 1.0247553586959839, "learning_rate": 0.0002, "epoch": 4.718693284936479, "step": 6500}, {"loss": 0.5794, "grad_norm": 0.8046169281005859, "learning_rate": 0.0002, "epoch": 4.725952813067151, "step": 6510}, {"loss": 0.5471, "grad_norm": 1.0710240602493286, "learning_rate": 0.0002, "epoch": 4.733212341197822, "step": 6520}, {"loss": 0.4805, "grad_norm": 0.9438924193382263, "learning_rate": 0.0002, "epoch": 4.740471869328494, "step": 6530}, {"loss": 0.5404, "grad_norm": 0.869162380695343, "learning_rate": 0.0002, "epoch": 4.747731397459165, "step": 6540}, {"loss": 0.6379, "grad_norm": 0.9776787161827087, "learning_rate": 0.0002, "epoch": 4.754990925589837, "step": 6550}, {"loss": 0.5288, "grad_norm": 1.1990505456924438, "learning_rate": 0.0002, "epoch": 4.762250453720508, "step": 6560}, {"loss": 0.5539, "grad_norm": 1.0582209825515747, "learning_rate": 0.0002, "epoch": 4.769509981851179, "step": 6570}, {"loss": 0.489, "grad_norm": 0.9966367483139038, "learning_rate": 0.0002, "epoch": 4.776769509981851, "step": 6580}, {"loss": 0.5514, "grad_norm": 0.9130612015724182, "learning_rate": 0.0002, "epoch": 4.784029038112522, "step": 6590}, {"loss": 0.5864, "grad_norm": 1.0950500965118408, "learning_rate": 0.0002, "epoch": 4.791288566243194, "step": 6600}, {"loss": 0.5266, "grad_norm": 1.108681321144104, "learning_rate": 0.0002, "epoch": 4.798548094373865, "step": 6610}, {"loss": 0.5875, "grad_norm": 1.1873763799667358, "learning_rate": 0.0002, "epoch": 4.805807622504537, "step": 6620}, {"loss": 0.5736, "grad_norm": 1.305367112159729, "learning_rate": 0.0002, "epoch": 4.8130671506352085, "step": 6630}, {"loss": 0.5636, "grad_norm": 1.2801482677459717, "learning_rate": 0.0002, "epoch": 4.8203266787658805, "step": 6640}, {"loss": 0.582, "grad_norm": 1.26764976978302, "learning_rate": 0.0002, "epoch": 4.827586206896552, "step": 6650}, {"loss": 0.5259, "grad_norm": 1.0018208026885986, "learning_rate": 0.0002, "epoch": 4.834845735027224, "step": 6660}, {"loss": 0.548, "grad_norm": 1.2326326370239258, "learning_rate": 0.0002, "epoch": 4.842105263157895, "step": 6670}, {"loss": 0.5933, "grad_norm": 0.9707282781600952, "learning_rate": 0.0002, "epoch": 4.849364791288567, "step": 6680}, {"loss": 0.5612, "grad_norm": 1.2772048711776733, "learning_rate": 0.0002, "epoch": 4.856624319419238, "step": 6690}, {"loss": 0.5346, "grad_norm": 2.6652262210845947, "learning_rate": 0.0002, "epoch": 4.863883847549909, "step": 6700}, {"loss": 0.5428, "grad_norm": 1.215828537940979, "learning_rate": 0.0002, "epoch": 4.871143375680581, "step": 6710}, {"loss": 0.6571, "grad_norm": 1.3704510927200317, "learning_rate": 0.0002, "epoch": 4.878402903811252, "step": 6720}, {"loss": 0.4963, "grad_norm": 0.7781757116317749, "learning_rate": 0.0002, "epoch": 4.885662431941924, "step": 6730}, {"loss": 0.5989, "grad_norm": 1.1883646249771118, "learning_rate": 0.0002, "epoch": 4.892921960072595, "step": 6740}, {"loss": 0.6067, "grad_norm": 0.9216066002845764, "learning_rate": 0.0002, "epoch": 4.900181488203267, "step": 6750}, {"loss": 0.5085, "grad_norm": 1.0558464527130127, "learning_rate": 0.0002, "epoch": 4.907441016333938, "step": 6760}, {"loss": 0.5216, "grad_norm": 1.032656192779541, "learning_rate": 0.0002, "epoch": 4.91470054446461, "step": 6770}, {"loss": 0.5426, "grad_norm": 1.1261441707611084, "learning_rate": 0.0002, "epoch": 4.921960072595281, "step": 6780}, {"loss": 0.5295, "grad_norm": 1.2178640365600586, "learning_rate": 0.0002, "epoch": 4.929219600725952, "step": 6790}, {"loss": 0.5476, "grad_norm": 1.5369361639022827, "learning_rate": 0.0002, "epoch": 4.936479128856624, "step": 6800}, {"loss": 0.5358, "grad_norm": 1.1188377141952515, "learning_rate": 0.0002, "epoch": 4.943738656987296, "step": 6810}, {"loss": 0.5483, "grad_norm": 1.2506113052368164, "learning_rate": 0.0002, "epoch": 4.950998185117967, "step": 6820}, {"loss": 0.567, "grad_norm": 0.8776047825813293, "learning_rate": 0.0002, "epoch": 4.9582577132486385, "step": 6830}, {"loss": 0.5764, "grad_norm": 0.9700555205345154, "learning_rate": 0.0002, "epoch": 4.9655172413793105, "step": 6840}, {"loss": 0.5396, "grad_norm": 1.2713534832000732, "learning_rate": 0.0002, "epoch": 4.972776769509982, "step": 6850}, {"loss": 0.5451, "grad_norm": 0.9855955243110657, "learning_rate": 0.0002, "epoch": 4.980036297640654, "step": 6860}, {"loss": 0.5884, "grad_norm": 0.8734853863716125, "learning_rate": 0.0002, "epoch": 4.987295825771325, "step": 6870}, {"loss": 0.5189, "grad_norm": 0.8065403699874878, "learning_rate": 0.0002, "epoch": 4.994555353901997, "step": 6880}, {"eval_loss": 1.3302682638168335, "eval_runtime": 46.2496, "eval_samples_per_second": 9.427, "eval_steps_per_second": 1.189, "epoch": 4.999637023593467, "step": 6887}, {"loss": 0.4889, "grad_norm": 0.5163813829421997, "learning_rate": 0.0002, "epoch": 5.001814882032668, "step": 6890}, {"loss": 0.3545, "grad_norm": 1.1496137380599976, "learning_rate": 0.0002, "epoch": 5.00907441016334, "step": 6900}, {"loss": 0.39, "grad_norm": 1.0133885145187378, "learning_rate": 0.0002, "epoch": 5.016333938294011, "step": 6910}, {"loss": 0.3693, "grad_norm": 0.9479621052742004, "learning_rate": 0.0002, "epoch": 5.023593466424682, "step": 6920}, {"loss": 0.4012, "grad_norm": 0.8587583303451538, "learning_rate": 0.0002, "epoch": 5.030852994555354, "step": 6930}, {"loss": 0.3428, "grad_norm": 1.3314697742462158, "learning_rate": 0.0002, "epoch": 5.038112522686025, "step": 6940}, {"loss": 0.3909, "grad_norm": 1.195448875427246, "learning_rate": 0.0002, "epoch": 5.045372050816697, "step": 6950}, {"loss": 0.3322, "grad_norm": 1.2482256889343262, "learning_rate": 0.0002, "epoch": 5.052631578947368, "step": 6960}, {"loss": 0.3893, "grad_norm": 1.2011528015136719, "learning_rate": 0.0002, "epoch": 5.05989110707804, "step": 6970}, {"loss": 0.3265, "grad_norm": 1.3997188806533813, "learning_rate": 0.0002, "epoch": 5.067150635208711, "step": 6980}, {"loss": 0.3716, "grad_norm": 1.2147513628005981, "learning_rate": 0.0002, "epoch": 5.074410163339383, "step": 6990}, {"loss": 0.4053, "grad_norm": 1.6030137538909912, "learning_rate": 0.0002, "epoch": 5.081669691470054, "step": 7000}, {"loss": 0.3665, "grad_norm": 0.9466970562934875, "learning_rate": 0.0002, "epoch": 5.088929219600726, "step": 7010}, {"loss": 0.3451, "grad_norm": 1.4593411684036255, "learning_rate": 0.0002, "epoch": 5.096188747731397, "step": 7020}, {"loss": 0.3843, "grad_norm": 1.2196033000946045, "learning_rate": 0.0002, "epoch": 5.103448275862069, "step": 7030}, {"loss": 0.3896, "grad_norm": 1.1341328620910645, "learning_rate": 0.0002, "epoch": 5.1107078039927405, "step": 7040}, {"loss": 0.3627, "grad_norm": 1.2248976230621338, "learning_rate": 0.0002, "epoch": 5.117967332123412, "step": 7050}, {"loss": 0.3784, "grad_norm": 1.1620593070983887, "learning_rate": 0.0002, "epoch": 5.125226860254084, "step": 7060}, {"loss": 0.3678, "grad_norm": 0.9300723671913147, "learning_rate": 0.0002, "epoch": 5.132486388384755, "step": 7070}, {"loss": 0.3756, "grad_norm": 1.2265169620513916, "learning_rate": 0.0002, "epoch": 5.139745916515427, "step": 7080}, {"loss": 0.3595, "grad_norm": 1.4430373907089233, "learning_rate": 0.0002, "epoch": 5.147005444646098, "step": 7090}, {"loss": 0.3788, "grad_norm": 1.0821576118469238, "learning_rate": 0.0002, "epoch": 5.15426497277677, "step": 7100}, {"loss": 0.383, "grad_norm": 1.2574739456176758, "learning_rate": 0.0002, "epoch": 5.161524500907441, "step": 7110}, {"loss": 0.3692, "grad_norm": 1.1806069612503052, "learning_rate": 0.0002, "epoch": 5.168784029038113, "step": 7120}, {"loss": 0.3978, "grad_norm": 0.9900956153869629, "learning_rate": 0.0002, "epoch": 5.176043557168784, "step": 7130}, {"loss": 0.4358, "grad_norm": 1.2414425611495972, "learning_rate": 0.0002, "epoch": 5.183303085299456, "step": 7140}, {"loss": 0.3485, "grad_norm": 0.8220699429512024, "learning_rate": 0.0002, "epoch": 5.190562613430127, "step": 7150}, {"loss": 0.3517, "grad_norm": 1.29408860206604, "learning_rate": 0.0002, "epoch": 5.197822141560798, "step": 7160}, {"loss": 0.3405, "grad_norm": 0.8510639071464539, "learning_rate": 0.0002, "epoch": 5.20508166969147, "step": 7170}, {"loss": 0.4233, "grad_norm": 1.3448902368545532, "learning_rate": 0.0002, "epoch": 5.212341197822141, "step": 7180}, {"loss": 0.3808, "grad_norm": 1.054451584815979, "learning_rate": 0.0002, "epoch": 5.219600725952813, "step": 7190}, {"loss": 0.368, "grad_norm": 1.3752713203430176, "learning_rate": 0.0002, "epoch": 5.226860254083484, "step": 7200}, {"loss": 0.3844, "grad_norm": 1.4848095178604126, "learning_rate": 0.0002, "epoch": 5.234119782214156, "step": 7210}, {"loss": 0.4187, "grad_norm": 1.428842544555664, "learning_rate": 0.0002, "epoch": 5.241379310344827, "step": 7220}, {"loss": 0.3778, "grad_norm": 1.1703591346740723, "learning_rate": 0.0002, "epoch": 5.248638838475499, "step": 7230}, {"loss": 0.417, "grad_norm": 1.2180451154708862, "learning_rate": 0.0002, "epoch": 5.2558983666061705, "step": 7240}, {"loss": 0.3656, "grad_norm": 1.094045877456665, "learning_rate": 0.0002, "epoch": 5.2631578947368425, "step": 7250}, {"loss": 0.4331, "grad_norm": 0.9545766115188599, "learning_rate": 0.0002, "epoch": 5.270417422867514, "step": 7260}, {"loss": 0.3642, "grad_norm": 0.8356652855873108, "learning_rate": 0.0002, "epoch": 5.277676950998185, "step": 7270}, {"loss": 0.3576, "grad_norm": 1.148160457611084, "learning_rate": 0.0002, "epoch": 5.284936479128857, "step": 7280}, {"loss": 0.4178, "grad_norm": 1.2009977102279663, "learning_rate": 0.0002, "epoch": 5.292196007259528, "step": 7290}, {"loss": 0.3977, "grad_norm": 1.3283873796463013, "learning_rate": 0.0002, "epoch": 5.2994555353902, "step": 7300}, {"loss": 0.3853, "grad_norm": 0.9850481748580933, "learning_rate": 0.0002, "epoch": 5.306715063520871, "step": 7310}, {"loss": 0.3645, "grad_norm": 1.367550015449524, "learning_rate": 0.0002, "epoch": 5.313974591651543, "step": 7320}, {"loss": 0.3898, "grad_norm": 0.8602936863899231, "learning_rate": 0.0002, "epoch": 5.321234119782214, "step": 7330}, {"loss": 0.4173, "grad_norm": 1.1130679845809937, "learning_rate": 0.0002, "epoch": 5.328493647912886, "step": 7340}, {"loss": 0.3642, "grad_norm": 1.3002253770828247, "learning_rate": 0.0002, "epoch": 5.335753176043557, "step": 7350}, {"loss": 0.4138, "grad_norm": 1.6235289573669434, "learning_rate": 0.0002, "epoch": 5.343012704174229, "step": 7360}, {"loss": 0.4779, "grad_norm": 1.156379222869873, "learning_rate": 0.0002, "epoch": 5.3502722323049, "step": 7370}, {"loss": 0.3222, "grad_norm": 1.0569308996200562, "learning_rate": 0.0002, "epoch": 5.357531760435572, "step": 7380}, {"loss": 0.3573, "grad_norm": 1.6674021482467651, "learning_rate": 0.0002, "epoch": 5.364791288566243, "step": 7390}, {"loss": 0.4325, "grad_norm": 1.2962018251419067, "learning_rate": 0.0002, "epoch": 5.372050816696914, "step": 7400}, {"loss": 0.3809, "grad_norm": 1.1904195547103882, "learning_rate": 0.0002, "epoch": 5.379310344827586, "step": 7410}, {"loss": 0.3728, "grad_norm": 1.316245675086975, "learning_rate": 0.0002, "epoch": 5.386569872958257, "step": 7420}, {"loss": 0.4096, "grad_norm": 1.127570390701294, "learning_rate": 0.0002, "epoch": 5.393829401088929, "step": 7430}, {"loss": 0.3933, "grad_norm": 1.3895777463912964, "learning_rate": 0.0002, "epoch": 5.4010889292196005, "step": 7440}, {"loss": 0.4085, "grad_norm": 1.626830816268921, "learning_rate": 0.0002, "epoch": 5.4083484573502725, "step": 7450}, {"loss": 0.4186, "grad_norm": 1.3703926801681519, "learning_rate": 0.0002, "epoch": 5.415607985480944, "step": 7460}, {"loss": 0.3517, "grad_norm": 1.3854840993881226, "learning_rate": 0.0002, "epoch": 5.422867513611616, "step": 7470}, {"loss": 0.3714, "grad_norm": 1.107065200805664, "learning_rate": 0.0002, "epoch": 5.430127041742287, "step": 7480}, {"loss": 0.3855, "grad_norm": 0.7843456268310547, "learning_rate": 0.0002, "epoch": 5.437386569872959, "step": 7490}, {"loss": 0.4159, "grad_norm": 1.6692372560501099, "learning_rate": 0.0002, "epoch": 5.44464609800363, "step": 7500}, {"loss": 0.4185, "grad_norm": 1.2583858966827393, "learning_rate": 0.0002, "epoch": 5.451905626134302, "step": 7510}, {"loss": 0.4401, "grad_norm": 1.6827000379562378, "learning_rate": 0.0002, "epoch": 5.459165154264973, "step": 7520}, {"loss": 0.397, "grad_norm": 1.6680560111999512, "learning_rate": 0.0002, "epoch": 5.466424682395644, "step": 7530}, {"loss": 0.4193, "grad_norm": 1.3696072101593018, "learning_rate": 0.0002, "epoch": 5.473684210526316, "step": 7540}, {"loss": 0.4244, "grad_norm": 1.4523496627807617, "learning_rate": 0.0002, "epoch": 5.480943738656987, "step": 7550}, {"loss": 0.3609, "grad_norm": 1.3432692289352417, "learning_rate": 0.0002, "epoch": 5.488203266787659, "step": 7560}, {"loss": 0.3675, "grad_norm": 1.363818645477295, "learning_rate": 0.0002, "epoch": 5.49546279491833, "step": 7570}, {"loss": 0.3726, "grad_norm": 1.0176721811294556, "learning_rate": 0.0002, "epoch": 5.502722323049002, "step": 7580}, {"loss": 0.3751, "grad_norm": 1.1625547409057617, "learning_rate": 0.0002, "epoch": 5.509981851179673, "step": 7590}, {"loss": 0.433, "grad_norm": 1.2480388879776, "learning_rate": 0.0002, "epoch": 5.517241379310345, "step": 7600}, {"loss": 0.4511, "grad_norm": 1.341509222984314, "learning_rate": 0.0002, "epoch": 5.524500907441016, "step": 7610}, {"loss": 0.4642, "grad_norm": 1.7048436403274536, "learning_rate": 0.0002, "epoch": 5.531760435571687, "step": 7620}, {"loss": 0.4509, "grad_norm": 1.1435480117797852, "learning_rate": 0.0002, "epoch": 5.539019963702359, "step": 7630}, {"loss": 0.4528, "grad_norm": 1.2381842136383057, "learning_rate": 0.0002, "epoch": 5.5462794918330305, "step": 7640}, {"loss": 0.4496, "grad_norm": 1.50786292552948, "learning_rate": 0.0002, "epoch": 5.5535390199637025, "step": 7650}, {"loss": 0.4242, "grad_norm": 1.2263519763946533, "learning_rate": 0.0002, "epoch": 5.560798548094374, "step": 7660}, {"loss": 0.418, "grad_norm": 1.2864696979522705, "learning_rate": 0.0002, "epoch": 5.568058076225046, "step": 7670}, {"loss": 0.3832, "grad_norm": 1.4443191289901733, "learning_rate": 0.0002, "epoch": 5.575317604355717, "step": 7680}, {"loss": 0.3964, "grad_norm": 1.3360971212387085, "learning_rate": 0.0002, "epoch": 5.582577132486389, "step": 7690}, {"loss": 0.4639, "grad_norm": 1.391828179359436, "learning_rate": 0.0002, "epoch": 5.58983666061706, "step": 7700}, {"loss": 0.4722, "grad_norm": 1.3699384927749634, "learning_rate": 0.0002, "epoch": 5.597096188747732, "step": 7710}, {"loss": 0.4302, "grad_norm": 1.3778468370437622, "learning_rate": 0.0002, "epoch": 5.604355716878403, "step": 7720}, {"loss": 0.4179, "grad_norm": 1.1009501218795776, "learning_rate": 0.0002, "epoch": 5.611615245009075, "step": 7730}, {"loss": 0.4104, "grad_norm": 1.0410021543502808, "learning_rate": 0.0002, "epoch": 5.618874773139746, "step": 7740}, {"loss": 0.4489, "grad_norm": 1.1012226343154907, "learning_rate": 0.0002, "epoch": 5.626134301270417, "step": 7750}, {"loss": 0.4544, "grad_norm": 1.3246384859085083, "learning_rate": 0.0002, "epoch": 5.633393829401089, "step": 7760}, {"loss": 0.4381, "grad_norm": 1.4301716089248657, "learning_rate": 0.0002, "epoch": 5.64065335753176, "step": 7770}, {"loss": 0.4297, "grad_norm": 1.1368978023529053, "learning_rate": 0.0002, "epoch": 5.647912885662432, "step": 7780}, {"loss": 0.4063, "grad_norm": 1.3493064641952515, "learning_rate": 0.0002, "epoch": 5.655172413793103, "step": 7790}, {"loss": 0.4562, "grad_norm": 1.3328721523284912, "learning_rate": 0.0002, "epoch": 5.662431941923775, "step": 7800}, {"loss": 0.4075, "grad_norm": 1.3235671520233154, "learning_rate": 0.0002, "epoch": 5.669691470054446, "step": 7810}, {"loss": 0.4589, "grad_norm": 1.1961841583251953, "learning_rate": 0.0002, "epoch": 5.676950998185118, "step": 7820}, {"loss": 0.4503, "grad_norm": 1.4189636707305908, "learning_rate": 0.0002, "epoch": 5.684210526315789, "step": 7830}, {"loss": 0.4452, "grad_norm": 1.3551312685012817, "learning_rate": 0.0002, "epoch": 5.691470054446461, "step": 7840}, {"loss": 0.4268, "grad_norm": 1.449987769126892, "learning_rate": 0.0002, "epoch": 5.6987295825771325, "step": 7850}, {"loss": 0.4141, "grad_norm": 1.1225156784057617, "learning_rate": 0.0002, "epoch": 5.7059891107078045, "step": 7860}, {"loss": 0.41, "grad_norm": 1.4734594821929932, "learning_rate": 0.0002, "epoch": 5.713248638838476, "step": 7870}, {"loss": 0.4013, "grad_norm": 1.3793359994888306, "learning_rate": 0.0002, "epoch": 5.720508166969147, "step": 7880}, {"loss": 0.4065, "grad_norm": 1.2431834936141968, "learning_rate": 0.0002, "epoch": 5.727767695099819, "step": 7890}, {"loss": 0.4595, "grad_norm": 1.1158313751220703, "learning_rate": 0.0002, "epoch": 5.73502722323049, "step": 7900}, {"loss": 0.4342, "grad_norm": 1.212248682975769, "learning_rate": 0.0002, "epoch": 5.742286751361162, "step": 7910}, {"loss": 0.4611, "grad_norm": 1.5259995460510254, "learning_rate": 0.0002, "epoch": 5.749546279491833, "step": 7920}, {"loss": 0.4483, "grad_norm": 1.3909121751785278, "learning_rate": 0.0002, "epoch": 5.756805807622505, "step": 7930}, {"loss": 0.4325, "grad_norm": 1.2511249780654907, "learning_rate": 0.0002, "epoch": 5.764065335753176, "step": 7940}, {"loss": 0.4048, "grad_norm": 1.2511906623840332, "learning_rate": 0.0002, "epoch": 5.771324863883848, "step": 7950}, {"loss": 0.3715, "grad_norm": 1.1489921808242798, "learning_rate": 0.0002, "epoch": 5.778584392014519, "step": 7960}, {"loss": 0.4196, "grad_norm": 1.028943419456482, "learning_rate": 0.0002, "epoch": 5.78584392014519, "step": 7970}, {"loss": 0.4334, "grad_norm": 1.0820423364639282, "learning_rate": 0.0002, "epoch": 5.793103448275862, "step": 7980}, {"loss": 0.3917, "grad_norm": 1.296520471572876, "learning_rate": 0.0002, "epoch": 5.800362976406533, "step": 7990}, {"loss": 0.4509, "grad_norm": 1.3597749471664429, "learning_rate": 0.0002, "epoch": 5.807622504537205, "step": 8000}, {"loss": 0.4535, "grad_norm": 0.8741790652275085, "learning_rate": 0.0002, "epoch": 5.814882032667876, "step": 8010}, {"loss": 0.4239, "grad_norm": 1.1471822261810303, "learning_rate": 0.0002, "epoch": 5.822141560798548, "step": 8020}, {"loss": 0.5042, "grad_norm": 1.2997334003448486, "learning_rate": 0.0002, "epoch": 5.829401088929219, "step": 8030}, {"loss": 0.4758, "grad_norm": 1.1027175188064575, "learning_rate": 0.0002, "epoch": 5.836660617059891, "step": 8040}, {"loss": 0.4192, "grad_norm": 1.2695307731628418, "learning_rate": 0.0002, "epoch": 5.8439201451905625, "step": 8050}, {"loss": 0.5173, "grad_norm": 1.5275461673736572, "learning_rate": 0.0002, "epoch": 5.8511796733212345, "step": 8060}, {"loss": 0.5012, "grad_norm": 1.3059501647949219, "learning_rate": 0.0002, "epoch": 5.8584392014519056, "step": 8070}, {"loss": 0.4425, "grad_norm": 1.57442045211792, "learning_rate": 0.0002, "epoch": 5.8656987295825775, "step": 8080}, {"loss": 0.4261, "grad_norm": 1.119564414024353, "learning_rate": 0.0002, "epoch": 5.872958257713249, "step": 8090}, {"loss": 0.465, "grad_norm": 1.6517373323440552, "learning_rate": 0.0002, "epoch": 5.88021778584392, "step": 8100}, {"loss": 0.4406, "grad_norm": 1.4093554019927979, "learning_rate": 0.0002, "epoch": 5.887477313974592, "step": 8110}, {"loss": 0.4433, "grad_norm": 1.278843641281128, "learning_rate": 0.0002, "epoch": 5.894736842105263, "step": 8120}, {"loss": 0.4007, "grad_norm": 1.2042944431304932, "learning_rate": 0.0002, "epoch": 5.901996370235935, "step": 8130}, {"loss": 0.3972, "grad_norm": 1.1788326501846313, "learning_rate": 0.0002, "epoch": 5.909255898366606, "step": 8140}, {"loss": 0.4506, "grad_norm": 1.4364569187164307, "learning_rate": 0.0002, "epoch": 5.916515426497278, "step": 8150}, {"loss": 0.4651, "grad_norm": 1.1704283952713013, "learning_rate": 0.0002, "epoch": 5.923774954627949, "step": 8160}, {"loss": 0.3972, "grad_norm": 1.040814995765686, "learning_rate": 0.0002, "epoch": 5.931034482758621, "step": 8170}, {"loss": 0.4038, "grad_norm": 1.1367416381835938, "learning_rate": 0.0002, "epoch": 5.938294010889292, "step": 8180}, {"loss": 0.4387, "grad_norm": 1.3401511907577515, "learning_rate": 0.0002, "epoch": 5.945553539019964, "step": 8190}, {"loss": 0.4396, "grad_norm": 1.1154041290283203, "learning_rate": 0.0002, "epoch": 5.952813067150635, "step": 8200}, {"loss": 0.4744, "grad_norm": 1.426089882850647, "learning_rate": 0.0002, "epoch": 5.960072595281307, "step": 8210}, {"loss": 0.4105, "grad_norm": 1.3170222043991089, "learning_rate": 0.0002, "epoch": 5.967332123411978, "step": 8220}, {"loss": 0.4137, "grad_norm": 1.1960029602050781, "learning_rate": 0.0002, "epoch": 5.974591651542649, "step": 8230}, {"loss": 0.423, "grad_norm": 1.0843931436538696, "learning_rate": 0.0002, "epoch": 5.981851179673321, "step": 8240}, {"loss": 0.459, "grad_norm": 1.050421118736267, "learning_rate": 0.0002, "epoch": 5.9891107078039925, "step": 8250}, {"loss": 0.3993, "grad_norm": 1.0183138847351074, "learning_rate": 0.0002, "epoch": 5.9963702359346644, "step": 8260}, {"eval_loss": 1.4677470922470093, "eval_runtime": 46.2504, "eval_samples_per_second": 9.427, "eval_steps_per_second": 1.189, "epoch": 6.0, "step": 8265}, {"loss": 0.3947, "grad_norm": 1.1702998876571655, "learning_rate": 0.0002, "epoch": 6.0036297640653356, "step": 8270}, {"loss": 0.2854, "grad_norm": 1.5389727354049683, "learning_rate": 0.0002, "epoch": 6.0108892921960075, "step": 8280}, {"loss": 0.2603, "grad_norm": 1.502568244934082, "learning_rate": 0.0002, "epoch": 6.018148820326679, "step": 8290}, {"loss": 0.3329, "grad_norm": 1.3846043348312378, "learning_rate": 0.0002, "epoch": 6.025408348457351, "step": 8300}, {"loss": 0.2651, "grad_norm": 1.173553228378296, "learning_rate": 0.0002, "epoch": 6.032667876588022, "step": 8310}, {"loss": 0.3142, "grad_norm": 1.5325932502746582, "learning_rate": 0.0002, "epoch": 6.039927404718694, "step": 8320}, {"loss": 0.2511, "grad_norm": 1.303783655166626, "learning_rate": 0.0002, "epoch": 6.047186932849365, "step": 8330}, {"loss": 0.2352, "grad_norm": 0.9408994913101196, "learning_rate": 0.0002, "epoch": 6.054446460980036, "step": 8340}, {"loss": 0.2548, "grad_norm": 1.5430388450622559, "learning_rate": 0.0002, "epoch": 6.061705989110708, "step": 8350}, {"loss": 0.2682, "grad_norm": 0.8765342235565186, "learning_rate": 0.0002, "epoch": 6.068965517241379, "step": 8360}, {"loss": 0.2614, "grad_norm": 1.2363157272338867, "learning_rate": 0.0002, "epoch": 6.076225045372051, "step": 8370}, {"loss": 0.294, "grad_norm": 1.21284818649292, "learning_rate": 0.0002, "epoch": 6.083484573502722, "step": 8380}, {"loss": 0.2498, "grad_norm": 1.3261712789535522, "learning_rate": 0.0002, "epoch": 6.090744101633394, "step": 8390}, {"loss": 0.2649, "grad_norm": 1.077317714691162, "learning_rate": 0.0002, "epoch": 6.098003629764065, "step": 8400}, {"loss": 0.269, "grad_norm": 0.9873808026313782, "learning_rate": 0.0002, "epoch": 6.105263157894737, "step": 8410}, {"loss": 0.2736, "grad_norm": 1.032258152961731, "learning_rate": 0.0002, "epoch": 6.112522686025408, "step": 8420}, {"loss": 0.2854, "grad_norm": 1.1014811992645264, "learning_rate": 0.0002, "epoch": 6.11978221415608, "step": 8430}, {"loss": 0.2924, "grad_norm": 1.4264203310012817, "learning_rate": 0.0002, "epoch": 6.127041742286751, "step": 8440}, {"loss": 0.3388, "grad_norm": 1.4086531400680542, "learning_rate": 0.0002, "epoch": 6.1343012704174225, "step": 8450}, {"loss": 0.2786, "grad_norm": 1.3842453956604004, "learning_rate": 0.0002, "epoch": 6.1415607985480944, "step": 8460}, {"loss": 0.3201, "grad_norm": 1.4356757402420044, "learning_rate": 0.0002, "epoch": 6.1488203266787655, "step": 8470}, {"loss": 0.2908, "grad_norm": 1.193315029144287, "learning_rate": 0.0002, "epoch": 6.1560798548094375, "step": 8480}, {"loss": 0.342, "grad_norm": 1.0623924732208252, "learning_rate": 0.0002, "epoch": 6.163339382940109, "step": 8490}, {"loss": 0.3257, "grad_norm": 1.5484434366226196, "learning_rate": 0.0002, "epoch": 6.170598911070781, "step": 8500}, {"loss": 0.2861, "grad_norm": 1.3520029783248901, "learning_rate": 0.0002, "epoch": 6.177858439201452, "step": 8510}, {"loss": 0.3242, "grad_norm": 1.2773103713989258, "learning_rate": 0.0002, "epoch": 6.185117967332124, "step": 8520}, {"loss": 0.3108, "grad_norm": 1.4675105810165405, "learning_rate": 0.0002, "epoch": 6.192377495462795, "step": 8530}, {"loss": 0.3044, "grad_norm": 1.2118732929229736, "learning_rate": 0.0002, "epoch": 6.199637023593467, "step": 8540}, {"loss": 0.2726, "grad_norm": 1.264024257659912, "learning_rate": 0.0002, "epoch": 6.206896551724138, "step": 8550}, {"loss": 0.306, "grad_norm": 1.406931757926941, "learning_rate": 0.0002, "epoch": 6.21415607985481, "step": 8560}, {"loss": 0.2904, "grad_norm": 1.385459542274475, "learning_rate": 0.0002, "epoch": 6.221415607985481, "step": 8570}, {"loss": 0.3413, "grad_norm": 1.9336168766021729, "learning_rate": 0.0002, "epoch": 6.228675136116152, "step": 8580}, {"loss": 0.2769, "grad_norm": 0.9880136847496033, "learning_rate": 0.0002, "epoch": 6.235934664246824, "step": 8590}, {"loss": 0.3035, "grad_norm": 1.3870339393615723, "learning_rate": 0.0002, "epoch": 6.243194192377495, "step": 8600}, {"loss": 0.286, "grad_norm": 1.2303647994995117, "learning_rate": 0.0002, "epoch": 6.250453720508167, "step": 8610}, {"loss": 0.3113, "grad_norm": 1.5406211614608765, "learning_rate": 0.0002, "epoch": 6.257713248638838, "step": 8620}, {"loss": 0.292, "grad_norm": 1.2436790466308594, "learning_rate": 0.0002, "epoch": 6.26497277676951, "step": 8630}, {"loss": 0.3102, "grad_norm": 0.8844212293624878, "learning_rate": 0.0002, "epoch": 6.272232304900181, "step": 8640}, {"loss": 0.3373, "grad_norm": 1.2846336364746094, "learning_rate": 0.0002, "epoch": 6.279491833030853, "step": 8650}, {"loss": 0.3535, "grad_norm": 1.593814730644226, "learning_rate": 0.0002, "epoch": 6.286751361161524, "step": 8660}, {"loss": 0.3413, "grad_norm": 1.2277469635009766, "learning_rate": 0.0002, "epoch": 6.2940108892921955, "step": 8670}, {"loss": 0.2958, "grad_norm": 1.2574384212493896, "learning_rate": 0.0002, "epoch": 6.3012704174228675, "step": 8680}, {"loss": 0.3251, "grad_norm": 1.335150957107544, "learning_rate": 0.0002, "epoch": 6.308529945553539, "step": 8690}, {"loss": 0.3009, "grad_norm": 1.3140437602996826, "learning_rate": 0.0002, "epoch": 6.315789473684211, "step": 8700}, {"loss": 0.2783, "grad_norm": 1.1689209938049316, "learning_rate": 0.0002, "epoch": 6.323049001814882, "step": 8710}, {"loss": 0.3476, "grad_norm": 1.6448503732681274, "learning_rate": 0.0002, "epoch": 6.330308529945554, "step": 8720}, {"loss": 0.2934, "grad_norm": 0.9944145679473877, "learning_rate": 0.0002, "epoch": 6.337568058076225, "step": 8730}, {"loss": 0.3315, "grad_norm": 1.1775634288787842, "learning_rate": 0.0002, "epoch": 6.344827586206897, "step": 8740}, {"loss": 0.3514, "grad_norm": 1.8438639640808105, "learning_rate": 0.0002, "epoch": 6.352087114337568, "step": 8750}, {"loss": 0.303, "grad_norm": 1.062495470046997, "learning_rate": 0.0002, "epoch": 6.35934664246824, "step": 8760}, {"loss": 0.2737, "grad_norm": 1.3224315643310547, "learning_rate": 0.0002, "epoch": 6.366606170598911, "step": 8770}, {"loss": 0.3445, "grad_norm": 1.399844765663147, "learning_rate": 0.0002, "epoch": 6.373865698729583, "step": 8780}, {"loss": 0.3277, "grad_norm": 1.0409915447235107, "learning_rate": 0.0002, "epoch": 6.381125226860254, "step": 8790}, {"loss": 0.3218, "grad_norm": 1.5657726526260376, "learning_rate": 0.0002, "epoch": 6.388384754990925, "step": 8800}, {"loss": 0.3031, "grad_norm": 1.4098644256591797, "learning_rate": 0.0002, "epoch": 6.395644283121597, "step": 8810}, {"loss": 0.3133, "grad_norm": 1.5154732465744019, "learning_rate": 0.0002, "epoch": 6.402903811252268, "step": 8820}, {"loss": 0.3111, "grad_norm": 1.1139698028564453, "learning_rate": 0.0002, "epoch": 6.41016333938294, "step": 8830}, {"loss": 0.3553, "grad_norm": 1.4149729013442993, "learning_rate": 0.0002, "epoch": 6.417422867513611, "step": 8840}, {"loss": 0.287, "grad_norm": 1.2632299661636353, "learning_rate": 0.0002, "epoch": 6.424682395644283, "step": 8850}, {"loss": 0.3198, "grad_norm": 1.6636109352111816, "learning_rate": 0.0002, "epoch": 6.431941923774954, "step": 8860}, {"loss": 0.3749, "grad_norm": 1.4149386882781982, "learning_rate": 0.0002, "epoch": 6.439201451905626, "step": 8870}, {"loss": 0.3504, "grad_norm": 1.1396206617355347, "learning_rate": 0.0002, "epoch": 6.4464609800362975, "step": 8880}, {"loss": 0.3328, "grad_norm": 1.2188775539398193, "learning_rate": 0.0002, "epoch": 6.4537205081669695, "step": 8890}, {"loss": 0.3427, "grad_norm": 0.9740369319915771, "learning_rate": 0.0002, "epoch": 6.460980036297641, "step": 8900}, {"loss": 0.3223, "grad_norm": 1.228569746017456, "learning_rate": 0.0002, "epoch": 6.468239564428313, "step": 8910}, {"loss": 0.3151, "grad_norm": 1.5019789934158325, "learning_rate": 0.0002, "epoch": 6.475499092558984, "step": 8920}, {"loss": 0.2916, "grad_norm": 1.3320101499557495, "learning_rate": 0.0002, "epoch": 6.482758620689655, "step": 8930}, {"loss": 0.298, "grad_norm": 1.5551502704620361, "learning_rate": 0.0002, "epoch": 6.490018148820327, "step": 8940}, {"loss": 0.3238, "grad_norm": 1.470131754875183, "learning_rate": 0.0002, "epoch": 6.497277676950998, "step": 8950}, {"loss": 0.2808, "grad_norm": 1.1803025007247925, "learning_rate": 0.0002, "epoch": 6.50453720508167, "step": 8960}, {"loss": 0.3025, "grad_norm": 1.3505640029907227, "learning_rate": 0.0002, "epoch": 6.511796733212341, "step": 8970}, {"loss": 0.3124, "grad_norm": 1.13093900680542, "learning_rate": 0.0002, "epoch": 6.519056261343013, "step": 8980}, {"loss": 0.3454, "grad_norm": 1.347386121749878, "learning_rate": 0.0002, "epoch": 6.526315789473684, "step": 8990}, {"loss": 0.3532, "grad_norm": 1.7879165410995483, "learning_rate": 0.0002, "epoch": 6.533575317604356, "step": 9000}, {"loss": 0.3382, "grad_norm": 1.2168169021606445, "learning_rate": 0.0002, "epoch": 6.540834845735027, "step": 9010}, {"loss": 0.3413, "grad_norm": 1.1758877038955688, "learning_rate": 0.0002, "epoch": 6.548094373865698, "step": 9020}, {"loss": 0.2806, "grad_norm": 1.7366445064544678, "learning_rate": 0.0002, "epoch": 6.55535390199637, "step": 9030}, {"loss": 0.3437, "grad_norm": 1.5919222831726074, "learning_rate": 0.0002, "epoch": 6.562613430127042, "step": 9040}, {"loss": 0.3261, "grad_norm": 1.336863398551941, "learning_rate": 0.0002, "epoch": 6.569872958257713, "step": 9050}, {"loss": 0.3103, "grad_norm": 1.1769421100616455, "learning_rate": 0.0002, "epoch": 6.577132486388384, "step": 9060}, {"loss": 0.3295, "grad_norm": 1.0048751831054688, "learning_rate": 0.0002, "epoch": 6.584392014519056, "step": 9070}, {"loss": 0.3156, "grad_norm": 1.5268515348434448, "learning_rate": 0.0002, "epoch": 6.5916515426497275, "step": 9080}, {"loss": 0.3752, "grad_norm": 1.434610366821289, "learning_rate": 0.0002, "epoch": 6.5989110707803995, "step": 9090}, {"loss": 0.3375, "grad_norm": 1.1151410341262817, "learning_rate": 0.0002, "epoch": 6.606170598911071, "step": 9100}, {"loss": 0.363, "grad_norm": 1.6690642833709717, "learning_rate": 0.0002, "epoch": 6.613430127041743, "step": 9110}, {"loss": 0.3703, "grad_norm": 1.4495552778244019, "learning_rate": 0.0002, "epoch": 6.620689655172414, "step": 9120}, {"loss": 0.3648, "grad_norm": 1.377621054649353, "learning_rate": 0.0002, "epoch": 6.627949183303086, "step": 9130}, {"loss": 0.3766, "grad_norm": 1.5459434986114502, "learning_rate": 0.0002, "epoch": 6.635208711433757, "step": 9140}, {"loss": 0.3196, "grad_norm": 1.0920850038528442, "learning_rate": 0.0002, "epoch": 6.642468239564428, "step": 9150}, {"loss": 0.3505, "grad_norm": 1.6708381175994873, "learning_rate": 0.0002, "epoch": 6.6497277676951, "step": 9160}, {"loss": 0.368, "grad_norm": 1.747514009475708, "learning_rate": 0.0002, "epoch": 6.656987295825771, "step": 9170}, {"loss": 0.3099, "grad_norm": 1.133466362953186, "learning_rate": 0.0002, "epoch": 6.664246823956443, "step": 9180}, {"loss": 0.3175, "grad_norm": 1.394358515739441, "learning_rate": 0.0002, "epoch": 6.671506352087114, "step": 9190}, {"loss": 0.2981, "grad_norm": 0.9258374571800232, "learning_rate": 0.0002, "epoch": 6.678765880217786, "step": 9200}, {"loss": 0.3723, "grad_norm": 1.3750739097595215, "learning_rate": 0.0002, "epoch": 6.686025408348457, "step": 9210}, {"loss": 0.3441, "grad_norm": 0.8604967594146729, "learning_rate": 0.0002, "epoch": 6.693284936479129, "step": 9220}, {"loss": 0.3775, "grad_norm": 1.6074559688568115, "learning_rate": 0.0002, "epoch": 6.7005444646098, "step": 9230}, {"loss": 0.3139, "grad_norm": 0.9576877355575562, "learning_rate": 0.0002, "epoch": 6.707803992740472, "step": 9240}, {"loss": 0.3633, "grad_norm": 1.7193048000335693, "learning_rate": 0.0002, "epoch": 6.715063520871143, "step": 9250}, {"loss": 0.3139, "grad_norm": 1.3131844997406006, "learning_rate": 0.0002, "epoch": 6.722323049001815, "step": 9260}, {"loss": 0.3121, "grad_norm": 1.2978184223175049, "learning_rate": 0.0002, "epoch": 6.729582577132486, "step": 9270}, {"loss": 0.3534, "grad_norm": 1.4792617559432983, "learning_rate": 0.0002, "epoch": 6.7368421052631575, "step": 9280}, {"loss": 0.3429, "grad_norm": 1.1265567541122437, "learning_rate": 0.0002, "epoch": 6.7441016333938295, "step": 9290}, {"loss": 0.3526, "grad_norm": 1.8553377389907837, "learning_rate": 0.0002, "epoch": 6.751361161524501, "step": 9300}, {"loss": 0.3666, "grad_norm": 1.3602519035339355, "learning_rate": 0.0002, "epoch": 6.758620689655173, "step": 9310}, {"loss": 0.2922, "grad_norm": 1.2874794006347656, "learning_rate": 0.0002, "epoch": 6.765880217785844, "step": 9320}, {"loss": 0.3816, "grad_norm": 1.4834712743759155, "learning_rate": 0.0002, "epoch": 6.773139745916516, "step": 9330}, {"loss": 0.3557, "grad_norm": 2.0824034214019775, "learning_rate": 0.0002, "epoch": 6.780399274047187, "step": 9340}, {"loss": 0.3174, "grad_norm": 1.2267698049545288, "learning_rate": 0.0002, "epoch": 6.787658802177859, "step": 9350}, {"loss": 0.3665, "grad_norm": 1.4485498666763306, "learning_rate": 0.0002, "epoch": 6.79491833030853, "step": 9360}, {"loss": 0.3676, "grad_norm": 1.3199396133422852, "learning_rate": 0.0002, "epoch": 6.802177858439201, "step": 9370}, {"loss": 0.298, "grad_norm": 1.2552456855773926, "learning_rate": 0.0002, "epoch": 6.809437386569873, "step": 9380}, {"loss": 0.3152, "grad_norm": 1.3895127773284912, "learning_rate": 0.0002, "epoch": 6.816696914700545, "step": 9390}, {"loss": 0.3375, "grad_norm": 1.7637823820114136, "learning_rate": 0.0002, "epoch": 6.823956442831216, "step": 9400}, {"loss": 0.3234, "grad_norm": 1.6004475355148315, "learning_rate": 0.0002, "epoch": 6.831215970961887, "step": 9410}, {"loss": 0.3364, "grad_norm": 1.4133695363998413, "learning_rate": 0.0002, "epoch": 6.838475499092559, "step": 9420}, {"loss": 0.3656, "grad_norm": 1.1583502292633057, "learning_rate": 0.0002, "epoch": 6.84573502722323, "step": 9430}, {"loss": 0.3499, "grad_norm": 1.3769075870513916, "learning_rate": 0.0002, "epoch": 6.852994555353902, "step": 9440}, {"loss": 0.3333, "grad_norm": 1.1831218004226685, "learning_rate": 0.0002, "epoch": 6.860254083484573, "step": 9450}, {"loss": 0.3501, "grad_norm": 1.6092621088027954, "learning_rate": 0.0002, "epoch": 6.867513611615245, "step": 9460}, {"loss": 0.3933, "grad_norm": 1.3850210905075073, "learning_rate": 0.0002, "epoch": 6.874773139745916, "step": 9470}, {"loss": 0.3868, "grad_norm": 1.4119619131088257, "learning_rate": 0.0002, "epoch": 6.882032667876588, "step": 9480}, {"loss": 0.3939, "grad_norm": 1.3494242429733276, "learning_rate": 0.0002, "epoch": 6.8892921960072595, "step": 9490}, {"loss": 0.3217, "grad_norm": 1.3130041360855103, "learning_rate": 0.0002, "epoch": 6.896551724137931, "step": 9500}, {"loss": 0.3738, "grad_norm": 1.169256329536438, "learning_rate": 0.0002, "epoch": 6.903811252268603, "step": 9510}, {"loss": 0.408, "grad_norm": 1.7475035190582275, "learning_rate": 0.0002, "epoch": 6.911070780399274, "step": 9520}, {"loss": 0.3407, "grad_norm": 1.440434217453003, "learning_rate": 0.0002, "epoch": 6.918330308529946, "step": 9530}, {"loss": 0.3707, "grad_norm": 1.6768704652786255, "learning_rate": 0.0002, "epoch": 6.925589836660617, "step": 9540}, {"loss": 0.3283, "grad_norm": 1.3720577955245972, "learning_rate": 0.0002, "epoch": 6.932849364791289, "step": 9550}, {"loss": 0.3257, "grad_norm": 1.8140523433685303, "learning_rate": 0.0002, "epoch": 6.94010889292196, "step": 9560}, {"loss": 0.3308, "grad_norm": 1.1828241348266602, "learning_rate": 0.0002, "epoch": 6.947368421052632, "step": 9570}, {"loss": 0.3536, "grad_norm": 1.2755135297775269, "learning_rate": 0.0002, "epoch": 6.954627949183303, "step": 9580}, {"loss": 0.3711, "grad_norm": 1.622009038925171, "learning_rate": 0.0002, "epoch": 6.961887477313975, "step": 9590}, {"loss": 0.3529, "grad_norm": 1.1543664932250977, "learning_rate": 0.0002, "epoch": 6.969147005444646, "step": 9600}, {"loss": 0.416, "grad_norm": 1.6755319833755493, "learning_rate": 0.0002, "epoch": 6.976406533575318, "step": 9610}, {"loss": 0.339, "grad_norm": 1.3726437091827393, "learning_rate": 0.0002, "epoch": 6.983666061705989, "step": 9620}, {"loss": 0.3709, "grad_norm": 1.1605958938598633, "learning_rate": 0.0002, "epoch": 6.99092558983666, "step": 9630}, {"loss": 0.371, "grad_norm": 1.5371781587600708, "learning_rate": 0.0002, "epoch": 6.998185117967332, "step": 9640}, {"eval_loss": 1.6280181407928467, "eval_runtime": 46.1964, "eval_samples_per_second": 9.438, "eval_steps_per_second": 1.191, "epoch": 6.999637023593467, "step": 9642}, {"loss": 0.2449, "grad_norm": 1.1645569801330566, "learning_rate": 0.0002, "epoch": 7.005444646098003, "step": 9650}, {"loss": 0.251, "grad_norm": 0.7663792967796326, "learning_rate": 0.0002, "epoch": 7.012704174228675, "step": 9660}, {"loss": 0.2553, "grad_norm": 1.5808782577514648, "learning_rate": 0.0002, "epoch": 7.019963702359346, "step": 9670}, {"loss": 0.2349, "grad_norm": 1.046607255935669, "learning_rate": 0.0002, "epoch": 7.027223230490018, "step": 9680}, {"loss": 0.2078, "grad_norm": 1.2008668184280396, "learning_rate": 0.0002, "epoch": 7.0344827586206895, "step": 9690}, {"loss": 0.2519, "grad_norm": 1.9596126079559326, "learning_rate": 0.0002, "epoch": 7.0417422867513615, "step": 9700}, {"loss": 0.2275, "grad_norm": 1.0400182008743286, "learning_rate": 0.0002, "epoch": 7.049001814882033, "step": 9710}, {"loss": 0.2136, "grad_norm": 1.3162504434585571, "learning_rate": 0.0002, "epoch": 7.056261343012705, "step": 9720}, {"loss": 0.2227, "grad_norm": 1.180074691772461, "learning_rate": 0.0002, "epoch": 7.063520871143376, "step": 9730}, {"loss": 0.2206, "grad_norm": 1.2093719244003296, "learning_rate": 0.0002, "epoch": 7.070780399274047, "step": 9740}, {"loss": 0.2387, "grad_norm": 1.4838900566101074, "learning_rate": 0.0002, "epoch": 7.078039927404719, "step": 9750}, {"loss": 0.2391, "grad_norm": 1.2319235801696777, "learning_rate": 0.0002, "epoch": 7.08529945553539, "step": 9760}, {"loss": 0.2624, "grad_norm": 1.2346558570861816, "learning_rate": 0.0002, "epoch": 7.092558983666062, "step": 9770}, {"loss": 0.2321, "grad_norm": 1.0748975276947021, "learning_rate": 0.0002, "epoch": 7.099818511796733, "step": 9780}, {"loss": 0.2751, "grad_norm": 1.0162630081176758, "learning_rate": 0.0002, "epoch": 7.107078039927405, "step": 9790}, {"loss": 0.2142, "grad_norm": 1.0014166831970215, "learning_rate": 0.0002, "epoch": 7.114337568058076, "step": 9800}, {"loss": 0.2439, "grad_norm": 1.0928411483764648, "learning_rate": 0.0002, "epoch": 7.121597096188748, "step": 9810}, {"loss": 0.2261, "grad_norm": 1.181496500968933, "learning_rate": 0.0002, "epoch": 7.128856624319419, "step": 9820}, {"loss": 0.2345, "grad_norm": 1.5846176147460938, "learning_rate": 0.0002, "epoch": 7.136116152450091, "step": 9830}, {"loss": 0.2282, "grad_norm": 0.8734912872314453, "learning_rate": 0.0002, "epoch": 7.143375680580762, "step": 9840}, {"loss": 0.2127, "grad_norm": 1.1599528789520264, "learning_rate": 0.0002, "epoch": 7.150635208711433, "step": 9850}, {"loss": 0.2614, "grad_norm": 1.168256402015686, "learning_rate": 0.0002, "epoch": 7.157894736842105, "step": 9860}, {"loss": 0.2418, "grad_norm": 1.4439860582351685, "learning_rate": 0.0002, "epoch": 7.165154264972776, "step": 9870}, {"loss": 0.2183, "grad_norm": 1.3615007400512695, "learning_rate": 0.0002, "epoch": 7.172413793103448, "step": 9880}, {"loss": 0.2366, "grad_norm": 1.1908115148544312, "learning_rate": 0.0002, "epoch": 7.1796733212341195, "step": 9890}, {"loss": 0.2338, "grad_norm": 1.452515959739685, "learning_rate": 0.0002, "epoch": 7.1869328493647915, "step": 9900}, {"loss": 0.2203, "grad_norm": 0.8387667536735535, "learning_rate": 0.0002, "epoch": 7.194192377495463, "step": 9910}, {"loss": 0.2117, "grad_norm": 1.3990435600280762, "learning_rate": 0.0002, "epoch": 7.201451905626135, "step": 9920}, {"loss": 0.2188, "grad_norm": 1.057800531387329, "learning_rate": 0.0002, "epoch": 7.208711433756806, "step": 9930}, {"loss": 0.2516, "grad_norm": 1.3718253374099731, "learning_rate": 0.0002, "epoch": 7.215970961887478, "step": 9940}, {"loss": 0.2084, "grad_norm": 1.2011432647705078, "learning_rate": 0.0002, "epoch": 7.223230490018149, "step": 9950}, {"loss": 0.2063, "grad_norm": 1.1608737707138062, "learning_rate": 0.0002, "epoch": 7.230490018148821, "step": 9960}, {"loss": 0.2275, "grad_norm": 1.7522791624069214, "learning_rate": 0.0002, "epoch": 7.237749546279492, "step": 9970}, {"loss": 0.2358, "grad_norm": 1.0787912607192993, "learning_rate": 0.0002, "epoch": 7.245009074410163, "step": 9980}, {"loss": 0.2361, "grad_norm": 1.8227689266204834, "learning_rate": 0.0002, "epoch": 7.252268602540835, "step": 9990}, {"loss": 0.2217, "grad_norm": 1.1438913345336914, "learning_rate": 0.0002, "epoch": 7.259528130671506, "step": 10000}, {"loss": 0.2438, "grad_norm": 1.331770420074463, "learning_rate": 0.0002, "epoch": 7.266787658802178, "step": 10010}, {"loss": 0.2622, "grad_norm": 1.2809056043624878, "learning_rate": 0.0002, "epoch": 7.274047186932849, "step": 10020}, {"loss": 0.2634, "grad_norm": 1.2245303392410278, "learning_rate": 0.0002, "epoch": 7.281306715063521, "step": 10030}, {"loss": 0.2706, "grad_norm": 1.2359435558319092, "learning_rate": 0.0002, "epoch": 7.288566243194192, "step": 10040}, {"loss": 0.2427, "grad_norm": 1.3707170486450195, "learning_rate": 0.0002, "epoch": 7.295825771324864, "step": 10050}, {"loss": 0.2497, "grad_norm": 1.7405836582183838, "learning_rate": 0.0002, "epoch": 7.303085299455535, "step": 10060}, {"loss": 0.2628, "grad_norm": 1.446069359779358, "learning_rate": 0.0002, "epoch": 7.310344827586207, "step": 10070}, {"loss": 0.2457, "grad_norm": 1.48823082447052, "learning_rate": 0.0002, "epoch": 7.317604355716878, "step": 10080}, {"loss": 0.2391, "grad_norm": 1.1720311641693115, "learning_rate": 0.0002, "epoch": 7.3248638838475495, "step": 10090}, {"loss": 0.2701, "grad_norm": 1.5485225915908813, "learning_rate": 0.0002, "epoch": 7.3321234119782215, "step": 10100}, {"loss": 0.2855, "grad_norm": 1.6018894910812378, "learning_rate": 0.0002, "epoch": 7.339382940108893, "step": 10110}, {"loss": 0.2662, "grad_norm": 1.4753694534301758, "learning_rate": 0.0002, "epoch": 7.346642468239565, "step": 10120}, {"loss": 0.2323, "grad_norm": 1.3604710102081299, "learning_rate": 0.0002, "epoch": 7.353901996370236, "step": 10130}, {"loss": 0.2664, "grad_norm": 1.5755873918533325, "learning_rate": 0.0002, "epoch": 7.361161524500908, "step": 10140}, {"loss": 0.215, "grad_norm": 0.9421692490577698, "learning_rate": 0.0002, "epoch": 7.368421052631579, "step": 10150}, {"loss": 0.297, "grad_norm": 1.3055956363677979, "learning_rate": 0.0002, "epoch": 7.375680580762251, "step": 10160}, {"loss": 0.2385, "grad_norm": 1.4764302968978882, "learning_rate": 0.0002, "epoch": 7.382940108892922, "step": 10170}, {"loss": 0.2724, "grad_norm": 1.3726946115493774, "learning_rate": 0.0002, "epoch": 7.390199637023594, "step": 10180}, {"loss": 0.2599, "grad_norm": 1.446473240852356, "learning_rate": 0.0002, "epoch": 7.397459165154265, "step": 10190}, {"loss": 0.2837, "grad_norm": 1.489094614982605, "learning_rate": 0.0002, "epoch": 7.404718693284936, "step": 10200}, {"loss": 0.2786, "grad_norm": 1.247572898864746, "learning_rate": 0.0002, "epoch": 7.411978221415608, "step": 10210}, {"loss": 0.2498, "grad_norm": 1.2741918563842773, "learning_rate": 0.0002, "epoch": 7.419237749546279, "step": 10220}, {"loss": 0.2649, "grad_norm": 1.0347636938095093, "learning_rate": 0.0002, "epoch": 7.426497277676951, "step": 10230}, {"loss": 0.2419, "grad_norm": 1.3295499086380005, "learning_rate": 0.0002, "epoch": 7.433756805807622, "step": 10240}, {"loss": 0.226, "grad_norm": 1.6056840419769287, "learning_rate": 0.0002, "epoch": 7.441016333938294, "step": 10250}, {"loss": 0.2571, "grad_norm": 1.4824398756027222, "learning_rate": 0.0002, "epoch": 7.448275862068965, "step": 10260}, {"loss": 0.2939, "grad_norm": 1.6259359121322632, "learning_rate": 0.0002, "epoch": 7.455535390199637, "step": 10270}, {"loss": 0.2873, "grad_norm": 1.5065499544143677, "learning_rate": 0.0002, "epoch": 7.462794918330308, "step": 10280}, {"loss": 0.2614, "grad_norm": 1.3505364656448364, "learning_rate": 0.0002, "epoch": 7.47005444646098, "step": 10290}, {"loss": 0.2862, "grad_norm": 1.4457359313964844, "learning_rate": 0.0002, "epoch": 7.4773139745916515, "step": 10300}, {"loss": 0.233, "grad_norm": 1.0782662630081177, "learning_rate": 0.0002, "epoch": 7.4845735027223235, "step": 10310}, {"loss": 0.2506, "grad_norm": 1.4209016561508179, "learning_rate": 0.0002, "epoch": 7.491833030852995, "step": 10320}, {"loss": 0.2741, "grad_norm": 1.4511336088180542, "learning_rate": 0.0002, "epoch": 7.499092558983666, "step": 10330}, {"loss": 0.2736, "grad_norm": 1.306691288948059, "learning_rate": 0.0002, "epoch": 7.506352087114338, "step": 10340}, {"loss": 0.2528, "grad_norm": 1.0647870302200317, "learning_rate": 0.0002, "epoch": 7.513611615245009, "step": 10350}, {"loss": 0.2402, "grad_norm": 1.0374330282211304, "learning_rate": 0.0002, "epoch": 7.520871143375681, "step": 10360}, {"loss": 0.3057, "grad_norm": 0.8428803086280823, "learning_rate": 0.0002, "epoch": 7.528130671506352, "step": 10370}, {"loss": 0.2446, "grad_norm": 1.3868707418441772, "learning_rate": 0.0002, "epoch": 7.535390199637024, "step": 10380}, {"loss": 0.2851, "grad_norm": 1.4324088096618652, "learning_rate": 0.0002, "epoch": 7.542649727767695, "step": 10390}, {"loss": 0.292, "grad_norm": 1.6413776874542236, "learning_rate": 0.0002, "epoch": 7.549909255898367, "step": 10400}, {"loss": 0.2585, "grad_norm": 1.4302188158035278, "learning_rate": 0.0002, "epoch": 7.557168784029038, "step": 10410}, {"loss": 0.306, "grad_norm": 1.3648983240127563, "learning_rate": 0.0002, "epoch": 7.564428312159709, "step": 10420}, {"loss": 0.2927, "grad_norm": 1.4480061531066895, "learning_rate": 0.0002, "epoch": 7.571687840290381, "step": 10430}, {"loss": 0.2711, "grad_norm": 1.0944541692733765, "learning_rate": 0.0002, "epoch": 7.578947368421053, "step": 10440}, {"loss": 0.2698, "grad_norm": 1.4632091522216797, "learning_rate": 0.0002, "epoch": 7.586206896551724, "step": 10450}, {"loss": 0.2994, "grad_norm": 1.562364935874939, "learning_rate": 0.0002, "epoch": 7.593466424682395, "step": 10460}, {"loss": 0.271, "grad_norm": 1.2138582468032837, "learning_rate": 0.0002, "epoch": 7.600725952813067, "step": 10470}, {"loss": 0.2638, "grad_norm": 1.467578411102295, "learning_rate": 0.0002, "epoch": 7.607985480943738, "step": 10480}, {"loss": 0.2949, "grad_norm": 1.3470213413238525, "learning_rate": 0.0002, "epoch": 7.61524500907441, "step": 10490}, {"loss": 0.2518, "grad_norm": 1.5385268926620483, "learning_rate": 0.0002, "epoch": 7.6225045372050815, "step": 10500}, {"loss": 0.2527, "grad_norm": 1.1245018243789673, "learning_rate": 0.0002, "epoch": 7.6297640653357535, "step": 10510}, {"loss": 0.267, "grad_norm": 1.3161317110061646, "learning_rate": 0.0002, "epoch": 7.637023593466425, "step": 10520}, {"loss": 0.2803, "grad_norm": 1.0402427911758423, "learning_rate": 0.0002, "epoch": 7.6442831215970966, "step": 10530}, {"loss": 0.2676, "grad_norm": 1.2699987888336182, "learning_rate": 0.0002, "epoch": 7.651542649727768, "step": 10540}, {"loss": 0.2835, "grad_norm": 1.47243332862854, "learning_rate": 0.0002, "epoch": 7.658802177858439, "step": 10550}, {"loss": 0.2624, "grad_norm": 1.1261144876480103, "learning_rate": 0.0002, "epoch": 7.666061705989111, "step": 10560}, {"loss": 0.282, "grad_norm": 1.5402237176895142, "learning_rate": 0.0002, "epoch": 7.673321234119782, "step": 10570}, {"loss": 0.2866, "grad_norm": 1.1316986083984375, "learning_rate": 0.0002, "epoch": 7.680580762250454, "step": 10580}, {"loss": 0.2593, "grad_norm": 1.2155439853668213, "learning_rate": 0.0002, "epoch": 7.687840290381125, "step": 10590}, {"loss": 0.274, "grad_norm": 1.566380500793457, "learning_rate": 0.0002, "epoch": 7.695099818511797, "step": 10600}, {"loss": 0.2664, "grad_norm": 1.7367318868637085, "learning_rate": 0.0002, "epoch": 7.702359346642468, "step": 10610}, {"loss": 0.3115, "grad_norm": 1.5213567018508911, "learning_rate": 0.0002, "epoch": 7.70961887477314, "step": 10620}, {"loss": 0.2863, "grad_norm": 1.3955585956573486, "learning_rate": 0.0002, "epoch": 7.716878402903811, "step": 10630}, {"loss": 0.2683, "grad_norm": 1.321916937828064, "learning_rate": 0.0002, "epoch": 7.724137931034483, "step": 10640}, {"loss": 0.306, "grad_norm": 1.8494919538497925, "learning_rate": 0.0002, "epoch": 7.731397459165154, "step": 10650}, {"loss": 0.2544, "grad_norm": 1.5309828519821167, "learning_rate": 0.0002, "epoch": 7.738656987295826, "step": 10660}, {"loss": 0.2693, "grad_norm": 1.3796069622039795, "learning_rate": 0.0002, "epoch": 7.745916515426497, "step": 10670}, {"loss": 0.2872, "grad_norm": 1.2416858673095703, "learning_rate": 0.0002, "epoch": 7.753176043557168, "step": 10680}, {"loss": 0.2729, "grad_norm": 1.4447332620620728, "learning_rate": 0.0002, "epoch": 7.76043557168784, "step": 10690}, {"loss": 0.2736, "grad_norm": 1.2003352642059326, "learning_rate": 0.0002, "epoch": 7.7676950998185115, "step": 10700}, {"loss": 0.2771, "grad_norm": 1.3607908487319946, "learning_rate": 0.0002, "epoch": 7.7749546279491835, "step": 10710}, {"loss": 0.2739, "grad_norm": 1.1789227724075317, "learning_rate": 0.0002, "epoch": 7.782214156079855, "step": 10720}, {"loss": 0.2927, "grad_norm": 1.2998148202896118, "learning_rate": 0.0002, "epoch": 7.7894736842105265, "step": 10730}, {"loss": 0.2825, "grad_norm": 1.8224656581878662, "learning_rate": 0.0002, "epoch": 7.796733212341198, "step": 10740}, {"loss": 0.2864, "grad_norm": 1.2510570287704468, "learning_rate": 0.0002, "epoch": 7.80399274047187, "step": 10750}, {"loss": 0.3007, "grad_norm": 1.065926194190979, "learning_rate": 0.0002, "epoch": 7.811252268602541, "step": 10760}, {"loss": 0.277, "grad_norm": 1.0313589572906494, "learning_rate": 0.0002, "epoch": 7.818511796733213, "step": 10770}, {"loss": 0.2954, "grad_norm": 1.10769784450531, "learning_rate": 0.0002, "epoch": 7.825771324863884, "step": 10780}, {"loss": 0.2893, "grad_norm": 1.4168727397918701, "learning_rate": 0.0002, "epoch": 7.833030852994556, "step": 10790}, {"loss": 0.2903, "grad_norm": 1.8239266872406006, "learning_rate": 0.0002, "epoch": 7.840290381125227, "step": 10800}, {"loss": 0.2573, "grad_norm": 1.5748721361160278, "learning_rate": 0.0002, "epoch": 7.847549909255898, "step": 10810}, {"loss": 0.3282, "grad_norm": 1.5762766599655151, "learning_rate": 0.0002, "epoch": 7.85480943738657, "step": 10820}, {"loss": 0.2981, "grad_norm": 1.1119135618209839, "learning_rate": 0.0002, "epoch": 7.862068965517241, "step": 10830}, {"loss": 0.3037, "grad_norm": 1.478314995765686, "learning_rate": 0.0002, "epoch": 7.869328493647913, "step": 10840}, {"loss": 0.2866, "grad_norm": 1.2225514650344849, "learning_rate": 0.0002, "epoch": 7.876588021778584, "step": 10850}, {"loss": 0.2795, "grad_norm": 1.503473162651062, "learning_rate": 0.0002, "epoch": 7.883847549909256, "step": 10860}, {"loss": 0.2732, "grad_norm": 1.0334484577178955, "learning_rate": 0.0002, "epoch": 7.891107078039927, "step": 10870}, {"loss": 0.3206, "grad_norm": 1.2068367004394531, "learning_rate": 0.0002, "epoch": 7.898366606170599, "step": 10880}, {"loss": 0.2936, "grad_norm": 1.3105504512786865, "learning_rate": 0.0002, "epoch": 7.90562613430127, "step": 10890}, {"loss": 0.3063, "grad_norm": 1.2941272258758545, "learning_rate": 0.0002, "epoch": 7.9128856624319415, "step": 10900}, {"loss": 0.2862, "grad_norm": 1.2809823751449585, "learning_rate": 0.0002, "epoch": 7.9201451905626135, "step": 10910}, {"loss": 0.3202, "grad_norm": 1.5362727642059326, "learning_rate": 0.0002, "epoch": 7.927404718693285, "step": 10920}, {"loss": 0.3005, "grad_norm": 1.5019413232803345, "learning_rate": 0.0002, "epoch": 7.9346642468239565, "step": 10930}, {"loss": 0.2976, "grad_norm": 1.5947920083999634, "learning_rate": 0.0002, "epoch": 7.941923774954628, "step": 10940}, {"loss": 0.2976, "grad_norm": 1.9482372999191284, "learning_rate": 0.0002, "epoch": 7.9491833030853, "step": 10950}, {"loss": 0.3297, "grad_norm": 1.8445630073547363, "learning_rate": 0.0002, "epoch": 7.956442831215971, "step": 10960}, {"loss": 0.2843, "grad_norm": 1.4342153072357178, "learning_rate": 0.0002, "epoch": 7.963702359346643, "step": 10970}, {"loss": 0.3066, "grad_norm": 1.3202505111694336, "learning_rate": 0.0002, "epoch": 7.970961887477314, "step": 10980}, {"loss": 0.2785, "grad_norm": 1.186015009880066, "learning_rate": 0.0002, "epoch": 7.978221415607986, "step": 10990}, {"loss": 0.324, "grad_norm": 1.2714571952819824, "learning_rate": 0.0002, "epoch": 7.985480943738657, "step": 11000}, {"loss": 0.2795, "grad_norm": 0.9723673462867737, "learning_rate": 0.0002, "epoch": 7.992740471869329, "step": 11010}]}