diff --git a/.gitattributes b/.gitattributes index f288ec082caec6893590724fb2b297549d8ff9d1..92a43d206d0bf56cc712d552aab22c11d2332380 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1768,3 +1768,12 @@ gemma-2-9b-it_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a- gemma-2-9b-it_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-205-sd-42/checkpoint-63/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2-9b-it_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-205-sd-42/checkpoint-84/tokenizer.json filter=lfs diff=lfs merge=lfs -text gemma-2-9b-it_int4_arc_challenge-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-0.2-num-205-sd-42/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/tokenizer.json filter=lfs diff=lfs merge=lfs -text +gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..71d27bad73e9e55e6e2b87696e1b3b4dbe8f7f9d --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b346eaa9e073c3cd345a6d9f82f197fbea2e95dbd041c1698f40d6d882de0d94 +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..add57051c280b6e60edad2e89b3d254b9410c183 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef891eef9b6fc06275c0d7e78f4344e293e51902f9d39ec7d3f56b6edf8029ad +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b94cae1ee29f783a9de8e1668b2ed2b58f8b9546 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f80e7ce75f729ef10f792677ab949387af41e280d6eb10e66458fc69b4fb0d6 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d2b0392fa2e9a09b4d59c3c7470fa28512978ce5 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d26e4354ef33a3b01c352125609d34cd20459307efa8989db92d6417ee8d478 +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c564362bb3cae555d39597966c49a4680e297bc2 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34570a8d1338ece3d5fe78776cbfab909a95faeddbe647cea7756eef5ebfcceb +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a88f41c29f1519514819a74787efa4fffc5b9b8e --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/trainer_state.json @@ -0,0 +1,7278 @@ +{ + "best_metric": 1.8034634590148926, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467", + "epoch": 6.999659284497445, + "eval_steps": 10, + "global_step": 10272, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0068143100511073255, + "grad_norm": 0.635574460029602, + "learning_rate": 0.0002, + "loss": 3.0988, + "step": 10 + }, + { + "epoch": 0.013628620102214651, + "grad_norm": 1.0401769876480103, + "learning_rate": 0.0002, + "loss": 2.5594, + "step": 20 + }, + { + "epoch": 0.020442930153321975, + "grad_norm": 0.4514131247997284, + "learning_rate": 0.0002, + "loss": 2.3587, + "step": 30 + }, + { + "epoch": 0.027257240204429302, + "grad_norm": 0.6303355693817139, + "learning_rate": 0.0002, + "loss": 2.121, + "step": 40 + }, + { + "epoch": 0.034071550255536626, + "grad_norm": 0.4648270606994629, + "learning_rate": 0.0002, + "loss": 1.9833, + "step": 50 + }, + { + "epoch": 0.04088586030664395, + "grad_norm": 0.42953479290008545, + "learning_rate": 0.0002, + "loss": 1.9384, + "step": 60 + }, + { + "epoch": 0.04770017035775128, + "grad_norm": 0.433614581823349, + "learning_rate": 0.0002, + "loss": 1.9202, + "step": 70 + }, + { + "epoch": 0.054514480408858604, + "grad_norm": 0.45318254828453064, + "learning_rate": 0.0002, + "loss": 1.911, + "step": 80 + }, + { + "epoch": 0.06132879045996593, + "grad_norm": 0.4023568630218506, + "learning_rate": 0.0002, + "loss": 1.8895, + "step": 90 + }, + { + "epoch": 0.06814310051107325, + "grad_norm": 0.43260207772254944, + "learning_rate": 0.0002, + "loss": 1.9257, + "step": 100 + }, + { + "epoch": 0.07495741056218058, + "grad_norm": 0.43389809131622314, + "learning_rate": 0.0002, + "loss": 1.9178, + "step": 110 + }, + { + "epoch": 0.0817717206132879, + "grad_norm": 0.39307987689971924, + "learning_rate": 0.0002, + "loss": 1.9071, + "step": 120 + }, + { + "epoch": 0.08858603066439523, + "grad_norm": 0.7703037261962891, + "learning_rate": 0.0002, + "loss": 1.9482, + "step": 130 + }, + { + "epoch": 0.09540034071550256, + "grad_norm": 0.38547563552856445, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 140 + }, + { + "epoch": 0.10221465076660988, + "grad_norm": 0.37948688864707947, + "learning_rate": 0.0002, + "loss": 1.9037, + "step": 150 + }, + { + "epoch": 0.10902896081771721, + "grad_norm": 0.33022379875183105, + "learning_rate": 0.0002, + "loss": 1.8664, + "step": 160 + }, + { + "epoch": 0.11584327086882454, + "grad_norm": 0.33703792095184326, + "learning_rate": 0.0002, + "loss": 1.8659, + "step": 170 + }, + { + "epoch": 0.12265758091993186, + "grad_norm": 0.3259912431240082, + "learning_rate": 0.0002, + "loss": 1.8389, + "step": 180 + }, + { + "epoch": 0.12947189097103917, + "grad_norm": 0.3593858778476715, + "learning_rate": 0.0002, + "loss": 1.8424, + "step": 190 + }, + { + "epoch": 0.1362862010221465, + "grad_norm": 0.3401614725589752, + "learning_rate": 0.0002, + "loss": 1.8457, + "step": 200 + }, + { + "epoch": 0.14310051107325383, + "grad_norm": 0.3892078697681427, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 210 + }, + { + "epoch": 0.14991482112436116, + "grad_norm": 0.315374493598938, + "learning_rate": 0.0002, + "loss": 1.8351, + "step": 220 + }, + { + "epoch": 0.1567291311754685, + "grad_norm": 0.3679497539997101, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 230 + }, + { + "epoch": 0.1635434412265758, + "grad_norm": 0.336730033159256, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 240 + }, + { + "epoch": 0.17035775127768313, + "grad_norm": 0.36472755670547485, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 250 + }, + { + "epoch": 0.17717206132879046, + "grad_norm": 0.34864023327827454, + "learning_rate": 0.0002, + "loss": 1.7766, + "step": 260 + }, + { + "epoch": 0.1839863713798978, + "grad_norm": 0.3350819945335388, + "learning_rate": 0.0002, + "loss": 1.896, + "step": 270 + }, + { + "epoch": 0.19080068143100512, + "grad_norm": 0.3844246566295624, + "learning_rate": 0.0002, + "loss": 1.8742, + "step": 280 + }, + { + "epoch": 0.19761499148211242, + "grad_norm": 0.38413065671920776, + "learning_rate": 0.0002, + "loss": 1.8531, + "step": 290 + }, + { + "epoch": 0.20442930153321975, + "grad_norm": 0.4080047607421875, + "learning_rate": 0.0002, + "loss": 1.8415, + "step": 300 + }, + { + "epoch": 0.21124361158432708, + "grad_norm": 0.3546800911426544, + "learning_rate": 0.0002, + "loss": 1.9009, + "step": 310 + }, + { + "epoch": 0.21805792163543442, + "grad_norm": 0.38577890396118164, + "learning_rate": 0.0002, + "loss": 1.8092, + "step": 320 + }, + { + "epoch": 0.22487223168654175, + "grad_norm": 0.38979098200798035, + "learning_rate": 0.0002, + "loss": 1.7113, + "step": 330 + }, + { + "epoch": 0.23168654173764908, + "grad_norm": 0.35105520486831665, + "learning_rate": 0.0002, + "loss": 1.8162, + "step": 340 + }, + { + "epoch": 0.23850085178875638, + "grad_norm": 0.31671223044395447, + "learning_rate": 0.0002, + "loss": 1.8408, + "step": 350 + }, + { + "epoch": 0.2453151618398637, + "grad_norm": 0.33034196496009827, + "learning_rate": 0.0002, + "loss": 1.8014, + "step": 360 + }, + { + "epoch": 0.252129471890971, + "grad_norm": 0.2990533709526062, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 370 + }, + { + "epoch": 0.25894378194207834, + "grad_norm": 0.362208753824234, + "learning_rate": 0.0002, + "loss": 1.8612, + "step": 380 + }, + { + "epoch": 0.2657580919931857, + "grad_norm": 0.3269096612930298, + "learning_rate": 0.0002, + "loss": 1.8001, + "step": 390 + }, + { + "epoch": 0.272572402044293, + "grad_norm": 0.30555954575538635, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 400 + }, + { + "epoch": 0.27938671209540034, + "grad_norm": 0.332933247089386, + "learning_rate": 0.0002, + "loss": 1.8018, + "step": 410 + }, + { + "epoch": 0.28620102214650767, + "grad_norm": 0.39454060792922974, + "learning_rate": 0.0002, + "loss": 1.8157, + "step": 420 + }, + { + "epoch": 0.293015332197615, + "grad_norm": 0.34589633345603943, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 430 + }, + { + "epoch": 0.29982964224872233, + "grad_norm": 0.3747332990169525, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 440 + }, + { + "epoch": 0.30664395229982966, + "grad_norm": 0.34825369715690613, + "learning_rate": 0.0002, + "loss": 1.8476, + "step": 450 + }, + { + "epoch": 0.313458262350937, + "grad_norm": 0.32906976342201233, + "learning_rate": 0.0002, + "loss": 1.775, + "step": 460 + }, + { + "epoch": 0.3202725724020443, + "grad_norm": 0.33108609914779663, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 470 + }, + { + "epoch": 0.3270868824531516, + "grad_norm": 0.3170463442802429, + "learning_rate": 0.0002, + "loss": 1.8012, + "step": 480 + }, + { + "epoch": 0.3339011925042589, + "grad_norm": 0.30792081356048584, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 490 + }, + { + "epoch": 0.34071550255536626, + "grad_norm": 0.31772997975349426, + "learning_rate": 0.0002, + "loss": 1.7361, + "step": 500 + }, + { + "epoch": 0.3475298126064736, + "grad_norm": 0.32714012265205383, + "learning_rate": 0.0002, + "loss": 1.8147, + "step": 510 + }, + { + "epoch": 0.3543441226575809, + "grad_norm": 0.3021100163459778, + "learning_rate": 0.0002, + "loss": 1.8332, + "step": 520 + }, + { + "epoch": 0.36115843270868825, + "grad_norm": 0.6045835018157959, + "learning_rate": 0.0002, + "loss": 1.7337, + "step": 530 + }, + { + "epoch": 0.3679727427597956, + "grad_norm": 0.3003896474838257, + "learning_rate": 0.0002, + "loss": 1.8661, + "step": 540 + }, + { + "epoch": 0.3747870528109029, + "grad_norm": 0.3678470551967621, + "learning_rate": 0.0002, + "loss": 1.8359, + "step": 550 + }, + { + "epoch": 0.38160136286201024, + "grad_norm": 0.35787615180015564, + "learning_rate": 0.0002, + "loss": 1.7512, + "step": 560 + }, + { + "epoch": 0.38841567291311757, + "grad_norm": 0.31882143020629883, + "learning_rate": 0.0002, + "loss": 1.8048, + "step": 570 + }, + { + "epoch": 0.39522998296422485, + "grad_norm": 0.3186313509941101, + "learning_rate": 0.0002, + "loss": 1.8108, + "step": 580 + }, + { + "epoch": 0.4020442930153322, + "grad_norm": 0.41443702578544617, + "learning_rate": 0.0002, + "loss": 1.8012, + "step": 590 + }, + { + "epoch": 0.4088586030664395, + "grad_norm": 0.28773069381713867, + "learning_rate": 0.0002, + "loss": 1.7794, + "step": 600 + }, + { + "epoch": 0.41567291311754684, + "grad_norm": 0.35743263363838196, + "learning_rate": 0.0002, + "loss": 1.8231, + "step": 610 + }, + { + "epoch": 0.42248722316865417, + "grad_norm": 0.3360286355018616, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 620 + }, + { + "epoch": 0.4293015332197615, + "grad_norm": 0.32838866114616394, + "learning_rate": 0.0002, + "loss": 1.7654, + "step": 630 + }, + { + "epoch": 0.43611584327086883, + "grad_norm": 0.2994388937950134, + "learning_rate": 0.0002, + "loss": 1.8176, + "step": 640 + }, + { + "epoch": 0.44293015332197616, + "grad_norm": 0.3306307792663574, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 650 + }, + { + "epoch": 0.4497444633730835, + "grad_norm": 0.3129560351371765, + "learning_rate": 0.0002, + "loss": 1.8201, + "step": 660 + }, + { + "epoch": 0.4565587734241908, + "grad_norm": 0.3244289457798004, + "learning_rate": 0.0002, + "loss": 1.803, + "step": 670 + }, + { + "epoch": 0.46337308347529815, + "grad_norm": 0.3196892738342285, + "learning_rate": 0.0002, + "loss": 1.8399, + "step": 680 + }, + { + "epoch": 0.47018739352640543, + "grad_norm": 0.3135230243206024, + "learning_rate": 0.0002, + "loss": 1.8291, + "step": 690 + }, + { + "epoch": 0.47700170357751276, + "grad_norm": 0.28677991032600403, + "learning_rate": 0.0002, + "loss": 1.7423, + "step": 700 + }, + { + "epoch": 0.4838160136286201, + "grad_norm": 0.3074065148830414, + "learning_rate": 0.0002, + "loss": 1.6982, + "step": 710 + }, + { + "epoch": 0.4906303236797274, + "grad_norm": 0.3354290723800659, + "learning_rate": 0.0002, + "loss": 1.8078, + "step": 720 + }, + { + "epoch": 0.49744463373083475, + "grad_norm": 0.324370801448822, + "learning_rate": 0.0002, + "loss": 1.7333, + "step": 730 + }, + { + "epoch": 0.504258943781942, + "grad_norm": 0.29496142268180847, + "learning_rate": 0.0002, + "loss": 1.8506, + "step": 740 + }, + { + "epoch": 0.5110732538330494, + "grad_norm": 0.30694130063056946, + "learning_rate": 0.0002, + "loss": 1.7761, + "step": 750 + }, + { + "epoch": 0.5178875638841567, + "grad_norm": 0.36168408393859863, + "learning_rate": 0.0002, + "loss": 1.7854, + "step": 760 + }, + { + "epoch": 0.524701873935264, + "grad_norm": 0.2930343449115753, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 770 + }, + { + "epoch": 0.5315161839863713, + "grad_norm": 0.3023432493209839, + "learning_rate": 0.0002, + "loss": 1.7924, + "step": 780 + }, + { + "epoch": 0.5383304940374787, + "grad_norm": 0.3272720277309418, + "learning_rate": 0.0002, + "loss": 1.8467, + "step": 790 + }, + { + "epoch": 0.545144804088586, + "grad_norm": 0.2907974421977997, + "learning_rate": 0.0002, + "loss": 1.8639, + "step": 800 + }, + { + "epoch": 0.5519591141396933, + "grad_norm": 0.32267168164253235, + "learning_rate": 0.0002, + "loss": 1.9018, + "step": 810 + }, + { + "epoch": 0.5587734241908007, + "grad_norm": 0.32059940695762634, + "learning_rate": 0.0002, + "loss": 1.8311, + "step": 820 + }, + { + "epoch": 0.565587734241908, + "grad_norm": 0.30951258540153503, + "learning_rate": 0.0002, + "loss": 1.7234, + "step": 830 + }, + { + "epoch": 0.5724020442930153, + "grad_norm": 0.33976122736930847, + "learning_rate": 0.0002, + "loss": 1.8063, + "step": 840 + }, + { + "epoch": 0.5792163543441227, + "grad_norm": 0.3195820450782776, + "learning_rate": 0.0002, + "loss": 1.7021, + "step": 850 + }, + { + "epoch": 0.58603066439523, + "grad_norm": 0.2828562557697296, + "learning_rate": 0.0002, + "loss": 1.8073, + "step": 860 + }, + { + "epoch": 0.5928449744463373, + "grad_norm": 0.29591670632362366, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 870 + }, + { + "epoch": 0.5996592844974447, + "grad_norm": 0.3086104393005371, + "learning_rate": 0.0002, + "loss": 1.8109, + "step": 880 + }, + { + "epoch": 0.606473594548552, + "grad_norm": 0.3592929542064667, + "learning_rate": 0.0002, + "loss": 1.8207, + "step": 890 + }, + { + "epoch": 0.6132879045996593, + "grad_norm": 0.2830186188220978, + "learning_rate": 0.0002, + "loss": 1.7662, + "step": 900 + }, + { + "epoch": 0.6201022146507666, + "grad_norm": 0.3128598630428314, + "learning_rate": 0.0002, + "loss": 1.8344, + "step": 910 + }, + { + "epoch": 0.626916524701874, + "grad_norm": 0.31957563757896423, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 920 + }, + { + "epoch": 0.6337308347529813, + "grad_norm": 0.30994319915771484, + "learning_rate": 0.0002, + "loss": 1.7578, + "step": 930 + }, + { + "epoch": 0.6405451448040886, + "grad_norm": 0.3352845013141632, + "learning_rate": 0.0002, + "loss": 1.8494, + "step": 940 + }, + { + "epoch": 0.6473594548551959, + "grad_norm": 0.2960077226161957, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 950 + }, + { + "epoch": 0.6541737649063032, + "grad_norm": 0.32675081491470337, + "learning_rate": 0.0002, + "loss": 1.7209, + "step": 960 + }, + { + "epoch": 0.6609880749574105, + "grad_norm": 0.30042028427124023, + "learning_rate": 0.0002, + "loss": 1.7776, + "step": 970 + }, + { + "epoch": 0.6678023850085179, + "grad_norm": 0.3288673758506775, + "learning_rate": 0.0002, + "loss": 1.7597, + "step": 980 + }, + { + "epoch": 0.6746166950596252, + "grad_norm": 0.323215126991272, + "learning_rate": 0.0002, + "loss": 1.7962, + "step": 990 + }, + { + "epoch": 0.6814310051107325, + "grad_norm": 0.30041399598121643, + "learning_rate": 0.0002, + "loss": 1.821, + "step": 1000 + }, + { + "epoch": 0.6882453151618398, + "grad_norm": 0.3076179623603821, + "learning_rate": 0.0002, + "loss": 1.7716, + "step": 1010 + }, + { + "epoch": 0.6950596252129472, + "grad_norm": 0.2941909730434418, + "learning_rate": 0.0002, + "loss": 1.7827, + "step": 1020 + }, + { + "epoch": 0.7018739352640545, + "grad_norm": 0.32220420241355896, + "learning_rate": 0.0002, + "loss": 1.7964, + "step": 1030 + }, + { + "epoch": 0.7086882453151618, + "grad_norm": 0.2989702820777893, + "learning_rate": 0.0002, + "loss": 1.7101, + "step": 1040 + }, + { + "epoch": 0.7155025553662692, + "grad_norm": 0.3049640357494354, + "learning_rate": 0.0002, + "loss": 1.8749, + "step": 1050 + }, + { + "epoch": 0.7223168654173765, + "grad_norm": 0.3183284103870392, + "learning_rate": 0.0002, + "loss": 1.782, + "step": 1060 + }, + { + "epoch": 0.7291311754684838, + "grad_norm": 0.3070095181465149, + "learning_rate": 0.0002, + "loss": 1.785, + "step": 1070 + }, + { + "epoch": 0.7359454855195912, + "grad_norm": 0.33263063430786133, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 1080 + }, + { + "epoch": 0.7427597955706985, + "grad_norm": 0.28774312138557434, + "learning_rate": 0.0002, + "loss": 1.7627, + "step": 1090 + }, + { + "epoch": 0.7495741056218058, + "grad_norm": 0.29304224252700806, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1100 + }, + { + "epoch": 0.7563884156729132, + "grad_norm": 0.27529507875442505, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 1110 + }, + { + "epoch": 0.7632027257240205, + "grad_norm": 0.32319945096969604, + "learning_rate": 0.0002, + "loss": 1.8071, + "step": 1120 + }, + { + "epoch": 0.7700170357751278, + "grad_norm": 0.33777597546577454, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 1130 + }, + { + "epoch": 0.7768313458262351, + "grad_norm": 0.29847201704978943, + "learning_rate": 0.0002, + "loss": 1.8488, + "step": 1140 + }, + { + "epoch": 0.7836456558773425, + "grad_norm": 0.31370633840560913, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 1150 + }, + { + "epoch": 0.7904599659284497, + "grad_norm": 0.31633856892585754, + "learning_rate": 0.0002, + "loss": 1.8175, + "step": 1160 + }, + { + "epoch": 0.797274275979557, + "grad_norm": 0.33849263191223145, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 1170 + }, + { + "epoch": 0.8040885860306644, + "grad_norm": 0.3306333124637604, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 1180 + }, + { + "epoch": 0.8109028960817717, + "grad_norm": 0.3249678313732147, + "learning_rate": 0.0002, + "loss": 1.777, + "step": 1190 + }, + { + "epoch": 0.817717206132879, + "grad_norm": 0.3252817690372467, + "learning_rate": 0.0002, + "loss": 1.7631, + "step": 1200 + }, + { + "epoch": 0.8245315161839863, + "grad_norm": 0.31772559881210327, + "learning_rate": 0.0002, + "loss": 1.7877, + "step": 1210 + }, + { + "epoch": 0.8313458262350937, + "grad_norm": 0.2803371846675873, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 1220 + }, + { + "epoch": 0.838160136286201, + "grad_norm": 0.26700571179389954, + "learning_rate": 0.0002, + "loss": 1.738, + "step": 1230 + }, + { + "epoch": 0.8449744463373083, + "grad_norm": 0.3060092031955719, + "learning_rate": 0.0002, + "loss": 1.8301, + "step": 1240 + }, + { + "epoch": 0.8517887563884157, + "grad_norm": 0.28831684589385986, + "learning_rate": 0.0002, + "loss": 1.7993, + "step": 1250 + }, + { + "epoch": 0.858603066439523, + "grad_norm": 0.30708742141723633, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 1260 + }, + { + "epoch": 0.8654173764906303, + "grad_norm": 0.2915987968444824, + "learning_rate": 0.0002, + "loss": 1.8506, + "step": 1270 + }, + { + "epoch": 0.8722316865417377, + "grad_norm": 0.2893589735031128, + "learning_rate": 0.0002, + "loss": 1.7536, + "step": 1280 + }, + { + "epoch": 0.879045996592845, + "grad_norm": 0.29545632004737854, + "learning_rate": 0.0002, + "loss": 1.7437, + "step": 1290 + }, + { + "epoch": 0.8858603066439523, + "grad_norm": 0.3354771137237549, + "learning_rate": 0.0002, + "loss": 1.859, + "step": 1300 + }, + { + "epoch": 0.8926746166950597, + "grad_norm": 0.37715399265289307, + "learning_rate": 0.0002, + "loss": 1.7644, + "step": 1310 + }, + { + "epoch": 0.899488926746167, + "grad_norm": 0.28847193717956543, + "learning_rate": 0.0002, + "loss": 1.7731, + "step": 1320 + }, + { + "epoch": 0.9063032367972743, + "grad_norm": 0.2780889868736267, + "learning_rate": 0.0002, + "loss": 1.9077, + "step": 1330 + }, + { + "epoch": 0.9131175468483816, + "grad_norm": 0.2714342176914215, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 1340 + }, + { + "epoch": 0.919931856899489, + "grad_norm": 0.2950133979320526, + "learning_rate": 0.0002, + "loss": 1.8, + "step": 1350 + }, + { + "epoch": 0.9267461669505963, + "grad_norm": 0.29097145795822144, + "learning_rate": 0.0002, + "loss": 1.7368, + "step": 1360 + }, + { + "epoch": 0.9335604770017035, + "grad_norm": 0.32540133595466614, + "learning_rate": 0.0002, + "loss": 1.7864, + "step": 1370 + }, + { + "epoch": 0.9403747870528109, + "grad_norm": 0.3076636493206024, + "learning_rate": 0.0002, + "loss": 1.7571, + "step": 1380 + }, + { + "epoch": 0.9471890971039182, + "grad_norm": 0.2962130308151245, + "learning_rate": 0.0002, + "loss": 1.8312, + "step": 1390 + }, + { + "epoch": 0.9540034071550255, + "grad_norm": 0.30086860060691833, + "learning_rate": 0.0002, + "loss": 1.7858, + "step": 1400 + }, + { + "epoch": 0.9608177172061328, + "grad_norm": 0.28634947538375854, + "learning_rate": 0.0002, + "loss": 1.7991, + "step": 1410 + }, + { + "epoch": 0.9676320272572402, + "grad_norm": 0.35314416885375977, + "learning_rate": 0.0002, + "loss": 1.8385, + "step": 1420 + }, + { + "epoch": 0.9744463373083475, + "grad_norm": 0.2939317524433136, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 1430 + }, + { + "epoch": 0.9812606473594548, + "grad_norm": 0.3010196089744568, + "learning_rate": 0.0002, + "loss": 1.7582, + "step": 1440 + }, + { + "epoch": 0.9880749574105622, + "grad_norm": 0.30816152691841125, + "learning_rate": 0.0002, + "loss": 1.758, + "step": 1450 + }, + { + "epoch": 0.9948892674616695, + "grad_norm": 0.28152793645858765, + "learning_rate": 0.0002, + "loss": 1.8159, + "step": 1460 + }, + { + "epoch": 0.9996592844974447, + "eval_loss": 1.8034634590148926, + "eval_runtime": 53.6017, + "eval_samples_per_second": 9.459, + "eval_steps_per_second": 1.194, + "step": 1467 + }, + { + "epoch": 1.0017035775127767, + "grad_norm": 0.29246416687965393, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 1470 + }, + { + "epoch": 1.008517887563884, + "grad_norm": 0.2668602168560028, + "learning_rate": 0.0002, + "loss": 1.8159, + "step": 1480 + }, + { + "epoch": 1.0153321976149914, + "grad_norm": 0.2998567819595337, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 1490 + }, + { + "epoch": 1.0221465076660987, + "grad_norm": 0.3284934461116791, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 1500 + }, + { + "epoch": 1.028960817717206, + "grad_norm": 0.3275827169418335, + "learning_rate": 0.0002, + "loss": 1.689, + "step": 1510 + }, + { + "epoch": 1.0357751277683134, + "grad_norm": 0.3382718563079834, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 1520 + }, + { + "epoch": 1.0425894378194207, + "grad_norm": 0.36737215518951416, + "learning_rate": 0.0002, + "loss": 1.7215, + "step": 1530 + }, + { + "epoch": 1.049403747870528, + "grad_norm": 0.3442603647708893, + "learning_rate": 0.0002, + "loss": 1.6993, + "step": 1540 + }, + { + "epoch": 1.0562180579216354, + "grad_norm": 0.3323381245136261, + "learning_rate": 0.0002, + "loss": 1.6996, + "step": 1550 + }, + { + "epoch": 1.0630323679727427, + "grad_norm": 0.341227650642395, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 1560 + }, + { + "epoch": 1.06984667802385, + "grad_norm": 0.3361579477787018, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 1570 + }, + { + "epoch": 1.0766609880749574, + "grad_norm": 0.3556230962276459, + "learning_rate": 0.0002, + "loss": 1.7337, + "step": 1580 + }, + { + "epoch": 1.0834752981260647, + "grad_norm": 0.27130424976348877, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 1590 + }, + { + "epoch": 1.090289608177172, + "grad_norm": 0.29366323351860046, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 1600 + }, + { + "epoch": 1.0971039182282794, + "grad_norm": 0.3581245541572571, + "learning_rate": 0.0002, + "loss": 1.6721, + "step": 1610 + }, + { + "epoch": 1.1039182282793867, + "grad_norm": 0.3021670877933502, + "learning_rate": 0.0002, + "loss": 1.7639, + "step": 1620 + }, + { + "epoch": 1.110732538330494, + "grad_norm": 0.3145572543144226, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 1630 + }, + { + "epoch": 1.1175468483816013, + "grad_norm": 0.35362154245376587, + "learning_rate": 0.0002, + "loss": 1.7408, + "step": 1640 + }, + { + "epoch": 1.1243611584327087, + "grad_norm": 0.5413113236427307, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 1650 + }, + { + "epoch": 1.131175468483816, + "grad_norm": 0.6858654022216797, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 1660 + }, + { + "epoch": 1.1379897785349233, + "grad_norm": 0.3781903386116028, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 1670 + }, + { + "epoch": 1.1448040885860307, + "grad_norm": 0.404864102602005, + "learning_rate": 0.0002, + "loss": 1.659, + "step": 1680 + }, + { + "epoch": 1.151618398637138, + "grad_norm": 0.3595100939273834, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 1690 + }, + { + "epoch": 1.1584327086882453, + "grad_norm": 0.33682283759117126, + "learning_rate": 0.0002, + "loss": 1.7023, + "step": 1700 + }, + { + "epoch": 1.1652470187393527, + "grad_norm": 0.3877373933792114, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 1710 + }, + { + "epoch": 1.17206132879046, + "grad_norm": 0.34606897830963135, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 1720 + }, + { + "epoch": 1.1788756388415673, + "grad_norm": 0.3122918903827667, + "learning_rate": 0.0002, + "loss": 1.6889, + "step": 1730 + }, + { + "epoch": 1.1856899488926746, + "grad_norm": 0.34081900119781494, + "learning_rate": 0.0002, + "loss": 1.6585, + "step": 1740 + }, + { + "epoch": 1.192504258943782, + "grad_norm": 0.3418807089328766, + "learning_rate": 0.0002, + "loss": 1.7794, + "step": 1750 + }, + { + "epoch": 1.1993185689948893, + "grad_norm": 0.3495500981807709, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 1760 + }, + { + "epoch": 1.2061328790459966, + "grad_norm": 0.557288408279419, + "learning_rate": 0.0002, + "loss": 1.6578, + "step": 1770 + }, + { + "epoch": 1.212947189097104, + "grad_norm": 0.3193778693675995, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 1780 + }, + { + "epoch": 1.2197614991482113, + "grad_norm": 0.3306216895580292, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 1790 + }, + { + "epoch": 1.2265758091993186, + "grad_norm": 0.37998732924461365, + "learning_rate": 0.0002, + "loss": 1.6772, + "step": 1800 + }, + { + "epoch": 1.233390119250426, + "grad_norm": 0.3255669176578522, + "learning_rate": 0.0002, + "loss": 1.7094, + "step": 1810 + }, + { + "epoch": 1.2402044293015333, + "grad_norm": 0.3741287291049957, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 1820 + }, + { + "epoch": 1.2470187393526406, + "grad_norm": 0.36727291345596313, + "learning_rate": 0.0002, + "loss": 1.7317, + "step": 1830 + }, + { + "epoch": 1.253833049403748, + "grad_norm": 0.3479527235031128, + "learning_rate": 0.0002, + "loss": 1.7418, + "step": 1840 + }, + { + "epoch": 1.2606473594548553, + "grad_norm": 0.3472636640071869, + "learning_rate": 0.0002, + "loss": 1.7062, + "step": 1850 + }, + { + "epoch": 1.2674616695059626, + "grad_norm": 0.3702869415283203, + "learning_rate": 0.0002, + "loss": 1.6263, + "step": 1860 + }, + { + "epoch": 1.27427597955707, + "grad_norm": 0.3934040069580078, + "learning_rate": 0.0002, + "loss": 1.6222, + "step": 1870 + }, + { + "epoch": 1.2810902896081773, + "grad_norm": 0.46887534856796265, + "learning_rate": 0.0002, + "loss": 1.7616, + "step": 1880 + }, + { + "epoch": 1.2879045996592846, + "grad_norm": 0.3191998600959778, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 1890 + }, + { + "epoch": 1.294718909710392, + "grad_norm": 0.34032225608825684, + "learning_rate": 0.0002, + "loss": 1.7001, + "step": 1900 + }, + { + "epoch": 1.3015332197614993, + "grad_norm": 0.33453696966171265, + "learning_rate": 0.0002, + "loss": 1.8316, + "step": 1910 + }, + { + "epoch": 1.3083475298126066, + "grad_norm": 0.3451494872570038, + "learning_rate": 0.0002, + "loss": 1.6567, + "step": 1920 + }, + { + "epoch": 1.315161839863714, + "grad_norm": 0.36203092336654663, + "learning_rate": 0.0002, + "loss": 1.708, + "step": 1930 + }, + { + "epoch": 1.321976149914821, + "grad_norm": 0.43794456124305725, + "learning_rate": 0.0002, + "loss": 1.7095, + "step": 1940 + }, + { + "epoch": 1.3287904599659284, + "grad_norm": 0.3630591034889221, + "learning_rate": 0.0002, + "loss": 1.7264, + "step": 1950 + }, + { + "epoch": 1.3356047700170357, + "grad_norm": 0.36951911449432373, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 1960 + }, + { + "epoch": 1.342419080068143, + "grad_norm": 0.4001159965991974, + "learning_rate": 0.0002, + "loss": 1.651, + "step": 1970 + }, + { + "epoch": 1.3492333901192504, + "grad_norm": 0.3820836544036865, + "learning_rate": 0.0002, + "loss": 1.677, + "step": 1980 + }, + { + "epoch": 1.3560477001703577, + "grad_norm": 0.3705870807170868, + "learning_rate": 0.0002, + "loss": 1.583, + "step": 1990 + }, + { + "epoch": 1.362862010221465, + "grad_norm": 0.3557972013950348, + "learning_rate": 0.0002, + "loss": 1.6207, + "step": 2000 + }, + { + "epoch": 1.3696763202725724, + "grad_norm": 0.38546398282051086, + "learning_rate": 0.0002, + "loss": 1.7656, + "step": 2010 + }, + { + "epoch": 1.3764906303236797, + "grad_norm": 0.3908020853996277, + "learning_rate": 0.0002, + "loss": 1.6881, + "step": 2020 + }, + { + "epoch": 1.383304940374787, + "grad_norm": 0.3822040855884552, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 2030 + }, + { + "epoch": 1.3901192504258943, + "grad_norm": 0.425327867269516, + "learning_rate": 0.0002, + "loss": 1.7923, + "step": 2040 + }, + { + "epoch": 1.3969335604770017, + "grad_norm": 0.3436259329319, + "learning_rate": 0.0002, + "loss": 1.7032, + "step": 2050 + }, + { + "epoch": 1.403747870528109, + "grad_norm": 0.33124062418937683, + "learning_rate": 0.0002, + "loss": 1.6398, + "step": 2060 + }, + { + "epoch": 1.4105621805792163, + "grad_norm": 0.3662424683570862, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 2070 + }, + { + "epoch": 1.4173764906303237, + "grad_norm": 0.3720635175704956, + "learning_rate": 0.0002, + "loss": 1.7258, + "step": 2080 + }, + { + "epoch": 1.424190800681431, + "grad_norm": 0.3361680805683136, + "learning_rate": 0.0002, + "loss": 1.7186, + "step": 2090 + }, + { + "epoch": 1.4310051107325383, + "grad_norm": 0.32751724123954773, + "learning_rate": 0.0002, + "loss": 1.7606, + "step": 2100 + }, + { + "epoch": 1.4378194207836457, + "grad_norm": 0.34333378076553345, + "learning_rate": 0.0002, + "loss": 1.7051, + "step": 2110 + }, + { + "epoch": 1.444633730834753, + "grad_norm": 0.37777671217918396, + "learning_rate": 0.0002, + "loss": 1.6979, + "step": 2120 + }, + { + "epoch": 1.4514480408858603, + "grad_norm": 0.37126365303993225, + "learning_rate": 0.0002, + "loss": 1.7085, + "step": 2130 + }, + { + "epoch": 1.4582623509369677, + "grad_norm": 0.3602267801761627, + "learning_rate": 0.0002, + "loss": 1.721, + "step": 2140 + }, + { + "epoch": 1.465076660988075, + "grad_norm": 0.3287110924720764, + "learning_rate": 0.0002, + "loss": 1.8148, + "step": 2150 + }, + { + "epoch": 1.4718909710391823, + "grad_norm": 0.3562135100364685, + "learning_rate": 0.0002, + "loss": 1.6966, + "step": 2160 + }, + { + "epoch": 1.4787052810902896, + "grad_norm": 0.38292962312698364, + "learning_rate": 0.0002, + "loss": 1.713, + "step": 2170 + }, + { + "epoch": 1.485519591141397, + "grad_norm": 0.38220319151878357, + "learning_rate": 0.0002, + "loss": 1.7036, + "step": 2180 + }, + { + "epoch": 1.4923339011925043, + "grad_norm": 0.3570062220096588, + "learning_rate": 0.0002, + "loss": 1.7297, + "step": 2190 + }, + { + "epoch": 1.4991482112436116, + "grad_norm": 0.363146036863327, + "learning_rate": 0.0002, + "loss": 1.5652, + "step": 2200 + }, + { + "epoch": 1.5059625212947187, + "grad_norm": 0.37393274903297424, + "learning_rate": 0.0002, + "loss": 1.6716, + "step": 2210 + }, + { + "epoch": 1.512776831345826, + "grad_norm": 0.3628501892089844, + "learning_rate": 0.0002, + "loss": 1.6839, + "step": 2220 + }, + { + "epoch": 1.5195911413969334, + "grad_norm": 0.33430740237236023, + "learning_rate": 0.0002, + "loss": 1.6627, + "step": 2230 + }, + { + "epoch": 1.5264054514480407, + "grad_norm": 0.35363978147506714, + "learning_rate": 0.0002, + "loss": 1.7495, + "step": 2240 + }, + { + "epoch": 1.533219761499148, + "grad_norm": 0.37220337986946106, + "learning_rate": 0.0002, + "loss": 1.6364, + "step": 2250 + }, + { + "epoch": 1.5400340715502554, + "grad_norm": 0.35020262002944946, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 2260 + }, + { + "epoch": 1.5468483816013627, + "grad_norm": 0.35274937748908997, + "learning_rate": 0.0002, + "loss": 1.7453, + "step": 2270 + }, + { + "epoch": 1.55366269165247, + "grad_norm": 0.3797738552093506, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 2280 + }, + { + "epoch": 1.5604770017035774, + "grad_norm": 0.4160412847995758, + "learning_rate": 0.0002, + "loss": 1.6197, + "step": 2290 + }, + { + "epoch": 1.5672913117546847, + "grad_norm": 0.38779592514038086, + "learning_rate": 0.0002, + "loss": 1.7101, + "step": 2300 + }, + { + "epoch": 1.574105621805792, + "grad_norm": 0.39171287417411804, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 2310 + }, + { + "epoch": 1.5809199318568994, + "grad_norm": 0.3606826663017273, + "learning_rate": 0.0002, + "loss": 1.7087, + "step": 2320 + }, + { + "epoch": 1.5877342419080067, + "grad_norm": 0.3745017647743225, + "learning_rate": 0.0002, + "loss": 1.7269, + "step": 2330 + }, + { + "epoch": 1.594548551959114, + "grad_norm": 0.34933462738990784, + "learning_rate": 0.0002, + "loss": 1.7068, + "step": 2340 + }, + { + "epoch": 1.6013628620102214, + "grad_norm": 0.37268444895744324, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 2350 + }, + { + "epoch": 1.6081771720613287, + "grad_norm": 0.4603484869003296, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 2360 + }, + { + "epoch": 1.614991482112436, + "grad_norm": 0.35689088702201843, + "learning_rate": 0.0002, + "loss": 1.7621, + "step": 2370 + }, + { + "epoch": 1.6218057921635434, + "grad_norm": 0.3392031490802765, + "learning_rate": 0.0002, + "loss": 1.6989, + "step": 2380 + }, + { + "epoch": 1.6286201022146507, + "grad_norm": 0.394653856754303, + "learning_rate": 0.0002, + "loss": 1.7077, + "step": 2390 + }, + { + "epoch": 1.635434412265758, + "grad_norm": 0.33972012996673584, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 2400 + }, + { + "epoch": 1.6422487223168654, + "grad_norm": 0.3854375183582306, + "learning_rate": 0.0002, + "loss": 1.7681, + "step": 2410 + }, + { + "epoch": 1.6490630323679727, + "grad_norm": 0.36143961548805237, + "learning_rate": 0.0002, + "loss": 1.7102, + "step": 2420 + }, + { + "epoch": 1.65587734241908, + "grad_norm": 0.35816189646720886, + "learning_rate": 0.0002, + "loss": 1.7432, + "step": 2430 + }, + { + "epoch": 1.6626916524701874, + "grad_norm": 0.36298736929893494, + "learning_rate": 0.0002, + "loss": 1.6407, + "step": 2440 + }, + { + "epoch": 1.6695059625212947, + "grad_norm": 0.36756929755210876, + "learning_rate": 0.0002, + "loss": 1.723, + "step": 2450 + }, + { + "epoch": 1.676320272572402, + "grad_norm": 0.35969603061676025, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 2460 + }, + { + "epoch": 1.6831345826235093, + "grad_norm": 0.38449376821517944, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 2470 + }, + { + "epoch": 1.6899488926746167, + "grad_norm": 0.44511452317237854, + "learning_rate": 0.0002, + "loss": 1.7261, + "step": 2480 + }, + { + "epoch": 1.696763202725724, + "grad_norm": 0.3831416368484497, + "learning_rate": 0.0002, + "loss": 1.7397, + "step": 2490 + }, + { + "epoch": 1.7035775127768313, + "grad_norm": 0.3795325756072998, + "learning_rate": 0.0002, + "loss": 1.7046, + "step": 2500 + }, + { + "epoch": 1.7103918228279387, + "grad_norm": 0.34978193044662476, + "learning_rate": 0.0002, + "loss": 1.6993, + "step": 2510 + }, + { + "epoch": 1.717206132879046, + "grad_norm": 0.35923877358436584, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 2520 + }, + { + "epoch": 1.7240204429301533, + "grad_norm": 0.352999746799469, + "learning_rate": 0.0002, + "loss": 1.6253, + "step": 2530 + }, + { + "epoch": 1.7308347529812607, + "grad_norm": 0.43673479557037354, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 2540 + }, + { + "epoch": 1.737649063032368, + "grad_norm": 0.4153687357902527, + "learning_rate": 0.0002, + "loss": 1.7079, + "step": 2550 + }, + { + "epoch": 1.7444633730834753, + "grad_norm": 0.35541167855262756, + "learning_rate": 0.0002, + "loss": 1.6714, + "step": 2560 + }, + { + "epoch": 1.7512776831345827, + "grad_norm": 0.3288775086402893, + "learning_rate": 0.0002, + "loss": 1.696, + "step": 2570 + }, + { + "epoch": 1.75809199318569, + "grad_norm": 0.3991123139858246, + "learning_rate": 0.0002, + "loss": 1.7486, + "step": 2580 + }, + { + "epoch": 1.7649063032367973, + "grad_norm": 0.39967241883277893, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 2590 + }, + { + "epoch": 1.7717206132879046, + "grad_norm": 0.41104283928871155, + "learning_rate": 0.0002, + "loss": 1.6855, + "step": 2600 + }, + { + "epoch": 1.778534923339012, + "grad_norm": 0.44885286688804626, + "learning_rate": 0.0002, + "loss": 1.6993, + "step": 2610 + }, + { + "epoch": 1.7853492333901193, + "grad_norm": 0.38384467363357544, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 2620 + }, + { + "epoch": 1.7921635434412266, + "grad_norm": 0.35020917654037476, + "learning_rate": 0.0002, + "loss": 1.7213, + "step": 2630 + }, + { + "epoch": 1.798977853492334, + "grad_norm": 0.3360341489315033, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 2640 + }, + { + "epoch": 1.8057921635434413, + "grad_norm": 0.38875144720077515, + "learning_rate": 0.0002, + "loss": 1.7037, + "step": 2650 + }, + { + "epoch": 1.8126064735945486, + "grad_norm": 0.34876883029937744, + "learning_rate": 0.0002, + "loss": 1.693, + "step": 2660 + }, + { + "epoch": 1.819420783645656, + "grad_norm": 0.419979989528656, + "learning_rate": 0.0002, + "loss": 1.7743, + "step": 2670 + }, + { + "epoch": 1.8262350936967633, + "grad_norm": 0.3648919463157654, + "learning_rate": 0.0002, + "loss": 1.7089, + "step": 2680 + }, + { + "epoch": 1.8330494037478706, + "grad_norm": 0.3485383987426758, + "learning_rate": 0.0002, + "loss": 1.7828, + "step": 2690 + }, + { + "epoch": 1.839863713798978, + "grad_norm": 0.3647468686103821, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 2700 + }, + { + "epoch": 1.8466780238500853, + "grad_norm": 0.37003210186958313, + "learning_rate": 0.0002, + "loss": 1.7318, + "step": 2710 + }, + { + "epoch": 1.8534923339011926, + "grad_norm": 0.37031617760658264, + "learning_rate": 0.0002, + "loss": 1.6647, + "step": 2720 + }, + { + "epoch": 1.8603066439523, + "grad_norm": 0.3438796103000641, + "learning_rate": 0.0002, + "loss": 1.69, + "step": 2730 + }, + { + "epoch": 1.8671209540034073, + "grad_norm": 0.41574627161026, + "learning_rate": 0.0002, + "loss": 1.5995, + "step": 2740 + }, + { + "epoch": 1.8739352640545146, + "grad_norm": 0.35049930214881897, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 2750 + }, + { + "epoch": 1.880749574105622, + "grad_norm": 0.3943989872932434, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 2760 + }, + { + "epoch": 1.8875638841567293, + "grad_norm": 0.3384978175163269, + "learning_rate": 0.0002, + "loss": 1.7047, + "step": 2770 + }, + { + "epoch": 1.8943781942078366, + "grad_norm": 0.3501328229904175, + "learning_rate": 0.0002, + "loss": 1.7848, + "step": 2780 + }, + { + "epoch": 1.901192504258944, + "grad_norm": 0.37484532594680786, + "learning_rate": 0.0002, + "loss": 1.6986, + "step": 2790 + }, + { + "epoch": 1.9080068143100513, + "grad_norm": 0.34497788548469543, + "learning_rate": 0.0002, + "loss": 1.6867, + "step": 2800 + }, + { + "epoch": 1.9148211243611586, + "grad_norm": 0.3530851900577545, + "learning_rate": 0.0002, + "loss": 1.6513, + "step": 2810 + }, + { + "epoch": 1.921635434412266, + "grad_norm": 0.3879254162311554, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 2820 + }, + { + "epoch": 1.9284497444633732, + "grad_norm": 0.3885590136051178, + "learning_rate": 0.0002, + "loss": 1.6786, + "step": 2830 + }, + { + "epoch": 1.9352640545144804, + "grad_norm": 0.3868715465068817, + "learning_rate": 0.0002, + "loss": 1.8049, + "step": 2840 + }, + { + "epoch": 1.9420783645655877, + "grad_norm": 0.4152422249317169, + "learning_rate": 0.0002, + "loss": 1.7099, + "step": 2850 + }, + { + "epoch": 1.948892674616695, + "grad_norm": 0.401714563369751, + "learning_rate": 0.0002, + "loss": 1.6696, + "step": 2860 + }, + { + "epoch": 1.9557069846678024, + "grad_norm": 0.34825265407562256, + "learning_rate": 0.0002, + "loss": 1.7182, + "step": 2870 + }, + { + "epoch": 1.9625212947189097, + "grad_norm": 0.3620675504207611, + "learning_rate": 0.0002, + "loss": 1.6612, + "step": 2880 + }, + { + "epoch": 1.969335604770017, + "grad_norm": 0.3977806866168976, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 2890 + }, + { + "epoch": 1.9761499148211243, + "grad_norm": 0.3687497079372406, + "learning_rate": 0.0002, + "loss": 1.6514, + "step": 2900 + }, + { + "epoch": 1.9829642248722317, + "grad_norm": 0.408640056848526, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 2910 + }, + { + "epoch": 1.989778534923339, + "grad_norm": 0.34510108828544617, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 2920 + }, + { + "epoch": 1.9965928449744463, + "grad_norm": 0.3596334755420685, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 2930 + }, + { + "epoch": 2.0, + "eval_loss": 1.8056600093841553, + "eval_runtime": 60.7049, + "eval_samples_per_second": 8.352, + "eval_steps_per_second": 1.054, + "step": 2935 + }, + { + "epoch": 2.0034071550255534, + "grad_norm": 0.3460802137851715, + "learning_rate": 0.0002, + "loss": 1.6918, + "step": 2940 + }, + { + "epoch": 2.0102214650766608, + "grad_norm": 0.4038620591163635, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 2950 + }, + { + "epoch": 2.017035775127768, + "grad_norm": 0.3950219750404358, + "learning_rate": 0.0002, + "loss": 1.5386, + "step": 2960 + }, + { + "epoch": 2.0238500851788754, + "grad_norm": 0.519116997718811, + "learning_rate": 0.0002, + "loss": 1.5649, + "step": 2970 + }, + { + "epoch": 2.0306643952299828, + "grad_norm": 0.4097684919834137, + "learning_rate": 0.0002, + "loss": 1.5397, + "step": 2980 + }, + { + "epoch": 2.03747870528109, + "grad_norm": 0.4153544306755066, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 2990 + }, + { + "epoch": 2.0442930153321974, + "grad_norm": 0.4351160526275635, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 3000 + }, + { + "epoch": 2.0511073253833048, + "grad_norm": 0.42036259174346924, + "learning_rate": 0.0002, + "loss": 1.5091, + "step": 3010 + }, + { + "epoch": 2.057921635434412, + "grad_norm": 0.4433218836784363, + "learning_rate": 0.0002, + "loss": 1.5686, + "step": 3020 + }, + { + "epoch": 2.0647359454855194, + "grad_norm": 0.46511581540107727, + "learning_rate": 0.0002, + "loss": 1.5478, + "step": 3030 + }, + { + "epoch": 2.0715502555366268, + "grad_norm": 0.4567560851573944, + "learning_rate": 0.0002, + "loss": 1.5554, + "step": 3040 + }, + { + "epoch": 2.078364565587734, + "grad_norm": 0.45671048760414124, + "learning_rate": 0.0002, + "loss": 1.5561, + "step": 3050 + }, + { + "epoch": 2.0851788756388414, + "grad_norm": 0.4598552882671356, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 3060 + }, + { + "epoch": 2.0919931856899487, + "grad_norm": 0.4582861661911011, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 3070 + }, + { + "epoch": 2.098807495741056, + "grad_norm": 0.4366969168186188, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 3080 + }, + { + "epoch": 2.1056218057921634, + "grad_norm": 0.495917409658432, + "learning_rate": 0.0002, + "loss": 1.5574, + "step": 3090 + }, + { + "epoch": 2.1124361158432707, + "grad_norm": 1.6846044063568115, + "learning_rate": 0.0002, + "loss": 1.5798, + "step": 3100 + }, + { + "epoch": 2.119250425894378, + "grad_norm": 0.4765092134475708, + "learning_rate": 0.0002, + "loss": 1.5877, + "step": 3110 + }, + { + "epoch": 2.1260647359454854, + "grad_norm": 0.45029810070991516, + "learning_rate": 0.0002, + "loss": 1.6106, + "step": 3120 + }, + { + "epoch": 2.1328790459965927, + "grad_norm": 0.5706973075866699, + "learning_rate": 0.0002, + "loss": 1.5707, + "step": 3130 + }, + { + "epoch": 2.1396933560477, + "grad_norm": 0.4606274366378784, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 3140 + }, + { + "epoch": 2.1465076660988074, + "grad_norm": 2.199115753173828, + "learning_rate": 0.0002, + "loss": 1.564, + "step": 3150 + }, + { + "epoch": 2.1533219761499147, + "grad_norm": 0.6231027245521545, + "learning_rate": 0.0002, + "loss": 1.5808, + "step": 3160 + }, + { + "epoch": 2.160136286201022, + "grad_norm": 0.46918219327926636, + "learning_rate": 0.0002, + "loss": 1.5397, + "step": 3170 + }, + { + "epoch": 2.1669505962521294, + "grad_norm": 0.5006393194198608, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 3180 + }, + { + "epoch": 2.1737649063032367, + "grad_norm": 0.4745093286037445, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 3190 + }, + { + "epoch": 2.180579216354344, + "grad_norm": 0.511576771736145, + "learning_rate": 0.0002, + "loss": 1.6459, + "step": 3200 + }, + { + "epoch": 2.1873935264054514, + "grad_norm": 0.38622918725013733, + "learning_rate": 0.0002, + "loss": 1.6019, + "step": 3210 + }, + { + "epoch": 2.1942078364565587, + "grad_norm": 0.4425644278526306, + "learning_rate": 0.0002, + "loss": 1.6187, + "step": 3220 + }, + { + "epoch": 2.201022146507666, + "grad_norm": 0.45643091201782227, + "learning_rate": 0.0002, + "loss": 1.6114, + "step": 3230 + }, + { + "epoch": 2.2078364565587734, + "grad_norm": 0.4975406229496002, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 3240 + }, + { + "epoch": 2.2146507666098807, + "grad_norm": 0.4673331081867218, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 3250 + }, + { + "epoch": 2.221465076660988, + "grad_norm": 0.5081731081008911, + "learning_rate": 0.0002, + "loss": 1.6155, + "step": 3260 + }, + { + "epoch": 2.2282793867120954, + "grad_norm": 0.4790806770324707, + "learning_rate": 0.0002, + "loss": 1.53, + "step": 3270 + }, + { + "epoch": 2.2350936967632027, + "grad_norm": 0.5184140801429749, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 3280 + }, + { + "epoch": 2.24190800681431, + "grad_norm": 0.5159083604812622, + "learning_rate": 0.0002, + "loss": 1.5646, + "step": 3290 + }, + { + "epoch": 2.2487223168654173, + "grad_norm": 0.4876042604446411, + "learning_rate": 0.0002, + "loss": 1.6577, + "step": 3300 + }, + { + "epoch": 2.2555366269165247, + "grad_norm": 0.5454957485198975, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 3310 + }, + { + "epoch": 2.262350936967632, + "grad_norm": 0.49866822361946106, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 3320 + }, + { + "epoch": 2.2691652470187393, + "grad_norm": 0.42674365639686584, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 3330 + }, + { + "epoch": 2.2759795570698467, + "grad_norm": 0.5202316641807556, + "learning_rate": 0.0002, + "loss": 1.5588, + "step": 3340 + }, + { + "epoch": 2.282793867120954, + "grad_norm": 0.4849465489387512, + "learning_rate": 0.0002, + "loss": 1.6032, + "step": 3350 + }, + { + "epoch": 2.2896081771720613, + "grad_norm": 0.47202569246292114, + "learning_rate": 0.0002, + "loss": 1.6853, + "step": 3360 + }, + { + "epoch": 2.2964224872231687, + "grad_norm": 0.5311620235443115, + "learning_rate": 0.0002, + "loss": 1.6164, + "step": 3370 + }, + { + "epoch": 2.303236797274276, + "grad_norm": 0.49011409282684326, + "learning_rate": 0.0002, + "loss": 1.6004, + "step": 3380 + }, + { + "epoch": 2.3100511073253833, + "grad_norm": 0.4789247512817383, + "learning_rate": 0.0002, + "loss": 1.5484, + "step": 3390 + }, + { + "epoch": 2.3168654173764907, + "grad_norm": 0.46646103262901306, + "learning_rate": 0.0002, + "loss": 1.5866, + "step": 3400 + }, + { + "epoch": 2.323679727427598, + "grad_norm": 0.5552441477775574, + "learning_rate": 0.0002, + "loss": 1.5308, + "step": 3410 + }, + { + "epoch": 2.3304940374787053, + "grad_norm": 0.4530351758003235, + "learning_rate": 0.0002, + "loss": 1.5761, + "step": 3420 + }, + { + "epoch": 2.3373083475298126, + "grad_norm": 0.4806232750415802, + "learning_rate": 0.0002, + "loss": 1.5919, + "step": 3430 + }, + { + "epoch": 2.34412265758092, + "grad_norm": 0.5998363494873047, + "learning_rate": 0.0002, + "loss": 1.5569, + "step": 3440 + }, + { + "epoch": 2.3509369676320273, + "grad_norm": 0.4918554425239563, + "learning_rate": 0.0002, + "loss": 1.513, + "step": 3450 + }, + { + "epoch": 2.3577512776831346, + "grad_norm": 0.5359559655189514, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 3460 + }, + { + "epoch": 2.364565587734242, + "grad_norm": 0.5053277611732483, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 3470 + }, + { + "epoch": 2.3713798977853493, + "grad_norm": 0.5058915019035339, + "learning_rate": 0.0002, + "loss": 1.5673, + "step": 3480 + }, + { + "epoch": 2.3781942078364566, + "grad_norm": 0.5314899682998657, + "learning_rate": 0.0002, + "loss": 1.5434, + "step": 3490 + }, + { + "epoch": 2.385008517887564, + "grad_norm": 0.48035913705825806, + "learning_rate": 0.0002, + "loss": 1.5203, + "step": 3500 + }, + { + "epoch": 2.3918228279386713, + "grad_norm": 0.45864903926849365, + "learning_rate": 0.0002, + "loss": 1.5936, + "step": 3510 + }, + { + "epoch": 2.3986371379897786, + "grad_norm": 0.4553050398826599, + "learning_rate": 0.0002, + "loss": 1.6285, + "step": 3520 + }, + { + "epoch": 2.405451448040886, + "grad_norm": 0.4483442008495331, + "learning_rate": 0.0002, + "loss": 1.5444, + "step": 3530 + }, + { + "epoch": 2.4122657580919933, + "grad_norm": 0.5043742060661316, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 3540 + }, + { + "epoch": 2.4190800681431006, + "grad_norm": 0.44807168841362, + "learning_rate": 0.0002, + "loss": 1.5692, + "step": 3550 + }, + { + "epoch": 2.425894378194208, + "grad_norm": 0.5065137147903442, + "learning_rate": 0.0002, + "loss": 1.6306, + "step": 3560 + }, + { + "epoch": 2.4327086882453153, + "grad_norm": 0.5186443328857422, + "learning_rate": 0.0002, + "loss": 1.5842, + "step": 3570 + }, + { + "epoch": 2.4395229982964226, + "grad_norm": 0.49743232131004333, + "learning_rate": 0.0002, + "loss": 1.5956, + "step": 3580 + }, + { + "epoch": 2.44633730834753, + "grad_norm": 0.524450421333313, + "learning_rate": 0.0002, + "loss": 1.6021, + "step": 3590 + }, + { + "epoch": 2.4531516183986373, + "grad_norm": 0.5053797364234924, + "learning_rate": 0.0002, + "loss": 1.6283, + "step": 3600 + }, + { + "epoch": 2.4599659284497446, + "grad_norm": 0.5223091840744019, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 3610 + }, + { + "epoch": 2.466780238500852, + "grad_norm": 0.4763810932636261, + "learning_rate": 0.0002, + "loss": 1.6315, + "step": 3620 + }, + { + "epoch": 2.4735945485519593, + "grad_norm": 0.5097282528877258, + "learning_rate": 0.0002, + "loss": 1.5623, + "step": 3630 + }, + { + "epoch": 2.4804088586030666, + "grad_norm": 0.5831942558288574, + "learning_rate": 0.0002, + "loss": 1.605, + "step": 3640 + }, + { + "epoch": 2.487223168654174, + "grad_norm": 0.47573572397232056, + "learning_rate": 0.0002, + "loss": 1.6074, + "step": 3650 + }, + { + "epoch": 2.4940374787052813, + "grad_norm": 0.49602726101875305, + "learning_rate": 0.0002, + "loss": 1.6411, + "step": 3660 + }, + { + "epoch": 2.500851788756388, + "grad_norm": 0.5468524694442749, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 3670 + }, + { + "epoch": 2.507666098807496, + "grad_norm": 0.45899373292922974, + "learning_rate": 0.0002, + "loss": 1.5782, + "step": 3680 + }, + { + "epoch": 2.514480408858603, + "grad_norm": 0.5031567215919495, + "learning_rate": 0.0002, + "loss": 1.5114, + "step": 3690 + }, + { + "epoch": 2.5212947189097106, + "grad_norm": 0.5224900841712952, + "learning_rate": 0.0002, + "loss": 1.538, + "step": 3700 + }, + { + "epoch": 2.5281090289608175, + "grad_norm": 0.504769504070282, + "learning_rate": 0.0002, + "loss": 1.5269, + "step": 3710 + }, + { + "epoch": 2.5349233390119252, + "grad_norm": 0.6120529770851135, + "learning_rate": 0.0002, + "loss": 1.5141, + "step": 3720 + }, + { + "epoch": 2.541737649063032, + "grad_norm": 0.47930678725242615, + "learning_rate": 0.0002, + "loss": 1.5666, + "step": 3730 + }, + { + "epoch": 2.54855195911414, + "grad_norm": 0.5039092302322388, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 3740 + }, + { + "epoch": 2.555366269165247, + "grad_norm": 0.49758994579315186, + "learning_rate": 0.0002, + "loss": 1.5585, + "step": 3750 + }, + { + "epoch": 2.5621805792163546, + "grad_norm": 0.44739171862602234, + "learning_rate": 0.0002, + "loss": 1.5351, + "step": 3760 + }, + { + "epoch": 2.5689948892674614, + "grad_norm": 0.47056373953819275, + "learning_rate": 0.0002, + "loss": 1.5099, + "step": 3770 + }, + { + "epoch": 2.575809199318569, + "grad_norm": 0.5077595114707947, + "learning_rate": 0.0002, + "loss": 1.5524, + "step": 3780 + }, + { + "epoch": 2.582623509369676, + "grad_norm": 0.4981902837753296, + "learning_rate": 0.0002, + "loss": 1.5524, + "step": 3790 + }, + { + "epoch": 2.589437819420784, + "grad_norm": 0.5736238360404968, + "learning_rate": 0.0002, + "loss": 1.5381, + "step": 3800 + }, + { + "epoch": 2.5962521294718908, + "grad_norm": 0.4898384213447571, + "learning_rate": 0.0002, + "loss": 1.67, + "step": 3810 + }, + { + "epoch": 2.6030664395229985, + "grad_norm": 0.4611325263977051, + "learning_rate": 0.0002, + "loss": 1.5411, + "step": 3820 + }, + { + "epoch": 2.6098807495741054, + "grad_norm": 0.5285341739654541, + "learning_rate": 0.0002, + "loss": 1.5662, + "step": 3830 + }, + { + "epoch": 2.616695059625213, + "grad_norm": 0.5679430961608887, + "learning_rate": 0.0002, + "loss": 1.5875, + "step": 3840 + }, + { + "epoch": 2.62350936967632, + "grad_norm": 0.48532548546791077, + "learning_rate": 0.0002, + "loss": 1.5544, + "step": 3850 + }, + { + "epoch": 2.630323679727428, + "grad_norm": 0.45506492257118225, + "learning_rate": 0.0002, + "loss": 1.579, + "step": 3860 + }, + { + "epoch": 2.6371379897785348, + "grad_norm": 0.6552556753158569, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 3870 + }, + { + "epoch": 2.643952299829642, + "grad_norm": 0.537874698638916, + "learning_rate": 0.0002, + "loss": 1.608, + "step": 3880 + }, + { + "epoch": 2.6507666098807494, + "grad_norm": 0.46102389693260193, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 3890 + }, + { + "epoch": 2.6575809199318567, + "grad_norm": 0.45531195402145386, + "learning_rate": 0.0002, + "loss": 1.5293, + "step": 3900 + }, + { + "epoch": 2.664395229982964, + "grad_norm": 0.5327293872833252, + "learning_rate": 0.0002, + "loss": 1.5492, + "step": 3910 + }, + { + "epoch": 2.6712095400340714, + "grad_norm": 0.4968956410884857, + "learning_rate": 0.0002, + "loss": 1.5921, + "step": 3920 + }, + { + "epoch": 2.6780238500851787, + "grad_norm": 0.4790082275867462, + "learning_rate": 0.0002, + "loss": 1.5823, + "step": 3930 + }, + { + "epoch": 2.684838160136286, + "grad_norm": 0.5392967462539673, + "learning_rate": 0.0002, + "loss": 1.615, + "step": 3940 + }, + { + "epoch": 2.6916524701873934, + "grad_norm": 0.5076649785041809, + "learning_rate": 0.0002, + "loss": 1.6218, + "step": 3950 + }, + { + "epoch": 2.6984667802385007, + "grad_norm": 0.5628064274787903, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 3960 + }, + { + "epoch": 2.705281090289608, + "grad_norm": 0.5012659430503845, + "learning_rate": 0.0002, + "loss": 1.5417, + "step": 3970 + }, + { + "epoch": 2.7120954003407154, + "grad_norm": 0.4947647452354431, + "learning_rate": 0.0002, + "loss": 1.5339, + "step": 3980 + }, + { + "epoch": 2.7189097103918227, + "grad_norm": 0.4890969693660736, + "learning_rate": 0.0002, + "loss": 1.5724, + "step": 3990 + }, + { + "epoch": 2.72572402044293, + "grad_norm": 0.4471694231033325, + "learning_rate": 0.0002, + "loss": 1.5746, + "step": 4000 + }, + { + "epoch": 2.7325383304940374, + "grad_norm": 0.5116439461708069, + "learning_rate": 0.0002, + "loss": 1.6669, + "step": 4010 + }, + { + "epoch": 2.7393526405451447, + "grad_norm": 0.5720411539077759, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 4020 + }, + { + "epoch": 2.746166950596252, + "grad_norm": 0.5529406070709229, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 4030 + }, + { + "epoch": 2.7529812606473594, + "grad_norm": 0.5229396820068359, + "learning_rate": 0.0002, + "loss": 1.6296, + "step": 4040 + }, + { + "epoch": 2.7597955706984667, + "grad_norm": 0.5270276069641113, + "learning_rate": 0.0002, + "loss": 1.5363, + "step": 4050 + }, + { + "epoch": 2.766609880749574, + "grad_norm": 0.48413026332855225, + "learning_rate": 0.0002, + "loss": 1.6305, + "step": 4060 + }, + { + "epoch": 2.7734241908006814, + "grad_norm": 0.5145403742790222, + "learning_rate": 0.0002, + "loss": 1.5488, + "step": 4070 + }, + { + "epoch": 2.7802385008517887, + "grad_norm": 0.48626071214675903, + "learning_rate": 0.0002, + "loss": 1.6023, + "step": 4080 + }, + { + "epoch": 2.787052810902896, + "grad_norm": 0.5018984079360962, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 4090 + }, + { + "epoch": 2.7938671209540034, + "grad_norm": 0.4881938695907593, + "learning_rate": 0.0002, + "loss": 1.7166, + "step": 4100 + }, + { + "epoch": 2.8006814310051107, + "grad_norm": 0.5151546001434326, + "learning_rate": 0.0002, + "loss": 1.515, + "step": 4110 + }, + { + "epoch": 2.807495741056218, + "grad_norm": 0.5109850764274597, + "learning_rate": 0.0002, + "loss": 1.6069, + "step": 4120 + }, + { + "epoch": 2.8143100511073254, + "grad_norm": 0.5109251141548157, + "learning_rate": 0.0002, + "loss": 1.6153, + "step": 4130 + }, + { + "epoch": 2.8211243611584327, + "grad_norm": 0.5025496482849121, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 4140 + }, + { + "epoch": 2.82793867120954, + "grad_norm": 0.49027004837989807, + "learning_rate": 0.0002, + "loss": 1.6292, + "step": 4150 + }, + { + "epoch": 2.8347529812606473, + "grad_norm": 0.4957362413406372, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 4160 + }, + { + "epoch": 2.8415672913117547, + "grad_norm": 0.5159927606582642, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 4170 + }, + { + "epoch": 2.848381601362862, + "grad_norm": 0.6040670871734619, + "learning_rate": 0.0002, + "loss": 1.577, + "step": 4180 + }, + { + "epoch": 2.8551959114139693, + "grad_norm": 0.5489953756332397, + "learning_rate": 0.0002, + "loss": 1.5295, + "step": 4190 + }, + { + "epoch": 2.8620102214650767, + "grad_norm": 0.5416634678840637, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 4200 + }, + { + "epoch": 2.868824531516184, + "grad_norm": 0.5278245210647583, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 4210 + }, + { + "epoch": 2.8756388415672913, + "grad_norm": 0.43382319808006287, + "learning_rate": 0.0002, + "loss": 1.4848, + "step": 4220 + }, + { + "epoch": 2.8824531516183987, + "grad_norm": 0.4724387228488922, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 4230 + }, + { + "epoch": 2.889267461669506, + "grad_norm": 0.49824780225753784, + "learning_rate": 0.0002, + "loss": 1.5748, + "step": 4240 + }, + { + "epoch": 2.8960817717206133, + "grad_norm": 0.5360262989997864, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 4250 + }, + { + "epoch": 2.9028960817717206, + "grad_norm": 0.49090322852134705, + "learning_rate": 0.0002, + "loss": 1.5934, + "step": 4260 + }, + { + "epoch": 2.909710391822828, + "grad_norm": 0.5613328218460083, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 4270 + }, + { + "epoch": 2.9165247018739353, + "grad_norm": 0.4611356258392334, + "learning_rate": 0.0002, + "loss": 1.5713, + "step": 4280 + }, + { + "epoch": 2.9233390119250426, + "grad_norm": 0.550897479057312, + "learning_rate": 0.0002, + "loss": 1.5457, + "step": 4290 + }, + { + "epoch": 2.93015332197615, + "grad_norm": 0.5089612603187561, + "learning_rate": 0.0002, + "loss": 1.6225, + "step": 4300 + }, + { + "epoch": 2.9369676320272573, + "grad_norm": 0.5210904479026794, + "learning_rate": 0.0002, + "loss": 1.5897, + "step": 4310 + }, + { + "epoch": 2.9437819420783646, + "grad_norm": 0.5506424903869629, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 4320 + }, + { + "epoch": 2.950596252129472, + "grad_norm": 0.5118561387062073, + "learning_rate": 0.0002, + "loss": 1.6395, + "step": 4330 + }, + { + "epoch": 2.9574105621805793, + "grad_norm": 0.5034464597702026, + "learning_rate": 0.0002, + "loss": 1.704, + "step": 4340 + }, + { + "epoch": 2.9642248722316866, + "grad_norm": 0.5019990801811218, + "learning_rate": 0.0002, + "loss": 1.6314, + "step": 4350 + }, + { + "epoch": 2.971039182282794, + "grad_norm": 0.5423325300216675, + "learning_rate": 0.0002, + "loss": 1.6161, + "step": 4360 + }, + { + "epoch": 2.9778534923339013, + "grad_norm": 0.5287469625473022, + "learning_rate": 0.0002, + "loss": 1.6144, + "step": 4370 + }, + { + "epoch": 2.9846678023850086, + "grad_norm": 0.5206913352012634, + "learning_rate": 0.0002, + "loss": 1.5227, + "step": 4380 + }, + { + "epoch": 2.991482112436116, + "grad_norm": 0.5407394170761108, + "learning_rate": 0.0002, + "loss": 1.6026, + "step": 4390 + }, + { + "epoch": 2.9982964224872233, + "grad_norm": 0.5244600176811218, + "learning_rate": 0.0002, + "loss": 1.5908, + "step": 4400 + }, + { + "epoch": 2.9996592844974446, + "eval_loss": 1.8412635326385498, + "eval_runtime": 65.5583, + "eval_samples_per_second": 7.734, + "eval_steps_per_second": 0.976, + "step": 4402 + }, + { + "epoch": 3.0051107325383306, + "grad_norm": 0.5172150731086731, + "learning_rate": 0.0002, + "loss": 1.5157, + "step": 4410 + }, + { + "epoch": 3.011925042589438, + "grad_norm": 0.6882525086402893, + "learning_rate": 0.0002, + "loss": 1.398, + "step": 4420 + }, + { + "epoch": 3.0187393526405453, + "grad_norm": 0.6435003280639648, + "learning_rate": 0.0002, + "loss": 1.3884, + "step": 4430 + }, + { + "epoch": 3.0255536626916526, + "grad_norm": 0.7126057147979736, + "learning_rate": 0.0002, + "loss": 1.4493, + "step": 4440 + }, + { + "epoch": 3.03236797274276, + "grad_norm": 0.6634385585784912, + "learning_rate": 0.0002, + "loss": 1.4397, + "step": 4450 + }, + { + "epoch": 3.0391822827938673, + "grad_norm": 0.6468435525894165, + "learning_rate": 0.0002, + "loss": 1.3674, + "step": 4460 + }, + { + "epoch": 3.0459965928449746, + "grad_norm": 0.5690478086471558, + "learning_rate": 0.0002, + "loss": 1.4045, + "step": 4470 + }, + { + "epoch": 3.052810902896082, + "grad_norm": 0.7323708534240723, + "learning_rate": 0.0002, + "loss": 1.3742, + "step": 4480 + }, + { + "epoch": 3.0596252129471893, + "grad_norm": 0.6989302039146423, + "learning_rate": 0.0002, + "loss": 1.3281, + "step": 4490 + }, + { + "epoch": 3.0664395229982966, + "grad_norm": 0.6704450845718384, + "learning_rate": 0.0002, + "loss": 1.379, + "step": 4500 + }, + { + "epoch": 3.073253833049404, + "grad_norm": 0.769137442111969, + "learning_rate": 0.0002, + "loss": 1.4028, + "step": 4510 + }, + { + "epoch": 3.0800681431005112, + "grad_norm": 0.6556448936462402, + "learning_rate": 0.0002, + "loss": 1.4295, + "step": 4520 + }, + { + "epoch": 3.0868824531516186, + "grad_norm": 0.7143950462341309, + "learning_rate": 0.0002, + "loss": 1.2763, + "step": 4530 + }, + { + "epoch": 3.093696763202726, + "grad_norm": 0.7060510516166687, + "learning_rate": 0.0002, + "loss": 1.4806, + "step": 4540 + }, + { + "epoch": 3.1005110732538332, + "grad_norm": 0.6637526750564575, + "learning_rate": 0.0002, + "loss": 1.4097, + "step": 4550 + }, + { + "epoch": 3.1073253833049406, + "grad_norm": 0.822989284992218, + "learning_rate": 0.0002, + "loss": 1.4752, + "step": 4560 + }, + { + "epoch": 3.114139693356048, + "grad_norm": 0.5542152523994446, + "learning_rate": 0.0002, + "loss": 1.4994, + "step": 4570 + }, + { + "epoch": 3.1209540034071552, + "grad_norm": 0.7780306935310364, + "learning_rate": 0.0002, + "loss": 1.4306, + "step": 4580 + }, + { + "epoch": 3.1277683134582626, + "grad_norm": 0.7372637987136841, + "learning_rate": 0.0002, + "loss": 1.3909, + "step": 4590 + }, + { + "epoch": 3.1345826235093694, + "grad_norm": 0.6730087995529175, + "learning_rate": 0.0002, + "loss": 1.3989, + "step": 4600 + }, + { + "epoch": 3.1413969335604772, + "grad_norm": 0.6687398552894592, + "learning_rate": 0.0002, + "loss": 1.3591, + "step": 4610 + }, + { + "epoch": 3.148211243611584, + "grad_norm": 0.7645083665847778, + "learning_rate": 0.0002, + "loss": 1.436, + "step": 4620 + }, + { + "epoch": 3.155025553662692, + "grad_norm": 0.6770380139350891, + "learning_rate": 0.0002, + "loss": 1.3681, + "step": 4630 + }, + { + "epoch": 3.1618398637137988, + "grad_norm": 0.7200576663017273, + "learning_rate": 0.0002, + "loss": 1.405, + "step": 4640 + }, + { + "epoch": 3.168654173764906, + "grad_norm": 0.6663638949394226, + "learning_rate": 0.0002, + "loss": 1.3752, + "step": 4650 + }, + { + "epoch": 3.1754684838160134, + "grad_norm": 0.6602960228919983, + "learning_rate": 0.0002, + "loss": 1.4099, + "step": 4660 + }, + { + "epoch": 3.1822827938671208, + "grad_norm": 0.7838228344917297, + "learning_rate": 0.0002, + "loss": 1.4003, + "step": 4670 + }, + { + "epoch": 3.189097103918228, + "grad_norm": 0.7559184432029724, + "learning_rate": 0.0002, + "loss": 1.3853, + "step": 4680 + }, + { + "epoch": 3.1959114139693354, + "grad_norm": 0.6609814167022705, + "learning_rate": 0.0002, + "loss": 1.4516, + "step": 4690 + }, + { + "epoch": 3.2027257240204428, + "grad_norm": 0.8470419645309448, + "learning_rate": 0.0002, + "loss": 1.4464, + "step": 4700 + }, + { + "epoch": 3.20954003407155, + "grad_norm": 0.7282822728157043, + "learning_rate": 0.0002, + "loss": 1.428, + "step": 4710 + }, + { + "epoch": 3.2163543441226574, + "grad_norm": 0.6722773313522339, + "learning_rate": 0.0002, + "loss": 1.5261, + "step": 4720 + }, + { + "epoch": 3.2231686541737647, + "grad_norm": 0.7630265355110168, + "learning_rate": 0.0002, + "loss": 1.3809, + "step": 4730 + }, + { + "epoch": 3.229982964224872, + "grad_norm": 0.7102773785591125, + "learning_rate": 0.0002, + "loss": 1.42, + "step": 4740 + }, + { + "epoch": 3.2367972742759794, + "grad_norm": 0.7778299450874329, + "learning_rate": 0.0002, + "loss": 1.3529, + "step": 4750 + }, + { + "epoch": 3.2436115843270867, + "grad_norm": 0.7189921736717224, + "learning_rate": 0.0002, + "loss": 1.4715, + "step": 4760 + }, + { + "epoch": 3.250425894378194, + "grad_norm": 0.7708092331886292, + "learning_rate": 0.0002, + "loss": 1.4328, + "step": 4770 + }, + { + "epoch": 3.2572402044293014, + "grad_norm": 0.7208452224731445, + "learning_rate": 0.0002, + "loss": 1.3855, + "step": 4780 + }, + { + "epoch": 3.2640545144804087, + "grad_norm": 0.7220432758331299, + "learning_rate": 0.0002, + "loss": 1.3206, + "step": 4790 + }, + { + "epoch": 3.270868824531516, + "grad_norm": 0.7064954042434692, + "learning_rate": 0.0002, + "loss": 1.463, + "step": 4800 + }, + { + "epoch": 3.2776831345826234, + "grad_norm": 0.6618382334709167, + "learning_rate": 0.0002, + "loss": 1.4236, + "step": 4810 + }, + { + "epoch": 3.2844974446337307, + "grad_norm": 0.6854256391525269, + "learning_rate": 0.0002, + "loss": 1.3878, + "step": 4820 + }, + { + "epoch": 3.291311754684838, + "grad_norm": 0.6036319136619568, + "learning_rate": 0.0002, + "loss": 1.4236, + "step": 4830 + }, + { + "epoch": 3.2981260647359454, + "grad_norm": 0.714678943157196, + "learning_rate": 0.0002, + "loss": 1.4796, + "step": 4840 + }, + { + "epoch": 3.3049403747870527, + "grad_norm": 0.7218600511550903, + "learning_rate": 0.0002, + "loss": 1.4273, + "step": 4850 + }, + { + "epoch": 3.31175468483816, + "grad_norm": 0.7243074774742126, + "learning_rate": 0.0002, + "loss": 1.3915, + "step": 4860 + }, + { + "epoch": 3.3185689948892674, + "grad_norm": 0.7058630585670471, + "learning_rate": 0.0002, + "loss": 1.4088, + "step": 4870 + }, + { + "epoch": 3.3253833049403747, + "grad_norm": 0.7091076970100403, + "learning_rate": 0.0002, + "loss": 1.3837, + "step": 4880 + }, + { + "epoch": 3.332197614991482, + "grad_norm": 0.7375147342681885, + "learning_rate": 0.0002, + "loss": 1.4745, + "step": 4890 + }, + { + "epoch": 3.3390119250425894, + "grad_norm": 0.9426755309104919, + "learning_rate": 0.0002, + "loss": 1.4826, + "step": 4900 + }, + { + "epoch": 3.3458262350936967, + "grad_norm": 0.6508213877677917, + "learning_rate": 0.0002, + "loss": 1.369, + "step": 4910 + }, + { + "epoch": 3.352640545144804, + "grad_norm": 0.6945043206214905, + "learning_rate": 0.0002, + "loss": 1.3839, + "step": 4920 + }, + { + "epoch": 3.3594548551959114, + "grad_norm": 0.6335888504981995, + "learning_rate": 0.0002, + "loss": 1.3571, + "step": 4930 + }, + { + "epoch": 3.3662691652470187, + "grad_norm": 0.6947107911109924, + "learning_rate": 0.0002, + "loss": 1.4391, + "step": 4940 + }, + { + "epoch": 3.373083475298126, + "grad_norm": 0.8204733729362488, + "learning_rate": 0.0002, + "loss": 1.3885, + "step": 4950 + }, + { + "epoch": 3.3798977853492334, + "grad_norm": 0.7212244868278503, + "learning_rate": 0.0002, + "loss": 1.4886, + "step": 4960 + }, + { + "epoch": 3.3867120954003407, + "grad_norm": 0.6053042411804199, + "learning_rate": 0.0002, + "loss": 1.4581, + "step": 4970 + }, + { + "epoch": 3.393526405451448, + "grad_norm": 0.7820029854774475, + "learning_rate": 0.0002, + "loss": 1.3863, + "step": 4980 + }, + { + "epoch": 3.4003407155025553, + "grad_norm": 0.6866770386695862, + "learning_rate": 0.0002, + "loss": 1.4326, + "step": 4990 + }, + { + "epoch": 3.4071550255536627, + "grad_norm": 0.6652463674545288, + "learning_rate": 0.0002, + "loss": 1.4287, + "step": 5000 + }, + { + "epoch": 3.41396933560477, + "grad_norm": 1.1209032535552979, + "learning_rate": 0.0002, + "loss": 1.3667, + "step": 5010 + }, + { + "epoch": 3.4207836456558773, + "grad_norm": 0.8390814661979675, + "learning_rate": 0.0002, + "loss": 1.4461, + "step": 5020 + }, + { + "epoch": 3.4275979557069847, + "grad_norm": 0.7541858553886414, + "learning_rate": 0.0002, + "loss": 1.4556, + "step": 5030 + }, + { + "epoch": 3.434412265758092, + "grad_norm": 0.6902772784233093, + "learning_rate": 0.0002, + "loss": 1.4245, + "step": 5040 + }, + { + "epoch": 3.4412265758091993, + "grad_norm": 0.7070329785346985, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 5050 + }, + { + "epoch": 3.4480408858603067, + "grad_norm": 0.8075643181800842, + "learning_rate": 0.0002, + "loss": 1.3903, + "step": 5060 + }, + { + "epoch": 3.454855195911414, + "grad_norm": 0.7133861780166626, + "learning_rate": 0.0002, + "loss": 1.3929, + "step": 5070 + }, + { + "epoch": 3.4616695059625213, + "grad_norm": 0.6631823182106018, + "learning_rate": 0.0002, + "loss": 1.4632, + "step": 5080 + }, + { + "epoch": 3.4684838160136287, + "grad_norm": 0.673870325088501, + "learning_rate": 0.0002, + "loss": 1.4162, + "step": 5090 + }, + { + "epoch": 3.475298126064736, + "grad_norm": 0.6438634395599365, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 5100 + }, + { + "epoch": 3.4821124361158433, + "grad_norm": 0.7560495734214783, + "learning_rate": 0.0002, + "loss": 1.4421, + "step": 5110 + }, + { + "epoch": 3.4889267461669506, + "grad_norm": 0.6877814531326294, + "learning_rate": 0.0002, + "loss": 1.4125, + "step": 5120 + }, + { + "epoch": 3.495741056218058, + "grad_norm": 0.7031328678131104, + "learning_rate": 0.0002, + "loss": 1.4308, + "step": 5130 + }, + { + "epoch": 3.5025553662691653, + "grad_norm": 0.6797195672988892, + "learning_rate": 0.0002, + "loss": 1.3705, + "step": 5140 + }, + { + "epoch": 3.5093696763202726, + "grad_norm": 0.6766413450241089, + "learning_rate": 0.0002, + "loss": 1.4687, + "step": 5150 + }, + { + "epoch": 3.51618398637138, + "grad_norm": 0.666656494140625, + "learning_rate": 0.0002, + "loss": 1.4194, + "step": 5160 + }, + { + "epoch": 3.5229982964224873, + "grad_norm": 0.74996417760849, + "learning_rate": 0.0002, + "loss": 1.469, + "step": 5170 + }, + { + "epoch": 3.5298126064735946, + "grad_norm": 0.7370911836624146, + "learning_rate": 0.0002, + "loss": 1.4848, + "step": 5180 + }, + { + "epoch": 3.536626916524702, + "grad_norm": 0.9063456654548645, + "learning_rate": 0.0002, + "loss": 1.4523, + "step": 5190 + }, + { + "epoch": 3.5434412265758093, + "grad_norm": 0.6861422657966614, + "learning_rate": 0.0002, + "loss": 1.4726, + "step": 5200 + }, + { + "epoch": 3.5502555366269166, + "grad_norm": 0.7104039788246155, + "learning_rate": 0.0002, + "loss": 1.4803, + "step": 5210 + }, + { + "epoch": 3.557069846678024, + "grad_norm": 0.6578653454780579, + "learning_rate": 0.0002, + "loss": 1.4313, + "step": 5220 + }, + { + "epoch": 3.5638841567291313, + "grad_norm": 0.7336562275886536, + "learning_rate": 0.0002, + "loss": 1.4596, + "step": 5230 + }, + { + "epoch": 3.5706984667802386, + "grad_norm": 0.7163010835647583, + "learning_rate": 0.0002, + "loss": 1.4591, + "step": 5240 + }, + { + "epoch": 3.577512776831346, + "grad_norm": 0.8112391233444214, + "learning_rate": 0.0002, + "loss": 1.3814, + "step": 5250 + }, + { + "epoch": 3.5843270868824533, + "grad_norm": 0.7260391116142273, + "learning_rate": 0.0002, + "loss": 1.4249, + "step": 5260 + }, + { + "epoch": 3.5911413969335606, + "grad_norm": 0.7038731575012207, + "learning_rate": 0.0002, + "loss": 1.4249, + "step": 5270 + }, + { + "epoch": 3.597955706984668, + "grad_norm": 0.7864376902580261, + "learning_rate": 0.0002, + "loss": 1.4172, + "step": 5280 + }, + { + "epoch": 3.6047700170357753, + "grad_norm": 0.6968383193016052, + "learning_rate": 0.0002, + "loss": 1.4637, + "step": 5290 + }, + { + "epoch": 3.6115843270868826, + "grad_norm": 0.6726206541061401, + "learning_rate": 0.0002, + "loss": 1.5269, + "step": 5300 + }, + { + "epoch": 3.61839863713799, + "grad_norm": 0.6716854572296143, + "learning_rate": 0.0002, + "loss": 1.4199, + "step": 5310 + }, + { + "epoch": 3.6252129471890973, + "grad_norm": 0.7229742407798767, + "learning_rate": 0.0002, + "loss": 1.4686, + "step": 5320 + }, + { + "epoch": 3.6320272572402046, + "grad_norm": 0.7338683009147644, + "learning_rate": 0.0002, + "loss": 1.4441, + "step": 5330 + }, + { + "epoch": 3.638841567291312, + "grad_norm": 0.771672785282135, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 5340 + }, + { + "epoch": 3.645655877342419, + "grad_norm": 0.7024078369140625, + "learning_rate": 0.0002, + "loss": 1.4007, + "step": 5350 + }, + { + "epoch": 3.6524701873935266, + "grad_norm": 0.6847538352012634, + "learning_rate": 0.0002, + "loss": 1.4996, + "step": 5360 + }, + { + "epoch": 3.6592844974446335, + "grad_norm": 0.71802818775177, + "learning_rate": 0.0002, + "loss": 1.4111, + "step": 5370 + }, + { + "epoch": 3.6660988074957412, + "grad_norm": 0.78530353307724, + "learning_rate": 0.0002, + "loss": 1.4224, + "step": 5380 + }, + { + "epoch": 3.672913117546848, + "grad_norm": 0.7262226939201355, + "learning_rate": 0.0002, + "loss": 1.4582, + "step": 5390 + }, + { + "epoch": 3.679727427597956, + "grad_norm": 0.7608316540718079, + "learning_rate": 0.0002, + "loss": 1.4704, + "step": 5400 + }, + { + "epoch": 3.686541737649063, + "grad_norm": 0.6994926333427429, + "learning_rate": 0.0002, + "loss": 1.3742, + "step": 5410 + }, + { + "epoch": 3.6933560477001706, + "grad_norm": 0.7888479828834534, + "learning_rate": 0.0002, + "loss": 1.4738, + "step": 5420 + }, + { + "epoch": 3.7001703577512775, + "grad_norm": 0.7053858041763306, + "learning_rate": 0.0002, + "loss": 1.4213, + "step": 5430 + }, + { + "epoch": 3.7069846678023852, + "grad_norm": 0.7063165903091431, + "learning_rate": 0.0002, + "loss": 1.4988, + "step": 5440 + }, + { + "epoch": 3.713798977853492, + "grad_norm": 0.6603744626045227, + "learning_rate": 0.0002, + "loss": 1.4386, + "step": 5450 + }, + { + "epoch": 3.7206132879046, + "grad_norm": 0.7043602466583252, + "learning_rate": 0.0002, + "loss": 1.4695, + "step": 5460 + }, + { + "epoch": 3.7274275979557068, + "grad_norm": 0.7026081681251526, + "learning_rate": 0.0002, + "loss": 1.5051, + "step": 5470 + }, + { + "epoch": 3.7342419080068145, + "grad_norm": 0.7200090289115906, + "learning_rate": 0.0002, + "loss": 1.5613, + "step": 5480 + }, + { + "epoch": 3.7410562180579214, + "grad_norm": 0.7170904278755188, + "learning_rate": 0.0002, + "loss": 1.4182, + "step": 5490 + }, + { + "epoch": 3.747870528109029, + "grad_norm": 0.7489104866981506, + "learning_rate": 0.0002, + "loss": 1.4344, + "step": 5500 + }, + { + "epoch": 3.754684838160136, + "grad_norm": 0.6540989875793457, + "learning_rate": 0.0002, + "loss": 1.4911, + "step": 5510 + }, + { + "epoch": 3.761499148211244, + "grad_norm": 0.6654048562049866, + "learning_rate": 0.0002, + "loss": 1.4955, + "step": 5520 + }, + { + "epoch": 3.7683134582623508, + "grad_norm": 0.6577395796775818, + "learning_rate": 0.0002, + "loss": 1.4487, + "step": 5530 + }, + { + "epoch": 3.7751277683134585, + "grad_norm": 0.7762192487716675, + "learning_rate": 0.0002, + "loss": 1.4283, + "step": 5540 + }, + { + "epoch": 3.7819420783645654, + "grad_norm": 0.6336314678192139, + "learning_rate": 0.0002, + "loss": 1.4727, + "step": 5550 + }, + { + "epoch": 3.7887563884156727, + "grad_norm": 0.7098057866096497, + "learning_rate": 0.0002, + "loss": 1.4588, + "step": 5560 + }, + { + "epoch": 3.79557069846678, + "grad_norm": 0.7379715442657471, + "learning_rate": 0.0002, + "loss": 1.4679, + "step": 5570 + }, + { + "epoch": 3.8023850085178874, + "grad_norm": 0.6726924777030945, + "learning_rate": 0.0002, + "loss": 1.4633, + "step": 5580 + }, + { + "epoch": 3.8091993185689947, + "grad_norm": 1.1212009191513062, + "learning_rate": 0.0002, + "loss": 1.4751, + "step": 5590 + }, + { + "epoch": 3.816013628620102, + "grad_norm": 0.6503795981407166, + "learning_rate": 0.0002, + "loss": 1.4503, + "step": 5600 + }, + { + "epoch": 3.8228279386712094, + "grad_norm": 0.7041325569152832, + "learning_rate": 0.0002, + "loss": 1.4754, + "step": 5610 + }, + { + "epoch": 3.8296422487223167, + "grad_norm": 0.7962933778762817, + "learning_rate": 0.0002, + "loss": 1.4199, + "step": 5620 + }, + { + "epoch": 3.836456558773424, + "grad_norm": 0.6613591909408569, + "learning_rate": 0.0002, + "loss": 1.4672, + "step": 5630 + }, + { + "epoch": 3.8432708688245314, + "grad_norm": 0.7293516397476196, + "learning_rate": 0.0002, + "loss": 1.5688, + "step": 5640 + }, + { + "epoch": 3.8500851788756387, + "grad_norm": 0.7388607859611511, + "learning_rate": 0.0002, + "loss": 1.4149, + "step": 5650 + }, + { + "epoch": 3.856899488926746, + "grad_norm": 0.6440677642822266, + "learning_rate": 0.0002, + "loss": 1.4743, + "step": 5660 + }, + { + "epoch": 3.8637137989778534, + "grad_norm": 0.7729013562202454, + "learning_rate": 0.0002, + "loss": 1.5082, + "step": 5670 + }, + { + "epoch": 3.8705281090289607, + "grad_norm": 0.6696794033050537, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 5680 + }, + { + "epoch": 3.877342419080068, + "grad_norm": 0.7151781320571899, + "learning_rate": 0.0002, + "loss": 1.472, + "step": 5690 + }, + { + "epoch": 3.8841567291311754, + "grad_norm": 0.6736966371536255, + "learning_rate": 0.0002, + "loss": 1.4923, + "step": 5700 + }, + { + "epoch": 3.8909710391822827, + "grad_norm": 0.7444243431091309, + "learning_rate": 0.0002, + "loss": 1.4453, + "step": 5710 + }, + { + "epoch": 3.89778534923339, + "grad_norm": 0.6701464653015137, + "learning_rate": 0.0002, + "loss": 1.4562, + "step": 5720 + }, + { + "epoch": 3.9045996592844974, + "grad_norm": 0.7231952548027039, + "learning_rate": 0.0002, + "loss": 1.4478, + "step": 5730 + }, + { + "epoch": 3.9114139693356047, + "grad_norm": 0.831954300403595, + "learning_rate": 0.0002, + "loss": 1.4539, + "step": 5740 + }, + { + "epoch": 3.918228279386712, + "grad_norm": 0.7697733640670776, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 5750 + }, + { + "epoch": 3.9250425894378194, + "grad_norm": 0.6964395046234131, + "learning_rate": 0.0002, + "loss": 1.4552, + "step": 5760 + }, + { + "epoch": 3.9318568994889267, + "grad_norm": 0.6942925453186035, + "learning_rate": 0.0002, + "loss": 1.4688, + "step": 5770 + }, + { + "epoch": 3.938671209540034, + "grad_norm": 0.6491202712059021, + "learning_rate": 0.0002, + "loss": 1.4668, + "step": 5780 + }, + { + "epoch": 3.9454855195911414, + "grad_norm": 0.7004382610321045, + "learning_rate": 0.0002, + "loss": 1.4404, + "step": 5790 + }, + { + "epoch": 3.9522998296422487, + "grad_norm": 0.7337747812271118, + "learning_rate": 0.0002, + "loss": 1.5022, + "step": 5800 + }, + { + "epoch": 3.959114139693356, + "grad_norm": 0.6923640966415405, + "learning_rate": 0.0002, + "loss": 1.5314, + "step": 5810 + }, + { + "epoch": 3.9659284497444633, + "grad_norm": 0.6815266609191895, + "learning_rate": 0.0002, + "loss": 1.4811, + "step": 5820 + }, + { + "epoch": 3.9727427597955707, + "grad_norm": 0.6755654811859131, + "learning_rate": 0.0002, + "loss": 1.437, + "step": 5830 + }, + { + "epoch": 3.979557069846678, + "grad_norm": 0.6912487149238586, + "learning_rate": 0.0002, + "loss": 1.4277, + "step": 5840 + }, + { + "epoch": 3.9863713798977853, + "grad_norm": 0.6948044896125793, + "learning_rate": 0.0002, + "loss": 1.4654, + "step": 5850 + }, + { + "epoch": 3.9931856899488927, + "grad_norm": 0.6735455989837646, + "learning_rate": 0.0002, + "loss": 1.4779, + "step": 5860 + }, + { + "epoch": 4.0, + "grad_norm": 0.7005048990249634, + "learning_rate": 0.0002, + "loss": 1.5102, + "step": 5870 + }, + { + "epoch": 4.0, + "eval_loss": 1.923058032989502, + "eval_runtime": 58.9903, + "eval_samples_per_second": 8.595, + "eval_steps_per_second": 1.085, + "step": 5870 + }, + { + "epoch": 4.006814310051107, + "grad_norm": 0.809018075466156, + "learning_rate": 0.0002, + "loss": 1.2417, + "step": 5880 + }, + { + "epoch": 4.013628620102215, + "grad_norm": 0.9499403238296509, + "learning_rate": 0.0002, + "loss": 1.2874, + "step": 5890 + }, + { + "epoch": 4.0204429301533215, + "grad_norm": 0.7944574356079102, + "learning_rate": 0.0002, + "loss": 1.2245, + "step": 5900 + }, + { + "epoch": 4.027257240204429, + "grad_norm": 0.9501046538352966, + "learning_rate": 0.0002, + "loss": 1.2751, + "step": 5910 + }, + { + "epoch": 4.034071550255536, + "grad_norm": 0.8247923254966736, + "learning_rate": 0.0002, + "loss": 1.2706, + "step": 5920 + }, + { + "epoch": 4.040885860306644, + "grad_norm": 0.9358038902282715, + "learning_rate": 0.0002, + "loss": 1.2762, + "step": 5930 + }, + { + "epoch": 4.047700170357751, + "grad_norm": 1.0102452039718628, + "learning_rate": 0.0002, + "loss": 1.2953, + "step": 5940 + }, + { + "epoch": 4.054514480408859, + "grad_norm": 1.0248252153396606, + "learning_rate": 0.0002, + "loss": 1.216, + "step": 5950 + }, + { + "epoch": 4.0613287904599655, + "grad_norm": 1.0438553094863892, + "learning_rate": 0.0002, + "loss": 1.2115, + "step": 5960 + }, + { + "epoch": 4.068143100511073, + "grad_norm": 0.7964957356452942, + "learning_rate": 0.0002, + "loss": 1.2516, + "step": 5970 + }, + { + "epoch": 4.07495741056218, + "grad_norm": 0.9757015109062195, + "learning_rate": 0.0002, + "loss": 1.1555, + "step": 5980 + }, + { + "epoch": 4.081771720613288, + "grad_norm": 0.9157161116600037, + "learning_rate": 0.0002, + "loss": 1.2243, + "step": 5990 + }, + { + "epoch": 4.088586030664395, + "grad_norm": 0.9372851848602295, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 6000 + }, + { + "epoch": 4.095400340715503, + "grad_norm": 1.240779995918274, + "learning_rate": 0.0002, + "loss": 1.2091, + "step": 6010 + }, + { + "epoch": 4.1022146507666095, + "grad_norm": 0.8394840359687805, + "learning_rate": 0.0002, + "loss": 1.1727, + "step": 6020 + }, + { + "epoch": 4.109028960817717, + "grad_norm": 1.1081455945968628, + "learning_rate": 0.0002, + "loss": 1.2926, + "step": 6030 + }, + { + "epoch": 4.115843270868824, + "grad_norm": 0.9227745532989502, + "learning_rate": 0.0002, + "loss": 1.2417, + "step": 6040 + }, + { + "epoch": 4.122657580919932, + "grad_norm": 0.8487664461135864, + "learning_rate": 0.0002, + "loss": 1.1994, + "step": 6050 + }, + { + "epoch": 4.129471890971039, + "grad_norm": 0.9643339514732361, + "learning_rate": 0.0002, + "loss": 1.2378, + "step": 6060 + }, + { + "epoch": 4.136286201022147, + "grad_norm": 1.0296099185943604, + "learning_rate": 0.0002, + "loss": 1.2254, + "step": 6070 + }, + { + "epoch": 4.1431005110732535, + "grad_norm": 0.9534215927124023, + "learning_rate": 0.0002, + "loss": 1.2419, + "step": 6080 + }, + { + "epoch": 4.149914821124361, + "grad_norm": 0.9647086262702942, + "learning_rate": 0.0002, + "loss": 1.1849, + "step": 6090 + }, + { + "epoch": 4.156729131175468, + "grad_norm": 1.084836721420288, + "learning_rate": 0.0002, + "loss": 1.2713, + "step": 6100 + }, + { + "epoch": 4.163543441226576, + "grad_norm": 0.9315235614776611, + "learning_rate": 0.0002, + "loss": 1.1788, + "step": 6110 + }, + { + "epoch": 4.170357751277683, + "grad_norm": 0.9541679620742798, + "learning_rate": 0.0002, + "loss": 1.17, + "step": 6120 + }, + { + "epoch": 4.177172061328791, + "grad_norm": 0.9792100191116333, + "learning_rate": 0.0002, + "loss": 1.1407, + "step": 6130 + }, + { + "epoch": 4.1839863713798975, + "grad_norm": 1.065783143043518, + "learning_rate": 0.0002, + "loss": 1.2069, + "step": 6140 + }, + { + "epoch": 4.190800681431005, + "grad_norm": 1.036161184310913, + "learning_rate": 0.0002, + "loss": 1.2512, + "step": 6150 + }, + { + "epoch": 4.197614991482112, + "grad_norm": 0.8979679942131042, + "learning_rate": 0.0002, + "loss": 1.2371, + "step": 6160 + }, + { + "epoch": 4.20442930153322, + "grad_norm": 0.7584333419799805, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 6170 + }, + { + "epoch": 4.211243611584327, + "grad_norm": 1.1970131397247314, + "learning_rate": 0.0002, + "loss": 1.2128, + "step": 6180 + }, + { + "epoch": 4.218057921635435, + "grad_norm": 2.6447298526763916, + "learning_rate": 0.0002, + "loss": 1.1982, + "step": 6190 + }, + { + "epoch": 4.2248722316865415, + "grad_norm": 0.9357487559318542, + "learning_rate": 0.0002, + "loss": 1.2465, + "step": 6200 + }, + { + "epoch": 4.231686541737649, + "grad_norm": 0.9141183495521545, + "learning_rate": 0.0002, + "loss": 1.2963, + "step": 6210 + }, + { + "epoch": 4.238500851788756, + "grad_norm": 1.0606296062469482, + "learning_rate": 0.0002, + "loss": 1.1959, + "step": 6220 + }, + { + "epoch": 4.245315161839864, + "grad_norm": 0.9999088048934937, + "learning_rate": 0.0002, + "loss": 1.2629, + "step": 6230 + }, + { + "epoch": 4.252129471890971, + "grad_norm": 0.9469764232635498, + "learning_rate": 0.0002, + "loss": 1.1471, + "step": 6240 + }, + { + "epoch": 4.258943781942079, + "grad_norm": 1.1508198976516724, + "learning_rate": 0.0002, + "loss": 1.223, + "step": 6250 + }, + { + "epoch": 4.2657580919931855, + "grad_norm": 1.2576130628585815, + "learning_rate": 0.0002, + "loss": 1.2677, + "step": 6260 + }, + { + "epoch": 4.272572402044293, + "grad_norm": 0.9435968399047852, + "learning_rate": 0.0002, + "loss": 1.2216, + "step": 6270 + }, + { + "epoch": 4.2793867120954, + "grad_norm": 0.9290348887443542, + "learning_rate": 0.0002, + "loss": 1.2788, + "step": 6280 + }, + { + "epoch": 4.286201022146508, + "grad_norm": 0.9973701238632202, + "learning_rate": 0.0002, + "loss": 1.2631, + "step": 6290 + }, + { + "epoch": 4.293015332197615, + "grad_norm": 1.012855887413025, + "learning_rate": 0.0002, + "loss": 1.2276, + "step": 6300 + }, + { + "epoch": 4.2998296422487225, + "grad_norm": 0.8371705412864685, + "learning_rate": 0.0002, + "loss": 1.2115, + "step": 6310 + }, + { + "epoch": 4.306643952299829, + "grad_norm": 1.0867925882339478, + "learning_rate": 0.0002, + "loss": 1.2423, + "step": 6320 + }, + { + "epoch": 4.313458262350937, + "grad_norm": 0.9763767123222351, + "learning_rate": 0.0002, + "loss": 1.2262, + "step": 6330 + }, + { + "epoch": 4.320272572402044, + "grad_norm": 1.1844252347946167, + "learning_rate": 0.0002, + "loss": 1.2557, + "step": 6340 + }, + { + "epoch": 4.327086882453152, + "grad_norm": 0.8292830586433411, + "learning_rate": 0.0002, + "loss": 1.2635, + "step": 6350 + }, + { + "epoch": 4.333901192504259, + "grad_norm": 0.9351436495780945, + "learning_rate": 0.0002, + "loss": 1.262, + "step": 6360 + }, + { + "epoch": 4.3407155025553665, + "grad_norm": 1.0425835847854614, + "learning_rate": 0.0002, + "loss": 1.2678, + "step": 6370 + }, + { + "epoch": 4.347529812606473, + "grad_norm": 0.8894261121749878, + "learning_rate": 0.0002, + "loss": 1.2476, + "step": 6380 + }, + { + "epoch": 4.354344122657581, + "grad_norm": 0.9663366079330444, + "learning_rate": 0.0002, + "loss": 1.2965, + "step": 6390 + }, + { + "epoch": 4.361158432708688, + "grad_norm": 0.8915578126907349, + "learning_rate": 0.0002, + "loss": 1.2529, + "step": 6400 + }, + { + "epoch": 4.367972742759796, + "grad_norm": 1.0393000841140747, + "learning_rate": 0.0002, + "loss": 1.2573, + "step": 6410 + }, + { + "epoch": 4.374787052810903, + "grad_norm": 0.917398989200592, + "learning_rate": 0.0002, + "loss": 1.2254, + "step": 6420 + }, + { + "epoch": 4.3816013628620105, + "grad_norm": 1.0496646165847778, + "learning_rate": 0.0002, + "loss": 1.3115, + "step": 6430 + }, + { + "epoch": 4.388415672913117, + "grad_norm": 0.9349859356880188, + "learning_rate": 0.0002, + "loss": 1.2607, + "step": 6440 + }, + { + "epoch": 4.395229982964225, + "grad_norm": 1.0981004238128662, + "learning_rate": 0.0002, + "loss": 1.3414, + "step": 6450 + }, + { + "epoch": 4.402044293015332, + "grad_norm": 0.9794871807098389, + "learning_rate": 0.0002, + "loss": 1.2391, + "step": 6460 + }, + { + "epoch": 4.40885860306644, + "grad_norm": 0.9321421384811401, + "learning_rate": 0.0002, + "loss": 1.208, + "step": 6470 + }, + { + "epoch": 4.415672913117547, + "grad_norm": 0.9158342480659485, + "learning_rate": 0.0002, + "loss": 1.3398, + "step": 6480 + }, + { + "epoch": 4.4224872231686545, + "grad_norm": 0.9462087750434875, + "learning_rate": 0.0002, + "loss": 1.1832, + "step": 6490 + }, + { + "epoch": 4.429301533219761, + "grad_norm": 0.9740175604820251, + "learning_rate": 0.0002, + "loss": 1.2366, + "step": 6500 + }, + { + "epoch": 4.436115843270869, + "grad_norm": 0.8477463126182556, + "learning_rate": 0.0002, + "loss": 1.3074, + "step": 6510 + }, + { + "epoch": 4.442930153321976, + "grad_norm": 1.0296647548675537, + "learning_rate": 0.0002, + "loss": 1.2719, + "step": 6520 + }, + { + "epoch": 4.449744463373084, + "grad_norm": 0.9437751173973083, + "learning_rate": 0.0002, + "loss": 1.2647, + "step": 6530 + }, + { + "epoch": 4.456558773424191, + "grad_norm": 1.011192798614502, + "learning_rate": 0.0002, + "loss": 1.2043, + "step": 6540 + }, + { + "epoch": 4.4633730834752985, + "grad_norm": 0.8836222290992737, + "learning_rate": 0.0002, + "loss": 1.3673, + "step": 6550 + }, + { + "epoch": 4.470187393526405, + "grad_norm": 1.2799941301345825, + "learning_rate": 0.0002, + "loss": 1.3028, + "step": 6560 + }, + { + "epoch": 4.477001703577513, + "grad_norm": 0.925910472869873, + "learning_rate": 0.0002, + "loss": 1.2789, + "step": 6570 + }, + { + "epoch": 4.48381601362862, + "grad_norm": 0.957401692867279, + "learning_rate": 0.0002, + "loss": 1.2723, + "step": 6580 + }, + { + "epoch": 4.490630323679728, + "grad_norm": 1.0789544582366943, + "learning_rate": 0.0002, + "loss": 1.242, + "step": 6590 + }, + { + "epoch": 4.497444633730835, + "grad_norm": 0.8874586820602417, + "learning_rate": 0.0002, + "loss": 1.2553, + "step": 6600 + }, + { + "epoch": 4.504258943781942, + "grad_norm": 0.9394784569740295, + "learning_rate": 0.0002, + "loss": 1.2779, + "step": 6610 + }, + { + "epoch": 4.511073253833049, + "grad_norm": 1.029640793800354, + "learning_rate": 0.0002, + "loss": 1.2744, + "step": 6620 + }, + { + "epoch": 4.517887563884157, + "grad_norm": 0.9510841965675354, + "learning_rate": 0.0002, + "loss": 1.2634, + "step": 6630 + }, + { + "epoch": 4.524701873935264, + "grad_norm": 0.9992963671684265, + "learning_rate": 0.0002, + "loss": 1.2562, + "step": 6640 + }, + { + "epoch": 4.531516183986371, + "grad_norm": 0.9312878847122192, + "learning_rate": 0.0002, + "loss": 1.2942, + "step": 6650 + }, + { + "epoch": 4.538330494037479, + "grad_norm": 0.9406482577323914, + "learning_rate": 0.0002, + "loss": 1.2572, + "step": 6660 + }, + { + "epoch": 4.5451448040885865, + "grad_norm": 1.1058286428451538, + "learning_rate": 0.0002, + "loss": 1.2283, + "step": 6670 + }, + { + "epoch": 4.551959114139693, + "grad_norm": 0.9389635920524597, + "learning_rate": 0.0002, + "loss": 1.2391, + "step": 6680 + }, + { + "epoch": 4.5587734241908, + "grad_norm": 1.0356028079986572, + "learning_rate": 0.0002, + "loss": 1.2696, + "step": 6690 + }, + { + "epoch": 4.565587734241908, + "grad_norm": 0.9370909929275513, + "learning_rate": 0.0002, + "loss": 1.2935, + "step": 6700 + }, + { + "epoch": 4.572402044293016, + "grad_norm": 0.9917567372322083, + "learning_rate": 0.0002, + "loss": 1.2914, + "step": 6710 + }, + { + "epoch": 4.579216354344123, + "grad_norm": 0.9065384864807129, + "learning_rate": 0.0002, + "loss": 1.3318, + "step": 6720 + }, + { + "epoch": 4.5860306643952296, + "grad_norm": 1.3347833156585693, + "learning_rate": 0.0002, + "loss": 1.2909, + "step": 6730 + }, + { + "epoch": 4.592844974446337, + "grad_norm": 0.910632312297821, + "learning_rate": 0.0002, + "loss": 1.3322, + "step": 6740 + }, + { + "epoch": 4.599659284497445, + "grad_norm": 0.8874805569648743, + "learning_rate": 0.0002, + "loss": 1.2584, + "step": 6750 + }, + { + "epoch": 4.606473594548552, + "grad_norm": 0.9355664253234863, + "learning_rate": 0.0002, + "loss": 1.3173, + "step": 6760 + }, + { + "epoch": 4.613287904599659, + "grad_norm": 0.9360204339027405, + "learning_rate": 0.0002, + "loss": 1.3515, + "step": 6770 + }, + { + "epoch": 4.620102214650767, + "grad_norm": 0.9931750893592834, + "learning_rate": 0.0002, + "loss": 1.2326, + "step": 6780 + }, + { + "epoch": 4.626916524701874, + "grad_norm": 0.9195131063461304, + "learning_rate": 0.0002, + "loss": 1.2677, + "step": 6790 + }, + { + "epoch": 4.633730834752981, + "grad_norm": 0.9448373913764954, + "learning_rate": 0.0002, + "loss": 1.3417, + "step": 6800 + }, + { + "epoch": 4.640545144804088, + "grad_norm": 1.162890911102295, + "learning_rate": 0.0002, + "loss": 1.2658, + "step": 6810 + }, + { + "epoch": 4.647359454855196, + "grad_norm": 0.9739466905593872, + "learning_rate": 0.0002, + "loss": 1.2841, + "step": 6820 + }, + { + "epoch": 4.654173764906303, + "grad_norm": 0.9462909698486328, + "learning_rate": 0.0002, + "loss": 1.3068, + "step": 6830 + }, + { + "epoch": 4.660988074957411, + "grad_norm": 1.042639970779419, + "learning_rate": 0.0002, + "loss": 1.284, + "step": 6840 + }, + { + "epoch": 4.6678023850085175, + "grad_norm": 0.8910539150238037, + "learning_rate": 0.0002, + "loss": 1.3337, + "step": 6850 + }, + { + "epoch": 4.674616695059625, + "grad_norm": 1.0806447267532349, + "learning_rate": 0.0002, + "loss": 1.3025, + "step": 6860 + }, + { + "epoch": 4.681431005110732, + "grad_norm": 1.0054864883422852, + "learning_rate": 0.0002, + "loss": 1.2258, + "step": 6870 + }, + { + "epoch": 4.68824531516184, + "grad_norm": 0.7774158120155334, + "learning_rate": 0.0002, + "loss": 1.3261, + "step": 6880 + }, + { + "epoch": 4.695059625212947, + "grad_norm": 0.9729512333869934, + "learning_rate": 0.0002, + "loss": 1.2545, + "step": 6890 + }, + { + "epoch": 4.701873935264055, + "grad_norm": 1.2025411128997803, + "learning_rate": 0.0002, + "loss": 1.3251, + "step": 6900 + }, + { + "epoch": 4.7086882453151615, + "grad_norm": 1.1654069423675537, + "learning_rate": 0.0002, + "loss": 1.3418, + "step": 6910 + }, + { + "epoch": 4.715502555366269, + "grad_norm": 1.1501442193984985, + "learning_rate": 0.0002, + "loss": 1.3091, + "step": 6920 + }, + { + "epoch": 4.722316865417376, + "grad_norm": 1.1083979606628418, + "learning_rate": 0.0002, + "loss": 1.2627, + "step": 6930 + }, + { + "epoch": 4.729131175468484, + "grad_norm": 0.9431378841400146, + "learning_rate": 0.0002, + "loss": 1.2836, + "step": 6940 + }, + { + "epoch": 4.735945485519591, + "grad_norm": 0.9722502827644348, + "learning_rate": 0.0002, + "loss": 1.3381, + "step": 6950 + }, + { + "epoch": 4.742759795570699, + "grad_norm": 0.9094559550285339, + "learning_rate": 0.0002, + "loss": 1.3228, + "step": 6960 + }, + { + "epoch": 4.7495741056218055, + "grad_norm": 0.9918473958969116, + "learning_rate": 0.0002, + "loss": 1.3474, + "step": 6970 + }, + { + "epoch": 4.756388415672913, + "grad_norm": 0.9999690651893616, + "learning_rate": 0.0002, + "loss": 1.3352, + "step": 6980 + }, + { + "epoch": 4.76320272572402, + "grad_norm": 1.0453810691833496, + "learning_rate": 0.0002, + "loss": 1.3579, + "step": 6990 + }, + { + "epoch": 4.770017035775128, + "grad_norm": 1.0167806148529053, + "learning_rate": 0.0002, + "loss": 1.294, + "step": 7000 + }, + { + "epoch": 4.776831345826235, + "grad_norm": 0.8133894801139832, + "learning_rate": 0.0002, + "loss": 1.3247, + "step": 7010 + }, + { + "epoch": 4.783645655877343, + "grad_norm": 0.8000897765159607, + "learning_rate": 0.0002, + "loss": 1.2577, + "step": 7020 + }, + { + "epoch": 4.7904599659284495, + "grad_norm": 0.992080569267273, + "learning_rate": 0.0002, + "loss": 1.2802, + "step": 7030 + }, + { + "epoch": 4.797274275979557, + "grad_norm": 0.9824522137641907, + "learning_rate": 0.0002, + "loss": 1.3269, + "step": 7040 + }, + { + "epoch": 4.804088586030664, + "grad_norm": 0.9808870553970337, + "learning_rate": 0.0002, + "loss": 1.279, + "step": 7050 + }, + { + "epoch": 4.810902896081772, + "grad_norm": 0.9679701924324036, + "learning_rate": 0.0002, + "loss": 1.3342, + "step": 7060 + }, + { + "epoch": 4.817717206132879, + "grad_norm": 0.9895215034484863, + "learning_rate": 0.0002, + "loss": 1.2711, + "step": 7070 + }, + { + "epoch": 4.824531516183987, + "grad_norm": 1.052246332168579, + "learning_rate": 0.0002, + "loss": 1.3008, + "step": 7080 + }, + { + "epoch": 4.8313458262350935, + "grad_norm": 0.9243564605712891, + "learning_rate": 0.0002, + "loss": 1.2874, + "step": 7090 + }, + { + "epoch": 4.838160136286201, + "grad_norm": 0.9545369744300842, + "learning_rate": 0.0002, + "loss": 1.2835, + "step": 7100 + }, + { + "epoch": 4.844974446337308, + "grad_norm": 0.9655884504318237, + "learning_rate": 0.0002, + "loss": 1.31, + "step": 7110 + }, + { + "epoch": 4.851788756388416, + "grad_norm": 0.9708049893379211, + "learning_rate": 0.0002, + "loss": 1.2862, + "step": 7120 + }, + { + "epoch": 4.858603066439523, + "grad_norm": 1.0064880847930908, + "learning_rate": 0.0002, + "loss": 1.3425, + "step": 7130 + }, + { + "epoch": 4.8654173764906306, + "grad_norm": 0.939943790435791, + "learning_rate": 0.0002, + "loss": 1.2899, + "step": 7140 + }, + { + "epoch": 4.872231686541737, + "grad_norm": 1.0750784873962402, + "learning_rate": 0.0002, + "loss": 1.2887, + "step": 7150 + }, + { + "epoch": 4.879045996592845, + "grad_norm": 0.9708989262580872, + "learning_rate": 0.0002, + "loss": 1.3367, + "step": 7160 + }, + { + "epoch": 4.885860306643952, + "grad_norm": 1.0228253602981567, + "learning_rate": 0.0002, + "loss": 1.2797, + "step": 7170 + }, + { + "epoch": 4.89267461669506, + "grad_norm": 0.8963132500648499, + "learning_rate": 0.0002, + "loss": 1.2695, + "step": 7180 + }, + { + "epoch": 4.899488926746167, + "grad_norm": 0.9198015928268433, + "learning_rate": 0.0002, + "loss": 1.3473, + "step": 7190 + }, + { + "epoch": 4.9063032367972745, + "grad_norm": 1.099906086921692, + "learning_rate": 0.0002, + "loss": 1.2541, + "step": 7200 + }, + { + "epoch": 4.913117546848381, + "grad_norm": 1.0624815225601196, + "learning_rate": 0.0002, + "loss": 1.3188, + "step": 7210 + }, + { + "epoch": 4.919931856899489, + "grad_norm": 0.9688444137573242, + "learning_rate": 0.0002, + "loss": 1.3026, + "step": 7220 + }, + { + "epoch": 4.926746166950596, + "grad_norm": 0.867011547088623, + "learning_rate": 0.0002, + "loss": 1.3379, + "step": 7230 + }, + { + "epoch": 4.933560477001704, + "grad_norm": 0.9600282311439514, + "learning_rate": 0.0002, + "loss": 1.289, + "step": 7240 + }, + { + "epoch": 4.940374787052811, + "grad_norm": 0.8979372978210449, + "learning_rate": 0.0002, + "loss": 1.2751, + "step": 7250 + }, + { + "epoch": 4.9471890971039185, + "grad_norm": 0.951474130153656, + "learning_rate": 0.0002, + "loss": 1.3426, + "step": 7260 + }, + { + "epoch": 4.954003407155025, + "grad_norm": 0.824851393699646, + "learning_rate": 0.0002, + "loss": 1.2726, + "step": 7270 + }, + { + "epoch": 4.960817717206133, + "grad_norm": 1.2926591634750366, + "learning_rate": 0.0002, + "loss": 1.2679, + "step": 7280 + }, + { + "epoch": 4.96763202725724, + "grad_norm": 1.1057835817337036, + "learning_rate": 0.0002, + "loss": 1.2974, + "step": 7290 + }, + { + "epoch": 4.974446337308348, + "grad_norm": 0.9814816117286682, + "learning_rate": 0.0002, + "loss": 1.2275, + "step": 7300 + }, + { + "epoch": 4.981260647359455, + "grad_norm": 1.0251333713531494, + "learning_rate": 0.0002, + "loss": 1.3001, + "step": 7310 + }, + { + "epoch": 4.9880749574105625, + "grad_norm": 0.9748668074607849, + "learning_rate": 0.0002, + "loss": 1.3113, + "step": 7320 + }, + { + "epoch": 4.994889267461669, + "grad_norm": 0.8552228808403015, + "learning_rate": 0.0002, + "loss": 1.3595, + "step": 7330 + }, + { + "epoch": 4.999659284497445, + "eval_loss": 2.03971004486084, + "eval_runtime": 67.4144, + "eval_samples_per_second": 7.521, + "eval_steps_per_second": 0.949, + "step": 7337 + }, + { + "epoch": 5.001703577512777, + "grad_norm": 0.8210785388946533, + "learning_rate": 0.0002, + "loss": 1.2464, + "step": 7340 + }, + { + "epoch": 5.008517887563884, + "grad_norm": 1.2577511072158813, + "learning_rate": 0.0002, + "loss": 1.0356, + "step": 7350 + }, + { + "epoch": 5.015332197614992, + "grad_norm": 1.280604362487793, + "learning_rate": 0.0002, + "loss": 0.9944, + "step": 7360 + }, + { + "epoch": 5.022146507666099, + "grad_norm": 1.3985474109649658, + "learning_rate": 0.0002, + "loss": 1.0858, + "step": 7370 + }, + { + "epoch": 5.0289608177172065, + "grad_norm": 1.1621310710906982, + "learning_rate": 0.0002, + "loss": 1.0122, + "step": 7380 + }, + { + "epoch": 5.035775127768313, + "grad_norm": 1.3278541564941406, + "learning_rate": 0.0002, + "loss": 1.05, + "step": 7390 + }, + { + "epoch": 5.042589437819421, + "grad_norm": 1.1166491508483887, + "learning_rate": 0.0002, + "loss": 1.0237, + "step": 7400 + }, + { + "epoch": 5.049403747870528, + "grad_norm": 1.8087667226791382, + "learning_rate": 0.0002, + "loss": 1.0397, + "step": 7410 + }, + { + "epoch": 5.056218057921636, + "grad_norm": 1.1517921686172485, + "learning_rate": 0.0002, + "loss": 1.0191, + "step": 7420 + }, + { + "epoch": 5.063032367972743, + "grad_norm": 1.2875889539718628, + "learning_rate": 0.0002, + "loss": 1.025, + "step": 7430 + }, + { + "epoch": 5.0698466780238505, + "grad_norm": 1.199702262878418, + "learning_rate": 0.0002, + "loss": 1.043, + "step": 7440 + }, + { + "epoch": 5.076660988074957, + "grad_norm": 1.2912452220916748, + "learning_rate": 0.0002, + "loss": 1.0176, + "step": 7450 + }, + { + "epoch": 5.083475298126065, + "grad_norm": 1.1446452140808105, + "learning_rate": 0.0002, + "loss": 1.0042, + "step": 7460 + }, + { + "epoch": 5.090289608177172, + "grad_norm": 1.3625746965408325, + "learning_rate": 0.0002, + "loss": 1.047, + "step": 7470 + }, + { + "epoch": 5.09710391822828, + "grad_norm": 1.2116546630859375, + "learning_rate": 0.0002, + "loss": 1.052, + "step": 7480 + }, + { + "epoch": 5.103918228279387, + "grad_norm": 1.3896098136901855, + "learning_rate": 0.0002, + "loss": 1.1041, + "step": 7490 + }, + { + "epoch": 5.1107325383304945, + "grad_norm": 1.6265277862548828, + "learning_rate": 0.0002, + "loss": 1.0668, + "step": 7500 + }, + { + "epoch": 5.117546848381601, + "grad_norm": 1.1468392610549927, + "learning_rate": 0.0002, + "loss": 1.028, + "step": 7510 + }, + { + "epoch": 5.124361158432709, + "grad_norm": 1.2649329900741577, + "learning_rate": 0.0002, + "loss": 0.9915, + "step": 7520 + }, + { + "epoch": 5.131175468483816, + "grad_norm": 1.1866015195846558, + "learning_rate": 0.0002, + "loss": 1.0251, + "step": 7530 + }, + { + "epoch": 5.137989778534923, + "grad_norm": 1.1517255306243896, + "learning_rate": 0.0002, + "loss": 1.0626, + "step": 7540 + }, + { + "epoch": 5.144804088586031, + "grad_norm": 1.3475146293640137, + "learning_rate": 0.0002, + "loss": 1.0303, + "step": 7550 + }, + { + "epoch": 5.151618398637138, + "grad_norm": 1.1167018413543701, + "learning_rate": 0.0002, + "loss": 1.0456, + "step": 7560 + }, + { + "epoch": 5.158432708688245, + "grad_norm": 1.209572434425354, + "learning_rate": 0.0002, + "loss": 1.04, + "step": 7570 + }, + { + "epoch": 5.165247018739352, + "grad_norm": 1.3578280210494995, + "learning_rate": 0.0002, + "loss": 1.0533, + "step": 7580 + }, + { + "epoch": 5.17206132879046, + "grad_norm": 1.2447012662887573, + "learning_rate": 0.0002, + "loss": 1.0958, + "step": 7590 + }, + { + "epoch": 5.178875638841567, + "grad_norm": 1.3715848922729492, + "learning_rate": 0.0002, + "loss": 1.0521, + "step": 7600 + }, + { + "epoch": 5.185689948892675, + "grad_norm": 1.435860276222229, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 7610 + }, + { + "epoch": 5.1925042589437815, + "grad_norm": 1.4093858003616333, + "learning_rate": 0.0002, + "loss": 1.0504, + "step": 7620 + }, + { + "epoch": 5.199318568994889, + "grad_norm": 1.1747535467147827, + "learning_rate": 0.0002, + "loss": 1.083, + "step": 7630 + }, + { + "epoch": 5.206132879045996, + "grad_norm": 1.4704833030700684, + "learning_rate": 0.0002, + "loss": 1.048, + "step": 7640 + }, + { + "epoch": 5.212947189097104, + "grad_norm": 1.2270972728729248, + "learning_rate": 0.0002, + "loss": 0.9991, + "step": 7650 + }, + { + "epoch": 5.219761499148211, + "grad_norm": 1.2215691804885864, + "learning_rate": 0.0002, + "loss": 1.0738, + "step": 7660 + }, + { + "epoch": 5.226575809199319, + "grad_norm": 1.3641486167907715, + "learning_rate": 0.0002, + "loss": 1.0628, + "step": 7670 + }, + { + "epoch": 5.2333901192504255, + "grad_norm": 1.3532041311264038, + "learning_rate": 0.0002, + "loss": 1.1066, + "step": 7680 + }, + { + "epoch": 5.240204429301533, + "grad_norm": 1.2243095636367798, + "learning_rate": 0.0002, + "loss": 1.0209, + "step": 7690 + }, + { + "epoch": 5.24701873935264, + "grad_norm": 1.3644746541976929, + "learning_rate": 0.0002, + "loss": 1.0503, + "step": 7700 + }, + { + "epoch": 5.253833049403748, + "grad_norm": 1.18478262424469, + "learning_rate": 0.0002, + "loss": 1.0406, + "step": 7710 + }, + { + "epoch": 5.260647359454855, + "grad_norm": 1.2146114110946655, + "learning_rate": 0.0002, + "loss": 1.1023, + "step": 7720 + }, + { + "epoch": 5.267461669505963, + "grad_norm": 1.233984112739563, + "learning_rate": 0.0002, + "loss": 1.1528, + "step": 7730 + }, + { + "epoch": 5.2742759795570695, + "grad_norm": 1.3709665536880493, + "learning_rate": 0.0002, + "loss": 1.0681, + "step": 7740 + }, + { + "epoch": 5.281090289608177, + "grad_norm": 1.36055326461792, + "learning_rate": 0.0002, + "loss": 1.0195, + "step": 7750 + }, + { + "epoch": 5.287904599659284, + "grad_norm": 1.6232351064682007, + "learning_rate": 0.0002, + "loss": 1.0447, + "step": 7760 + }, + { + "epoch": 5.294718909710392, + "grad_norm": 1.3359960317611694, + "learning_rate": 0.0002, + "loss": 1.0627, + "step": 7770 + }, + { + "epoch": 5.301533219761499, + "grad_norm": 1.3815656900405884, + "learning_rate": 0.0002, + "loss": 1.1082, + "step": 7780 + }, + { + "epoch": 5.308347529812607, + "grad_norm": 1.1392076015472412, + "learning_rate": 0.0002, + "loss": 1.0891, + "step": 7790 + }, + { + "epoch": 5.3151618398637135, + "grad_norm": 1.3006905317306519, + "learning_rate": 0.0002, + "loss": 1.0364, + "step": 7800 + }, + { + "epoch": 5.321976149914821, + "grad_norm": 1.503645896911621, + "learning_rate": 0.0002, + "loss": 1.1005, + "step": 7810 + }, + { + "epoch": 5.328790459965928, + "grad_norm": 1.141939640045166, + "learning_rate": 0.0002, + "loss": 1.0075, + "step": 7820 + }, + { + "epoch": 5.335604770017036, + "grad_norm": 1.4654004573822021, + "learning_rate": 0.0002, + "loss": 1.0284, + "step": 7830 + }, + { + "epoch": 5.342419080068143, + "grad_norm": 1.4195219278335571, + "learning_rate": 0.0002, + "loss": 1.1185, + "step": 7840 + }, + { + "epoch": 5.349233390119251, + "grad_norm": 1.2354168891906738, + "learning_rate": 0.0002, + "loss": 1.0535, + "step": 7850 + }, + { + "epoch": 5.3560477001703575, + "grad_norm": 1.529862880706787, + "learning_rate": 0.0002, + "loss": 1.0923, + "step": 7860 + }, + { + "epoch": 5.362862010221465, + "grad_norm": 1.364678978919983, + "learning_rate": 0.0002, + "loss": 1.1005, + "step": 7870 + }, + { + "epoch": 5.369676320272572, + "grad_norm": 1.1010444164276123, + "learning_rate": 0.0002, + "loss": 1.1084, + "step": 7880 + }, + { + "epoch": 5.37649063032368, + "grad_norm": 1.1949712038040161, + "learning_rate": 0.0002, + "loss": 1.1225, + "step": 7890 + }, + { + "epoch": 5.383304940374787, + "grad_norm": 1.485922932624817, + "learning_rate": 0.0002, + "loss": 1.058, + "step": 7900 + }, + { + "epoch": 5.390119250425895, + "grad_norm": 1.0844227075576782, + "learning_rate": 0.0002, + "loss": 0.9894, + "step": 7910 + }, + { + "epoch": 5.3969335604770015, + "grad_norm": 1.3784468173980713, + "learning_rate": 0.0002, + "loss": 1.0418, + "step": 7920 + }, + { + "epoch": 5.403747870528109, + "grad_norm": 1.4771490097045898, + "learning_rate": 0.0002, + "loss": 1.0542, + "step": 7930 + }, + { + "epoch": 5.410562180579216, + "grad_norm": 1.2460103034973145, + "learning_rate": 0.0002, + "loss": 1.1265, + "step": 7940 + }, + { + "epoch": 5.417376490630324, + "grad_norm": 1.3047645092010498, + "learning_rate": 0.0002, + "loss": 1.096, + "step": 7950 + }, + { + "epoch": 5.424190800681431, + "grad_norm": 1.1396620273590088, + "learning_rate": 0.0002, + "loss": 1.0956, + "step": 7960 + }, + { + "epoch": 5.4310051107325386, + "grad_norm": 1.4193450212478638, + "learning_rate": 0.0002, + "loss": 1.0685, + "step": 7970 + }, + { + "epoch": 5.437819420783645, + "grad_norm": 1.2085850238800049, + "learning_rate": 0.0002, + "loss": 1.1347, + "step": 7980 + }, + { + "epoch": 5.444633730834753, + "grad_norm": 1.2721607685089111, + "learning_rate": 0.0002, + "loss": 1.0277, + "step": 7990 + }, + { + "epoch": 5.45144804088586, + "grad_norm": 1.4134020805358887, + "learning_rate": 0.0002, + "loss": 1.1316, + "step": 8000 + }, + { + "epoch": 5.458262350936968, + "grad_norm": 1.4283325672149658, + "learning_rate": 0.0002, + "loss": 1.0576, + "step": 8010 + }, + { + "epoch": 5.465076660988075, + "grad_norm": 1.3127079010009766, + "learning_rate": 0.0002, + "loss": 1.0505, + "step": 8020 + }, + { + "epoch": 5.4718909710391825, + "grad_norm": 1.2924352884292603, + "learning_rate": 0.0002, + "loss": 1.0812, + "step": 8030 + }, + { + "epoch": 5.478705281090289, + "grad_norm": 1.8000653982162476, + "learning_rate": 0.0002, + "loss": 1.1178, + "step": 8040 + }, + { + "epoch": 5.485519591141397, + "grad_norm": 1.1538785696029663, + "learning_rate": 0.0002, + "loss": 1.1205, + "step": 8050 + }, + { + "epoch": 5.492333901192504, + "grad_norm": 1.1173290014266968, + "learning_rate": 0.0002, + "loss": 1.1015, + "step": 8060 + }, + { + "epoch": 5.499148211243612, + "grad_norm": 1.1501243114471436, + "learning_rate": 0.0002, + "loss": 1.1597, + "step": 8070 + }, + { + "epoch": 5.505962521294719, + "grad_norm": 1.1335760354995728, + "learning_rate": 0.0002, + "loss": 1.1465, + "step": 8080 + }, + { + "epoch": 5.5127768313458265, + "grad_norm": 1.565274953842163, + "learning_rate": 0.0002, + "loss": 1.1005, + "step": 8090 + }, + { + "epoch": 5.519591141396933, + "grad_norm": 1.3415014743804932, + "learning_rate": 0.0002, + "loss": 1.1085, + "step": 8100 + }, + { + "epoch": 5.526405451448041, + "grad_norm": 1.2377240657806396, + "learning_rate": 0.0002, + "loss": 1.1166, + "step": 8110 + }, + { + "epoch": 5.533219761499148, + "grad_norm": 1.3333637714385986, + "learning_rate": 0.0002, + "loss": 1.0766, + "step": 8120 + }, + { + "epoch": 5.540034071550256, + "grad_norm": 1.2620662450790405, + "learning_rate": 0.0002, + "loss": 1.1515, + "step": 8130 + }, + { + "epoch": 5.546848381601363, + "grad_norm": 1.2806652784347534, + "learning_rate": 0.0002, + "loss": 1.0839, + "step": 8140 + }, + { + "epoch": 5.5536626916524705, + "grad_norm": 1.2057335376739502, + "learning_rate": 0.0002, + "loss": 1.1221, + "step": 8150 + }, + { + "epoch": 5.560477001703577, + "grad_norm": 1.411726951599121, + "learning_rate": 0.0002, + "loss": 1.1292, + "step": 8160 + }, + { + "epoch": 5.567291311754685, + "grad_norm": 1.381104588508606, + "learning_rate": 0.0002, + "loss": 1.0887, + "step": 8170 + }, + { + "epoch": 5.574105621805792, + "grad_norm": 1.3449294567108154, + "learning_rate": 0.0002, + "loss": 1.1317, + "step": 8180 + }, + { + "epoch": 5.5809199318569, + "grad_norm": 1.2791016101837158, + "learning_rate": 0.0002, + "loss": 1.1392, + "step": 8190 + }, + { + "epoch": 5.587734241908007, + "grad_norm": 1.276891827583313, + "learning_rate": 0.0002, + "loss": 1.0972, + "step": 8200 + }, + { + "epoch": 5.5945485519591145, + "grad_norm": 1.3951541185379028, + "learning_rate": 0.0002, + "loss": 1.1001, + "step": 8210 + }, + { + "epoch": 5.601362862010221, + "grad_norm": 1.4167890548706055, + "learning_rate": 0.0002, + "loss": 1.0993, + "step": 8220 + }, + { + "epoch": 5.608177172061329, + "grad_norm": 1.4388375282287598, + "learning_rate": 0.0002, + "loss": 1.0826, + "step": 8230 + }, + { + "epoch": 5.614991482112436, + "grad_norm": 1.210157036781311, + "learning_rate": 0.0002, + "loss": 1.1941, + "step": 8240 + }, + { + "epoch": 5.621805792163544, + "grad_norm": 1.0557862520217896, + "learning_rate": 0.0002, + "loss": 1.0833, + "step": 8250 + }, + { + "epoch": 5.628620102214651, + "grad_norm": 1.2913990020751953, + "learning_rate": 0.0002, + "loss": 1.1197, + "step": 8260 + }, + { + "epoch": 5.6354344122657585, + "grad_norm": 1.2204737663269043, + "learning_rate": 0.0002, + "loss": 1.0346, + "step": 8270 + }, + { + "epoch": 5.642248722316865, + "grad_norm": 1.57016921043396, + "learning_rate": 0.0002, + "loss": 1.1429, + "step": 8280 + }, + { + "epoch": 5.649063032367973, + "grad_norm": 1.0117967128753662, + "learning_rate": 0.0002, + "loss": 1.0988, + "step": 8290 + }, + { + "epoch": 5.65587734241908, + "grad_norm": 1.3195525407791138, + "learning_rate": 0.0002, + "loss": 1.0786, + "step": 8300 + }, + { + "epoch": 5.662691652470187, + "grad_norm": 1.2566497325897217, + "learning_rate": 0.0002, + "loss": 1.0618, + "step": 8310 + }, + { + "epoch": 5.669505962521295, + "grad_norm": 1.1446818113327026, + "learning_rate": 0.0002, + "loss": 1.1635, + "step": 8320 + }, + { + "epoch": 5.6763202725724025, + "grad_norm": 1.2928680181503296, + "learning_rate": 0.0002, + "loss": 1.2201, + "step": 8330 + }, + { + "epoch": 5.683134582623509, + "grad_norm": 1.2823996543884277, + "learning_rate": 0.0002, + "loss": 1.1488, + "step": 8340 + }, + { + "epoch": 5.689948892674616, + "grad_norm": 1.1523874998092651, + "learning_rate": 0.0002, + "loss": 1.0686, + "step": 8350 + }, + { + "epoch": 5.696763202725724, + "grad_norm": 1.0819287300109863, + "learning_rate": 0.0002, + "loss": 1.0938, + "step": 8360 + }, + { + "epoch": 5.703577512776832, + "grad_norm": 1.2384417057037354, + "learning_rate": 0.0002, + "loss": 1.167, + "step": 8370 + }, + { + "epoch": 5.710391822827939, + "grad_norm": 1.1733224391937256, + "learning_rate": 0.0002, + "loss": 1.1136, + "step": 8380 + }, + { + "epoch": 5.7172061328790456, + "grad_norm": 1.3173418045043945, + "learning_rate": 0.0002, + "loss": 1.1041, + "step": 8390 + }, + { + "epoch": 5.724020442930153, + "grad_norm": 1.285880446434021, + "learning_rate": 0.0002, + "loss": 1.1014, + "step": 8400 + }, + { + "epoch": 5.730834752981261, + "grad_norm": 1.1404874324798584, + "learning_rate": 0.0002, + "loss": 1.1161, + "step": 8410 + }, + { + "epoch": 5.737649063032368, + "grad_norm": 1.2432540655136108, + "learning_rate": 0.0002, + "loss": 1.192, + "step": 8420 + }, + { + "epoch": 5.744463373083475, + "grad_norm": 1.2432233095169067, + "learning_rate": 0.0002, + "loss": 1.1702, + "step": 8430 + }, + { + "epoch": 5.751277683134583, + "grad_norm": 1.154496669769287, + "learning_rate": 0.0002, + "loss": 1.1357, + "step": 8440 + }, + { + "epoch": 5.75809199318569, + "grad_norm": 1.3301030397415161, + "learning_rate": 0.0002, + "loss": 1.1706, + "step": 8450 + }, + { + "epoch": 5.764906303236797, + "grad_norm": 1.243760347366333, + "learning_rate": 0.0002, + "loss": 1.2052, + "step": 8460 + }, + { + "epoch": 5.771720613287904, + "grad_norm": 1.4083361625671387, + "learning_rate": 0.0002, + "loss": 1.1035, + "step": 8470 + }, + { + "epoch": 5.778534923339012, + "grad_norm": 1.5662120580673218, + "learning_rate": 0.0002, + "loss": 1.1362, + "step": 8480 + }, + { + "epoch": 5.78534923339012, + "grad_norm": 1.2111139297485352, + "learning_rate": 0.0002, + "loss": 1.1578, + "step": 8490 + }, + { + "epoch": 5.792163543441227, + "grad_norm": 1.2776305675506592, + "learning_rate": 0.0002, + "loss": 1.1333, + "step": 8500 + }, + { + "epoch": 5.7989778534923335, + "grad_norm": 1.1777727603912354, + "learning_rate": 0.0002, + "loss": 1.1439, + "step": 8510 + }, + { + "epoch": 5.805792163543441, + "grad_norm": 1.1696112155914307, + "learning_rate": 0.0002, + "loss": 1.0859, + "step": 8520 + }, + { + "epoch": 5.812606473594548, + "grad_norm": 1.137397289276123, + "learning_rate": 0.0002, + "loss": 1.162, + "step": 8530 + }, + { + "epoch": 5.819420783645656, + "grad_norm": 1.3182098865509033, + "learning_rate": 0.0002, + "loss": 1.2099, + "step": 8540 + }, + { + "epoch": 5.826235093696763, + "grad_norm": 1.359756588935852, + "learning_rate": 0.0002, + "loss": 1.1427, + "step": 8550 + }, + { + "epoch": 5.833049403747871, + "grad_norm": 1.4118162393569946, + "learning_rate": 0.0002, + "loss": 1.1714, + "step": 8560 + }, + { + "epoch": 5.8398637137989775, + "grad_norm": 1.1899290084838867, + "learning_rate": 0.0002, + "loss": 1.1758, + "step": 8570 + }, + { + "epoch": 5.846678023850085, + "grad_norm": 1.1764532327651978, + "learning_rate": 0.0002, + "loss": 1.1511, + "step": 8580 + }, + { + "epoch": 5.853492333901192, + "grad_norm": 1.33274245262146, + "learning_rate": 0.0002, + "loss": 1.1633, + "step": 8590 + }, + { + "epoch": 5.8603066439523, + "grad_norm": 1.2571861743927002, + "learning_rate": 0.0002, + "loss": 1.1092, + "step": 8600 + }, + { + "epoch": 5.867120954003407, + "grad_norm": 1.3523616790771484, + "learning_rate": 0.0002, + "loss": 1.1137, + "step": 8610 + }, + { + "epoch": 5.873935264054515, + "grad_norm": 1.3556902408599854, + "learning_rate": 0.0002, + "loss": 1.2442, + "step": 8620 + }, + { + "epoch": 5.8807495741056215, + "grad_norm": 1.2864879369735718, + "learning_rate": 0.0002, + "loss": 1.0967, + "step": 8630 + }, + { + "epoch": 5.887563884156729, + "grad_norm": 1.2872768640518188, + "learning_rate": 0.0002, + "loss": 1.1491, + "step": 8640 + }, + { + "epoch": 5.894378194207836, + "grad_norm": 1.1446053981781006, + "learning_rate": 0.0002, + "loss": 1.1003, + "step": 8650 + }, + { + "epoch": 5.901192504258944, + "grad_norm": 1.292615532875061, + "learning_rate": 0.0002, + "loss": 1.1095, + "step": 8660 + }, + { + "epoch": 5.908006814310051, + "grad_norm": 1.190891981124878, + "learning_rate": 0.0002, + "loss": 1.2009, + "step": 8670 + }, + { + "epoch": 5.914821124361159, + "grad_norm": 1.330273985862732, + "learning_rate": 0.0002, + "loss": 1.1386, + "step": 8680 + }, + { + "epoch": 5.9216354344122655, + "grad_norm": 1.41121244430542, + "learning_rate": 0.0002, + "loss": 1.1874, + "step": 8690 + }, + { + "epoch": 5.928449744463373, + "grad_norm": 1.1360729932785034, + "learning_rate": 0.0002, + "loss": 1.1573, + "step": 8700 + }, + { + "epoch": 5.93526405451448, + "grad_norm": 1.2220772504806519, + "learning_rate": 0.0002, + "loss": 1.115, + "step": 8710 + }, + { + "epoch": 5.942078364565588, + "grad_norm": 1.1077110767364502, + "learning_rate": 0.0002, + "loss": 1.1696, + "step": 8720 + }, + { + "epoch": 5.948892674616695, + "grad_norm": 1.3632500171661377, + "learning_rate": 0.0002, + "loss": 1.1443, + "step": 8730 + }, + { + "epoch": 5.955706984667803, + "grad_norm": 1.4695830345153809, + "learning_rate": 0.0002, + "loss": 1.1474, + "step": 8740 + }, + { + "epoch": 5.9625212947189095, + "grad_norm": 1.217741847038269, + "learning_rate": 0.0002, + "loss": 1.1825, + "step": 8750 + }, + { + "epoch": 5.969335604770017, + "grad_norm": 1.0386874675750732, + "learning_rate": 0.0002, + "loss": 1.1495, + "step": 8760 + }, + { + "epoch": 5.976149914821124, + "grad_norm": 1.2067872285842896, + "learning_rate": 0.0002, + "loss": 1.1146, + "step": 8770 + }, + { + "epoch": 5.982964224872232, + "grad_norm": 1.3842018842697144, + "learning_rate": 0.0002, + "loss": 1.1987, + "step": 8780 + }, + { + "epoch": 5.989778534923339, + "grad_norm": 1.4584033489227295, + "learning_rate": 0.0002, + "loss": 1.2147, + "step": 8790 + }, + { + "epoch": 5.996592844974447, + "grad_norm": 1.1912888288497925, + "learning_rate": 0.0002, + "loss": 1.2078, + "step": 8800 + }, + { + "epoch": 6.0, + "eval_loss": 2.261807441711426, + "eval_runtime": 68.1125, + "eval_samples_per_second": 7.444, + "eval_steps_per_second": 0.94, + "step": 8805 + }, + { + "epoch": 6.003407155025553, + "grad_norm": 1.1715940237045288, + "learning_rate": 0.0002, + "loss": 1.004, + "step": 8810 + }, + { + "epoch": 6.010221465076661, + "grad_norm": 1.6573960781097412, + "learning_rate": 0.0002, + "loss": 0.8665, + "step": 8820 + }, + { + "epoch": 6.017035775127768, + "grad_norm": 1.2845953702926636, + "learning_rate": 0.0002, + "loss": 0.8866, + "step": 8830 + }, + { + "epoch": 6.023850085178876, + "grad_norm": 1.526754379272461, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 8840 + }, + { + "epoch": 6.030664395229983, + "grad_norm": 1.4536073207855225, + "learning_rate": 0.0002, + "loss": 0.8555, + "step": 8850 + }, + { + "epoch": 6.0374787052810905, + "grad_norm": 1.68099045753479, + "learning_rate": 0.0002, + "loss": 0.8839, + "step": 8860 + }, + { + "epoch": 6.044293015332197, + "grad_norm": 1.485777497291565, + "learning_rate": 0.0002, + "loss": 0.8538, + "step": 8870 + }, + { + "epoch": 6.051107325383305, + "grad_norm": 1.5084402561187744, + "learning_rate": 0.0002, + "loss": 0.8534, + "step": 8880 + }, + { + "epoch": 6.057921635434412, + "grad_norm": 1.3901145458221436, + "learning_rate": 0.0002, + "loss": 0.8587, + "step": 8890 + }, + { + "epoch": 6.06473594548552, + "grad_norm": 1.528954267501831, + "learning_rate": 0.0002, + "loss": 0.8625, + "step": 8900 + }, + { + "epoch": 6.071550255536627, + "grad_norm": 1.6869531869888306, + "learning_rate": 0.0002, + "loss": 0.9115, + "step": 8910 + }, + { + "epoch": 6.0783645655877345, + "grad_norm": 1.4149913787841797, + "learning_rate": 0.0002, + "loss": 0.8817, + "step": 8920 + }, + { + "epoch": 6.085178875638841, + "grad_norm": 1.6853618621826172, + "learning_rate": 0.0002, + "loss": 0.8734, + "step": 8930 + }, + { + "epoch": 6.091993185689949, + "grad_norm": 1.694443702697754, + "learning_rate": 0.0002, + "loss": 0.8836, + "step": 8940 + }, + { + "epoch": 6.098807495741056, + "grad_norm": 2.1037111282348633, + "learning_rate": 0.0002, + "loss": 0.9144, + "step": 8950 + }, + { + "epoch": 6.105621805792164, + "grad_norm": 2.1236703395843506, + "learning_rate": 0.0002, + "loss": 0.8296, + "step": 8960 + }, + { + "epoch": 6.112436115843271, + "grad_norm": 1.6621695756912231, + "learning_rate": 0.0002, + "loss": 0.8451, + "step": 8970 + }, + { + "epoch": 6.1192504258943785, + "grad_norm": 1.5390307903289795, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 8980 + }, + { + "epoch": 6.126064735945485, + "grad_norm": 1.7841306924819946, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 8990 + }, + { + "epoch": 6.132879045996593, + "grad_norm": 1.8420580625534058, + "learning_rate": 0.0002, + "loss": 0.8872, + "step": 9000 + }, + { + "epoch": 6.1396933560477, + "grad_norm": 1.8198356628417969, + "learning_rate": 0.0002, + "loss": 0.9411, + "step": 9010 + }, + { + "epoch": 6.146507666098808, + "grad_norm": 1.6955933570861816, + "learning_rate": 0.0002, + "loss": 0.8921, + "step": 9020 + }, + { + "epoch": 6.153321976149915, + "grad_norm": 1.5072602033615112, + "learning_rate": 0.0002, + "loss": 0.9241, + "step": 9030 + }, + { + "epoch": 6.1601362862010225, + "grad_norm": 1.63434898853302, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 9040 + }, + { + "epoch": 6.166950596252129, + "grad_norm": 1.3761866092681885, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 9050 + }, + { + "epoch": 6.173764906303237, + "grad_norm": 1.7027268409729004, + "learning_rate": 0.0002, + "loss": 0.8136, + "step": 9060 + }, + { + "epoch": 6.180579216354344, + "grad_norm": 1.3534049987792969, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 9070 + }, + { + "epoch": 6.187393526405452, + "grad_norm": 1.4437154531478882, + "learning_rate": 0.0002, + "loss": 0.847, + "step": 9080 + }, + { + "epoch": 6.194207836456559, + "grad_norm": 1.4449656009674072, + "learning_rate": 0.0002, + "loss": 0.9169, + "step": 9090 + }, + { + "epoch": 6.2010221465076665, + "grad_norm": 1.5854601860046387, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 9100 + }, + { + "epoch": 6.207836456558773, + "grad_norm": 1.5987509489059448, + "learning_rate": 0.0002, + "loss": 0.8801, + "step": 9110 + }, + { + "epoch": 6.214650766609881, + "grad_norm": 1.6309672594070435, + "learning_rate": 0.0002, + "loss": 0.9077, + "step": 9120 + }, + { + "epoch": 6.221465076660988, + "grad_norm": 1.526936411857605, + "learning_rate": 0.0002, + "loss": 0.8802, + "step": 9130 + }, + { + "epoch": 6.228279386712096, + "grad_norm": 1.4649606943130493, + "learning_rate": 0.0002, + "loss": 0.8858, + "step": 9140 + }, + { + "epoch": 6.235093696763203, + "grad_norm": 1.589350700378418, + "learning_rate": 0.0002, + "loss": 0.9414, + "step": 9150 + }, + { + "epoch": 6.2419080068143105, + "grad_norm": 1.655668020248413, + "learning_rate": 0.0002, + "loss": 0.9001, + "step": 9160 + }, + { + "epoch": 6.248722316865417, + "grad_norm": 1.5296401977539062, + "learning_rate": 0.0002, + "loss": 0.9879, + "step": 9170 + }, + { + "epoch": 6.255536626916525, + "grad_norm": 1.5857278108596802, + "learning_rate": 0.0002, + "loss": 0.8908, + "step": 9180 + }, + { + "epoch": 6.262350936967632, + "grad_norm": 1.7779686450958252, + "learning_rate": 0.0002, + "loss": 0.9329, + "step": 9190 + }, + { + "epoch": 6.269165247018739, + "grad_norm": 1.588886022567749, + "learning_rate": 0.0002, + "loss": 0.9683, + "step": 9200 + }, + { + "epoch": 6.275979557069847, + "grad_norm": 1.3818320035934448, + "learning_rate": 0.0002, + "loss": 0.9091, + "step": 9210 + }, + { + "epoch": 6.2827938671209544, + "grad_norm": 1.6675978899002075, + "learning_rate": 0.0002, + "loss": 0.9003, + "step": 9220 + }, + { + "epoch": 6.289608177172061, + "grad_norm": 1.5672610998153687, + "learning_rate": 0.0002, + "loss": 0.9125, + "step": 9230 + }, + { + "epoch": 6.296422487223168, + "grad_norm": 1.4558004140853882, + "learning_rate": 0.0002, + "loss": 0.9083, + "step": 9240 + }, + { + "epoch": 6.303236797274276, + "grad_norm": 1.5393446683883667, + "learning_rate": 0.0002, + "loss": 0.9362, + "step": 9250 + }, + { + "epoch": 6.310051107325384, + "grad_norm": 1.4367083311080933, + "learning_rate": 0.0002, + "loss": 0.8807, + "step": 9260 + }, + { + "epoch": 6.316865417376491, + "grad_norm": 1.5045381784439087, + "learning_rate": 0.0002, + "loss": 0.9203, + "step": 9270 + }, + { + "epoch": 6.3236797274275975, + "grad_norm": 1.8604016304016113, + "learning_rate": 0.0002, + "loss": 0.9239, + "step": 9280 + }, + { + "epoch": 6.330494037478705, + "grad_norm": 1.4863131046295166, + "learning_rate": 0.0002, + "loss": 0.9644, + "step": 9290 + }, + { + "epoch": 6.337308347529812, + "grad_norm": 1.511121392250061, + "learning_rate": 0.0002, + "loss": 0.9052, + "step": 9300 + }, + { + "epoch": 6.34412265758092, + "grad_norm": 1.6979162693023682, + "learning_rate": 0.0002, + "loss": 0.8609, + "step": 9310 + }, + { + "epoch": 6.350936967632027, + "grad_norm": 1.6060494184494019, + "learning_rate": 0.0002, + "loss": 0.953, + "step": 9320 + }, + { + "epoch": 6.357751277683135, + "grad_norm": 1.6572561264038086, + "learning_rate": 0.0002, + "loss": 0.9552, + "step": 9330 + }, + { + "epoch": 6.3645655877342415, + "grad_norm": 1.6706757545471191, + "learning_rate": 0.0002, + "loss": 0.9201, + "step": 9340 + }, + { + "epoch": 6.371379897785349, + "grad_norm": 1.620836615562439, + "learning_rate": 0.0002, + "loss": 0.8693, + "step": 9350 + }, + { + "epoch": 6.378194207836456, + "grad_norm": 1.482940673828125, + "learning_rate": 0.0002, + "loss": 0.9281, + "step": 9360 + }, + { + "epoch": 6.385008517887564, + "grad_norm": 1.3969961404800415, + "learning_rate": 0.0002, + "loss": 0.9026, + "step": 9370 + }, + { + "epoch": 6.391822827938671, + "grad_norm": 1.611212134361267, + "learning_rate": 0.0002, + "loss": 0.8909, + "step": 9380 + }, + { + "epoch": 6.398637137989779, + "grad_norm": 1.5586223602294922, + "learning_rate": 0.0002, + "loss": 0.9137, + "step": 9390 + }, + { + "epoch": 6.4054514480408855, + "grad_norm": 1.394761562347412, + "learning_rate": 0.0002, + "loss": 0.9254, + "step": 9400 + }, + { + "epoch": 6.412265758091993, + "grad_norm": 1.559618592262268, + "learning_rate": 0.0002, + "loss": 0.8935, + "step": 9410 + }, + { + "epoch": 6.4190800681431, + "grad_norm": 1.462173581123352, + "learning_rate": 0.0002, + "loss": 0.9585, + "step": 9420 + }, + { + "epoch": 6.425894378194208, + "grad_norm": 1.5655437707901, + "learning_rate": 0.0002, + "loss": 0.9492, + "step": 9430 + }, + { + "epoch": 6.432708688245315, + "grad_norm": 1.4344340562820435, + "learning_rate": 0.0002, + "loss": 0.9371, + "step": 9440 + }, + { + "epoch": 6.439522998296423, + "grad_norm": 1.5132373571395874, + "learning_rate": 0.0002, + "loss": 0.9396, + "step": 9450 + }, + { + "epoch": 6.4463373083475295, + "grad_norm": 1.68776535987854, + "learning_rate": 0.0002, + "loss": 0.9229, + "step": 9460 + }, + { + "epoch": 6.453151618398637, + "grad_norm": 1.556823968887329, + "learning_rate": 0.0002, + "loss": 0.9524, + "step": 9470 + }, + { + "epoch": 6.459965928449744, + "grad_norm": 1.4254260063171387, + "learning_rate": 0.0002, + "loss": 0.94, + "step": 9480 + }, + { + "epoch": 6.466780238500852, + "grad_norm": 1.7901203632354736, + "learning_rate": 0.0002, + "loss": 0.9689, + "step": 9490 + }, + { + "epoch": 6.473594548551959, + "grad_norm": 1.5098410844802856, + "learning_rate": 0.0002, + "loss": 0.9267, + "step": 9500 + }, + { + "epoch": 6.480408858603067, + "grad_norm": 1.6036792993545532, + "learning_rate": 0.0002, + "loss": 0.9159, + "step": 9510 + }, + { + "epoch": 6.4872231686541735, + "grad_norm": 1.5011411905288696, + "learning_rate": 0.0002, + "loss": 0.9253, + "step": 9520 + }, + { + "epoch": 6.494037478705281, + "grad_norm": 1.410780906677246, + "learning_rate": 0.0002, + "loss": 0.9527, + "step": 9530 + }, + { + "epoch": 6.500851788756388, + "grad_norm": 1.7451791763305664, + "learning_rate": 0.0002, + "loss": 0.8927, + "step": 9540 + }, + { + "epoch": 6.507666098807496, + "grad_norm": 1.5888725519180298, + "learning_rate": 0.0002, + "loss": 0.9566, + "step": 9550 + }, + { + "epoch": 6.514480408858603, + "grad_norm": 1.3016585111618042, + "learning_rate": 0.0002, + "loss": 0.9324, + "step": 9560 + }, + { + "epoch": 6.521294718909711, + "grad_norm": 1.629522442817688, + "learning_rate": 0.0002, + "loss": 0.9576, + "step": 9570 + }, + { + "epoch": 6.5281090289608175, + "grad_norm": 1.494436264038086, + "learning_rate": 0.0002, + "loss": 0.92, + "step": 9580 + }, + { + "epoch": 6.534923339011925, + "grad_norm": 1.323195219039917, + "learning_rate": 0.0002, + "loss": 0.9154, + "step": 9590 + }, + { + "epoch": 6.541737649063032, + "grad_norm": 1.4904460906982422, + "learning_rate": 0.0002, + "loss": 0.9891, + "step": 9600 + }, + { + "epoch": 6.54855195911414, + "grad_norm": 1.6079169511795044, + "learning_rate": 0.0002, + "loss": 0.9316, + "step": 9610 + }, + { + "epoch": 6.555366269165247, + "grad_norm": 1.5113396644592285, + "learning_rate": 0.0002, + "loss": 1.0105, + "step": 9620 + }, + { + "epoch": 6.562180579216355, + "grad_norm": 1.7113087177276611, + "learning_rate": 0.0002, + "loss": 0.9618, + "step": 9630 + }, + { + "epoch": 6.5689948892674614, + "grad_norm": 1.359394907951355, + "learning_rate": 0.0002, + "loss": 0.9699, + "step": 9640 + }, + { + "epoch": 6.575809199318569, + "grad_norm": 1.7701337337493896, + "learning_rate": 0.0002, + "loss": 1.0267, + "step": 9650 + }, + { + "epoch": 6.582623509369676, + "grad_norm": 1.6381222009658813, + "learning_rate": 0.0002, + "loss": 0.9639, + "step": 9660 + }, + { + "epoch": 6.589437819420784, + "grad_norm": 1.781891942024231, + "learning_rate": 0.0002, + "loss": 0.9292, + "step": 9670 + }, + { + "epoch": 6.596252129471891, + "grad_norm": 1.47724449634552, + "learning_rate": 0.0002, + "loss": 1.0078, + "step": 9680 + }, + { + "epoch": 6.6030664395229985, + "grad_norm": 1.5498195886611938, + "learning_rate": 0.0002, + "loss": 1.0268, + "step": 9690 + }, + { + "epoch": 6.609880749574105, + "grad_norm": 1.5682368278503418, + "learning_rate": 0.0002, + "loss": 0.9794, + "step": 9700 + }, + { + "epoch": 6.616695059625213, + "grad_norm": 1.6106981039047241, + "learning_rate": 0.0002, + "loss": 0.9298, + "step": 9710 + }, + { + "epoch": 6.62350936967632, + "grad_norm": 1.5388364791870117, + "learning_rate": 0.0002, + "loss": 0.9644, + "step": 9720 + }, + { + "epoch": 6.630323679727428, + "grad_norm": 1.5432790517807007, + "learning_rate": 0.0002, + "loss": 0.9385, + "step": 9730 + }, + { + "epoch": 6.637137989778535, + "grad_norm": 1.4929786920547485, + "learning_rate": 0.0002, + "loss": 0.9995, + "step": 9740 + }, + { + "epoch": 6.6439522998296425, + "grad_norm": 1.6959431171417236, + "learning_rate": 0.0002, + "loss": 0.932, + "step": 9750 + }, + { + "epoch": 6.650766609880749, + "grad_norm": 1.4990962743759155, + "learning_rate": 0.0002, + "loss": 0.9397, + "step": 9760 + }, + { + "epoch": 6.657580919931857, + "grad_norm": 1.5235223770141602, + "learning_rate": 0.0002, + "loss": 0.9808, + "step": 9770 + }, + { + "epoch": 6.664395229982964, + "grad_norm": 1.8264366388320923, + "learning_rate": 0.0002, + "loss": 0.9522, + "step": 9780 + }, + { + "epoch": 6.671209540034072, + "grad_norm": 1.4298417568206787, + "learning_rate": 0.0002, + "loss": 0.9751, + "step": 9790 + }, + { + "epoch": 6.678023850085179, + "grad_norm": 1.5926862955093384, + "learning_rate": 0.0002, + "loss": 0.9607, + "step": 9800 + }, + { + "epoch": 6.6848381601362865, + "grad_norm": 1.4592483043670654, + "learning_rate": 0.0002, + "loss": 0.9681, + "step": 9810 + }, + { + "epoch": 6.691652470187393, + "grad_norm": 1.375799536705017, + "learning_rate": 0.0002, + "loss": 0.9385, + "step": 9820 + }, + { + "epoch": 6.698466780238501, + "grad_norm": 1.5767531394958496, + "learning_rate": 0.0002, + "loss": 0.9684, + "step": 9830 + }, + { + "epoch": 6.705281090289608, + "grad_norm": 1.6452189683914185, + "learning_rate": 0.0002, + "loss": 0.9313, + "step": 9840 + }, + { + "epoch": 6.712095400340716, + "grad_norm": 1.3874469995498657, + "learning_rate": 0.0002, + "loss": 0.9781, + "step": 9850 + }, + { + "epoch": 6.718909710391823, + "grad_norm": 1.5470930337905884, + "learning_rate": 0.0002, + "loss": 0.9803, + "step": 9860 + }, + { + "epoch": 6.7257240204429305, + "grad_norm": 1.499840259552002, + "learning_rate": 0.0002, + "loss": 0.9335, + "step": 9870 + }, + { + "epoch": 6.732538330494037, + "grad_norm": 1.4733195304870605, + "learning_rate": 0.0002, + "loss": 0.9209, + "step": 9880 + }, + { + "epoch": 6.739352640545145, + "grad_norm": 1.921722173690796, + "learning_rate": 0.0002, + "loss": 0.9124, + "step": 9890 + }, + { + "epoch": 6.746166950596252, + "grad_norm": 1.848003625869751, + "learning_rate": 0.0002, + "loss": 0.9311, + "step": 9900 + }, + { + "epoch": 6.75298126064736, + "grad_norm": 1.6050934791564941, + "learning_rate": 0.0002, + "loss": 0.9601, + "step": 9910 + }, + { + "epoch": 6.759795570698467, + "grad_norm": 1.716424822807312, + "learning_rate": 0.0002, + "loss": 0.941, + "step": 9920 + }, + { + "epoch": 6.7666098807495745, + "grad_norm": 1.5647642612457275, + "learning_rate": 0.0002, + "loss": 0.9592, + "step": 9930 + }, + { + "epoch": 6.773424190800681, + "grad_norm": 1.5500049591064453, + "learning_rate": 0.0002, + "loss": 0.927, + "step": 9940 + }, + { + "epoch": 6.780238500851789, + "grad_norm": 1.5384467840194702, + "learning_rate": 0.0002, + "loss": 0.9921, + "step": 9950 + }, + { + "epoch": 6.787052810902896, + "grad_norm": 1.8312339782714844, + "learning_rate": 0.0002, + "loss": 0.9673, + "step": 9960 + }, + { + "epoch": 6.793867120954003, + "grad_norm": 1.3505569696426392, + "learning_rate": 0.0002, + "loss": 0.9647, + "step": 9970 + }, + { + "epoch": 6.800681431005111, + "grad_norm": 1.6717044115066528, + "learning_rate": 0.0002, + "loss": 0.9553, + "step": 9980 + }, + { + "epoch": 6.8074957410562185, + "grad_norm": 1.7072664499282837, + "learning_rate": 0.0002, + "loss": 0.9688, + "step": 9990 + }, + { + "epoch": 6.814310051107325, + "grad_norm": 1.3609364032745361, + "learning_rate": 0.0002, + "loss": 0.951, + "step": 10000 + }, + { + "epoch": 6.821124361158432, + "grad_norm": 1.4862881898880005, + "learning_rate": 0.0002, + "loss": 0.9638, + "step": 10010 + }, + { + "epoch": 6.82793867120954, + "grad_norm": 1.4808303117752075, + "learning_rate": 0.0002, + "loss": 1.016, + "step": 10020 + }, + { + "epoch": 6.834752981260648, + "grad_norm": 1.6531925201416016, + "learning_rate": 0.0002, + "loss": 0.9233, + "step": 10030 + }, + { + "epoch": 6.841567291311755, + "grad_norm": 1.5090917348861694, + "learning_rate": 0.0002, + "loss": 0.9435, + "step": 10040 + }, + { + "epoch": 6.848381601362862, + "grad_norm": 1.5361953973770142, + "learning_rate": 0.0002, + "loss": 0.9395, + "step": 10050 + }, + { + "epoch": 6.855195911413969, + "grad_norm": 1.7302757501602173, + "learning_rate": 0.0002, + "loss": 1.0095, + "step": 10060 + }, + { + "epoch": 6.862010221465077, + "grad_norm": 1.5626600980758667, + "learning_rate": 0.0002, + "loss": 0.9796, + "step": 10070 + }, + { + "epoch": 6.868824531516184, + "grad_norm": 1.4168927669525146, + "learning_rate": 0.0002, + "loss": 1.0244, + "step": 10080 + }, + { + "epoch": 6.875638841567291, + "grad_norm": 1.3921427726745605, + "learning_rate": 0.0002, + "loss": 0.9253, + "step": 10090 + }, + { + "epoch": 6.882453151618399, + "grad_norm": 1.6304726600646973, + "learning_rate": 0.0002, + "loss": 1.0037, + "step": 10100 + }, + { + "epoch": 6.889267461669506, + "grad_norm": 1.5463745594024658, + "learning_rate": 0.0002, + "loss": 1.0088, + "step": 10110 + }, + { + "epoch": 6.896081771720613, + "grad_norm": 1.4989547729492188, + "learning_rate": 0.0002, + "loss": 1.0276, + "step": 10120 + }, + { + "epoch": 6.90289608177172, + "grad_norm": 1.7281252145767212, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 10130 + }, + { + "epoch": 6.909710391822828, + "grad_norm": 1.469348669052124, + "learning_rate": 0.0002, + "loss": 1.031, + "step": 10140 + }, + { + "epoch": 6.916524701873936, + "grad_norm": 1.3762892484664917, + "learning_rate": 0.0002, + "loss": 1.0301, + "step": 10150 + }, + { + "epoch": 6.923339011925043, + "grad_norm": 1.489425539970398, + "learning_rate": 0.0002, + "loss": 1.0032, + "step": 10160 + }, + { + "epoch": 6.9301533219761495, + "grad_norm": 1.4514580965042114, + "learning_rate": 0.0002, + "loss": 0.9487, + "step": 10170 + }, + { + "epoch": 6.936967632027257, + "grad_norm": 1.6008871793746948, + "learning_rate": 0.0002, + "loss": 0.9898, + "step": 10180 + }, + { + "epoch": 6.943781942078364, + "grad_norm": 1.6893450021743774, + "learning_rate": 0.0002, + "loss": 1.0577, + "step": 10190 + }, + { + "epoch": 6.950596252129472, + "grad_norm": 1.66379976272583, + "learning_rate": 0.0002, + "loss": 0.9699, + "step": 10200 + }, + { + "epoch": 6.957410562180579, + "grad_norm": 1.501943588256836, + "learning_rate": 0.0002, + "loss": 1.0159, + "step": 10210 + }, + { + "epoch": 6.964224872231687, + "grad_norm": 1.6803759336471558, + "learning_rate": 0.0002, + "loss": 1.0414, + "step": 10220 + }, + { + "epoch": 6.9710391822827935, + "grad_norm": 1.4512689113616943, + "learning_rate": 0.0002, + "loss": 1.0413, + "step": 10230 + }, + { + "epoch": 6.977853492333901, + "grad_norm": 1.6071290969848633, + "learning_rate": 0.0002, + "loss": 0.9791, + "step": 10240 + }, + { + "epoch": 6.984667802385008, + "grad_norm": 1.598915696144104, + "learning_rate": 0.0002, + "loss": 1.0574, + "step": 10250 + }, + { + "epoch": 6.991482112436116, + "grad_norm": 1.7178512811660767, + "learning_rate": 0.0002, + "loss": 1.0379, + "step": 10260 + }, + { + "epoch": 6.998296422487223, + "grad_norm": 1.4407050609588623, + "learning_rate": 0.0002, + "loss": 1.0082, + "step": 10270 + }, + { + "epoch": 6.999659284497445, + "eval_loss": 2.4567856788635254, + "eval_runtime": 69.5742, + "eval_samples_per_second": 7.287, + "eval_steps_per_second": 0.92, + "step": 10272 + } + ], + "logging_steps": 10, + "max_steps": 11736, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.276340769652736e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2f734531f14829705714a5a2703d90c81f36eeb0 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-10272/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2af5d486d0b370637d5a3999ab7668f94f5d53b6497f018967564aef5e8ad133 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f74ada4442c138045492c30aba09ea94d8338b42 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7259ceae0e31a349a522bf82e1147dfbefd856ae294a2573c46de43644b25a7f +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e96a913d348023850bc506b607eee3c30713a0e6 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01eece72777f2fc0092de932ac375548cf1b8cf87ce7d74e314070735975ec00 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e4e28cd6977f6fd3949eec8689cb19c30811a43f --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f1465c6d8e27cd7eeb47bb8f955eb057820bc127a249b64440095713acfc544 +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..478f22f83a541b1c2b83385dea2b33a3e12ceba3 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68092a24f4c9ae5ed5f140257fc54e8eb6bdbed6d8641161c053ff91ac1291e5 +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8827c2cf18f3e4591c1573e3b6636507597feb6a --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/trainer_state.json @@ -0,0 +1,8308 @@ +{ + "best_metric": 1.8034634590148926, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467", + "epoch": 7.997274275979557, + "eval_steps": 10, + "global_step": 11736, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0068143100511073255, + "grad_norm": 0.635574460029602, + "learning_rate": 0.0002, + "loss": 3.0988, + "step": 10 + }, + { + "epoch": 0.013628620102214651, + "grad_norm": 1.0401769876480103, + "learning_rate": 0.0002, + "loss": 2.5594, + "step": 20 + }, + { + "epoch": 0.020442930153321975, + "grad_norm": 0.4514131247997284, + "learning_rate": 0.0002, + "loss": 2.3587, + "step": 30 + }, + { + "epoch": 0.027257240204429302, + "grad_norm": 0.6303355693817139, + "learning_rate": 0.0002, + "loss": 2.121, + "step": 40 + }, + { + "epoch": 0.034071550255536626, + "grad_norm": 0.4648270606994629, + "learning_rate": 0.0002, + "loss": 1.9833, + "step": 50 + }, + { + "epoch": 0.04088586030664395, + "grad_norm": 0.42953479290008545, + "learning_rate": 0.0002, + "loss": 1.9384, + "step": 60 + }, + { + "epoch": 0.04770017035775128, + "grad_norm": 0.433614581823349, + "learning_rate": 0.0002, + "loss": 1.9202, + "step": 70 + }, + { + "epoch": 0.054514480408858604, + "grad_norm": 0.45318254828453064, + "learning_rate": 0.0002, + "loss": 1.911, + "step": 80 + }, + { + "epoch": 0.06132879045996593, + "grad_norm": 0.4023568630218506, + "learning_rate": 0.0002, + "loss": 1.8895, + "step": 90 + }, + { + "epoch": 0.06814310051107325, + "grad_norm": 0.43260207772254944, + "learning_rate": 0.0002, + "loss": 1.9257, + "step": 100 + }, + { + "epoch": 0.07495741056218058, + "grad_norm": 0.43389809131622314, + "learning_rate": 0.0002, + "loss": 1.9178, + "step": 110 + }, + { + "epoch": 0.0817717206132879, + "grad_norm": 0.39307987689971924, + "learning_rate": 0.0002, + "loss": 1.9071, + "step": 120 + }, + { + "epoch": 0.08858603066439523, + "grad_norm": 0.7703037261962891, + "learning_rate": 0.0002, + "loss": 1.9482, + "step": 130 + }, + { + "epoch": 0.09540034071550256, + "grad_norm": 0.38547563552856445, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 140 + }, + { + "epoch": 0.10221465076660988, + "grad_norm": 0.37948688864707947, + "learning_rate": 0.0002, + "loss": 1.9037, + "step": 150 + }, + { + "epoch": 0.10902896081771721, + "grad_norm": 0.33022379875183105, + "learning_rate": 0.0002, + "loss": 1.8664, + "step": 160 + }, + { + "epoch": 0.11584327086882454, + "grad_norm": 0.33703792095184326, + "learning_rate": 0.0002, + "loss": 1.8659, + "step": 170 + }, + { + "epoch": 0.12265758091993186, + "grad_norm": 0.3259912431240082, + "learning_rate": 0.0002, + "loss": 1.8389, + "step": 180 + }, + { + "epoch": 0.12947189097103917, + "grad_norm": 0.3593858778476715, + "learning_rate": 0.0002, + "loss": 1.8424, + "step": 190 + }, + { + "epoch": 0.1362862010221465, + "grad_norm": 0.3401614725589752, + "learning_rate": 0.0002, + "loss": 1.8457, + "step": 200 + }, + { + "epoch": 0.14310051107325383, + "grad_norm": 0.3892078697681427, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 210 + }, + { + "epoch": 0.14991482112436116, + "grad_norm": 0.315374493598938, + "learning_rate": 0.0002, + "loss": 1.8351, + "step": 220 + }, + { + "epoch": 0.1567291311754685, + "grad_norm": 0.3679497539997101, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 230 + }, + { + "epoch": 0.1635434412265758, + "grad_norm": 0.336730033159256, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 240 + }, + { + "epoch": 0.17035775127768313, + "grad_norm": 0.36472755670547485, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 250 + }, + { + "epoch": 0.17717206132879046, + "grad_norm": 0.34864023327827454, + "learning_rate": 0.0002, + "loss": 1.7766, + "step": 260 + }, + { + "epoch": 0.1839863713798978, + "grad_norm": 0.3350819945335388, + "learning_rate": 0.0002, + "loss": 1.896, + "step": 270 + }, + { + "epoch": 0.19080068143100512, + "grad_norm": 0.3844246566295624, + "learning_rate": 0.0002, + "loss": 1.8742, + "step": 280 + }, + { + "epoch": 0.19761499148211242, + "grad_norm": 0.38413065671920776, + "learning_rate": 0.0002, + "loss": 1.8531, + "step": 290 + }, + { + "epoch": 0.20442930153321975, + "grad_norm": 0.4080047607421875, + "learning_rate": 0.0002, + "loss": 1.8415, + "step": 300 + }, + { + "epoch": 0.21124361158432708, + "grad_norm": 0.3546800911426544, + "learning_rate": 0.0002, + "loss": 1.9009, + "step": 310 + }, + { + "epoch": 0.21805792163543442, + "grad_norm": 0.38577890396118164, + "learning_rate": 0.0002, + "loss": 1.8092, + "step": 320 + }, + { + "epoch": 0.22487223168654175, + "grad_norm": 0.38979098200798035, + "learning_rate": 0.0002, + "loss": 1.7113, + "step": 330 + }, + { + "epoch": 0.23168654173764908, + "grad_norm": 0.35105520486831665, + "learning_rate": 0.0002, + "loss": 1.8162, + "step": 340 + }, + { + "epoch": 0.23850085178875638, + "grad_norm": 0.31671223044395447, + "learning_rate": 0.0002, + "loss": 1.8408, + "step": 350 + }, + { + "epoch": 0.2453151618398637, + "grad_norm": 0.33034196496009827, + "learning_rate": 0.0002, + "loss": 1.8014, + "step": 360 + }, + { + "epoch": 0.252129471890971, + "grad_norm": 0.2990533709526062, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 370 + }, + { + "epoch": 0.25894378194207834, + "grad_norm": 0.362208753824234, + "learning_rate": 0.0002, + "loss": 1.8612, + "step": 380 + }, + { + "epoch": 0.2657580919931857, + "grad_norm": 0.3269096612930298, + "learning_rate": 0.0002, + "loss": 1.8001, + "step": 390 + }, + { + "epoch": 0.272572402044293, + "grad_norm": 0.30555954575538635, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 400 + }, + { + "epoch": 0.27938671209540034, + "grad_norm": 0.332933247089386, + "learning_rate": 0.0002, + "loss": 1.8018, + "step": 410 + }, + { + "epoch": 0.28620102214650767, + "grad_norm": 0.39454060792922974, + "learning_rate": 0.0002, + "loss": 1.8157, + "step": 420 + }, + { + "epoch": 0.293015332197615, + "grad_norm": 0.34589633345603943, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 430 + }, + { + "epoch": 0.29982964224872233, + "grad_norm": 0.3747332990169525, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 440 + }, + { + "epoch": 0.30664395229982966, + "grad_norm": 0.34825369715690613, + "learning_rate": 0.0002, + "loss": 1.8476, + "step": 450 + }, + { + "epoch": 0.313458262350937, + "grad_norm": 0.32906976342201233, + "learning_rate": 0.0002, + "loss": 1.775, + "step": 460 + }, + { + "epoch": 0.3202725724020443, + "grad_norm": 0.33108609914779663, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 470 + }, + { + "epoch": 0.3270868824531516, + "grad_norm": 0.3170463442802429, + "learning_rate": 0.0002, + "loss": 1.8012, + "step": 480 + }, + { + "epoch": 0.3339011925042589, + "grad_norm": 0.30792081356048584, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 490 + }, + { + "epoch": 0.34071550255536626, + "grad_norm": 0.31772997975349426, + "learning_rate": 0.0002, + "loss": 1.7361, + "step": 500 + }, + { + "epoch": 0.3475298126064736, + "grad_norm": 0.32714012265205383, + "learning_rate": 0.0002, + "loss": 1.8147, + "step": 510 + }, + { + "epoch": 0.3543441226575809, + "grad_norm": 0.3021100163459778, + "learning_rate": 0.0002, + "loss": 1.8332, + "step": 520 + }, + { + "epoch": 0.36115843270868825, + "grad_norm": 0.6045835018157959, + "learning_rate": 0.0002, + "loss": 1.7337, + "step": 530 + }, + { + "epoch": 0.3679727427597956, + "grad_norm": 0.3003896474838257, + "learning_rate": 0.0002, + "loss": 1.8661, + "step": 540 + }, + { + "epoch": 0.3747870528109029, + "grad_norm": 0.3678470551967621, + "learning_rate": 0.0002, + "loss": 1.8359, + "step": 550 + }, + { + "epoch": 0.38160136286201024, + "grad_norm": 0.35787615180015564, + "learning_rate": 0.0002, + "loss": 1.7512, + "step": 560 + }, + { + "epoch": 0.38841567291311757, + "grad_norm": 0.31882143020629883, + "learning_rate": 0.0002, + "loss": 1.8048, + "step": 570 + }, + { + "epoch": 0.39522998296422485, + "grad_norm": 0.3186313509941101, + "learning_rate": 0.0002, + "loss": 1.8108, + "step": 580 + }, + { + "epoch": 0.4020442930153322, + "grad_norm": 0.41443702578544617, + "learning_rate": 0.0002, + "loss": 1.8012, + "step": 590 + }, + { + "epoch": 0.4088586030664395, + "grad_norm": 0.28773069381713867, + "learning_rate": 0.0002, + "loss": 1.7794, + "step": 600 + }, + { + "epoch": 0.41567291311754684, + "grad_norm": 0.35743263363838196, + "learning_rate": 0.0002, + "loss": 1.8231, + "step": 610 + }, + { + "epoch": 0.42248722316865417, + "grad_norm": 0.3360286355018616, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 620 + }, + { + "epoch": 0.4293015332197615, + "grad_norm": 0.32838866114616394, + "learning_rate": 0.0002, + "loss": 1.7654, + "step": 630 + }, + { + "epoch": 0.43611584327086883, + "grad_norm": 0.2994388937950134, + "learning_rate": 0.0002, + "loss": 1.8176, + "step": 640 + }, + { + "epoch": 0.44293015332197616, + "grad_norm": 0.3306307792663574, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 650 + }, + { + "epoch": 0.4497444633730835, + "grad_norm": 0.3129560351371765, + "learning_rate": 0.0002, + "loss": 1.8201, + "step": 660 + }, + { + "epoch": 0.4565587734241908, + "grad_norm": 0.3244289457798004, + "learning_rate": 0.0002, + "loss": 1.803, + "step": 670 + }, + { + "epoch": 0.46337308347529815, + "grad_norm": 0.3196892738342285, + "learning_rate": 0.0002, + "loss": 1.8399, + "step": 680 + }, + { + "epoch": 0.47018739352640543, + "grad_norm": 0.3135230243206024, + "learning_rate": 0.0002, + "loss": 1.8291, + "step": 690 + }, + { + "epoch": 0.47700170357751276, + "grad_norm": 0.28677991032600403, + "learning_rate": 0.0002, + "loss": 1.7423, + "step": 700 + }, + { + "epoch": 0.4838160136286201, + "grad_norm": 0.3074065148830414, + "learning_rate": 0.0002, + "loss": 1.6982, + "step": 710 + }, + { + "epoch": 0.4906303236797274, + "grad_norm": 0.3354290723800659, + "learning_rate": 0.0002, + "loss": 1.8078, + "step": 720 + }, + { + "epoch": 0.49744463373083475, + "grad_norm": 0.324370801448822, + "learning_rate": 0.0002, + "loss": 1.7333, + "step": 730 + }, + { + "epoch": 0.504258943781942, + "grad_norm": 0.29496142268180847, + "learning_rate": 0.0002, + "loss": 1.8506, + "step": 740 + }, + { + "epoch": 0.5110732538330494, + "grad_norm": 0.30694130063056946, + "learning_rate": 0.0002, + "loss": 1.7761, + "step": 750 + }, + { + "epoch": 0.5178875638841567, + "grad_norm": 0.36168408393859863, + "learning_rate": 0.0002, + "loss": 1.7854, + "step": 760 + }, + { + "epoch": 0.524701873935264, + "grad_norm": 0.2930343449115753, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 770 + }, + { + "epoch": 0.5315161839863713, + "grad_norm": 0.3023432493209839, + "learning_rate": 0.0002, + "loss": 1.7924, + "step": 780 + }, + { + "epoch": 0.5383304940374787, + "grad_norm": 0.3272720277309418, + "learning_rate": 0.0002, + "loss": 1.8467, + "step": 790 + }, + { + "epoch": 0.545144804088586, + "grad_norm": 0.2907974421977997, + "learning_rate": 0.0002, + "loss": 1.8639, + "step": 800 + }, + { + "epoch": 0.5519591141396933, + "grad_norm": 0.32267168164253235, + "learning_rate": 0.0002, + "loss": 1.9018, + "step": 810 + }, + { + "epoch": 0.5587734241908007, + "grad_norm": 0.32059940695762634, + "learning_rate": 0.0002, + "loss": 1.8311, + "step": 820 + }, + { + "epoch": 0.565587734241908, + "grad_norm": 0.30951258540153503, + "learning_rate": 0.0002, + "loss": 1.7234, + "step": 830 + }, + { + "epoch": 0.5724020442930153, + "grad_norm": 0.33976122736930847, + "learning_rate": 0.0002, + "loss": 1.8063, + "step": 840 + }, + { + "epoch": 0.5792163543441227, + "grad_norm": 0.3195820450782776, + "learning_rate": 0.0002, + "loss": 1.7021, + "step": 850 + }, + { + "epoch": 0.58603066439523, + "grad_norm": 0.2828562557697296, + "learning_rate": 0.0002, + "loss": 1.8073, + "step": 860 + }, + { + "epoch": 0.5928449744463373, + "grad_norm": 0.29591670632362366, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 870 + }, + { + "epoch": 0.5996592844974447, + "grad_norm": 0.3086104393005371, + "learning_rate": 0.0002, + "loss": 1.8109, + "step": 880 + }, + { + "epoch": 0.606473594548552, + "grad_norm": 0.3592929542064667, + "learning_rate": 0.0002, + "loss": 1.8207, + "step": 890 + }, + { + "epoch": 0.6132879045996593, + "grad_norm": 0.2830186188220978, + "learning_rate": 0.0002, + "loss": 1.7662, + "step": 900 + }, + { + "epoch": 0.6201022146507666, + "grad_norm": 0.3128598630428314, + "learning_rate": 0.0002, + "loss": 1.8344, + "step": 910 + }, + { + "epoch": 0.626916524701874, + "grad_norm": 0.31957563757896423, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 920 + }, + { + "epoch": 0.6337308347529813, + "grad_norm": 0.30994319915771484, + "learning_rate": 0.0002, + "loss": 1.7578, + "step": 930 + }, + { + "epoch": 0.6405451448040886, + "grad_norm": 0.3352845013141632, + "learning_rate": 0.0002, + "loss": 1.8494, + "step": 940 + }, + { + "epoch": 0.6473594548551959, + "grad_norm": 0.2960077226161957, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 950 + }, + { + "epoch": 0.6541737649063032, + "grad_norm": 0.32675081491470337, + "learning_rate": 0.0002, + "loss": 1.7209, + "step": 960 + }, + { + "epoch": 0.6609880749574105, + "grad_norm": 0.30042028427124023, + "learning_rate": 0.0002, + "loss": 1.7776, + "step": 970 + }, + { + "epoch": 0.6678023850085179, + "grad_norm": 0.3288673758506775, + "learning_rate": 0.0002, + "loss": 1.7597, + "step": 980 + }, + { + "epoch": 0.6746166950596252, + "grad_norm": 0.323215126991272, + "learning_rate": 0.0002, + "loss": 1.7962, + "step": 990 + }, + { + "epoch": 0.6814310051107325, + "grad_norm": 0.30041399598121643, + "learning_rate": 0.0002, + "loss": 1.821, + "step": 1000 + }, + { + "epoch": 0.6882453151618398, + "grad_norm": 0.3076179623603821, + "learning_rate": 0.0002, + "loss": 1.7716, + "step": 1010 + }, + { + "epoch": 0.6950596252129472, + "grad_norm": 0.2941909730434418, + "learning_rate": 0.0002, + "loss": 1.7827, + "step": 1020 + }, + { + "epoch": 0.7018739352640545, + "grad_norm": 0.32220420241355896, + "learning_rate": 0.0002, + "loss": 1.7964, + "step": 1030 + }, + { + "epoch": 0.7086882453151618, + "grad_norm": 0.2989702820777893, + "learning_rate": 0.0002, + "loss": 1.7101, + "step": 1040 + }, + { + "epoch": 0.7155025553662692, + "grad_norm": 0.3049640357494354, + "learning_rate": 0.0002, + "loss": 1.8749, + "step": 1050 + }, + { + "epoch": 0.7223168654173765, + "grad_norm": 0.3183284103870392, + "learning_rate": 0.0002, + "loss": 1.782, + "step": 1060 + }, + { + "epoch": 0.7291311754684838, + "grad_norm": 0.3070095181465149, + "learning_rate": 0.0002, + "loss": 1.785, + "step": 1070 + }, + { + "epoch": 0.7359454855195912, + "grad_norm": 0.33263063430786133, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 1080 + }, + { + "epoch": 0.7427597955706985, + "grad_norm": 0.28774312138557434, + "learning_rate": 0.0002, + "loss": 1.7627, + "step": 1090 + }, + { + "epoch": 0.7495741056218058, + "grad_norm": 0.29304224252700806, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1100 + }, + { + "epoch": 0.7563884156729132, + "grad_norm": 0.27529507875442505, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 1110 + }, + { + "epoch": 0.7632027257240205, + "grad_norm": 0.32319945096969604, + "learning_rate": 0.0002, + "loss": 1.8071, + "step": 1120 + }, + { + "epoch": 0.7700170357751278, + "grad_norm": 0.33777597546577454, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 1130 + }, + { + "epoch": 0.7768313458262351, + "grad_norm": 0.29847201704978943, + "learning_rate": 0.0002, + "loss": 1.8488, + "step": 1140 + }, + { + "epoch": 0.7836456558773425, + "grad_norm": 0.31370633840560913, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 1150 + }, + { + "epoch": 0.7904599659284497, + "grad_norm": 0.31633856892585754, + "learning_rate": 0.0002, + "loss": 1.8175, + "step": 1160 + }, + { + "epoch": 0.797274275979557, + "grad_norm": 0.33849263191223145, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 1170 + }, + { + "epoch": 0.8040885860306644, + "grad_norm": 0.3306333124637604, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 1180 + }, + { + "epoch": 0.8109028960817717, + "grad_norm": 0.3249678313732147, + "learning_rate": 0.0002, + "loss": 1.777, + "step": 1190 + }, + { + "epoch": 0.817717206132879, + "grad_norm": 0.3252817690372467, + "learning_rate": 0.0002, + "loss": 1.7631, + "step": 1200 + }, + { + "epoch": 0.8245315161839863, + "grad_norm": 0.31772559881210327, + "learning_rate": 0.0002, + "loss": 1.7877, + "step": 1210 + }, + { + "epoch": 0.8313458262350937, + "grad_norm": 0.2803371846675873, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 1220 + }, + { + "epoch": 0.838160136286201, + "grad_norm": 0.26700571179389954, + "learning_rate": 0.0002, + "loss": 1.738, + "step": 1230 + }, + { + "epoch": 0.8449744463373083, + "grad_norm": 0.3060092031955719, + "learning_rate": 0.0002, + "loss": 1.8301, + "step": 1240 + }, + { + "epoch": 0.8517887563884157, + "grad_norm": 0.28831684589385986, + "learning_rate": 0.0002, + "loss": 1.7993, + "step": 1250 + }, + { + "epoch": 0.858603066439523, + "grad_norm": 0.30708742141723633, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 1260 + }, + { + "epoch": 0.8654173764906303, + "grad_norm": 0.2915987968444824, + "learning_rate": 0.0002, + "loss": 1.8506, + "step": 1270 + }, + { + "epoch": 0.8722316865417377, + "grad_norm": 0.2893589735031128, + "learning_rate": 0.0002, + "loss": 1.7536, + "step": 1280 + }, + { + "epoch": 0.879045996592845, + "grad_norm": 0.29545632004737854, + "learning_rate": 0.0002, + "loss": 1.7437, + "step": 1290 + }, + { + "epoch": 0.8858603066439523, + "grad_norm": 0.3354771137237549, + "learning_rate": 0.0002, + "loss": 1.859, + "step": 1300 + }, + { + "epoch": 0.8926746166950597, + "grad_norm": 0.37715399265289307, + "learning_rate": 0.0002, + "loss": 1.7644, + "step": 1310 + }, + { + "epoch": 0.899488926746167, + "grad_norm": 0.28847193717956543, + "learning_rate": 0.0002, + "loss": 1.7731, + "step": 1320 + }, + { + "epoch": 0.9063032367972743, + "grad_norm": 0.2780889868736267, + "learning_rate": 0.0002, + "loss": 1.9077, + "step": 1330 + }, + { + "epoch": 0.9131175468483816, + "grad_norm": 0.2714342176914215, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 1340 + }, + { + "epoch": 0.919931856899489, + "grad_norm": 0.2950133979320526, + "learning_rate": 0.0002, + "loss": 1.8, + "step": 1350 + }, + { + "epoch": 0.9267461669505963, + "grad_norm": 0.29097145795822144, + "learning_rate": 0.0002, + "loss": 1.7368, + "step": 1360 + }, + { + "epoch": 0.9335604770017035, + "grad_norm": 0.32540133595466614, + "learning_rate": 0.0002, + "loss": 1.7864, + "step": 1370 + }, + { + "epoch": 0.9403747870528109, + "grad_norm": 0.3076636493206024, + "learning_rate": 0.0002, + "loss": 1.7571, + "step": 1380 + }, + { + "epoch": 0.9471890971039182, + "grad_norm": 0.2962130308151245, + "learning_rate": 0.0002, + "loss": 1.8312, + "step": 1390 + }, + { + "epoch": 0.9540034071550255, + "grad_norm": 0.30086860060691833, + "learning_rate": 0.0002, + "loss": 1.7858, + "step": 1400 + }, + { + "epoch": 0.9608177172061328, + "grad_norm": 0.28634947538375854, + "learning_rate": 0.0002, + "loss": 1.7991, + "step": 1410 + }, + { + "epoch": 0.9676320272572402, + "grad_norm": 0.35314416885375977, + "learning_rate": 0.0002, + "loss": 1.8385, + "step": 1420 + }, + { + "epoch": 0.9744463373083475, + "grad_norm": 0.2939317524433136, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 1430 + }, + { + "epoch": 0.9812606473594548, + "grad_norm": 0.3010196089744568, + "learning_rate": 0.0002, + "loss": 1.7582, + "step": 1440 + }, + { + "epoch": 0.9880749574105622, + "grad_norm": 0.30816152691841125, + "learning_rate": 0.0002, + "loss": 1.758, + "step": 1450 + }, + { + "epoch": 0.9948892674616695, + "grad_norm": 0.28152793645858765, + "learning_rate": 0.0002, + "loss": 1.8159, + "step": 1460 + }, + { + "epoch": 0.9996592844974447, + "eval_loss": 1.8034634590148926, + "eval_runtime": 53.6017, + "eval_samples_per_second": 9.459, + "eval_steps_per_second": 1.194, + "step": 1467 + }, + { + "epoch": 1.0017035775127767, + "grad_norm": 0.29246416687965393, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 1470 + }, + { + "epoch": 1.008517887563884, + "grad_norm": 0.2668602168560028, + "learning_rate": 0.0002, + "loss": 1.8159, + "step": 1480 + }, + { + "epoch": 1.0153321976149914, + "grad_norm": 0.2998567819595337, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 1490 + }, + { + "epoch": 1.0221465076660987, + "grad_norm": 0.3284934461116791, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 1500 + }, + { + "epoch": 1.028960817717206, + "grad_norm": 0.3275827169418335, + "learning_rate": 0.0002, + "loss": 1.689, + "step": 1510 + }, + { + "epoch": 1.0357751277683134, + "grad_norm": 0.3382718563079834, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 1520 + }, + { + "epoch": 1.0425894378194207, + "grad_norm": 0.36737215518951416, + "learning_rate": 0.0002, + "loss": 1.7215, + "step": 1530 + }, + { + "epoch": 1.049403747870528, + "grad_norm": 0.3442603647708893, + "learning_rate": 0.0002, + "loss": 1.6993, + "step": 1540 + }, + { + "epoch": 1.0562180579216354, + "grad_norm": 0.3323381245136261, + "learning_rate": 0.0002, + "loss": 1.6996, + "step": 1550 + }, + { + "epoch": 1.0630323679727427, + "grad_norm": 0.341227650642395, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 1560 + }, + { + "epoch": 1.06984667802385, + "grad_norm": 0.3361579477787018, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 1570 + }, + { + "epoch": 1.0766609880749574, + "grad_norm": 0.3556230962276459, + "learning_rate": 0.0002, + "loss": 1.7337, + "step": 1580 + }, + { + "epoch": 1.0834752981260647, + "grad_norm": 0.27130424976348877, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 1590 + }, + { + "epoch": 1.090289608177172, + "grad_norm": 0.29366323351860046, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 1600 + }, + { + "epoch": 1.0971039182282794, + "grad_norm": 0.3581245541572571, + "learning_rate": 0.0002, + "loss": 1.6721, + "step": 1610 + }, + { + "epoch": 1.1039182282793867, + "grad_norm": 0.3021670877933502, + "learning_rate": 0.0002, + "loss": 1.7639, + "step": 1620 + }, + { + "epoch": 1.110732538330494, + "grad_norm": 0.3145572543144226, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 1630 + }, + { + "epoch": 1.1175468483816013, + "grad_norm": 0.35362154245376587, + "learning_rate": 0.0002, + "loss": 1.7408, + "step": 1640 + }, + { + "epoch": 1.1243611584327087, + "grad_norm": 0.5413113236427307, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 1650 + }, + { + "epoch": 1.131175468483816, + "grad_norm": 0.6858654022216797, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 1660 + }, + { + "epoch": 1.1379897785349233, + "grad_norm": 0.3781903386116028, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 1670 + }, + { + "epoch": 1.1448040885860307, + "grad_norm": 0.404864102602005, + "learning_rate": 0.0002, + "loss": 1.659, + "step": 1680 + }, + { + "epoch": 1.151618398637138, + "grad_norm": 0.3595100939273834, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 1690 + }, + { + "epoch": 1.1584327086882453, + "grad_norm": 0.33682283759117126, + "learning_rate": 0.0002, + "loss": 1.7023, + "step": 1700 + }, + { + "epoch": 1.1652470187393527, + "grad_norm": 0.3877373933792114, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 1710 + }, + { + "epoch": 1.17206132879046, + "grad_norm": 0.34606897830963135, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 1720 + }, + { + "epoch": 1.1788756388415673, + "grad_norm": 0.3122918903827667, + "learning_rate": 0.0002, + "loss": 1.6889, + "step": 1730 + }, + { + "epoch": 1.1856899488926746, + "grad_norm": 0.34081900119781494, + "learning_rate": 0.0002, + "loss": 1.6585, + "step": 1740 + }, + { + "epoch": 1.192504258943782, + "grad_norm": 0.3418807089328766, + "learning_rate": 0.0002, + "loss": 1.7794, + "step": 1750 + }, + { + "epoch": 1.1993185689948893, + "grad_norm": 0.3495500981807709, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 1760 + }, + { + "epoch": 1.2061328790459966, + "grad_norm": 0.557288408279419, + "learning_rate": 0.0002, + "loss": 1.6578, + "step": 1770 + }, + { + "epoch": 1.212947189097104, + "grad_norm": 0.3193778693675995, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 1780 + }, + { + "epoch": 1.2197614991482113, + "grad_norm": 0.3306216895580292, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 1790 + }, + { + "epoch": 1.2265758091993186, + "grad_norm": 0.37998732924461365, + "learning_rate": 0.0002, + "loss": 1.6772, + "step": 1800 + }, + { + "epoch": 1.233390119250426, + "grad_norm": 0.3255669176578522, + "learning_rate": 0.0002, + "loss": 1.7094, + "step": 1810 + }, + { + "epoch": 1.2402044293015333, + "grad_norm": 0.3741287291049957, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 1820 + }, + { + "epoch": 1.2470187393526406, + "grad_norm": 0.36727291345596313, + "learning_rate": 0.0002, + "loss": 1.7317, + "step": 1830 + }, + { + "epoch": 1.253833049403748, + "grad_norm": 0.3479527235031128, + "learning_rate": 0.0002, + "loss": 1.7418, + "step": 1840 + }, + { + "epoch": 1.2606473594548553, + "grad_norm": 0.3472636640071869, + "learning_rate": 0.0002, + "loss": 1.7062, + "step": 1850 + }, + { + "epoch": 1.2674616695059626, + "grad_norm": 0.3702869415283203, + "learning_rate": 0.0002, + "loss": 1.6263, + "step": 1860 + }, + { + "epoch": 1.27427597955707, + "grad_norm": 0.3934040069580078, + "learning_rate": 0.0002, + "loss": 1.6222, + "step": 1870 + }, + { + "epoch": 1.2810902896081773, + "grad_norm": 0.46887534856796265, + "learning_rate": 0.0002, + "loss": 1.7616, + "step": 1880 + }, + { + "epoch": 1.2879045996592846, + "grad_norm": 0.3191998600959778, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 1890 + }, + { + "epoch": 1.294718909710392, + "grad_norm": 0.34032225608825684, + "learning_rate": 0.0002, + "loss": 1.7001, + "step": 1900 + }, + { + "epoch": 1.3015332197614993, + "grad_norm": 0.33453696966171265, + "learning_rate": 0.0002, + "loss": 1.8316, + "step": 1910 + }, + { + "epoch": 1.3083475298126066, + "grad_norm": 0.3451494872570038, + "learning_rate": 0.0002, + "loss": 1.6567, + "step": 1920 + }, + { + "epoch": 1.315161839863714, + "grad_norm": 0.36203092336654663, + "learning_rate": 0.0002, + "loss": 1.708, + "step": 1930 + }, + { + "epoch": 1.321976149914821, + "grad_norm": 0.43794456124305725, + "learning_rate": 0.0002, + "loss": 1.7095, + "step": 1940 + }, + { + "epoch": 1.3287904599659284, + "grad_norm": 0.3630591034889221, + "learning_rate": 0.0002, + "loss": 1.7264, + "step": 1950 + }, + { + "epoch": 1.3356047700170357, + "grad_norm": 0.36951911449432373, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 1960 + }, + { + "epoch": 1.342419080068143, + "grad_norm": 0.4001159965991974, + "learning_rate": 0.0002, + "loss": 1.651, + "step": 1970 + }, + { + "epoch": 1.3492333901192504, + "grad_norm": 0.3820836544036865, + "learning_rate": 0.0002, + "loss": 1.677, + "step": 1980 + }, + { + "epoch": 1.3560477001703577, + "grad_norm": 0.3705870807170868, + "learning_rate": 0.0002, + "loss": 1.583, + "step": 1990 + }, + { + "epoch": 1.362862010221465, + "grad_norm": 0.3557972013950348, + "learning_rate": 0.0002, + "loss": 1.6207, + "step": 2000 + }, + { + "epoch": 1.3696763202725724, + "grad_norm": 0.38546398282051086, + "learning_rate": 0.0002, + "loss": 1.7656, + "step": 2010 + }, + { + "epoch": 1.3764906303236797, + "grad_norm": 0.3908020853996277, + "learning_rate": 0.0002, + "loss": 1.6881, + "step": 2020 + }, + { + "epoch": 1.383304940374787, + "grad_norm": 0.3822040855884552, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 2030 + }, + { + "epoch": 1.3901192504258943, + "grad_norm": 0.425327867269516, + "learning_rate": 0.0002, + "loss": 1.7923, + "step": 2040 + }, + { + "epoch": 1.3969335604770017, + "grad_norm": 0.3436259329319, + "learning_rate": 0.0002, + "loss": 1.7032, + "step": 2050 + }, + { + "epoch": 1.403747870528109, + "grad_norm": 0.33124062418937683, + "learning_rate": 0.0002, + "loss": 1.6398, + "step": 2060 + }, + { + "epoch": 1.4105621805792163, + "grad_norm": 0.3662424683570862, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 2070 + }, + { + "epoch": 1.4173764906303237, + "grad_norm": 0.3720635175704956, + "learning_rate": 0.0002, + "loss": 1.7258, + "step": 2080 + }, + { + "epoch": 1.424190800681431, + "grad_norm": 0.3361680805683136, + "learning_rate": 0.0002, + "loss": 1.7186, + "step": 2090 + }, + { + "epoch": 1.4310051107325383, + "grad_norm": 0.32751724123954773, + "learning_rate": 0.0002, + "loss": 1.7606, + "step": 2100 + }, + { + "epoch": 1.4378194207836457, + "grad_norm": 0.34333378076553345, + "learning_rate": 0.0002, + "loss": 1.7051, + "step": 2110 + }, + { + "epoch": 1.444633730834753, + "grad_norm": 0.37777671217918396, + "learning_rate": 0.0002, + "loss": 1.6979, + "step": 2120 + }, + { + "epoch": 1.4514480408858603, + "grad_norm": 0.37126365303993225, + "learning_rate": 0.0002, + "loss": 1.7085, + "step": 2130 + }, + { + "epoch": 1.4582623509369677, + "grad_norm": 0.3602267801761627, + "learning_rate": 0.0002, + "loss": 1.721, + "step": 2140 + }, + { + "epoch": 1.465076660988075, + "grad_norm": 0.3287110924720764, + "learning_rate": 0.0002, + "loss": 1.8148, + "step": 2150 + }, + { + "epoch": 1.4718909710391823, + "grad_norm": 0.3562135100364685, + "learning_rate": 0.0002, + "loss": 1.6966, + "step": 2160 + }, + { + "epoch": 1.4787052810902896, + "grad_norm": 0.38292962312698364, + "learning_rate": 0.0002, + "loss": 1.713, + "step": 2170 + }, + { + "epoch": 1.485519591141397, + "grad_norm": 0.38220319151878357, + "learning_rate": 0.0002, + "loss": 1.7036, + "step": 2180 + }, + { + "epoch": 1.4923339011925043, + "grad_norm": 0.3570062220096588, + "learning_rate": 0.0002, + "loss": 1.7297, + "step": 2190 + }, + { + "epoch": 1.4991482112436116, + "grad_norm": 0.363146036863327, + "learning_rate": 0.0002, + "loss": 1.5652, + "step": 2200 + }, + { + "epoch": 1.5059625212947187, + "grad_norm": 0.37393274903297424, + "learning_rate": 0.0002, + "loss": 1.6716, + "step": 2210 + }, + { + "epoch": 1.512776831345826, + "grad_norm": 0.3628501892089844, + "learning_rate": 0.0002, + "loss": 1.6839, + "step": 2220 + }, + { + "epoch": 1.5195911413969334, + "grad_norm": 0.33430740237236023, + "learning_rate": 0.0002, + "loss": 1.6627, + "step": 2230 + }, + { + "epoch": 1.5264054514480407, + "grad_norm": 0.35363978147506714, + "learning_rate": 0.0002, + "loss": 1.7495, + "step": 2240 + }, + { + "epoch": 1.533219761499148, + "grad_norm": 0.37220337986946106, + "learning_rate": 0.0002, + "loss": 1.6364, + "step": 2250 + }, + { + "epoch": 1.5400340715502554, + "grad_norm": 0.35020262002944946, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 2260 + }, + { + "epoch": 1.5468483816013627, + "grad_norm": 0.35274937748908997, + "learning_rate": 0.0002, + "loss": 1.7453, + "step": 2270 + }, + { + "epoch": 1.55366269165247, + "grad_norm": 0.3797738552093506, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 2280 + }, + { + "epoch": 1.5604770017035774, + "grad_norm": 0.4160412847995758, + "learning_rate": 0.0002, + "loss": 1.6197, + "step": 2290 + }, + { + "epoch": 1.5672913117546847, + "grad_norm": 0.38779592514038086, + "learning_rate": 0.0002, + "loss": 1.7101, + "step": 2300 + }, + { + "epoch": 1.574105621805792, + "grad_norm": 0.39171287417411804, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 2310 + }, + { + "epoch": 1.5809199318568994, + "grad_norm": 0.3606826663017273, + "learning_rate": 0.0002, + "loss": 1.7087, + "step": 2320 + }, + { + "epoch": 1.5877342419080067, + "grad_norm": 0.3745017647743225, + "learning_rate": 0.0002, + "loss": 1.7269, + "step": 2330 + }, + { + "epoch": 1.594548551959114, + "grad_norm": 0.34933462738990784, + "learning_rate": 0.0002, + "loss": 1.7068, + "step": 2340 + }, + { + "epoch": 1.6013628620102214, + "grad_norm": 0.37268444895744324, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 2350 + }, + { + "epoch": 1.6081771720613287, + "grad_norm": 0.4603484869003296, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 2360 + }, + { + "epoch": 1.614991482112436, + "grad_norm": 0.35689088702201843, + "learning_rate": 0.0002, + "loss": 1.7621, + "step": 2370 + }, + { + "epoch": 1.6218057921635434, + "grad_norm": 0.3392031490802765, + "learning_rate": 0.0002, + "loss": 1.6989, + "step": 2380 + }, + { + "epoch": 1.6286201022146507, + "grad_norm": 0.394653856754303, + "learning_rate": 0.0002, + "loss": 1.7077, + "step": 2390 + }, + { + "epoch": 1.635434412265758, + "grad_norm": 0.33972012996673584, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 2400 + }, + { + "epoch": 1.6422487223168654, + "grad_norm": 0.3854375183582306, + "learning_rate": 0.0002, + "loss": 1.7681, + "step": 2410 + }, + { + "epoch": 1.6490630323679727, + "grad_norm": 0.36143961548805237, + "learning_rate": 0.0002, + "loss": 1.7102, + "step": 2420 + }, + { + "epoch": 1.65587734241908, + "grad_norm": 0.35816189646720886, + "learning_rate": 0.0002, + "loss": 1.7432, + "step": 2430 + }, + { + "epoch": 1.6626916524701874, + "grad_norm": 0.36298736929893494, + "learning_rate": 0.0002, + "loss": 1.6407, + "step": 2440 + }, + { + "epoch": 1.6695059625212947, + "grad_norm": 0.36756929755210876, + "learning_rate": 0.0002, + "loss": 1.723, + "step": 2450 + }, + { + "epoch": 1.676320272572402, + "grad_norm": 0.35969603061676025, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 2460 + }, + { + "epoch": 1.6831345826235093, + "grad_norm": 0.38449376821517944, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 2470 + }, + { + "epoch": 1.6899488926746167, + "grad_norm": 0.44511452317237854, + "learning_rate": 0.0002, + "loss": 1.7261, + "step": 2480 + }, + { + "epoch": 1.696763202725724, + "grad_norm": 0.3831416368484497, + "learning_rate": 0.0002, + "loss": 1.7397, + "step": 2490 + }, + { + "epoch": 1.7035775127768313, + "grad_norm": 0.3795325756072998, + "learning_rate": 0.0002, + "loss": 1.7046, + "step": 2500 + }, + { + "epoch": 1.7103918228279387, + "grad_norm": 0.34978193044662476, + "learning_rate": 0.0002, + "loss": 1.6993, + "step": 2510 + }, + { + "epoch": 1.717206132879046, + "grad_norm": 0.35923877358436584, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 2520 + }, + { + "epoch": 1.7240204429301533, + "grad_norm": 0.352999746799469, + "learning_rate": 0.0002, + "loss": 1.6253, + "step": 2530 + }, + { + "epoch": 1.7308347529812607, + "grad_norm": 0.43673479557037354, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 2540 + }, + { + "epoch": 1.737649063032368, + "grad_norm": 0.4153687357902527, + "learning_rate": 0.0002, + "loss": 1.7079, + "step": 2550 + }, + { + "epoch": 1.7444633730834753, + "grad_norm": 0.35541167855262756, + "learning_rate": 0.0002, + "loss": 1.6714, + "step": 2560 + }, + { + "epoch": 1.7512776831345827, + "grad_norm": 0.3288775086402893, + "learning_rate": 0.0002, + "loss": 1.696, + "step": 2570 + }, + { + "epoch": 1.75809199318569, + "grad_norm": 0.3991123139858246, + "learning_rate": 0.0002, + "loss": 1.7486, + "step": 2580 + }, + { + "epoch": 1.7649063032367973, + "grad_norm": 0.39967241883277893, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 2590 + }, + { + "epoch": 1.7717206132879046, + "grad_norm": 0.41104283928871155, + "learning_rate": 0.0002, + "loss": 1.6855, + "step": 2600 + }, + { + "epoch": 1.778534923339012, + "grad_norm": 0.44885286688804626, + "learning_rate": 0.0002, + "loss": 1.6993, + "step": 2610 + }, + { + "epoch": 1.7853492333901193, + "grad_norm": 0.38384467363357544, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 2620 + }, + { + "epoch": 1.7921635434412266, + "grad_norm": 0.35020917654037476, + "learning_rate": 0.0002, + "loss": 1.7213, + "step": 2630 + }, + { + "epoch": 1.798977853492334, + "grad_norm": 0.3360341489315033, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 2640 + }, + { + "epoch": 1.8057921635434413, + "grad_norm": 0.38875144720077515, + "learning_rate": 0.0002, + "loss": 1.7037, + "step": 2650 + }, + { + "epoch": 1.8126064735945486, + "grad_norm": 0.34876883029937744, + "learning_rate": 0.0002, + "loss": 1.693, + "step": 2660 + }, + { + "epoch": 1.819420783645656, + "grad_norm": 0.419979989528656, + "learning_rate": 0.0002, + "loss": 1.7743, + "step": 2670 + }, + { + "epoch": 1.8262350936967633, + "grad_norm": 0.3648919463157654, + "learning_rate": 0.0002, + "loss": 1.7089, + "step": 2680 + }, + { + "epoch": 1.8330494037478706, + "grad_norm": 0.3485383987426758, + "learning_rate": 0.0002, + "loss": 1.7828, + "step": 2690 + }, + { + "epoch": 1.839863713798978, + "grad_norm": 0.3647468686103821, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 2700 + }, + { + "epoch": 1.8466780238500853, + "grad_norm": 0.37003210186958313, + "learning_rate": 0.0002, + "loss": 1.7318, + "step": 2710 + }, + { + "epoch": 1.8534923339011926, + "grad_norm": 0.37031617760658264, + "learning_rate": 0.0002, + "loss": 1.6647, + "step": 2720 + }, + { + "epoch": 1.8603066439523, + "grad_norm": 0.3438796103000641, + "learning_rate": 0.0002, + "loss": 1.69, + "step": 2730 + }, + { + "epoch": 1.8671209540034073, + "grad_norm": 0.41574627161026, + "learning_rate": 0.0002, + "loss": 1.5995, + "step": 2740 + }, + { + "epoch": 1.8739352640545146, + "grad_norm": 0.35049930214881897, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 2750 + }, + { + "epoch": 1.880749574105622, + "grad_norm": 0.3943989872932434, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 2760 + }, + { + "epoch": 1.8875638841567293, + "grad_norm": 0.3384978175163269, + "learning_rate": 0.0002, + "loss": 1.7047, + "step": 2770 + }, + { + "epoch": 1.8943781942078366, + "grad_norm": 0.3501328229904175, + "learning_rate": 0.0002, + "loss": 1.7848, + "step": 2780 + }, + { + "epoch": 1.901192504258944, + "grad_norm": 0.37484532594680786, + "learning_rate": 0.0002, + "loss": 1.6986, + "step": 2790 + }, + { + "epoch": 1.9080068143100513, + "grad_norm": 0.34497788548469543, + "learning_rate": 0.0002, + "loss": 1.6867, + "step": 2800 + }, + { + "epoch": 1.9148211243611586, + "grad_norm": 0.3530851900577545, + "learning_rate": 0.0002, + "loss": 1.6513, + "step": 2810 + }, + { + "epoch": 1.921635434412266, + "grad_norm": 0.3879254162311554, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 2820 + }, + { + "epoch": 1.9284497444633732, + "grad_norm": 0.3885590136051178, + "learning_rate": 0.0002, + "loss": 1.6786, + "step": 2830 + }, + { + "epoch": 1.9352640545144804, + "grad_norm": 0.3868715465068817, + "learning_rate": 0.0002, + "loss": 1.8049, + "step": 2840 + }, + { + "epoch": 1.9420783645655877, + "grad_norm": 0.4152422249317169, + "learning_rate": 0.0002, + "loss": 1.7099, + "step": 2850 + }, + { + "epoch": 1.948892674616695, + "grad_norm": 0.401714563369751, + "learning_rate": 0.0002, + "loss": 1.6696, + "step": 2860 + }, + { + "epoch": 1.9557069846678024, + "grad_norm": 0.34825265407562256, + "learning_rate": 0.0002, + "loss": 1.7182, + "step": 2870 + }, + { + "epoch": 1.9625212947189097, + "grad_norm": 0.3620675504207611, + "learning_rate": 0.0002, + "loss": 1.6612, + "step": 2880 + }, + { + "epoch": 1.969335604770017, + "grad_norm": 0.3977806866168976, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 2890 + }, + { + "epoch": 1.9761499148211243, + "grad_norm": 0.3687497079372406, + "learning_rate": 0.0002, + "loss": 1.6514, + "step": 2900 + }, + { + "epoch": 1.9829642248722317, + "grad_norm": 0.408640056848526, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 2910 + }, + { + "epoch": 1.989778534923339, + "grad_norm": 0.34510108828544617, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 2920 + }, + { + "epoch": 1.9965928449744463, + "grad_norm": 0.3596334755420685, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 2930 + }, + { + "epoch": 2.0, + "eval_loss": 1.8056600093841553, + "eval_runtime": 60.7049, + "eval_samples_per_second": 8.352, + "eval_steps_per_second": 1.054, + "step": 2935 + }, + { + "epoch": 2.0034071550255534, + "grad_norm": 0.3460802137851715, + "learning_rate": 0.0002, + "loss": 1.6918, + "step": 2940 + }, + { + "epoch": 2.0102214650766608, + "grad_norm": 0.4038620591163635, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 2950 + }, + { + "epoch": 2.017035775127768, + "grad_norm": 0.3950219750404358, + "learning_rate": 0.0002, + "loss": 1.5386, + "step": 2960 + }, + { + "epoch": 2.0238500851788754, + "grad_norm": 0.519116997718811, + "learning_rate": 0.0002, + "loss": 1.5649, + "step": 2970 + }, + { + "epoch": 2.0306643952299828, + "grad_norm": 0.4097684919834137, + "learning_rate": 0.0002, + "loss": 1.5397, + "step": 2980 + }, + { + "epoch": 2.03747870528109, + "grad_norm": 0.4153544306755066, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 2990 + }, + { + "epoch": 2.0442930153321974, + "grad_norm": 0.4351160526275635, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 3000 + }, + { + "epoch": 2.0511073253833048, + "grad_norm": 0.42036259174346924, + "learning_rate": 0.0002, + "loss": 1.5091, + "step": 3010 + }, + { + "epoch": 2.057921635434412, + "grad_norm": 0.4433218836784363, + "learning_rate": 0.0002, + "loss": 1.5686, + "step": 3020 + }, + { + "epoch": 2.0647359454855194, + "grad_norm": 0.46511581540107727, + "learning_rate": 0.0002, + "loss": 1.5478, + "step": 3030 + }, + { + "epoch": 2.0715502555366268, + "grad_norm": 0.4567560851573944, + "learning_rate": 0.0002, + "loss": 1.5554, + "step": 3040 + }, + { + "epoch": 2.078364565587734, + "grad_norm": 0.45671048760414124, + "learning_rate": 0.0002, + "loss": 1.5561, + "step": 3050 + }, + { + "epoch": 2.0851788756388414, + "grad_norm": 0.4598552882671356, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 3060 + }, + { + "epoch": 2.0919931856899487, + "grad_norm": 0.4582861661911011, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 3070 + }, + { + "epoch": 2.098807495741056, + "grad_norm": 0.4366969168186188, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 3080 + }, + { + "epoch": 2.1056218057921634, + "grad_norm": 0.495917409658432, + "learning_rate": 0.0002, + "loss": 1.5574, + "step": 3090 + }, + { + "epoch": 2.1124361158432707, + "grad_norm": 1.6846044063568115, + "learning_rate": 0.0002, + "loss": 1.5798, + "step": 3100 + }, + { + "epoch": 2.119250425894378, + "grad_norm": 0.4765092134475708, + "learning_rate": 0.0002, + "loss": 1.5877, + "step": 3110 + }, + { + "epoch": 2.1260647359454854, + "grad_norm": 0.45029810070991516, + "learning_rate": 0.0002, + "loss": 1.6106, + "step": 3120 + }, + { + "epoch": 2.1328790459965927, + "grad_norm": 0.5706973075866699, + "learning_rate": 0.0002, + "loss": 1.5707, + "step": 3130 + }, + { + "epoch": 2.1396933560477, + "grad_norm": 0.4606274366378784, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 3140 + }, + { + "epoch": 2.1465076660988074, + "grad_norm": 2.199115753173828, + "learning_rate": 0.0002, + "loss": 1.564, + "step": 3150 + }, + { + "epoch": 2.1533219761499147, + "grad_norm": 0.6231027245521545, + "learning_rate": 0.0002, + "loss": 1.5808, + "step": 3160 + }, + { + "epoch": 2.160136286201022, + "grad_norm": 0.46918219327926636, + "learning_rate": 0.0002, + "loss": 1.5397, + "step": 3170 + }, + { + "epoch": 2.1669505962521294, + "grad_norm": 0.5006393194198608, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 3180 + }, + { + "epoch": 2.1737649063032367, + "grad_norm": 0.4745093286037445, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 3190 + }, + { + "epoch": 2.180579216354344, + "grad_norm": 0.511576771736145, + "learning_rate": 0.0002, + "loss": 1.6459, + "step": 3200 + }, + { + "epoch": 2.1873935264054514, + "grad_norm": 0.38622918725013733, + "learning_rate": 0.0002, + "loss": 1.6019, + "step": 3210 + }, + { + "epoch": 2.1942078364565587, + "grad_norm": 0.4425644278526306, + "learning_rate": 0.0002, + "loss": 1.6187, + "step": 3220 + }, + { + "epoch": 2.201022146507666, + "grad_norm": 0.45643091201782227, + "learning_rate": 0.0002, + "loss": 1.6114, + "step": 3230 + }, + { + "epoch": 2.2078364565587734, + "grad_norm": 0.4975406229496002, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 3240 + }, + { + "epoch": 2.2146507666098807, + "grad_norm": 0.4673331081867218, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 3250 + }, + { + "epoch": 2.221465076660988, + "grad_norm": 0.5081731081008911, + "learning_rate": 0.0002, + "loss": 1.6155, + "step": 3260 + }, + { + "epoch": 2.2282793867120954, + "grad_norm": 0.4790806770324707, + "learning_rate": 0.0002, + "loss": 1.53, + "step": 3270 + }, + { + "epoch": 2.2350936967632027, + "grad_norm": 0.5184140801429749, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 3280 + }, + { + "epoch": 2.24190800681431, + "grad_norm": 0.5159083604812622, + "learning_rate": 0.0002, + "loss": 1.5646, + "step": 3290 + }, + { + "epoch": 2.2487223168654173, + "grad_norm": 0.4876042604446411, + "learning_rate": 0.0002, + "loss": 1.6577, + "step": 3300 + }, + { + "epoch": 2.2555366269165247, + "grad_norm": 0.5454957485198975, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 3310 + }, + { + "epoch": 2.262350936967632, + "grad_norm": 0.49866822361946106, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 3320 + }, + { + "epoch": 2.2691652470187393, + "grad_norm": 0.42674365639686584, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 3330 + }, + { + "epoch": 2.2759795570698467, + "grad_norm": 0.5202316641807556, + "learning_rate": 0.0002, + "loss": 1.5588, + "step": 3340 + }, + { + "epoch": 2.282793867120954, + "grad_norm": 0.4849465489387512, + "learning_rate": 0.0002, + "loss": 1.6032, + "step": 3350 + }, + { + "epoch": 2.2896081771720613, + "grad_norm": 0.47202569246292114, + "learning_rate": 0.0002, + "loss": 1.6853, + "step": 3360 + }, + { + "epoch": 2.2964224872231687, + "grad_norm": 0.5311620235443115, + "learning_rate": 0.0002, + "loss": 1.6164, + "step": 3370 + }, + { + "epoch": 2.303236797274276, + "grad_norm": 0.49011409282684326, + "learning_rate": 0.0002, + "loss": 1.6004, + "step": 3380 + }, + { + "epoch": 2.3100511073253833, + "grad_norm": 0.4789247512817383, + "learning_rate": 0.0002, + "loss": 1.5484, + "step": 3390 + }, + { + "epoch": 2.3168654173764907, + "grad_norm": 0.46646103262901306, + "learning_rate": 0.0002, + "loss": 1.5866, + "step": 3400 + }, + { + "epoch": 2.323679727427598, + "grad_norm": 0.5552441477775574, + "learning_rate": 0.0002, + "loss": 1.5308, + "step": 3410 + }, + { + "epoch": 2.3304940374787053, + "grad_norm": 0.4530351758003235, + "learning_rate": 0.0002, + "loss": 1.5761, + "step": 3420 + }, + { + "epoch": 2.3373083475298126, + "grad_norm": 0.4806232750415802, + "learning_rate": 0.0002, + "loss": 1.5919, + "step": 3430 + }, + { + "epoch": 2.34412265758092, + "grad_norm": 0.5998363494873047, + "learning_rate": 0.0002, + "loss": 1.5569, + "step": 3440 + }, + { + "epoch": 2.3509369676320273, + "grad_norm": 0.4918554425239563, + "learning_rate": 0.0002, + "loss": 1.513, + "step": 3450 + }, + { + "epoch": 2.3577512776831346, + "grad_norm": 0.5359559655189514, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 3460 + }, + { + "epoch": 2.364565587734242, + "grad_norm": 0.5053277611732483, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 3470 + }, + { + "epoch": 2.3713798977853493, + "grad_norm": 0.5058915019035339, + "learning_rate": 0.0002, + "loss": 1.5673, + "step": 3480 + }, + { + "epoch": 2.3781942078364566, + "grad_norm": 0.5314899682998657, + "learning_rate": 0.0002, + "loss": 1.5434, + "step": 3490 + }, + { + "epoch": 2.385008517887564, + "grad_norm": 0.48035913705825806, + "learning_rate": 0.0002, + "loss": 1.5203, + "step": 3500 + }, + { + "epoch": 2.3918228279386713, + "grad_norm": 0.45864903926849365, + "learning_rate": 0.0002, + "loss": 1.5936, + "step": 3510 + }, + { + "epoch": 2.3986371379897786, + "grad_norm": 0.4553050398826599, + "learning_rate": 0.0002, + "loss": 1.6285, + "step": 3520 + }, + { + "epoch": 2.405451448040886, + "grad_norm": 0.4483442008495331, + "learning_rate": 0.0002, + "loss": 1.5444, + "step": 3530 + }, + { + "epoch": 2.4122657580919933, + "grad_norm": 0.5043742060661316, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 3540 + }, + { + "epoch": 2.4190800681431006, + "grad_norm": 0.44807168841362, + "learning_rate": 0.0002, + "loss": 1.5692, + "step": 3550 + }, + { + "epoch": 2.425894378194208, + "grad_norm": 0.5065137147903442, + "learning_rate": 0.0002, + "loss": 1.6306, + "step": 3560 + }, + { + "epoch": 2.4327086882453153, + "grad_norm": 0.5186443328857422, + "learning_rate": 0.0002, + "loss": 1.5842, + "step": 3570 + }, + { + "epoch": 2.4395229982964226, + "grad_norm": 0.49743232131004333, + "learning_rate": 0.0002, + "loss": 1.5956, + "step": 3580 + }, + { + "epoch": 2.44633730834753, + "grad_norm": 0.524450421333313, + "learning_rate": 0.0002, + "loss": 1.6021, + "step": 3590 + }, + { + "epoch": 2.4531516183986373, + "grad_norm": 0.5053797364234924, + "learning_rate": 0.0002, + "loss": 1.6283, + "step": 3600 + }, + { + "epoch": 2.4599659284497446, + "grad_norm": 0.5223091840744019, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 3610 + }, + { + "epoch": 2.466780238500852, + "grad_norm": 0.4763810932636261, + "learning_rate": 0.0002, + "loss": 1.6315, + "step": 3620 + }, + { + "epoch": 2.4735945485519593, + "grad_norm": 0.5097282528877258, + "learning_rate": 0.0002, + "loss": 1.5623, + "step": 3630 + }, + { + "epoch": 2.4804088586030666, + "grad_norm": 0.5831942558288574, + "learning_rate": 0.0002, + "loss": 1.605, + "step": 3640 + }, + { + "epoch": 2.487223168654174, + "grad_norm": 0.47573572397232056, + "learning_rate": 0.0002, + "loss": 1.6074, + "step": 3650 + }, + { + "epoch": 2.4940374787052813, + "grad_norm": 0.49602726101875305, + "learning_rate": 0.0002, + "loss": 1.6411, + "step": 3660 + }, + { + "epoch": 2.500851788756388, + "grad_norm": 0.5468524694442749, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 3670 + }, + { + "epoch": 2.507666098807496, + "grad_norm": 0.45899373292922974, + "learning_rate": 0.0002, + "loss": 1.5782, + "step": 3680 + }, + { + "epoch": 2.514480408858603, + "grad_norm": 0.5031567215919495, + "learning_rate": 0.0002, + "loss": 1.5114, + "step": 3690 + }, + { + "epoch": 2.5212947189097106, + "grad_norm": 0.5224900841712952, + "learning_rate": 0.0002, + "loss": 1.538, + "step": 3700 + }, + { + "epoch": 2.5281090289608175, + "grad_norm": 0.504769504070282, + "learning_rate": 0.0002, + "loss": 1.5269, + "step": 3710 + }, + { + "epoch": 2.5349233390119252, + "grad_norm": 0.6120529770851135, + "learning_rate": 0.0002, + "loss": 1.5141, + "step": 3720 + }, + { + "epoch": 2.541737649063032, + "grad_norm": 0.47930678725242615, + "learning_rate": 0.0002, + "loss": 1.5666, + "step": 3730 + }, + { + "epoch": 2.54855195911414, + "grad_norm": 0.5039092302322388, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 3740 + }, + { + "epoch": 2.555366269165247, + "grad_norm": 0.49758994579315186, + "learning_rate": 0.0002, + "loss": 1.5585, + "step": 3750 + }, + { + "epoch": 2.5621805792163546, + "grad_norm": 0.44739171862602234, + "learning_rate": 0.0002, + "loss": 1.5351, + "step": 3760 + }, + { + "epoch": 2.5689948892674614, + "grad_norm": 0.47056373953819275, + "learning_rate": 0.0002, + "loss": 1.5099, + "step": 3770 + }, + { + "epoch": 2.575809199318569, + "grad_norm": 0.5077595114707947, + "learning_rate": 0.0002, + "loss": 1.5524, + "step": 3780 + }, + { + "epoch": 2.582623509369676, + "grad_norm": 0.4981902837753296, + "learning_rate": 0.0002, + "loss": 1.5524, + "step": 3790 + }, + { + "epoch": 2.589437819420784, + "grad_norm": 0.5736238360404968, + "learning_rate": 0.0002, + "loss": 1.5381, + "step": 3800 + }, + { + "epoch": 2.5962521294718908, + "grad_norm": 0.4898384213447571, + "learning_rate": 0.0002, + "loss": 1.67, + "step": 3810 + }, + { + "epoch": 2.6030664395229985, + "grad_norm": 0.4611325263977051, + "learning_rate": 0.0002, + "loss": 1.5411, + "step": 3820 + }, + { + "epoch": 2.6098807495741054, + "grad_norm": 0.5285341739654541, + "learning_rate": 0.0002, + "loss": 1.5662, + "step": 3830 + }, + { + "epoch": 2.616695059625213, + "grad_norm": 0.5679430961608887, + "learning_rate": 0.0002, + "loss": 1.5875, + "step": 3840 + }, + { + "epoch": 2.62350936967632, + "grad_norm": 0.48532548546791077, + "learning_rate": 0.0002, + "loss": 1.5544, + "step": 3850 + }, + { + "epoch": 2.630323679727428, + "grad_norm": 0.45506492257118225, + "learning_rate": 0.0002, + "loss": 1.579, + "step": 3860 + }, + { + "epoch": 2.6371379897785348, + "grad_norm": 0.6552556753158569, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 3870 + }, + { + "epoch": 2.643952299829642, + "grad_norm": 0.537874698638916, + "learning_rate": 0.0002, + "loss": 1.608, + "step": 3880 + }, + { + "epoch": 2.6507666098807494, + "grad_norm": 0.46102389693260193, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 3890 + }, + { + "epoch": 2.6575809199318567, + "grad_norm": 0.45531195402145386, + "learning_rate": 0.0002, + "loss": 1.5293, + "step": 3900 + }, + { + "epoch": 2.664395229982964, + "grad_norm": 0.5327293872833252, + "learning_rate": 0.0002, + "loss": 1.5492, + "step": 3910 + }, + { + "epoch": 2.6712095400340714, + "grad_norm": 0.4968956410884857, + "learning_rate": 0.0002, + "loss": 1.5921, + "step": 3920 + }, + { + "epoch": 2.6780238500851787, + "grad_norm": 0.4790082275867462, + "learning_rate": 0.0002, + "loss": 1.5823, + "step": 3930 + }, + { + "epoch": 2.684838160136286, + "grad_norm": 0.5392967462539673, + "learning_rate": 0.0002, + "loss": 1.615, + "step": 3940 + }, + { + "epoch": 2.6916524701873934, + "grad_norm": 0.5076649785041809, + "learning_rate": 0.0002, + "loss": 1.6218, + "step": 3950 + }, + { + "epoch": 2.6984667802385007, + "grad_norm": 0.5628064274787903, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 3960 + }, + { + "epoch": 2.705281090289608, + "grad_norm": 0.5012659430503845, + "learning_rate": 0.0002, + "loss": 1.5417, + "step": 3970 + }, + { + "epoch": 2.7120954003407154, + "grad_norm": 0.4947647452354431, + "learning_rate": 0.0002, + "loss": 1.5339, + "step": 3980 + }, + { + "epoch": 2.7189097103918227, + "grad_norm": 0.4890969693660736, + "learning_rate": 0.0002, + "loss": 1.5724, + "step": 3990 + }, + { + "epoch": 2.72572402044293, + "grad_norm": 0.4471694231033325, + "learning_rate": 0.0002, + "loss": 1.5746, + "step": 4000 + }, + { + "epoch": 2.7325383304940374, + "grad_norm": 0.5116439461708069, + "learning_rate": 0.0002, + "loss": 1.6669, + "step": 4010 + }, + { + "epoch": 2.7393526405451447, + "grad_norm": 0.5720411539077759, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 4020 + }, + { + "epoch": 2.746166950596252, + "grad_norm": 0.5529406070709229, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 4030 + }, + { + "epoch": 2.7529812606473594, + "grad_norm": 0.5229396820068359, + "learning_rate": 0.0002, + "loss": 1.6296, + "step": 4040 + }, + { + "epoch": 2.7597955706984667, + "grad_norm": 0.5270276069641113, + "learning_rate": 0.0002, + "loss": 1.5363, + "step": 4050 + }, + { + "epoch": 2.766609880749574, + "grad_norm": 0.48413026332855225, + "learning_rate": 0.0002, + "loss": 1.6305, + "step": 4060 + }, + { + "epoch": 2.7734241908006814, + "grad_norm": 0.5145403742790222, + "learning_rate": 0.0002, + "loss": 1.5488, + "step": 4070 + }, + { + "epoch": 2.7802385008517887, + "grad_norm": 0.48626071214675903, + "learning_rate": 0.0002, + "loss": 1.6023, + "step": 4080 + }, + { + "epoch": 2.787052810902896, + "grad_norm": 0.5018984079360962, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 4090 + }, + { + "epoch": 2.7938671209540034, + "grad_norm": 0.4881938695907593, + "learning_rate": 0.0002, + "loss": 1.7166, + "step": 4100 + }, + { + "epoch": 2.8006814310051107, + "grad_norm": 0.5151546001434326, + "learning_rate": 0.0002, + "loss": 1.515, + "step": 4110 + }, + { + "epoch": 2.807495741056218, + "grad_norm": 0.5109850764274597, + "learning_rate": 0.0002, + "loss": 1.6069, + "step": 4120 + }, + { + "epoch": 2.8143100511073254, + "grad_norm": 0.5109251141548157, + "learning_rate": 0.0002, + "loss": 1.6153, + "step": 4130 + }, + { + "epoch": 2.8211243611584327, + "grad_norm": 0.5025496482849121, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 4140 + }, + { + "epoch": 2.82793867120954, + "grad_norm": 0.49027004837989807, + "learning_rate": 0.0002, + "loss": 1.6292, + "step": 4150 + }, + { + "epoch": 2.8347529812606473, + "grad_norm": 0.4957362413406372, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 4160 + }, + { + "epoch": 2.8415672913117547, + "grad_norm": 0.5159927606582642, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 4170 + }, + { + "epoch": 2.848381601362862, + "grad_norm": 0.6040670871734619, + "learning_rate": 0.0002, + "loss": 1.577, + "step": 4180 + }, + { + "epoch": 2.8551959114139693, + "grad_norm": 0.5489953756332397, + "learning_rate": 0.0002, + "loss": 1.5295, + "step": 4190 + }, + { + "epoch": 2.8620102214650767, + "grad_norm": 0.5416634678840637, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 4200 + }, + { + "epoch": 2.868824531516184, + "grad_norm": 0.5278245210647583, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 4210 + }, + { + "epoch": 2.8756388415672913, + "grad_norm": 0.43382319808006287, + "learning_rate": 0.0002, + "loss": 1.4848, + "step": 4220 + }, + { + "epoch": 2.8824531516183987, + "grad_norm": 0.4724387228488922, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 4230 + }, + { + "epoch": 2.889267461669506, + "grad_norm": 0.49824780225753784, + "learning_rate": 0.0002, + "loss": 1.5748, + "step": 4240 + }, + { + "epoch": 2.8960817717206133, + "grad_norm": 0.5360262989997864, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 4250 + }, + { + "epoch": 2.9028960817717206, + "grad_norm": 0.49090322852134705, + "learning_rate": 0.0002, + "loss": 1.5934, + "step": 4260 + }, + { + "epoch": 2.909710391822828, + "grad_norm": 0.5613328218460083, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 4270 + }, + { + "epoch": 2.9165247018739353, + "grad_norm": 0.4611356258392334, + "learning_rate": 0.0002, + "loss": 1.5713, + "step": 4280 + }, + { + "epoch": 2.9233390119250426, + "grad_norm": 0.550897479057312, + "learning_rate": 0.0002, + "loss": 1.5457, + "step": 4290 + }, + { + "epoch": 2.93015332197615, + "grad_norm": 0.5089612603187561, + "learning_rate": 0.0002, + "loss": 1.6225, + "step": 4300 + }, + { + "epoch": 2.9369676320272573, + "grad_norm": 0.5210904479026794, + "learning_rate": 0.0002, + "loss": 1.5897, + "step": 4310 + }, + { + "epoch": 2.9437819420783646, + "grad_norm": 0.5506424903869629, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 4320 + }, + { + "epoch": 2.950596252129472, + "grad_norm": 0.5118561387062073, + "learning_rate": 0.0002, + "loss": 1.6395, + "step": 4330 + }, + { + "epoch": 2.9574105621805793, + "grad_norm": 0.5034464597702026, + "learning_rate": 0.0002, + "loss": 1.704, + "step": 4340 + }, + { + "epoch": 2.9642248722316866, + "grad_norm": 0.5019990801811218, + "learning_rate": 0.0002, + "loss": 1.6314, + "step": 4350 + }, + { + "epoch": 2.971039182282794, + "grad_norm": 0.5423325300216675, + "learning_rate": 0.0002, + "loss": 1.6161, + "step": 4360 + }, + { + "epoch": 2.9778534923339013, + "grad_norm": 0.5287469625473022, + "learning_rate": 0.0002, + "loss": 1.6144, + "step": 4370 + }, + { + "epoch": 2.9846678023850086, + "grad_norm": 0.5206913352012634, + "learning_rate": 0.0002, + "loss": 1.5227, + "step": 4380 + }, + { + "epoch": 2.991482112436116, + "grad_norm": 0.5407394170761108, + "learning_rate": 0.0002, + "loss": 1.6026, + "step": 4390 + }, + { + "epoch": 2.9982964224872233, + "grad_norm": 0.5244600176811218, + "learning_rate": 0.0002, + "loss": 1.5908, + "step": 4400 + }, + { + "epoch": 2.9996592844974446, + "eval_loss": 1.8412635326385498, + "eval_runtime": 65.5583, + "eval_samples_per_second": 7.734, + "eval_steps_per_second": 0.976, + "step": 4402 + }, + { + "epoch": 3.0051107325383306, + "grad_norm": 0.5172150731086731, + "learning_rate": 0.0002, + "loss": 1.5157, + "step": 4410 + }, + { + "epoch": 3.011925042589438, + "grad_norm": 0.6882525086402893, + "learning_rate": 0.0002, + "loss": 1.398, + "step": 4420 + }, + { + "epoch": 3.0187393526405453, + "grad_norm": 0.6435003280639648, + "learning_rate": 0.0002, + "loss": 1.3884, + "step": 4430 + }, + { + "epoch": 3.0255536626916526, + "grad_norm": 0.7126057147979736, + "learning_rate": 0.0002, + "loss": 1.4493, + "step": 4440 + }, + { + "epoch": 3.03236797274276, + "grad_norm": 0.6634385585784912, + "learning_rate": 0.0002, + "loss": 1.4397, + "step": 4450 + }, + { + "epoch": 3.0391822827938673, + "grad_norm": 0.6468435525894165, + "learning_rate": 0.0002, + "loss": 1.3674, + "step": 4460 + }, + { + "epoch": 3.0459965928449746, + "grad_norm": 0.5690478086471558, + "learning_rate": 0.0002, + "loss": 1.4045, + "step": 4470 + }, + { + "epoch": 3.052810902896082, + "grad_norm": 0.7323708534240723, + "learning_rate": 0.0002, + "loss": 1.3742, + "step": 4480 + }, + { + "epoch": 3.0596252129471893, + "grad_norm": 0.6989302039146423, + "learning_rate": 0.0002, + "loss": 1.3281, + "step": 4490 + }, + { + "epoch": 3.0664395229982966, + "grad_norm": 0.6704450845718384, + "learning_rate": 0.0002, + "loss": 1.379, + "step": 4500 + }, + { + "epoch": 3.073253833049404, + "grad_norm": 0.769137442111969, + "learning_rate": 0.0002, + "loss": 1.4028, + "step": 4510 + }, + { + "epoch": 3.0800681431005112, + "grad_norm": 0.6556448936462402, + "learning_rate": 0.0002, + "loss": 1.4295, + "step": 4520 + }, + { + "epoch": 3.0868824531516186, + "grad_norm": 0.7143950462341309, + "learning_rate": 0.0002, + "loss": 1.2763, + "step": 4530 + }, + { + "epoch": 3.093696763202726, + "grad_norm": 0.7060510516166687, + "learning_rate": 0.0002, + "loss": 1.4806, + "step": 4540 + }, + { + "epoch": 3.1005110732538332, + "grad_norm": 0.6637526750564575, + "learning_rate": 0.0002, + "loss": 1.4097, + "step": 4550 + }, + { + "epoch": 3.1073253833049406, + "grad_norm": 0.822989284992218, + "learning_rate": 0.0002, + "loss": 1.4752, + "step": 4560 + }, + { + "epoch": 3.114139693356048, + "grad_norm": 0.5542152523994446, + "learning_rate": 0.0002, + "loss": 1.4994, + "step": 4570 + }, + { + "epoch": 3.1209540034071552, + "grad_norm": 0.7780306935310364, + "learning_rate": 0.0002, + "loss": 1.4306, + "step": 4580 + }, + { + "epoch": 3.1277683134582626, + "grad_norm": 0.7372637987136841, + "learning_rate": 0.0002, + "loss": 1.3909, + "step": 4590 + }, + { + "epoch": 3.1345826235093694, + "grad_norm": 0.6730087995529175, + "learning_rate": 0.0002, + "loss": 1.3989, + "step": 4600 + }, + { + "epoch": 3.1413969335604772, + "grad_norm": 0.6687398552894592, + "learning_rate": 0.0002, + "loss": 1.3591, + "step": 4610 + }, + { + "epoch": 3.148211243611584, + "grad_norm": 0.7645083665847778, + "learning_rate": 0.0002, + "loss": 1.436, + "step": 4620 + }, + { + "epoch": 3.155025553662692, + "grad_norm": 0.6770380139350891, + "learning_rate": 0.0002, + "loss": 1.3681, + "step": 4630 + }, + { + "epoch": 3.1618398637137988, + "grad_norm": 0.7200576663017273, + "learning_rate": 0.0002, + "loss": 1.405, + "step": 4640 + }, + { + "epoch": 3.168654173764906, + "grad_norm": 0.6663638949394226, + "learning_rate": 0.0002, + "loss": 1.3752, + "step": 4650 + }, + { + "epoch": 3.1754684838160134, + "grad_norm": 0.6602960228919983, + "learning_rate": 0.0002, + "loss": 1.4099, + "step": 4660 + }, + { + "epoch": 3.1822827938671208, + "grad_norm": 0.7838228344917297, + "learning_rate": 0.0002, + "loss": 1.4003, + "step": 4670 + }, + { + "epoch": 3.189097103918228, + "grad_norm": 0.7559184432029724, + "learning_rate": 0.0002, + "loss": 1.3853, + "step": 4680 + }, + { + "epoch": 3.1959114139693354, + "grad_norm": 0.6609814167022705, + "learning_rate": 0.0002, + "loss": 1.4516, + "step": 4690 + }, + { + "epoch": 3.2027257240204428, + "grad_norm": 0.8470419645309448, + "learning_rate": 0.0002, + "loss": 1.4464, + "step": 4700 + }, + { + "epoch": 3.20954003407155, + "grad_norm": 0.7282822728157043, + "learning_rate": 0.0002, + "loss": 1.428, + "step": 4710 + }, + { + "epoch": 3.2163543441226574, + "grad_norm": 0.6722773313522339, + "learning_rate": 0.0002, + "loss": 1.5261, + "step": 4720 + }, + { + "epoch": 3.2231686541737647, + "grad_norm": 0.7630265355110168, + "learning_rate": 0.0002, + "loss": 1.3809, + "step": 4730 + }, + { + "epoch": 3.229982964224872, + "grad_norm": 0.7102773785591125, + "learning_rate": 0.0002, + "loss": 1.42, + "step": 4740 + }, + { + "epoch": 3.2367972742759794, + "grad_norm": 0.7778299450874329, + "learning_rate": 0.0002, + "loss": 1.3529, + "step": 4750 + }, + { + "epoch": 3.2436115843270867, + "grad_norm": 0.7189921736717224, + "learning_rate": 0.0002, + "loss": 1.4715, + "step": 4760 + }, + { + "epoch": 3.250425894378194, + "grad_norm": 0.7708092331886292, + "learning_rate": 0.0002, + "loss": 1.4328, + "step": 4770 + }, + { + "epoch": 3.2572402044293014, + "grad_norm": 0.7208452224731445, + "learning_rate": 0.0002, + "loss": 1.3855, + "step": 4780 + }, + { + "epoch": 3.2640545144804087, + "grad_norm": 0.7220432758331299, + "learning_rate": 0.0002, + "loss": 1.3206, + "step": 4790 + }, + { + "epoch": 3.270868824531516, + "grad_norm": 0.7064954042434692, + "learning_rate": 0.0002, + "loss": 1.463, + "step": 4800 + }, + { + "epoch": 3.2776831345826234, + "grad_norm": 0.6618382334709167, + "learning_rate": 0.0002, + "loss": 1.4236, + "step": 4810 + }, + { + "epoch": 3.2844974446337307, + "grad_norm": 0.6854256391525269, + "learning_rate": 0.0002, + "loss": 1.3878, + "step": 4820 + }, + { + "epoch": 3.291311754684838, + "grad_norm": 0.6036319136619568, + "learning_rate": 0.0002, + "loss": 1.4236, + "step": 4830 + }, + { + "epoch": 3.2981260647359454, + "grad_norm": 0.714678943157196, + "learning_rate": 0.0002, + "loss": 1.4796, + "step": 4840 + }, + { + "epoch": 3.3049403747870527, + "grad_norm": 0.7218600511550903, + "learning_rate": 0.0002, + "loss": 1.4273, + "step": 4850 + }, + { + "epoch": 3.31175468483816, + "grad_norm": 0.7243074774742126, + "learning_rate": 0.0002, + "loss": 1.3915, + "step": 4860 + }, + { + "epoch": 3.3185689948892674, + "grad_norm": 0.7058630585670471, + "learning_rate": 0.0002, + "loss": 1.4088, + "step": 4870 + }, + { + "epoch": 3.3253833049403747, + "grad_norm": 0.7091076970100403, + "learning_rate": 0.0002, + "loss": 1.3837, + "step": 4880 + }, + { + "epoch": 3.332197614991482, + "grad_norm": 0.7375147342681885, + "learning_rate": 0.0002, + "loss": 1.4745, + "step": 4890 + }, + { + "epoch": 3.3390119250425894, + "grad_norm": 0.9426755309104919, + "learning_rate": 0.0002, + "loss": 1.4826, + "step": 4900 + }, + { + "epoch": 3.3458262350936967, + "grad_norm": 0.6508213877677917, + "learning_rate": 0.0002, + "loss": 1.369, + "step": 4910 + }, + { + "epoch": 3.352640545144804, + "grad_norm": 0.6945043206214905, + "learning_rate": 0.0002, + "loss": 1.3839, + "step": 4920 + }, + { + "epoch": 3.3594548551959114, + "grad_norm": 0.6335888504981995, + "learning_rate": 0.0002, + "loss": 1.3571, + "step": 4930 + }, + { + "epoch": 3.3662691652470187, + "grad_norm": 0.6947107911109924, + "learning_rate": 0.0002, + "loss": 1.4391, + "step": 4940 + }, + { + "epoch": 3.373083475298126, + "grad_norm": 0.8204733729362488, + "learning_rate": 0.0002, + "loss": 1.3885, + "step": 4950 + }, + { + "epoch": 3.3798977853492334, + "grad_norm": 0.7212244868278503, + "learning_rate": 0.0002, + "loss": 1.4886, + "step": 4960 + }, + { + "epoch": 3.3867120954003407, + "grad_norm": 0.6053042411804199, + "learning_rate": 0.0002, + "loss": 1.4581, + "step": 4970 + }, + { + "epoch": 3.393526405451448, + "grad_norm": 0.7820029854774475, + "learning_rate": 0.0002, + "loss": 1.3863, + "step": 4980 + }, + { + "epoch": 3.4003407155025553, + "grad_norm": 0.6866770386695862, + "learning_rate": 0.0002, + "loss": 1.4326, + "step": 4990 + }, + { + "epoch": 3.4071550255536627, + "grad_norm": 0.6652463674545288, + "learning_rate": 0.0002, + "loss": 1.4287, + "step": 5000 + }, + { + "epoch": 3.41396933560477, + "grad_norm": 1.1209032535552979, + "learning_rate": 0.0002, + "loss": 1.3667, + "step": 5010 + }, + { + "epoch": 3.4207836456558773, + "grad_norm": 0.8390814661979675, + "learning_rate": 0.0002, + "loss": 1.4461, + "step": 5020 + }, + { + "epoch": 3.4275979557069847, + "grad_norm": 0.7541858553886414, + "learning_rate": 0.0002, + "loss": 1.4556, + "step": 5030 + }, + { + "epoch": 3.434412265758092, + "grad_norm": 0.6902772784233093, + "learning_rate": 0.0002, + "loss": 1.4245, + "step": 5040 + }, + { + "epoch": 3.4412265758091993, + "grad_norm": 0.7070329785346985, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 5050 + }, + { + "epoch": 3.4480408858603067, + "grad_norm": 0.8075643181800842, + "learning_rate": 0.0002, + "loss": 1.3903, + "step": 5060 + }, + { + "epoch": 3.454855195911414, + "grad_norm": 0.7133861780166626, + "learning_rate": 0.0002, + "loss": 1.3929, + "step": 5070 + }, + { + "epoch": 3.4616695059625213, + "grad_norm": 0.6631823182106018, + "learning_rate": 0.0002, + "loss": 1.4632, + "step": 5080 + }, + { + "epoch": 3.4684838160136287, + "grad_norm": 0.673870325088501, + "learning_rate": 0.0002, + "loss": 1.4162, + "step": 5090 + }, + { + "epoch": 3.475298126064736, + "grad_norm": 0.6438634395599365, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 5100 + }, + { + "epoch": 3.4821124361158433, + "grad_norm": 0.7560495734214783, + "learning_rate": 0.0002, + "loss": 1.4421, + "step": 5110 + }, + { + "epoch": 3.4889267461669506, + "grad_norm": 0.6877814531326294, + "learning_rate": 0.0002, + "loss": 1.4125, + "step": 5120 + }, + { + "epoch": 3.495741056218058, + "grad_norm": 0.7031328678131104, + "learning_rate": 0.0002, + "loss": 1.4308, + "step": 5130 + }, + { + "epoch": 3.5025553662691653, + "grad_norm": 0.6797195672988892, + "learning_rate": 0.0002, + "loss": 1.3705, + "step": 5140 + }, + { + "epoch": 3.5093696763202726, + "grad_norm": 0.6766413450241089, + "learning_rate": 0.0002, + "loss": 1.4687, + "step": 5150 + }, + { + "epoch": 3.51618398637138, + "grad_norm": 0.666656494140625, + "learning_rate": 0.0002, + "loss": 1.4194, + "step": 5160 + }, + { + "epoch": 3.5229982964224873, + "grad_norm": 0.74996417760849, + "learning_rate": 0.0002, + "loss": 1.469, + "step": 5170 + }, + { + "epoch": 3.5298126064735946, + "grad_norm": 0.7370911836624146, + "learning_rate": 0.0002, + "loss": 1.4848, + "step": 5180 + }, + { + "epoch": 3.536626916524702, + "grad_norm": 0.9063456654548645, + "learning_rate": 0.0002, + "loss": 1.4523, + "step": 5190 + }, + { + "epoch": 3.5434412265758093, + "grad_norm": 0.6861422657966614, + "learning_rate": 0.0002, + "loss": 1.4726, + "step": 5200 + }, + { + "epoch": 3.5502555366269166, + "grad_norm": 0.7104039788246155, + "learning_rate": 0.0002, + "loss": 1.4803, + "step": 5210 + }, + { + "epoch": 3.557069846678024, + "grad_norm": 0.6578653454780579, + "learning_rate": 0.0002, + "loss": 1.4313, + "step": 5220 + }, + { + "epoch": 3.5638841567291313, + "grad_norm": 0.7336562275886536, + "learning_rate": 0.0002, + "loss": 1.4596, + "step": 5230 + }, + { + "epoch": 3.5706984667802386, + "grad_norm": 0.7163010835647583, + "learning_rate": 0.0002, + "loss": 1.4591, + "step": 5240 + }, + { + "epoch": 3.577512776831346, + "grad_norm": 0.8112391233444214, + "learning_rate": 0.0002, + "loss": 1.3814, + "step": 5250 + }, + { + "epoch": 3.5843270868824533, + "grad_norm": 0.7260391116142273, + "learning_rate": 0.0002, + "loss": 1.4249, + "step": 5260 + }, + { + "epoch": 3.5911413969335606, + "grad_norm": 0.7038731575012207, + "learning_rate": 0.0002, + "loss": 1.4249, + "step": 5270 + }, + { + "epoch": 3.597955706984668, + "grad_norm": 0.7864376902580261, + "learning_rate": 0.0002, + "loss": 1.4172, + "step": 5280 + }, + { + "epoch": 3.6047700170357753, + "grad_norm": 0.6968383193016052, + "learning_rate": 0.0002, + "loss": 1.4637, + "step": 5290 + }, + { + "epoch": 3.6115843270868826, + "grad_norm": 0.6726206541061401, + "learning_rate": 0.0002, + "loss": 1.5269, + "step": 5300 + }, + { + "epoch": 3.61839863713799, + "grad_norm": 0.6716854572296143, + "learning_rate": 0.0002, + "loss": 1.4199, + "step": 5310 + }, + { + "epoch": 3.6252129471890973, + "grad_norm": 0.7229742407798767, + "learning_rate": 0.0002, + "loss": 1.4686, + "step": 5320 + }, + { + "epoch": 3.6320272572402046, + "grad_norm": 0.7338683009147644, + "learning_rate": 0.0002, + "loss": 1.4441, + "step": 5330 + }, + { + "epoch": 3.638841567291312, + "grad_norm": 0.771672785282135, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 5340 + }, + { + "epoch": 3.645655877342419, + "grad_norm": 0.7024078369140625, + "learning_rate": 0.0002, + "loss": 1.4007, + "step": 5350 + }, + { + "epoch": 3.6524701873935266, + "grad_norm": 0.6847538352012634, + "learning_rate": 0.0002, + "loss": 1.4996, + "step": 5360 + }, + { + "epoch": 3.6592844974446335, + "grad_norm": 0.71802818775177, + "learning_rate": 0.0002, + "loss": 1.4111, + "step": 5370 + }, + { + "epoch": 3.6660988074957412, + "grad_norm": 0.78530353307724, + "learning_rate": 0.0002, + "loss": 1.4224, + "step": 5380 + }, + { + "epoch": 3.672913117546848, + "grad_norm": 0.7262226939201355, + "learning_rate": 0.0002, + "loss": 1.4582, + "step": 5390 + }, + { + "epoch": 3.679727427597956, + "grad_norm": 0.7608316540718079, + "learning_rate": 0.0002, + "loss": 1.4704, + "step": 5400 + }, + { + "epoch": 3.686541737649063, + "grad_norm": 0.6994926333427429, + "learning_rate": 0.0002, + "loss": 1.3742, + "step": 5410 + }, + { + "epoch": 3.6933560477001706, + "grad_norm": 0.7888479828834534, + "learning_rate": 0.0002, + "loss": 1.4738, + "step": 5420 + }, + { + "epoch": 3.7001703577512775, + "grad_norm": 0.7053858041763306, + "learning_rate": 0.0002, + "loss": 1.4213, + "step": 5430 + }, + { + "epoch": 3.7069846678023852, + "grad_norm": 0.7063165903091431, + "learning_rate": 0.0002, + "loss": 1.4988, + "step": 5440 + }, + { + "epoch": 3.713798977853492, + "grad_norm": 0.6603744626045227, + "learning_rate": 0.0002, + "loss": 1.4386, + "step": 5450 + }, + { + "epoch": 3.7206132879046, + "grad_norm": 0.7043602466583252, + "learning_rate": 0.0002, + "loss": 1.4695, + "step": 5460 + }, + { + "epoch": 3.7274275979557068, + "grad_norm": 0.7026081681251526, + "learning_rate": 0.0002, + "loss": 1.5051, + "step": 5470 + }, + { + "epoch": 3.7342419080068145, + "grad_norm": 0.7200090289115906, + "learning_rate": 0.0002, + "loss": 1.5613, + "step": 5480 + }, + { + "epoch": 3.7410562180579214, + "grad_norm": 0.7170904278755188, + "learning_rate": 0.0002, + "loss": 1.4182, + "step": 5490 + }, + { + "epoch": 3.747870528109029, + "grad_norm": 0.7489104866981506, + "learning_rate": 0.0002, + "loss": 1.4344, + "step": 5500 + }, + { + "epoch": 3.754684838160136, + "grad_norm": 0.6540989875793457, + "learning_rate": 0.0002, + "loss": 1.4911, + "step": 5510 + }, + { + "epoch": 3.761499148211244, + "grad_norm": 0.6654048562049866, + "learning_rate": 0.0002, + "loss": 1.4955, + "step": 5520 + }, + { + "epoch": 3.7683134582623508, + "grad_norm": 0.6577395796775818, + "learning_rate": 0.0002, + "loss": 1.4487, + "step": 5530 + }, + { + "epoch": 3.7751277683134585, + "grad_norm": 0.7762192487716675, + "learning_rate": 0.0002, + "loss": 1.4283, + "step": 5540 + }, + { + "epoch": 3.7819420783645654, + "grad_norm": 0.6336314678192139, + "learning_rate": 0.0002, + "loss": 1.4727, + "step": 5550 + }, + { + "epoch": 3.7887563884156727, + "grad_norm": 0.7098057866096497, + "learning_rate": 0.0002, + "loss": 1.4588, + "step": 5560 + }, + { + "epoch": 3.79557069846678, + "grad_norm": 0.7379715442657471, + "learning_rate": 0.0002, + "loss": 1.4679, + "step": 5570 + }, + { + "epoch": 3.8023850085178874, + "grad_norm": 0.6726924777030945, + "learning_rate": 0.0002, + "loss": 1.4633, + "step": 5580 + }, + { + "epoch": 3.8091993185689947, + "grad_norm": 1.1212009191513062, + "learning_rate": 0.0002, + "loss": 1.4751, + "step": 5590 + }, + { + "epoch": 3.816013628620102, + "grad_norm": 0.6503795981407166, + "learning_rate": 0.0002, + "loss": 1.4503, + "step": 5600 + }, + { + "epoch": 3.8228279386712094, + "grad_norm": 0.7041325569152832, + "learning_rate": 0.0002, + "loss": 1.4754, + "step": 5610 + }, + { + "epoch": 3.8296422487223167, + "grad_norm": 0.7962933778762817, + "learning_rate": 0.0002, + "loss": 1.4199, + "step": 5620 + }, + { + "epoch": 3.836456558773424, + "grad_norm": 0.6613591909408569, + "learning_rate": 0.0002, + "loss": 1.4672, + "step": 5630 + }, + { + "epoch": 3.8432708688245314, + "grad_norm": 0.7293516397476196, + "learning_rate": 0.0002, + "loss": 1.5688, + "step": 5640 + }, + { + "epoch": 3.8500851788756387, + "grad_norm": 0.7388607859611511, + "learning_rate": 0.0002, + "loss": 1.4149, + "step": 5650 + }, + { + "epoch": 3.856899488926746, + "grad_norm": 0.6440677642822266, + "learning_rate": 0.0002, + "loss": 1.4743, + "step": 5660 + }, + { + "epoch": 3.8637137989778534, + "grad_norm": 0.7729013562202454, + "learning_rate": 0.0002, + "loss": 1.5082, + "step": 5670 + }, + { + "epoch": 3.8705281090289607, + "grad_norm": 0.6696794033050537, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 5680 + }, + { + "epoch": 3.877342419080068, + "grad_norm": 0.7151781320571899, + "learning_rate": 0.0002, + "loss": 1.472, + "step": 5690 + }, + { + "epoch": 3.8841567291311754, + "grad_norm": 0.6736966371536255, + "learning_rate": 0.0002, + "loss": 1.4923, + "step": 5700 + }, + { + "epoch": 3.8909710391822827, + "grad_norm": 0.7444243431091309, + "learning_rate": 0.0002, + "loss": 1.4453, + "step": 5710 + }, + { + "epoch": 3.89778534923339, + "grad_norm": 0.6701464653015137, + "learning_rate": 0.0002, + "loss": 1.4562, + "step": 5720 + }, + { + "epoch": 3.9045996592844974, + "grad_norm": 0.7231952548027039, + "learning_rate": 0.0002, + "loss": 1.4478, + "step": 5730 + }, + { + "epoch": 3.9114139693356047, + "grad_norm": 0.831954300403595, + "learning_rate": 0.0002, + "loss": 1.4539, + "step": 5740 + }, + { + "epoch": 3.918228279386712, + "grad_norm": 0.7697733640670776, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 5750 + }, + { + "epoch": 3.9250425894378194, + "grad_norm": 0.6964395046234131, + "learning_rate": 0.0002, + "loss": 1.4552, + "step": 5760 + }, + { + "epoch": 3.9318568994889267, + "grad_norm": 0.6942925453186035, + "learning_rate": 0.0002, + "loss": 1.4688, + "step": 5770 + }, + { + "epoch": 3.938671209540034, + "grad_norm": 0.6491202712059021, + "learning_rate": 0.0002, + "loss": 1.4668, + "step": 5780 + }, + { + "epoch": 3.9454855195911414, + "grad_norm": 0.7004382610321045, + "learning_rate": 0.0002, + "loss": 1.4404, + "step": 5790 + }, + { + "epoch": 3.9522998296422487, + "grad_norm": 0.7337747812271118, + "learning_rate": 0.0002, + "loss": 1.5022, + "step": 5800 + }, + { + "epoch": 3.959114139693356, + "grad_norm": 0.6923640966415405, + "learning_rate": 0.0002, + "loss": 1.5314, + "step": 5810 + }, + { + "epoch": 3.9659284497444633, + "grad_norm": 0.6815266609191895, + "learning_rate": 0.0002, + "loss": 1.4811, + "step": 5820 + }, + { + "epoch": 3.9727427597955707, + "grad_norm": 0.6755654811859131, + "learning_rate": 0.0002, + "loss": 1.437, + "step": 5830 + }, + { + "epoch": 3.979557069846678, + "grad_norm": 0.6912487149238586, + "learning_rate": 0.0002, + "loss": 1.4277, + "step": 5840 + }, + { + "epoch": 3.9863713798977853, + "grad_norm": 0.6948044896125793, + "learning_rate": 0.0002, + "loss": 1.4654, + "step": 5850 + }, + { + "epoch": 3.9931856899488927, + "grad_norm": 0.6735455989837646, + "learning_rate": 0.0002, + "loss": 1.4779, + "step": 5860 + }, + { + "epoch": 4.0, + "grad_norm": 0.7005048990249634, + "learning_rate": 0.0002, + "loss": 1.5102, + "step": 5870 + }, + { + "epoch": 4.0, + "eval_loss": 1.923058032989502, + "eval_runtime": 58.9903, + "eval_samples_per_second": 8.595, + "eval_steps_per_second": 1.085, + "step": 5870 + }, + { + "epoch": 4.006814310051107, + "grad_norm": 0.809018075466156, + "learning_rate": 0.0002, + "loss": 1.2417, + "step": 5880 + }, + { + "epoch": 4.013628620102215, + "grad_norm": 0.9499403238296509, + "learning_rate": 0.0002, + "loss": 1.2874, + "step": 5890 + }, + { + "epoch": 4.0204429301533215, + "grad_norm": 0.7944574356079102, + "learning_rate": 0.0002, + "loss": 1.2245, + "step": 5900 + }, + { + "epoch": 4.027257240204429, + "grad_norm": 0.9501046538352966, + "learning_rate": 0.0002, + "loss": 1.2751, + "step": 5910 + }, + { + "epoch": 4.034071550255536, + "grad_norm": 0.8247923254966736, + "learning_rate": 0.0002, + "loss": 1.2706, + "step": 5920 + }, + { + "epoch": 4.040885860306644, + "grad_norm": 0.9358038902282715, + "learning_rate": 0.0002, + "loss": 1.2762, + "step": 5930 + }, + { + "epoch": 4.047700170357751, + "grad_norm": 1.0102452039718628, + "learning_rate": 0.0002, + "loss": 1.2953, + "step": 5940 + }, + { + "epoch": 4.054514480408859, + "grad_norm": 1.0248252153396606, + "learning_rate": 0.0002, + "loss": 1.216, + "step": 5950 + }, + { + "epoch": 4.0613287904599655, + "grad_norm": 1.0438553094863892, + "learning_rate": 0.0002, + "loss": 1.2115, + "step": 5960 + }, + { + "epoch": 4.068143100511073, + "grad_norm": 0.7964957356452942, + "learning_rate": 0.0002, + "loss": 1.2516, + "step": 5970 + }, + { + "epoch": 4.07495741056218, + "grad_norm": 0.9757015109062195, + "learning_rate": 0.0002, + "loss": 1.1555, + "step": 5980 + }, + { + "epoch": 4.081771720613288, + "grad_norm": 0.9157161116600037, + "learning_rate": 0.0002, + "loss": 1.2243, + "step": 5990 + }, + { + "epoch": 4.088586030664395, + "grad_norm": 0.9372851848602295, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 6000 + }, + { + "epoch": 4.095400340715503, + "grad_norm": 1.240779995918274, + "learning_rate": 0.0002, + "loss": 1.2091, + "step": 6010 + }, + { + "epoch": 4.1022146507666095, + "grad_norm": 0.8394840359687805, + "learning_rate": 0.0002, + "loss": 1.1727, + "step": 6020 + }, + { + "epoch": 4.109028960817717, + "grad_norm": 1.1081455945968628, + "learning_rate": 0.0002, + "loss": 1.2926, + "step": 6030 + }, + { + "epoch": 4.115843270868824, + "grad_norm": 0.9227745532989502, + "learning_rate": 0.0002, + "loss": 1.2417, + "step": 6040 + }, + { + "epoch": 4.122657580919932, + "grad_norm": 0.8487664461135864, + "learning_rate": 0.0002, + "loss": 1.1994, + "step": 6050 + }, + { + "epoch": 4.129471890971039, + "grad_norm": 0.9643339514732361, + "learning_rate": 0.0002, + "loss": 1.2378, + "step": 6060 + }, + { + "epoch": 4.136286201022147, + "grad_norm": 1.0296099185943604, + "learning_rate": 0.0002, + "loss": 1.2254, + "step": 6070 + }, + { + "epoch": 4.1431005110732535, + "grad_norm": 0.9534215927124023, + "learning_rate": 0.0002, + "loss": 1.2419, + "step": 6080 + }, + { + "epoch": 4.149914821124361, + "grad_norm": 0.9647086262702942, + "learning_rate": 0.0002, + "loss": 1.1849, + "step": 6090 + }, + { + "epoch": 4.156729131175468, + "grad_norm": 1.084836721420288, + "learning_rate": 0.0002, + "loss": 1.2713, + "step": 6100 + }, + { + "epoch": 4.163543441226576, + "grad_norm": 0.9315235614776611, + "learning_rate": 0.0002, + "loss": 1.1788, + "step": 6110 + }, + { + "epoch": 4.170357751277683, + "grad_norm": 0.9541679620742798, + "learning_rate": 0.0002, + "loss": 1.17, + "step": 6120 + }, + { + "epoch": 4.177172061328791, + "grad_norm": 0.9792100191116333, + "learning_rate": 0.0002, + "loss": 1.1407, + "step": 6130 + }, + { + "epoch": 4.1839863713798975, + "grad_norm": 1.065783143043518, + "learning_rate": 0.0002, + "loss": 1.2069, + "step": 6140 + }, + { + "epoch": 4.190800681431005, + "grad_norm": 1.036161184310913, + "learning_rate": 0.0002, + "loss": 1.2512, + "step": 6150 + }, + { + "epoch": 4.197614991482112, + "grad_norm": 0.8979679942131042, + "learning_rate": 0.0002, + "loss": 1.2371, + "step": 6160 + }, + { + "epoch": 4.20442930153322, + "grad_norm": 0.7584333419799805, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 6170 + }, + { + "epoch": 4.211243611584327, + "grad_norm": 1.1970131397247314, + "learning_rate": 0.0002, + "loss": 1.2128, + "step": 6180 + }, + { + "epoch": 4.218057921635435, + "grad_norm": 2.6447298526763916, + "learning_rate": 0.0002, + "loss": 1.1982, + "step": 6190 + }, + { + "epoch": 4.2248722316865415, + "grad_norm": 0.9357487559318542, + "learning_rate": 0.0002, + "loss": 1.2465, + "step": 6200 + }, + { + "epoch": 4.231686541737649, + "grad_norm": 0.9141183495521545, + "learning_rate": 0.0002, + "loss": 1.2963, + "step": 6210 + }, + { + "epoch": 4.238500851788756, + "grad_norm": 1.0606296062469482, + "learning_rate": 0.0002, + "loss": 1.1959, + "step": 6220 + }, + { + "epoch": 4.245315161839864, + "grad_norm": 0.9999088048934937, + "learning_rate": 0.0002, + "loss": 1.2629, + "step": 6230 + }, + { + "epoch": 4.252129471890971, + "grad_norm": 0.9469764232635498, + "learning_rate": 0.0002, + "loss": 1.1471, + "step": 6240 + }, + { + "epoch": 4.258943781942079, + "grad_norm": 1.1508198976516724, + "learning_rate": 0.0002, + "loss": 1.223, + "step": 6250 + }, + { + "epoch": 4.2657580919931855, + "grad_norm": 1.2576130628585815, + "learning_rate": 0.0002, + "loss": 1.2677, + "step": 6260 + }, + { + "epoch": 4.272572402044293, + "grad_norm": 0.9435968399047852, + "learning_rate": 0.0002, + "loss": 1.2216, + "step": 6270 + }, + { + "epoch": 4.2793867120954, + "grad_norm": 0.9290348887443542, + "learning_rate": 0.0002, + "loss": 1.2788, + "step": 6280 + }, + { + "epoch": 4.286201022146508, + "grad_norm": 0.9973701238632202, + "learning_rate": 0.0002, + "loss": 1.2631, + "step": 6290 + }, + { + "epoch": 4.293015332197615, + "grad_norm": 1.012855887413025, + "learning_rate": 0.0002, + "loss": 1.2276, + "step": 6300 + }, + { + "epoch": 4.2998296422487225, + "grad_norm": 0.8371705412864685, + "learning_rate": 0.0002, + "loss": 1.2115, + "step": 6310 + }, + { + "epoch": 4.306643952299829, + "grad_norm": 1.0867925882339478, + "learning_rate": 0.0002, + "loss": 1.2423, + "step": 6320 + }, + { + "epoch": 4.313458262350937, + "grad_norm": 0.9763767123222351, + "learning_rate": 0.0002, + "loss": 1.2262, + "step": 6330 + }, + { + "epoch": 4.320272572402044, + "grad_norm": 1.1844252347946167, + "learning_rate": 0.0002, + "loss": 1.2557, + "step": 6340 + }, + { + "epoch": 4.327086882453152, + "grad_norm": 0.8292830586433411, + "learning_rate": 0.0002, + "loss": 1.2635, + "step": 6350 + }, + { + "epoch": 4.333901192504259, + "grad_norm": 0.9351436495780945, + "learning_rate": 0.0002, + "loss": 1.262, + "step": 6360 + }, + { + "epoch": 4.3407155025553665, + "grad_norm": 1.0425835847854614, + "learning_rate": 0.0002, + "loss": 1.2678, + "step": 6370 + }, + { + "epoch": 4.347529812606473, + "grad_norm": 0.8894261121749878, + "learning_rate": 0.0002, + "loss": 1.2476, + "step": 6380 + }, + { + "epoch": 4.354344122657581, + "grad_norm": 0.9663366079330444, + "learning_rate": 0.0002, + "loss": 1.2965, + "step": 6390 + }, + { + "epoch": 4.361158432708688, + "grad_norm": 0.8915578126907349, + "learning_rate": 0.0002, + "loss": 1.2529, + "step": 6400 + }, + { + "epoch": 4.367972742759796, + "grad_norm": 1.0393000841140747, + "learning_rate": 0.0002, + "loss": 1.2573, + "step": 6410 + }, + { + "epoch": 4.374787052810903, + "grad_norm": 0.917398989200592, + "learning_rate": 0.0002, + "loss": 1.2254, + "step": 6420 + }, + { + "epoch": 4.3816013628620105, + "grad_norm": 1.0496646165847778, + "learning_rate": 0.0002, + "loss": 1.3115, + "step": 6430 + }, + { + "epoch": 4.388415672913117, + "grad_norm": 0.9349859356880188, + "learning_rate": 0.0002, + "loss": 1.2607, + "step": 6440 + }, + { + "epoch": 4.395229982964225, + "grad_norm": 1.0981004238128662, + "learning_rate": 0.0002, + "loss": 1.3414, + "step": 6450 + }, + { + "epoch": 4.402044293015332, + "grad_norm": 0.9794871807098389, + "learning_rate": 0.0002, + "loss": 1.2391, + "step": 6460 + }, + { + "epoch": 4.40885860306644, + "grad_norm": 0.9321421384811401, + "learning_rate": 0.0002, + "loss": 1.208, + "step": 6470 + }, + { + "epoch": 4.415672913117547, + "grad_norm": 0.9158342480659485, + "learning_rate": 0.0002, + "loss": 1.3398, + "step": 6480 + }, + { + "epoch": 4.4224872231686545, + "grad_norm": 0.9462087750434875, + "learning_rate": 0.0002, + "loss": 1.1832, + "step": 6490 + }, + { + "epoch": 4.429301533219761, + "grad_norm": 0.9740175604820251, + "learning_rate": 0.0002, + "loss": 1.2366, + "step": 6500 + }, + { + "epoch": 4.436115843270869, + "grad_norm": 0.8477463126182556, + "learning_rate": 0.0002, + "loss": 1.3074, + "step": 6510 + }, + { + "epoch": 4.442930153321976, + "grad_norm": 1.0296647548675537, + "learning_rate": 0.0002, + "loss": 1.2719, + "step": 6520 + }, + { + "epoch": 4.449744463373084, + "grad_norm": 0.9437751173973083, + "learning_rate": 0.0002, + "loss": 1.2647, + "step": 6530 + }, + { + "epoch": 4.456558773424191, + "grad_norm": 1.011192798614502, + "learning_rate": 0.0002, + "loss": 1.2043, + "step": 6540 + }, + { + "epoch": 4.4633730834752985, + "grad_norm": 0.8836222290992737, + "learning_rate": 0.0002, + "loss": 1.3673, + "step": 6550 + }, + { + "epoch": 4.470187393526405, + "grad_norm": 1.2799941301345825, + "learning_rate": 0.0002, + "loss": 1.3028, + "step": 6560 + }, + { + "epoch": 4.477001703577513, + "grad_norm": 0.925910472869873, + "learning_rate": 0.0002, + "loss": 1.2789, + "step": 6570 + }, + { + "epoch": 4.48381601362862, + "grad_norm": 0.957401692867279, + "learning_rate": 0.0002, + "loss": 1.2723, + "step": 6580 + }, + { + "epoch": 4.490630323679728, + "grad_norm": 1.0789544582366943, + "learning_rate": 0.0002, + "loss": 1.242, + "step": 6590 + }, + { + "epoch": 4.497444633730835, + "grad_norm": 0.8874586820602417, + "learning_rate": 0.0002, + "loss": 1.2553, + "step": 6600 + }, + { + "epoch": 4.504258943781942, + "grad_norm": 0.9394784569740295, + "learning_rate": 0.0002, + "loss": 1.2779, + "step": 6610 + }, + { + "epoch": 4.511073253833049, + "grad_norm": 1.029640793800354, + "learning_rate": 0.0002, + "loss": 1.2744, + "step": 6620 + }, + { + "epoch": 4.517887563884157, + "grad_norm": 0.9510841965675354, + "learning_rate": 0.0002, + "loss": 1.2634, + "step": 6630 + }, + { + "epoch": 4.524701873935264, + "grad_norm": 0.9992963671684265, + "learning_rate": 0.0002, + "loss": 1.2562, + "step": 6640 + }, + { + "epoch": 4.531516183986371, + "grad_norm": 0.9312878847122192, + "learning_rate": 0.0002, + "loss": 1.2942, + "step": 6650 + }, + { + "epoch": 4.538330494037479, + "grad_norm": 0.9406482577323914, + "learning_rate": 0.0002, + "loss": 1.2572, + "step": 6660 + }, + { + "epoch": 4.5451448040885865, + "grad_norm": 1.1058286428451538, + "learning_rate": 0.0002, + "loss": 1.2283, + "step": 6670 + }, + { + "epoch": 4.551959114139693, + "grad_norm": 0.9389635920524597, + "learning_rate": 0.0002, + "loss": 1.2391, + "step": 6680 + }, + { + "epoch": 4.5587734241908, + "grad_norm": 1.0356028079986572, + "learning_rate": 0.0002, + "loss": 1.2696, + "step": 6690 + }, + { + "epoch": 4.565587734241908, + "grad_norm": 0.9370909929275513, + "learning_rate": 0.0002, + "loss": 1.2935, + "step": 6700 + }, + { + "epoch": 4.572402044293016, + "grad_norm": 0.9917567372322083, + "learning_rate": 0.0002, + "loss": 1.2914, + "step": 6710 + }, + { + "epoch": 4.579216354344123, + "grad_norm": 0.9065384864807129, + "learning_rate": 0.0002, + "loss": 1.3318, + "step": 6720 + }, + { + "epoch": 4.5860306643952296, + "grad_norm": 1.3347833156585693, + "learning_rate": 0.0002, + "loss": 1.2909, + "step": 6730 + }, + { + "epoch": 4.592844974446337, + "grad_norm": 0.910632312297821, + "learning_rate": 0.0002, + "loss": 1.3322, + "step": 6740 + }, + { + "epoch": 4.599659284497445, + "grad_norm": 0.8874805569648743, + "learning_rate": 0.0002, + "loss": 1.2584, + "step": 6750 + }, + { + "epoch": 4.606473594548552, + "grad_norm": 0.9355664253234863, + "learning_rate": 0.0002, + "loss": 1.3173, + "step": 6760 + }, + { + "epoch": 4.613287904599659, + "grad_norm": 0.9360204339027405, + "learning_rate": 0.0002, + "loss": 1.3515, + "step": 6770 + }, + { + "epoch": 4.620102214650767, + "grad_norm": 0.9931750893592834, + "learning_rate": 0.0002, + "loss": 1.2326, + "step": 6780 + }, + { + "epoch": 4.626916524701874, + "grad_norm": 0.9195131063461304, + "learning_rate": 0.0002, + "loss": 1.2677, + "step": 6790 + }, + { + "epoch": 4.633730834752981, + "grad_norm": 0.9448373913764954, + "learning_rate": 0.0002, + "loss": 1.3417, + "step": 6800 + }, + { + "epoch": 4.640545144804088, + "grad_norm": 1.162890911102295, + "learning_rate": 0.0002, + "loss": 1.2658, + "step": 6810 + }, + { + "epoch": 4.647359454855196, + "grad_norm": 0.9739466905593872, + "learning_rate": 0.0002, + "loss": 1.2841, + "step": 6820 + }, + { + "epoch": 4.654173764906303, + "grad_norm": 0.9462909698486328, + "learning_rate": 0.0002, + "loss": 1.3068, + "step": 6830 + }, + { + "epoch": 4.660988074957411, + "grad_norm": 1.042639970779419, + "learning_rate": 0.0002, + "loss": 1.284, + "step": 6840 + }, + { + "epoch": 4.6678023850085175, + "grad_norm": 0.8910539150238037, + "learning_rate": 0.0002, + "loss": 1.3337, + "step": 6850 + }, + { + "epoch": 4.674616695059625, + "grad_norm": 1.0806447267532349, + "learning_rate": 0.0002, + "loss": 1.3025, + "step": 6860 + }, + { + "epoch": 4.681431005110732, + "grad_norm": 1.0054864883422852, + "learning_rate": 0.0002, + "loss": 1.2258, + "step": 6870 + }, + { + "epoch": 4.68824531516184, + "grad_norm": 0.7774158120155334, + "learning_rate": 0.0002, + "loss": 1.3261, + "step": 6880 + }, + { + "epoch": 4.695059625212947, + "grad_norm": 0.9729512333869934, + "learning_rate": 0.0002, + "loss": 1.2545, + "step": 6890 + }, + { + "epoch": 4.701873935264055, + "grad_norm": 1.2025411128997803, + "learning_rate": 0.0002, + "loss": 1.3251, + "step": 6900 + }, + { + "epoch": 4.7086882453151615, + "grad_norm": 1.1654069423675537, + "learning_rate": 0.0002, + "loss": 1.3418, + "step": 6910 + }, + { + "epoch": 4.715502555366269, + "grad_norm": 1.1501442193984985, + "learning_rate": 0.0002, + "loss": 1.3091, + "step": 6920 + }, + { + "epoch": 4.722316865417376, + "grad_norm": 1.1083979606628418, + "learning_rate": 0.0002, + "loss": 1.2627, + "step": 6930 + }, + { + "epoch": 4.729131175468484, + "grad_norm": 0.9431378841400146, + "learning_rate": 0.0002, + "loss": 1.2836, + "step": 6940 + }, + { + "epoch": 4.735945485519591, + "grad_norm": 0.9722502827644348, + "learning_rate": 0.0002, + "loss": 1.3381, + "step": 6950 + }, + { + "epoch": 4.742759795570699, + "grad_norm": 0.9094559550285339, + "learning_rate": 0.0002, + "loss": 1.3228, + "step": 6960 + }, + { + "epoch": 4.7495741056218055, + "grad_norm": 0.9918473958969116, + "learning_rate": 0.0002, + "loss": 1.3474, + "step": 6970 + }, + { + "epoch": 4.756388415672913, + "grad_norm": 0.9999690651893616, + "learning_rate": 0.0002, + "loss": 1.3352, + "step": 6980 + }, + { + "epoch": 4.76320272572402, + "grad_norm": 1.0453810691833496, + "learning_rate": 0.0002, + "loss": 1.3579, + "step": 6990 + }, + { + "epoch": 4.770017035775128, + "grad_norm": 1.0167806148529053, + "learning_rate": 0.0002, + "loss": 1.294, + "step": 7000 + }, + { + "epoch": 4.776831345826235, + "grad_norm": 0.8133894801139832, + "learning_rate": 0.0002, + "loss": 1.3247, + "step": 7010 + }, + { + "epoch": 4.783645655877343, + "grad_norm": 0.8000897765159607, + "learning_rate": 0.0002, + "loss": 1.2577, + "step": 7020 + }, + { + "epoch": 4.7904599659284495, + "grad_norm": 0.992080569267273, + "learning_rate": 0.0002, + "loss": 1.2802, + "step": 7030 + }, + { + "epoch": 4.797274275979557, + "grad_norm": 0.9824522137641907, + "learning_rate": 0.0002, + "loss": 1.3269, + "step": 7040 + }, + { + "epoch": 4.804088586030664, + "grad_norm": 0.9808870553970337, + "learning_rate": 0.0002, + "loss": 1.279, + "step": 7050 + }, + { + "epoch": 4.810902896081772, + "grad_norm": 0.9679701924324036, + "learning_rate": 0.0002, + "loss": 1.3342, + "step": 7060 + }, + { + "epoch": 4.817717206132879, + "grad_norm": 0.9895215034484863, + "learning_rate": 0.0002, + "loss": 1.2711, + "step": 7070 + }, + { + "epoch": 4.824531516183987, + "grad_norm": 1.052246332168579, + "learning_rate": 0.0002, + "loss": 1.3008, + "step": 7080 + }, + { + "epoch": 4.8313458262350935, + "grad_norm": 0.9243564605712891, + "learning_rate": 0.0002, + "loss": 1.2874, + "step": 7090 + }, + { + "epoch": 4.838160136286201, + "grad_norm": 0.9545369744300842, + "learning_rate": 0.0002, + "loss": 1.2835, + "step": 7100 + }, + { + "epoch": 4.844974446337308, + "grad_norm": 0.9655884504318237, + "learning_rate": 0.0002, + "loss": 1.31, + "step": 7110 + }, + { + "epoch": 4.851788756388416, + "grad_norm": 0.9708049893379211, + "learning_rate": 0.0002, + "loss": 1.2862, + "step": 7120 + }, + { + "epoch": 4.858603066439523, + "grad_norm": 1.0064880847930908, + "learning_rate": 0.0002, + "loss": 1.3425, + "step": 7130 + }, + { + "epoch": 4.8654173764906306, + "grad_norm": 0.939943790435791, + "learning_rate": 0.0002, + "loss": 1.2899, + "step": 7140 + }, + { + "epoch": 4.872231686541737, + "grad_norm": 1.0750784873962402, + "learning_rate": 0.0002, + "loss": 1.2887, + "step": 7150 + }, + { + "epoch": 4.879045996592845, + "grad_norm": 0.9708989262580872, + "learning_rate": 0.0002, + "loss": 1.3367, + "step": 7160 + }, + { + "epoch": 4.885860306643952, + "grad_norm": 1.0228253602981567, + "learning_rate": 0.0002, + "loss": 1.2797, + "step": 7170 + }, + { + "epoch": 4.89267461669506, + "grad_norm": 0.8963132500648499, + "learning_rate": 0.0002, + "loss": 1.2695, + "step": 7180 + }, + { + "epoch": 4.899488926746167, + "grad_norm": 0.9198015928268433, + "learning_rate": 0.0002, + "loss": 1.3473, + "step": 7190 + }, + { + "epoch": 4.9063032367972745, + "grad_norm": 1.099906086921692, + "learning_rate": 0.0002, + "loss": 1.2541, + "step": 7200 + }, + { + "epoch": 4.913117546848381, + "grad_norm": 1.0624815225601196, + "learning_rate": 0.0002, + "loss": 1.3188, + "step": 7210 + }, + { + "epoch": 4.919931856899489, + "grad_norm": 0.9688444137573242, + "learning_rate": 0.0002, + "loss": 1.3026, + "step": 7220 + }, + { + "epoch": 4.926746166950596, + "grad_norm": 0.867011547088623, + "learning_rate": 0.0002, + "loss": 1.3379, + "step": 7230 + }, + { + "epoch": 4.933560477001704, + "grad_norm": 0.9600282311439514, + "learning_rate": 0.0002, + "loss": 1.289, + "step": 7240 + }, + { + "epoch": 4.940374787052811, + "grad_norm": 0.8979372978210449, + "learning_rate": 0.0002, + "loss": 1.2751, + "step": 7250 + }, + { + "epoch": 4.9471890971039185, + "grad_norm": 0.951474130153656, + "learning_rate": 0.0002, + "loss": 1.3426, + "step": 7260 + }, + { + "epoch": 4.954003407155025, + "grad_norm": 0.824851393699646, + "learning_rate": 0.0002, + "loss": 1.2726, + "step": 7270 + }, + { + "epoch": 4.960817717206133, + "grad_norm": 1.2926591634750366, + "learning_rate": 0.0002, + "loss": 1.2679, + "step": 7280 + }, + { + "epoch": 4.96763202725724, + "grad_norm": 1.1057835817337036, + "learning_rate": 0.0002, + "loss": 1.2974, + "step": 7290 + }, + { + "epoch": 4.974446337308348, + "grad_norm": 0.9814816117286682, + "learning_rate": 0.0002, + "loss": 1.2275, + "step": 7300 + }, + { + "epoch": 4.981260647359455, + "grad_norm": 1.0251333713531494, + "learning_rate": 0.0002, + "loss": 1.3001, + "step": 7310 + }, + { + "epoch": 4.9880749574105625, + "grad_norm": 0.9748668074607849, + "learning_rate": 0.0002, + "loss": 1.3113, + "step": 7320 + }, + { + "epoch": 4.994889267461669, + "grad_norm": 0.8552228808403015, + "learning_rate": 0.0002, + "loss": 1.3595, + "step": 7330 + }, + { + "epoch": 4.999659284497445, + "eval_loss": 2.03971004486084, + "eval_runtime": 67.4144, + "eval_samples_per_second": 7.521, + "eval_steps_per_second": 0.949, + "step": 7337 + }, + { + "epoch": 5.001703577512777, + "grad_norm": 0.8210785388946533, + "learning_rate": 0.0002, + "loss": 1.2464, + "step": 7340 + }, + { + "epoch": 5.008517887563884, + "grad_norm": 1.2577511072158813, + "learning_rate": 0.0002, + "loss": 1.0356, + "step": 7350 + }, + { + "epoch": 5.015332197614992, + "grad_norm": 1.280604362487793, + "learning_rate": 0.0002, + "loss": 0.9944, + "step": 7360 + }, + { + "epoch": 5.022146507666099, + "grad_norm": 1.3985474109649658, + "learning_rate": 0.0002, + "loss": 1.0858, + "step": 7370 + }, + { + "epoch": 5.0289608177172065, + "grad_norm": 1.1621310710906982, + "learning_rate": 0.0002, + "loss": 1.0122, + "step": 7380 + }, + { + "epoch": 5.035775127768313, + "grad_norm": 1.3278541564941406, + "learning_rate": 0.0002, + "loss": 1.05, + "step": 7390 + }, + { + "epoch": 5.042589437819421, + "grad_norm": 1.1166491508483887, + "learning_rate": 0.0002, + "loss": 1.0237, + "step": 7400 + }, + { + "epoch": 5.049403747870528, + "grad_norm": 1.8087667226791382, + "learning_rate": 0.0002, + "loss": 1.0397, + "step": 7410 + }, + { + "epoch": 5.056218057921636, + "grad_norm": 1.1517921686172485, + "learning_rate": 0.0002, + "loss": 1.0191, + "step": 7420 + }, + { + "epoch": 5.063032367972743, + "grad_norm": 1.2875889539718628, + "learning_rate": 0.0002, + "loss": 1.025, + "step": 7430 + }, + { + "epoch": 5.0698466780238505, + "grad_norm": 1.199702262878418, + "learning_rate": 0.0002, + "loss": 1.043, + "step": 7440 + }, + { + "epoch": 5.076660988074957, + "grad_norm": 1.2912452220916748, + "learning_rate": 0.0002, + "loss": 1.0176, + "step": 7450 + }, + { + "epoch": 5.083475298126065, + "grad_norm": 1.1446452140808105, + "learning_rate": 0.0002, + "loss": 1.0042, + "step": 7460 + }, + { + "epoch": 5.090289608177172, + "grad_norm": 1.3625746965408325, + "learning_rate": 0.0002, + "loss": 1.047, + "step": 7470 + }, + { + "epoch": 5.09710391822828, + "grad_norm": 1.2116546630859375, + "learning_rate": 0.0002, + "loss": 1.052, + "step": 7480 + }, + { + "epoch": 5.103918228279387, + "grad_norm": 1.3896098136901855, + "learning_rate": 0.0002, + "loss": 1.1041, + "step": 7490 + }, + { + "epoch": 5.1107325383304945, + "grad_norm": 1.6265277862548828, + "learning_rate": 0.0002, + "loss": 1.0668, + "step": 7500 + }, + { + "epoch": 5.117546848381601, + "grad_norm": 1.1468392610549927, + "learning_rate": 0.0002, + "loss": 1.028, + "step": 7510 + }, + { + "epoch": 5.124361158432709, + "grad_norm": 1.2649329900741577, + "learning_rate": 0.0002, + "loss": 0.9915, + "step": 7520 + }, + { + "epoch": 5.131175468483816, + "grad_norm": 1.1866015195846558, + "learning_rate": 0.0002, + "loss": 1.0251, + "step": 7530 + }, + { + "epoch": 5.137989778534923, + "grad_norm": 1.1517255306243896, + "learning_rate": 0.0002, + "loss": 1.0626, + "step": 7540 + }, + { + "epoch": 5.144804088586031, + "grad_norm": 1.3475146293640137, + "learning_rate": 0.0002, + "loss": 1.0303, + "step": 7550 + }, + { + "epoch": 5.151618398637138, + "grad_norm": 1.1167018413543701, + "learning_rate": 0.0002, + "loss": 1.0456, + "step": 7560 + }, + { + "epoch": 5.158432708688245, + "grad_norm": 1.209572434425354, + "learning_rate": 0.0002, + "loss": 1.04, + "step": 7570 + }, + { + "epoch": 5.165247018739352, + "grad_norm": 1.3578280210494995, + "learning_rate": 0.0002, + "loss": 1.0533, + "step": 7580 + }, + { + "epoch": 5.17206132879046, + "grad_norm": 1.2447012662887573, + "learning_rate": 0.0002, + "loss": 1.0958, + "step": 7590 + }, + { + "epoch": 5.178875638841567, + "grad_norm": 1.3715848922729492, + "learning_rate": 0.0002, + "loss": 1.0521, + "step": 7600 + }, + { + "epoch": 5.185689948892675, + "grad_norm": 1.435860276222229, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 7610 + }, + { + "epoch": 5.1925042589437815, + "grad_norm": 1.4093858003616333, + "learning_rate": 0.0002, + "loss": 1.0504, + "step": 7620 + }, + { + "epoch": 5.199318568994889, + "grad_norm": 1.1747535467147827, + "learning_rate": 0.0002, + "loss": 1.083, + "step": 7630 + }, + { + "epoch": 5.206132879045996, + "grad_norm": 1.4704833030700684, + "learning_rate": 0.0002, + "loss": 1.048, + "step": 7640 + }, + { + "epoch": 5.212947189097104, + "grad_norm": 1.2270972728729248, + "learning_rate": 0.0002, + "loss": 0.9991, + "step": 7650 + }, + { + "epoch": 5.219761499148211, + "grad_norm": 1.2215691804885864, + "learning_rate": 0.0002, + "loss": 1.0738, + "step": 7660 + }, + { + "epoch": 5.226575809199319, + "grad_norm": 1.3641486167907715, + "learning_rate": 0.0002, + "loss": 1.0628, + "step": 7670 + }, + { + "epoch": 5.2333901192504255, + "grad_norm": 1.3532041311264038, + "learning_rate": 0.0002, + "loss": 1.1066, + "step": 7680 + }, + { + "epoch": 5.240204429301533, + "grad_norm": 1.2243095636367798, + "learning_rate": 0.0002, + "loss": 1.0209, + "step": 7690 + }, + { + "epoch": 5.24701873935264, + "grad_norm": 1.3644746541976929, + "learning_rate": 0.0002, + "loss": 1.0503, + "step": 7700 + }, + { + "epoch": 5.253833049403748, + "grad_norm": 1.18478262424469, + "learning_rate": 0.0002, + "loss": 1.0406, + "step": 7710 + }, + { + "epoch": 5.260647359454855, + "grad_norm": 1.2146114110946655, + "learning_rate": 0.0002, + "loss": 1.1023, + "step": 7720 + }, + { + "epoch": 5.267461669505963, + "grad_norm": 1.233984112739563, + "learning_rate": 0.0002, + "loss": 1.1528, + "step": 7730 + }, + { + "epoch": 5.2742759795570695, + "grad_norm": 1.3709665536880493, + "learning_rate": 0.0002, + "loss": 1.0681, + "step": 7740 + }, + { + "epoch": 5.281090289608177, + "grad_norm": 1.36055326461792, + "learning_rate": 0.0002, + "loss": 1.0195, + "step": 7750 + }, + { + "epoch": 5.287904599659284, + "grad_norm": 1.6232351064682007, + "learning_rate": 0.0002, + "loss": 1.0447, + "step": 7760 + }, + { + "epoch": 5.294718909710392, + "grad_norm": 1.3359960317611694, + "learning_rate": 0.0002, + "loss": 1.0627, + "step": 7770 + }, + { + "epoch": 5.301533219761499, + "grad_norm": 1.3815656900405884, + "learning_rate": 0.0002, + "loss": 1.1082, + "step": 7780 + }, + { + "epoch": 5.308347529812607, + "grad_norm": 1.1392076015472412, + "learning_rate": 0.0002, + "loss": 1.0891, + "step": 7790 + }, + { + "epoch": 5.3151618398637135, + "grad_norm": 1.3006905317306519, + "learning_rate": 0.0002, + "loss": 1.0364, + "step": 7800 + }, + { + "epoch": 5.321976149914821, + "grad_norm": 1.503645896911621, + "learning_rate": 0.0002, + "loss": 1.1005, + "step": 7810 + }, + { + "epoch": 5.328790459965928, + "grad_norm": 1.141939640045166, + "learning_rate": 0.0002, + "loss": 1.0075, + "step": 7820 + }, + { + "epoch": 5.335604770017036, + "grad_norm": 1.4654004573822021, + "learning_rate": 0.0002, + "loss": 1.0284, + "step": 7830 + }, + { + "epoch": 5.342419080068143, + "grad_norm": 1.4195219278335571, + "learning_rate": 0.0002, + "loss": 1.1185, + "step": 7840 + }, + { + "epoch": 5.349233390119251, + "grad_norm": 1.2354168891906738, + "learning_rate": 0.0002, + "loss": 1.0535, + "step": 7850 + }, + { + "epoch": 5.3560477001703575, + "grad_norm": 1.529862880706787, + "learning_rate": 0.0002, + "loss": 1.0923, + "step": 7860 + }, + { + "epoch": 5.362862010221465, + "grad_norm": 1.364678978919983, + "learning_rate": 0.0002, + "loss": 1.1005, + "step": 7870 + }, + { + "epoch": 5.369676320272572, + "grad_norm": 1.1010444164276123, + "learning_rate": 0.0002, + "loss": 1.1084, + "step": 7880 + }, + { + "epoch": 5.37649063032368, + "grad_norm": 1.1949712038040161, + "learning_rate": 0.0002, + "loss": 1.1225, + "step": 7890 + }, + { + "epoch": 5.383304940374787, + "grad_norm": 1.485922932624817, + "learning_rate": 0.0002, + "loss": 1.058, + "step": 7900 + }, + { + "epoch": 5.390119250425895, + "grad_norm": 1.0844227075576782, + "learning_rate": 0.0002, + "loss": 0.9894, + "step": 7910 + }, + { + "epoch": 5.3969335604770015, + "grad_norm": 1.3784468173980713, + "learning_rate": 0.0002, + "loss": 1.0418, + "step": 7920 + }, + { + "epoch": 5.403747870528109, + "grad_norm": 1.4771490097045898, + "learning_rate": 0.0002, + "loss": 1.0542, + "step": 7930 + }, + { + "epoch": 5.410562180579216, + "grad_norm": 1.2460103034973145, + "learning_rate": 0.0002, + "loss": 1.1265, + "step": 7940 + }, + { + "epoch": 5.417376490630324, + "grad_norm": 1.3047645092010498, + "learning_rate": 0.0002, + "loss": 1.096, + "step": 7950 + }, + { + "epoch": 5.424190800681431, + "grad_norm": 1.1396620273590088, + "learning_rate": 0.0002, + "loss": 1.0956, + "step": 7960 + }, + { + "epoch": 5.4310051107325386, + "grad_norm": 1.4193450212478638, + "learning_rate": 0.0002, + "loss": 1.0685, + "step": 7970 + }, + { + "epoch": 5.437819420783645, + "grad_norm": 1.2085850238800049, + "learning_rate": 0.0002, + "loss": 1.1347, + "step": 7980 + }, + { + "epoch": 5.444633730834753, + "grad_norm": 1.2721607685089111, + "learning_rate": 0.0002, + "loss": 1.0277, + "step": 7990 + }, + { + "epoch": 5.45144804088586, + "grad_norm": 1.4134020805358887, + "learning_rate": 0.0002, + "loss": 1.1316, + "step": 8000 + }, + { + "epoch": 5.458262350936968, + "grad_norm": 1.4283325672149658, + "learning_rate": 0.0002, + "loss": 1.0576, + "step": 8010 + }, + { + "epoch": 5.465076660988075, + "grad_norm": 1.3127079010009766, + "learning_rate": 0.0002, + "loss": 1.0505, + "step": 8020 + }, + { + "epoch": 5.4718909710391825, + "grad_norm": 1.2924352884292603, + "learning_rate": 0.0002, + "loss": 1.0812, + "step": 8030 + }, + { + "epoch": 5.478705281090289, + "grad_norm": 1.8000653982162476, + "learning_rate": 0.0002, + "loss": 1.1178, + "step": 8040 + }, + { + "epoch": 5.485519591141397, + "grad_norm": 1.1538785696029663, + "learning_rate": 0.0002, + "loss": 1.1205, + "step": 8050 + }, + { + "epoch": 5.492333901192504, + "grad_norm": 1.1173290014266968, + "learning_rate": 0.0002, + "loss": 1.1015, + "step": 8060 + }, + { + "epoch": 5.499148211243612, + "grad_norm": 1.1501243114471436, + "learning_rate": 0.0002, + "loss": 1.1597, + "step": 8070 + }, + { + "epoch": 5.505962521294719, + "grad_norm": 1.1335760354995728, + "learning_rate": 0.0002, + "loss": 1.1465, + "step": 8080 + }, + { + "epoch": 5.5127768313458265, + "grad_norm": 1.565274953842163, + "learning_rate": 0.0002, + "loss": 1.1005, + "step": 8090 + }, + { + "epoch": 5.519591141396933, + "grad_norm": 1.3415014743804932, + "learning_rate": 0.0002, + "loss": 1.1085, + "step": 8100 + }, + { + "epoch": 5.526405451448041, + "grad_norm": 1.2377240657806396, + "learning_rate": 0.0002, + "loss": 1.1166, + "step": 8110 + }, + { + "epoch": 5.533219761499148, + "grad_norm": 1.3333637714385986, + "learning_rate": 0.0002, + "loss": 1.0766, + "step": 8120 + }, + { + "epoch": 5.540034071550256, + "grad_norm": 1.2620662450790405, + "learning_rate": 0.0002, + "loss": 1.1515, + "step": 8130 + }, + { + "epoch": 5.546848381601363, + "grad_norm": 1.2806652784347534, + "learning_rate": 0.0002, + "loss": 1.0839, + "step": 8140 + }, + { + "epoch": 5.5536626916524705, + "grad_norm": 1.2057335376739502, + "learning_rate": 0.0002, + "loss": 1.1221, + "step": 8150 + }, + { + "epoch": 5.560477001703577, + "grad_norm": 1.411726951599121, + "learning_rate": 0.0002, + "loss": 1.1292, + "step": 8160 + }, + { + "epoch": 5.567291311754685, + "grad_norm": 1.381104588508606, + "learning_rate": 0.0002, + "loss": 1.0887, + "step": 8170 + }, + { + "epoch": 5.574105621805792, + "grad_norm": 1.3449294567108154, + "learning_rate": 0.0002, + "loss": 1.1317, + "step": 8180 + }, + { + "epoch": 5.5809199318569, + "grad_norm": 1.2791016101837158, + "learning_rate": 0.0002, + "loss": 1.1392, + "step": 8190 + }, + { + "epoch": 5.587734241908007, + "grad_norm": 1.276891827583313, + "learning_rate": 0.0002, + "loss": 1.0972, + "step": 8200 + }, + { + "epoch": 5.5945485519591145, + "grad_norm": 1.3951541185379028, + "learning_rate": 0.0002, + "loss": 1.1001, + "step": 8210 + }, + { + "epoch": 5.601362862010221, + "grad_norm": 1.4167890548706055, + "learning_rate": 0.0002, + "loss": 1.0993, + "step": 8220 + }, + { + "epoch": 5.608177172061329, + "grad_norm": 1.4388375282287598, + "learning_rate": 0.0002, + "loss": 1.0826, + "step": 8230 + }, + { + "epoch": 5.614991482112436, + "grad_norm": 1.210157036781311, + "learning_rate": 0.0002, + "loss": 1.1941, + "step": 8240 + }, + { + "epoch": 5.621805792163544, + "grad_norm": 1.0557862520217896, + "learning_rate": 0.0002, + "loss": 1.0833, + "step": 8250 + }, + { + "epoch": 5.628620102214651, + "grad_norm": 1.2913990020751953, + "learning_rate": 0.0002, + "loss": 1.1197, + "step": 8260 + }, + { + "epoch": 5.6354344122657585, + "grad_norm": 1.2204737663269043, + "learning_rate": 0.0002, + "loss": 1.0346, + "step": 8270 + }, + { + "epoch": 5.642248722316865, + "grad_norm": 1.57016921043396, + "learning_rate": 0.0002, + "loss": 1.1429, + "step": 8280 + }, + { + "epoch": 5.649063032367973, + "grad_norm": 1.0117967128753662, + "learning_rate": 0.0002, + "loss": 1.0988, + "step": 8290 + }, + { + "epoch": 5.65587734241908, + "grad_norm": 1.3195525407791138, + "learning_rate": 0.0002, + "loss": 1.0786, + "step": 8300 + }, + { + "epoch": 5.662691652470187, + "grad_norm": 1.2566497325897217, + "learning_rate": 0.0002, + "loss": 1.0618, + "step": 8310 + }, + { + "epoch": 5.669505962521295, + "grad_norm": 1.1446818113327026, + "learning_rate": 0.0002, + "loss": 1.1635, + "step": 8320 + }, + { + "epoch": 5.6763202725724025, + "grad_norm": 1.2928680181503296, + "learning_rate": 0.0002, + "loss": 1.2201, + "step": 8330 + }, + { + "epoch": 5.683134582623509, + "grad_norm": 1.2823996543884277, + "learning_rate": 0.0002, + "loss": 1.1488, + "step": 8340 + }, + { + "epoch": 5.689948892674616, + "grad_norm": 1.1523874998092651, + "learning_rate": 0.0002, + "loss": 1.0686, + "step": 8350 + }, + { + "epoch": 5.696763202725724, + "grad_norm": 1.0819287300109863, + "learning_rate": 0.0002, + "loss": 1.0938, + "step": 8360 + }, + { + "epoch": 5.703577512776832, + "grad_norm": 1.2384417057037354, + "learning_rate": 0.0002, + "loss": 1.167, + "step": 8370 + }, + { + "epoch": 5.710391822827939, + "grad_norm": 1.1733224391937256, + "learning_rate": 0.0002, + "loss": 1.1136, + "step": 8380 + }, + { + "epoch": 5.7172061328790456, + "grad_norm": 1.3173418045043945, + "learning_rate": 0.0002, + "loss": 1.1041, + "step": 8390 + }, + { + "epoch": 5.724020442930153, + "grad_norm": 1.285880446434021, + "learning_rate": 0.0002, + "loss": 1.1014, + "step": 8400 + }, + { + "epoch": 5.730834752981261, + "grad_norm": 1.1404874324798584, + "learning_rate": 0.0002, + "loss": 1.1161, + "step": 8410 + }, + { + "epoch": 5.737649063032368, + "grad_norm": 1.2432540655136108, + "learning_rate": 0.0002, + "loss": 1.192, + "step": 8420 + }, + { + "epoch": 5.744463373083475, + "grad_norm": 1.2432233095169067, + "learning_rate": 0.0002, + "loss": 1.1702, + "step": 8430 + }, + { + "epoch": 5.751277683134583, + "grad_norm": 1.154496669769287, + "learning_rate": 0.0002, + "loss": 1.1357, + "step": 8440 + }, + { + "epoch": 5.75809199318569, + "grad_norm": 1.3301030397415161, + "learning_rate": 0.0002, + "loss": 1.1706, + "step": 8450 + }, + { + "epoch": 5.764906303236797, + "grad_norm": 1.243760347366333, + "learning_rate": 0.0002, + "loss": 1.2052, + "step": 8460 + }, + { + "epoch": 5.771720613287904, + "grad_norm": 1.4083361625671387, + "learning_rate": 0.0002, + "loss": 1.1035, + "step": 8470 + }, + { + "epoch": 5.778534923339012, + "grad_norm": 1.5662120580673218, + "learning_rate": 0.0002, + "loss": 1.1362, + "step": 8480 + }, + { + "epoch": 5.78534923339012, + "grad_norm": 1.2111139297485352, + "learning_rate": 0.0002, + "loss": 1.1578, + "step": 8490 + }, + { + "epoch": 5.792163543441227, + "grad_norm": 1.2776305675506592, + "learning_rate": 0.0002, + "loss": 1.1333, + "step": 8500 + }, + { + "epoch": 5.7989778534923335, + "grad_norm": 1.1777727603912354, + "learning_rate": 0.0002, + "loss": 1.1439, + "step": 8510 + }, + { + "epoch": 5.805792163543441, + "grad_norm": 1.1696112155914307, + "learning_rate": 0.0002, + "loss": 1.0859, + "step": 8520 + }, + { + "epoch": 5.812606473594548, + "grad_norm": 1.137397289276123, + "learning_rate": 0.0002, + "loss": 1.162, + "step": 8530 + }, + { + "epoch": 5.819420783645656, + "grad_norm": 1.3182098865509033, + "learning_rate": 0.0002, + "loss": 1.2099, + "step": 8540 + }, + { + "epoch": 5.826235093696763, + "grad_norm": 1.359756588935852, + "learning_rate": 0.0002, + "loss": 1.1427, + "step": 8550 + }, + { + "epoch": 5.833049403747871, + "grad_norm": 1.4118162393569946, + "learning_rate": 0.0002, + "loss": 1.1714, + "step": 8560 + }, + { + "epoch": 5.8398637137989775, + "grad_norm": 1.1899290084838867, + "learning_rate": 0.0002, + "loss": 1.1758, + "step": 8570 + }, + { + "epoch": 5.846678023850085, + "grad_norm": 1.1764532327651978, + "learning_rate": 0.0002, + "loss": 1.1511, + "step": 8580 + }, + { + "epoch": 5.853492333901192, + "grad_norm": 1.33274245262146, + "learning_rate": 0.0002, + "loss": 1.1633, + "step": 8590 + }, + { + "epoch": 5.8603066439523, + "grad_norm": 1.2571861743927002, + "learning_rate": 0.0002, + "loss": 1.1092, + "step": 8600 + }, + { + "epoch": 5.867120954003407, + "grad_norm": 1.3523616790771484, + "learning_rate": 0.0002, + "loss": 1.1137, + "step": 8610 + }, + { + "epoch": 5.873935264054515, + "grad_norm": 1.3556902408599854, + "learning_rate": 0.0002, + "loss": 1.2442, + "step": 8620 + }, + { + "epoch": 5.8807495741056215, + "grad_norm": 1.2864879369735718, + "learning_rate": 0.0002, + "loss": 1.0967, + "step": 8630 + }, + { + "epoch": 5.887563884156729, + "grad_norm": 1.2872768640518188, + "learning_rate": 0.0002, + "loss": 1.1491, + "step": 8640 + }, + { + "epoch": 5.894378194207836, + "grad_norm": 1.1446053981781006, + "learning_rate": 0.0002, + "loss": 1.1003, + "step": 8650 + }, + { + "epoch": 5.901192504258944, + "grad_norm": 1.292615532875061, + "learning_rate": 0.0002, + "loss": 1.1095, + "step": 8660 + }, + { + "epoch": 5.908006814310051, + "grad_norm": 1.190891981124878, + "learning_rate": 0.0002, + "loss": 1.2009, + "step": 8670 + }, + { + "epoch": 5.914821124361159, + "grad_norm": 1.330273985862732, + "learning_rate": 0.0002, + "loss": 1.1386, + "step": 8680 + }, + { + "epoch": 5.9216354344122655, + "grad_norm": 1.41121244430542, + "learning_rate": 0.0002, + "loss": 1.1874, + "step": 8690 + }, + { + "epoch": 5.928449744463373, + "grad_norm": 1.1360729932785034, + "learning_rate": 0.0002, + "loss": 1.1573, + "step": 8700 + }, + { + "epoch": 5.93526405451448, + "grad_norm": 1.2220772504806519, + "learning_rate": 0.0002, + "loss": 1.115, + "step": 8710 + }, + { + "epoch": 5.942078364565588, + "grad_norm": 1.1077110767364502, + "learning_rate": 0.0002, + "loss": 1.1696, + "step": 8720 + }, + { + "epoch": 5.948892674616695, + "grad_norm": 1.3632500171661377, + "learning_rate": 0.0002, + "loss": 1.1443, + "step": 8730 + }, + { + "epoch": 5.955706984667803, + "grad_norm": 1.4695830345153809, + "learning_rate": 0.0002, + "loss": 1.1474, + "step": 8740 + }, + { + "epoch": 5.9625212947189095, + "grad_norm": 1.217741847038269, + "learning_rate": 0.0002, + "loss": 1.1825, + "step": 8750 + }, + { + "epoch": 5.969335604770017, + "grad_norm": 1.0386874675750732, + "learning_rate": 0.0002, + "loss": 1.1495, + "step": 8760 + }, + { + "epoch": 5.976149914821124, + "grad_norm": 1.2067872285842896, + "learning_rate": 0.0002, + "loss": 1.1146, + "step": 8770 + }, + { + "epoch": 5.982964224872232, + "grad_norm": 1.3842018842697144, + "learning_rate": 0.0002, + "loss": 1.1987, + "step": 8780 + }, + { + "epoch": 5.989778534923339, + "grad_norm": 1.4584033489227295, + "learning_rate": 0.0002, + "loss": 1.2147, + "step": 8790 + }, + { + "epoch": 5.996592844974447, + "grad_norm": 1.1912888288497925, + "learning_rate": 0.0002, + "loss": 1.2078, + "step": 8800 + }, + { + "epoch": 6.0, + "eval_loss": 2.261807441711426, + "eval_runtime": 68.1125, + "eval_samples_per_second": 7.444, + "eval_steps_per_second": 0.94, + "step": 8805 + }, + { + "epoch": 6.003407155025553, + "grad_norm": 1.1715940237045288, + "learning_rate": 0.0002, + "loss": 1.004, + "step": 8810 + }, + { + "epoch": 6.010221465076661, + "grad_norm": 1.6573960781097412, + "learning_rate": 0.0002, + "loss": 0.8665, + "step": 8820 + }, + { + "epoch": 6.017035775127768, + "grad_norm": 1.2845953702926636, + "learning_rate": 0.0002, + "loss": 0.8866, + "step": 8830 + }, + { + "epoch": 6.023850085178876, + "grad_norm": 1.526754379272461, + "learning_rate": 0.0002, + "loss": 0.8528, + "step": 8840 + }, + { + "epoch": 6.030664395229983, + "grad_norm": 1.4536073207855225, + "learning_rate": 0.0002, + "loss": 0.8555, + "step": 8850 + }, + { + "epoch": 6.0374787052810905, + "grad_norm": 1.68099045753479, + "learning_rate": 0.0002, + "loss": 0.8839, + "step": 8860 + }, + { + "epoch": 6.044293015332197, + "grad_norm": 1.485777497291565, + "learning_rate": 0.0002, + "loss": 0.8538, + "step": 8870 + }, + { + "epoch": 6.051107325383305, + "grad_norm": 1.5084402561187744, + "learning_rate": 0.0002, + "loss": 0.8534, + "step": 8880 + }, + { + "epoch": 6.057921635434412, + "grad_norm": 1.3901145458221436, + "learning_rate": 0.0002, + "loss": 0.8587, + "step": 8890 + }, + { + "epoch": 6.06473594548552, + "grad_norm": 1.528954267501831, + "learning_rate": 0.0002, + "loss": 0.8625, + "step": 8900 + }, + { + "epoch": 6.071550255536627, + "grad_norm": 1.6869531869888306, + "learning_rate": 0.0002, + "loss": 0.9115, + "step": 8910 + }, + { + "epoch": 6.0783645655877345, + "grad_norm": 1.4149913787841797, + "learning_rate": 0.0002, + "loss": 0.8817, + "step": 8920 + }, + { + "epoch": 6.085178875638841, + "grad_norm": 1.6853618621826172, + "learning_rate": 0.0002, + "loss": 0.8734, + "step": 8930 + }, + { + "epoch": 6.091993185689949, + "grad_norm": 1.694443702697754, + "learning_rate": 0.0002, + "loss": 0.8836, + "step": 8940 + }, + { + "epoch": 6.098807495741056, + "grad_norm": 2.1037111282348633, + "learning_rate": 0.0002, + "loss": 0.9144, + "step": 8950 + }, + { + "epoch": 6.105621805792164, + "grad_norm": 2.1236703395843506, + "learning_rate": 0.0002, + "loss": 0.8296, + "step": 8960 + }, + { + "epoch": 6.112436115843271, + "grad_norm": 1.6621695756912231, + "learning_rate": 0.0002, + "loss": 0.8451, + "step": 8970 + }, + { + "epoch": 6.1192504258943785, + "grad_norm": 1.5390307903289795, + "learning_rate": 0.0002, + "loss": 0.8423, + "step": 8980 + }, + { + "epoch": 6.126064735945485, + "grad_norm": 1.7841306924819946, + "learning_rate": 0.0002, + "loss": 0.8829, + "step": 8990 + }, + { + "epoch": 6.132879045996593, + "grad_norm": 1.8420580625534058, + "learning_rate": 0.0002, + "loss": 0.8872, + "step": 9000 + }, + { + "epoch": 6.1396933560477, + "grad_norm": 1.8198356628417969, + "learning_rate": 0.0002, + "loss": 0.9411, + "step": 9010 + }, + { + "epoch": 6.146507666098808, + "grad_norm": 1.6955933570861816, + "learning_rate": 0.0002, + "loss": 0.8921, + "step": 9020 + }, + { + "epoch": 6.153321976149915, + "grad_norm": 1.5072602033615112, + "learning_rate": 0.0002, + "loss": 0.9241, + "step": 9030 + }, + { + "epoch": 6.1601362862010225, + "grad_norm": 1.63434898853302, + "learning_rate": 0.0002, + "loss": 0.8643, + "step": 9040 + }, + { + "epoch": 6.166950596252129, + "grad_norm": 1.3761866092681885, + "learning_rate": 0.0002, + "loss": 0.8317, + "step": 9050 + }, + { + "epoch": 6.173764906303237, + "grad_norm": 1.7027268409729004, + "learning_rate": 0.0002, + "loss": 0.8136, + "step": 9060 + }, + { + "epoch": 6.180579216354344, + "grad_norm": 1.3534049987792969, + "learning_rate": 0.0002, + "loss": 0.8333, + "step": 9070 + }, + { + "epoch": 6.187393526405452, + "grad_norm": 1.4437154531478882, + "learning_rate": 0.0002, + "loss": 0.847, + "step": 9080 + }, + { + "epoch": 6.194207836456559, + "grad_norm": 1.4449656009674072, + "learning_rate": 0.0002, + "loss": 0.9169, + "step": 9090 + }, + { + "epoch": 6.2010221465076665, + "grad_norm": 1.5854601860046387, + "learning_rate": 0.0002, + "loss": 0.846, + "step": 9100 + }, + { + "epoch": 6.207836456558773, + "grad_norm": 1.5987509489059448, + "learning_rate": 0.0002, + "loss": 0.8801, + "step": 9110 + }, + { + "epoch": 6.214650766609881, + "grad_norm": 1.6309672594070435, + "learning_rate": 0.0002, + "loss": 0.9077, + "step": 9120 + }, + { + "epoch": 6.221465076660988, + "grad_norm": 1.526936411857605, + "learning_rate": 0.0002, + "loss": 0.8802, + "step": 9130 + }, + { + "epoch": 6.228279386712096, + "grad_norm": 1.4649606943130493, + "learning_rate": 0.0002, + "loss": 0.8858, + "step": 9140 + }, + { + "epoch": 6.235093696763203, + "grad_norm": 1.589350700378418, + "learning_rate": 0.0002, + "loss": 0.9414, + "step": 9150 + }, + { + "epoch": 6.2419080068143105, + "grad_norm": 1.655668020248413, + "learning_rate": 0.0002, + "loss": 0.9001, + "step": 9160 + }, + { + "epoch": 6.248722316865417, + "grad_norm": 1.5296401977539062, + "learning_rate": 0.0002, + "loss": 0.9879, + "step": 9170 + }, + { + "epoch": 6.255536626916525, + "grad_norm": 1.5857278108596802, + "learning_rate": 0.0002, + "loss": 0.8908, + "step": 9180 + }, + { + "epoch": 6.262350936967632, + "grad_norm": 1.7779686450958252, + "learning_rate": 0.0002, + "loss": 0.9329, + "step": 9190 + }, + { + "epoch": 6.269165247018739, + "grad_norm": 1.588886022567749, + "learning_rate": 0.0002, + "loss": 0.9683, + "step": 9200 + }, + { + "epoch": 6.275979557069847, + "grad_norm": 1.3818320035934448, + "learning_rate": 0.0002, + "loss": 0.9091, + "step": 9210 + }, + { + "epoch": 6.2827938671209544, + "grad_norm": 1.6675978899002075, + "learning_rate": 0.0002, + "loss": 0.9003, + "step": 9220 + }, + { + "epoch": 6.289608177172061, + "grad_norm": 1.5672610998153687, + "learning_rate": 0.0002, + "loss": 0.9125, + "step": 9230 + }, + { + "epoch": 6.296422487223168, + "grad_norm": 1.4558004140853882, + "learning_rate": 0.0002, + "loss": 0.9083, + "step": 9240 + }, + { + "epoch": 6.303236797274276, + "grad_norm": 1.5393446683883667, + "learning_rate": 0.0002, + "loss": 0.9362, + "step": 9250 + }, + { + "epoch": 6.310051107325384, + "grad_norm": 1.4367083311080933, + "learning_rate": 0.0002, + "loss": 0.8807, + "step": 9260 + }, + { + "epoch": 6.316865417376491, + "grad_norm": 1.5045381784439087, + "learning_rate": 0.0002, + "loss": 0.9203, + "step": 9270 + }, + { + "epoch": 6.3236797274275975, + "grad_norm": 1.8604016304016113, + "learning_rate": 0.0002, + "loss": 0.9239, + "step": 9280 + }, + { + "epoch": 6.330494037478705, + "grad_norm": 1.4863131046295166, + "learning_rate": 0.0002, + "loss": 0.9644, + "step": 9290 + }, + { + "epoch": 6.337308347529812, + "grad_norm": 1.511121392250061, + "learning_rate": 0.0002, + "loss": 0.9052, + "step": 9300 + }, + { + "epoch": 6.34412265758092, + "grad_norm": 1.6979162693023682, + "learning_rate": 0.0002, + "loss": 0.8609, + "step": 9310 + }, + { + "epoch": 6.350936967632027, + "grad_norm": 1.6060494184494019, + "learning_rate": 0.0002, + "loss": 0.953, + "step": 9320 + }, + { + "epoch": 6.357751277683135, + "grad_norm": 1.6572561264038086, + "learning_rate": 0.0002, + "loss": 0.9552, + "step": 9330 + }, + { + "epoch": 6.3645655877342415, + "grad_norm": 1.6706757545471191, + "learning_rate": 0.0002, + "loss": 0.9201, + "step": 9340 + }, + { + "epoch": 6.371379897785349, + "grad_norm": 1.620836615562439, + "learning_rate": 0.0002, + "loss": 0.8693, + "step": 9350 + }, + { + "epoch": 6.378194207836456, + "grad_norm": 1.482940673828125, + "learning_rate": 0.0002, + "loss": 0.9281, + "step": 9360 + }, + { + "epoch": 6.385008517887564, + "grad_norm": 1.3969961404800415, + "learning_rate": 0.0002, + "loss": 0.9026, + "step": 9370 + }, + { + "epoch": 6.391822827938671, + "grad_norm": 1.611212134361267, + "learning_rate": 0.0002, + "loss": 0.8909, + "step": 9380 + }, + { + "epoch": 6.398637137989779, + "grad_norm": 1.5586223602294922, + "learning_rate": 0.0002, + "loss": 0.9137, + "step": 9390 + }, + { + "epoch": 6.4054514480408855, + "grad_norm": 1.394761562347412, + "learning_rate": 0.0002, + "loss": 0.9254, + "step": 9400 + }, + { + "epoch": 6.412265758091993, + "grad_norm": 1.559618592262268, + "learning_rate": 0.0002, + "loss": 0.8935, + "step": 9410 + }, + { + "epoch": 6.4190800681431, + "grad_norm": 1.462173581123352, + "learning_rate": 0.0002, + "loss": 0.9585, + "step": 9420 + }, + { + "epoch": 6.425894378194208, + "grad_norm": 1.5655437707901, + "learning_rate": 0.0002, + "loss": 0.9492, + "step": 9430 + }, + { + "epoch": 6.432708688245315, + "grad_norm": 1.4344340562820435, + "learning_rate": 0.0002, + "loss": 0.9371, + "step": 9440 + }, + { + "epoch": 6.439522998296423, + "grad_norm": 1.5132373571395874, + "learning_rate": 0.0002, + "loss": 0.9396, + "step": 9450 + }, + { + "epoch": 6.4463373083475295, + "grad_norm": 1.68776535987854, + "learning_rate": 0.0002, + "loss": 0.9229, + "step": 9460 + }, + { + "epoch": 6.453151618398637, + "grad_norm": 1.556823968887329, + "learning_rate": 0.0002, + "loss": 0.9524, + "step": 9470 + }, + { + "epoch": 6.459965928449744, + "grad_norm": 1.4254260063171387, + "learning_rate": 0.0002, + "loss": 0.94, + "step": 9480 + }, + { + "epoch": 6.466780238500852, + "grad_norm": 1.7901203632354736, + "learning_rate": 0.0002, + "loss": 0.9689, + "step": 9490 + }, + { + "epoch": 6.473594548551959, + "grad_norm": 1.5098410844802856, + "learning_rate": 0.0002, + "loss": 0.9267, + "step": 9500 + }, + { + "epoch": 6.480408858603067, + "grad_norm": 1.6036792993545532, + "learning_rate": 0.0002, + "loss": 0.9159, + "step": 9510 + }, + { + "epoch": 6.4872231686541735, + "grad_norm": 1.5011411905288696, + "learning_rate": 0.0002, + "loss": 0.9253, + "step": 9520 + }, + { + "epoch": 6.494037478705281, + "grad_norm": 1.410780906677246, + "learning_rate": 0.0002, + "loss": 0.9527, + "step": 9530 + }, + { + "epoch": 6.500851788756388, + "grad_norm": 1.7451791763305664, + "learning_rate": 0.0002, + "loss": 0.8927, + "step": 9540 + }, + { + "epoch": 6.507666098807496, + "grad_norm": 1.5888725519180298, + "learning_rate": 0.0002, + "loss": 0.9566, + "step": 9550 + }, + { + "epoch": 6.514480408858603, + "grad_norm": 1.3016585111618042, + "learning_rate": 0.0002, + "loss": 0.9324, + "step": 9560 + }, + { + "epoch": 6.521294718909711, + "grad_norm": 1.629522442817688, + "learning_rate": 0.0002, + "loss": 0.9576, + "step": 9570 + }, + { + "epoch": 6.5281090289608175, + "grad_norm": 1.494436264038086, + "learning_rate": 0.0002, + "loss": 0.92, + "step": 9580 + }, + { + "epoch": 6.534923339011925, + "grad_norm": 1.323195219039917, + "learning_rate": 0.0002, + "loss": 0.9154, + "step": 9590 + }, + { + "epoch": 6.541737649063032, + "grad_norm": 1.4904460906982422, + "learning_rate": 0.0002, + "loss": 0.9891, + "step": 9600 + }, + { + "epoch": 6.54855195911414, + "grad_norm": 1.6079169511795044, + "learning_rate": 0.0002, + "loss": 0.9316, + "step": 9610 + }, + { + "epoch": 6.555366269165247, + "grad_norm": 1.5113396644592285, + "learning_rate": 0.0002, + "loss": 1.0105, + "step": 9620 + }, + { + "epoch": 6.562180579216355, + "grad_norm": 1.7113087177276611, + "learning_rate": 0.0002, + "loss": 0.9618, + "step": 9630 + }, + { + "epoch": 6.5689948892674614, + "grad_norm": 1.359394907951355, + "learning_rate": 0.0002, + "loss": 0.9699, + "step": 9640 + }, + { + "epoch": 6.575809199318569, + "grad_norm": 1.7701337337493896, + "learning_rate": 0.0002, + "loss": 1.0267, + "step": 9650 + }, + { + "epoch": 6.582623509369676, + "grad_norm": 1.6381222009658813, + "learning_rate": 0.0002, + "loss": 0.9639, + "step": 9660 + }, + { + "epoch": 6.589437819420784, + "grad_norm": 1.781891942024231, + "learning_rate": 0.0002, + "loss": 0.9292, + "step": 9670 + }, + { + "epoch": 6.596252129471891, + "grad_norm": 1.47724449634552, + "learning_rate": 0.0002, + "loss": 1.0078, + "step": 9680 + }, + { + "epoch": 6.6030664395229985, + "grad_norm": 1.5498195886611938, + "learning_rate": 0.0002, + "loss": 1.0268, + "step": 9690 + }, + { + "epoch": 6.609880749574105, + "grad_norm": 1.5682368278503418, + "learning_rate": 0.0002, + "loss": 0.9794, + "step": 9700 + }, + { + "epoch": 6.616695059625213, + "grad_norm": 1.6106981039047241, + "learning_rate": 0.0002, + "loss": 0.9298, + "step": 9710 + }, + { + "epoch": 6.62350936967632, + "grad_norm": 1.5388364791870117, + "learning_rate": 0.0002, + "loss": 0.9644, + "step": 9720 + }, + { + "epoch": 6.630323679727428, + "grad_norm": 1.5432790517807007, + "learning_rate": 0.0002, + "loss": 0.9385, + "step": 9730 + }, + { + "epoch": 6.637137989778535, + "grad_norm": 1.4929786920547485, + "learning_rate": 0.0002, + "loss": 0.9995, + "step": 9740 + }, + { + "epoch": 6.6439522998296425, + "grad_norm": 1.6959431171417236, + "learning_rate": 0.0002, + "loss": 0.932, + "step": 9750 + }, + { + "epoch": 6.650766609880749, + "grad_norm": 1.4990962743759155, + "learning_rate": 0.0002, + "loss": 0.9397, + "step": 9760 + }, + { + "epoch": 6.657580919931857, + "grad_norm": 1.5235223770141602, + "learning_rate": 0.0002, + "loss": 0.9808, + "step": 9770 + }, + { + "epoch": 6.664395229982964, + "grad_norm": 1.8264366388320923, + "learning_rate": 0.0002, + "loss": 0.9522, + "step": 9780 + }, + { + "epoch": 6.671209540034072, + "grad_norm": 1.4298417568206787, + "learning_rate": 0.0002, + "loss": 0.9751, + "step": 9790 + }, + { + "epoch": 6.678023850085179, + "grad_norm": 1.5926862955093384, + "learning_rate": 0.0002, + "loss": 0.9607, + "step": 9800 + }, + { + "epoch": 6.6848381601362865, + "grad_norm": 1.4592483043670654, + "learning_rate": 0.0002, + "loss": 0.9681, + "step": 9810 + }, + { + "epoch": 6.691652470187393, + "grad_norm": 1.375799536705017, + "learning_rate": 0.0002, + "loss": 0.9385, + "step": 9820 + }, + { + "epoch": 6.698466780238501, + "grad_norm": 1.5767531394958496, + "learning_rate": 0.0002, + "loss": 0.9684, + "step": 9830 + }, + { + "epoch": 6.705281090289608, + "grad_norm": 1.6452189683914185, + "learning_rate": 0.0002, + "loss": 0.9313, + "step": 9840 + }, + { + "epoch": 6.712095400340716, + "grad_norm": 1.3874469995498657, + "learning_rate": 0.0002, + "loss": 0.9781, + "step": 9850 + }, + { + "epoch": 6.718909710391823, + "grad_norm": 1.5470930337905884, + "learning_rate": 0.0002, + "loss": 0.9803, + "step": 9860 + }, + { + "epoch": 6.7257240204429305, + "grad_norm": 1.499840259552002, + "learning_rate": 0.0002, + "loss": 0.9335, + "step": 9870 + }, + { + "epoch": 6.732538330494037, + "grad_norm": 1.4733195304870605, + "learning_rate": 0.0002, + "loss": 0.9209, + "step": 9880 + }, + { + "epoch": 6.739352640545145, + "grad_norm": 1.921722173690796, + "learning_rate": 0.0002, + "loss": 0.9124, + "step": 9890 + }, + { + "epoch": 6.746166950596252, + "grad_norm": 1.848003625869751, + "learning_rate": 0.0002, + "loss": 0.9311, + "step": 9900 + }, + { + "epoch": 6.75298126064736, + "grad_norm": 1.6050934791564941, + "learning_rate": 0.0002, + "loss": 0.9601, + "step": 9910 + }, + { + "epoch": 6.759795570698467, + "grad_norm": 1.716424822807312, + "learning_rate": 0.0002, + "loss": 0.941, + "step": 9920 + }, + { + "epoch": 6.7666098807495745, + "grad_norm": 1.5647642612457275, + "learning_rate": 0.0002, + "loss": 0.9592, + "step": 9930 + }, + { + "epoch": 6.773424190800681, + "grad_norm": 1.5500049591064453, + "learning_rate": 0.0002, + "loss": 0.927, + "step": 9940 + }, + { + "epoch": 6.780238500851789, + "grad_norm": 1.5384467840194702, + "learning_rate": 0.0002, + "loss": 0.9921, + "step": 9950 + }, + { + "epoch": 6.787052810902896, + "grad_norm": 1.8312339782714844, + "learning_rate": 0.0002, + "loss": 0.9673, + "step": 9960 + }, + { + "epoch": 6.793867120954003, + "grad_norm": 1.3505569696426392, + "learning_rate": 0.0002, + "loss": 0.9647, + "step": 9970 + }, + { + "epoch": 6.800681431005111, + "grad_norm": 1.6717044115066528, + "learning_rate": 0.0002, + "loss": 0.9553, + "step": 9980 + }, + { + "epoch": 6.8074957410562185, + "grad_norm": 1.7072664499282837, + "learning_rate": 0.0002, + "loss": 0.9688, + "step": 9990 + }, + { + "epoch": 6.814310051107325, + "grad_norm": 1.3609364032745361, + "learning_rate": 0.0002, + "loss": 0.951, + "step": 10000 + }, + { + "epoch": 6.821124361158432, + "grad_norm": 1.4862881898880005, + "learning_rate": 0.0002, + "loss": 0.9638, + "step": 10010 + }, + { + "epoch": 6.82793867120954, + "grad_norm": 1.4808303117752075, + "learning_rate": 0.0002, + "loss": 1.016, + "step": 10020 + }, + { + "epoch": 6.834752981260648, + "grad_norm": 1.6531925201416016, + "learning_rate": 0.0002, + "loss": 0.9233, + "step": 10030 + }, + { + "epoch": 6.841567291311755, + "grad_norm": 1.5090917348861694, + "learning_rate": 0.0002, + "loss": 0.9435, + "step": 10040 + }, + { + "epoch": 6.848381601362862, + "grad_norm": 1.5361953973770142, + "learning_rate": 0.0002, + "loss": 0.9395, + "step": 10050 + }, + { + "epoch": 6.855195911413969, + "grad_norm": 1.7302757501602173, + "learning_rate": 0.0002, + "loss": 1.0095, + "step": 10060 + }, + { + "epoch": 6.862010221465077, + "grad_norm": 1.5626600980758667, + "learning_rate": 0.0002, + "loss": 0.9796, + "step": 10070 + }, + { + "epoch": 6.868824531516184, + "grad_norm": 1.4168927669525146, + "learning_rate": 0.0002, + "loss": 1.0244, + "step": 10080 + }, + { + "epoch": 6.875638841567291, + "grad_norm": 1.3921427726745605, + "learning_rate": 0.0002, + "loss": 0.9253, + "step": 10090 + }, + { + "epoch": 6.882453151618399, + "grad_norm": 1.6304726600646973, + "learning_rate": 0.0002, + "loss": 1.0037, + "step": 10100 + }, + { + "epoch": 6.889267461669506, + "grad_norm": 1.5463745594024658, + "learning_rate": 0.0002, + "loss": 1.0088, + "step": 10110 + }, + { + "epoch": 6.896081771720613, + "grad_norm": 1.4989547729492188, + "learning_rate": 0.0002, + "loss": 1.0276, + "step": 10120 + }, + { + "epoch": 6.90289608177172, + "grad_norm": 1.7281252145767212, + "learning_rate": 0.0002, + "loss": 1.0352, + "step": 10130 + }, + { + "epoch": 6.909710391822828, + "grad_norm": 1.469348669052124, + "learning_rate": 0.0002, + "loss": 1.031, + "step": 10140 + }, + { + "epoch": 6.916524701873936, + "grad_norm": 1.3762892484664917, + "learning_rate": 0.0002, + "loss": 1.0301, + "step": 10150 + }, + { + "epoch": 6.923339011925043, + "grad_norm": 1.489425539970398, + "learning_rate": 0.0002, + "loss": 1.0032, + "step": 10160 + }, + { + "epoch": 6.9301533219761495, + "grad_norm": 1.4514580965042114, + "learning_rate": 0.0002, + "loss": 0.9487, + "step": 10170 + }, + { + "epoch": 6.936967632027257, + "grad_norm": 1.6008871793746948, + "learning_rate": 0.0002, + "loss": 0.9898, + "step": 10180 + }, + { + "epoch": 6.943781942078364, + "grad_norm": 1.6893450021743774, + "learning_rate": 0.0002, + "loss": 1.0577, + "step": 10190 + }, + { + "epoch": 6.950596252129472, + "grad_norm": 1.66379976272583, + "learning_rate": 0.0002, + "loss": 0.9699, + "step": 10200 + }, + { + "epoch": 6.957410562180579, + "grad_norm": 1.501943588256836, + "learning_rate": 0.0002, + "loss": 1.0159, + "step": 10210 + }, + { + "epoch": 6.964224872231687, + "grad_norm": 1.6803759336471558, + "learning_rate": 0.0002, + "loss": 1.0414, + "step": 10220 + }, + { + "epoch": 6.9710391822827935, + "grad_norm": 1.4512689113616943, + "learning_rate": 0.0002, + "loss": 1.0413, + "step": 10230 + }, + { + "epoch": 6.977853492333901, + "grad_norm": 1.6071290969848633, + "learning_rate": 0.0002, + "loss": 0.9791, + "step": 10240 + }, + { + "epoch": 6.984667802385008, + "grad_norm": 1.598915696144104, + "learning_rate": 0.0002, + "loss": 1.0574, + "step": 10250 + }, + { + "epoch": 6.991482112436116, + "grad_norm": 1.7178512811660767, + "learning_rate": 0.0002, + "loss": 1.0379, + "step": 10260 + }, + { + "epoch": 6.998296422487223, + "grad_norm": 1.4407050609588623, + "learning_rate": 0.0002, + "loss": 1.0082, + "step": 10270 + }, + { + "epoch": 6.999659284497445, + "eval_loss": 2.4567856788635254, + "eval_runtime": 69.5742, + "eval_samples_per_second": 7.287, + "eval_steps_per_second": 0.92, + "step": 10272 + }, + { + "epoch": 7.005110732538331, + "grad_norm": 1.6635409593582153, + "learning_rate": 0.0002, + "loss": 0.7802, + "step": 10280 + }, + { + "epoch": 7.0119250425894375, + "grad_norm": 1.8180204629898071, + "learning_rate": 0.0002, + "loss": 0.6558, + "step": 10290 + }, + { + "epoch": 7.018739352640545, + "grad_norm": 1.7982863187789917, + "learning_rate": 0.0002, + "loss": 0.7228, + "step": 10300 + }, + { + "epoch": 7.025553662691652, + "grad_norm": 2.1364097595214844, + "learning_rate": 0.0002, + "loss": 0.7028, + "step": 10310 + }, + { + "epoch": 7.03236797274276, + "grad_norm": 1.9538214206695557, + "learning_rate": 0.0002, + "loss": 0.7317, + "step": 10320 + }, + { + "epoch": 7.039182282793867, + "grad_norm": 1.7746129035949707, + "learning_rate": 0.0002, + "loss": 0.6774, + "step": 10330 + }, + { + "epoch": 7.045996592844975, + "grad_norm": 1.5186023712158203, + "learning_rate": 0.0002, + "loss": 0.7045, + "step": 10340 + }, + { + "epoch": 7.0528109028960815, + "grad_norm": 1.9523893594741821, + "learning_rate": 0.0002, + "loss": 0.7208, + "step": 10350 + }, + { + "epoch": 7.059625212947189, + "grad_norm": 1.9791967868804932, + "learning_rate": 0.0002, + "loss": 0.6842, + "step": 10360 + }, + { + "epoch": 7.066439522998296, + "grad_norm": 1.4577405452728271, + "learning_rate": 0.0002, + "loss": 0.67, + "step": 10370 + }, + { + "epoch": 7.073253833049404, + "grad_norm": 1.7670400142669678, + "learning_rate": 0.0002, + "loss": 0.7209, + "step": 10380 + }, + { + "epoch": 7.080068143100511, + "grad_norm": 1.9858429431915283, + "learning_rate": 0.0002, + "loss": 0.7416, + "step": 10390 + }, + { + "epoch": 7.086882453151619, + "grad_norm": 1.4968500137329102, + "learning_rate": 0.0002, + "loss": 0.6793, + "step": 10400 + }, + { + "epoch": 7.0936967632027255, + "grad_norm": 2.2092909812927246, + "learning_rate": 0.0002, + "loss": 0.6852, + "step": 10410 + }, + { + "epoch": 7.100511073253833, + "grad_norm": 1.944272756576538, + "learning_rate": 0.0002, + "loss": 0.7256, + "step": 10420 + }, + { + "epoch": 7.10732538330494, + "grad_norm": 1.7232941389083862, + "learning_rate": 0.0002, + "loss": 0.6841, + "step": 10430 + }, + { + "epoch": 7.114139693356048, + "grad_norm": 2.098334312438965, + "learning_rate": 0.0002, + "loss": 0.7197, + "step": 10440 + }, + { + "epoch": 7.120954003407155, + "grad_norm": 1.7802670001983643, + "learning_rate": 0.0002, + "loss": 0.672, + "step": 10450 + }, + { + "epoch": 7.127768313458263, + "grad_norm": 1.7171560525894165, + "learning_rate": 0.0002, + "loss": 0.7054, + "step": 10460 + }, + { + "epoch": 7.1345826235093694, + "grad_norm": 1.7227827310562134, + "learning_rate": 0.0002, + "loss": 0.7414, + "step": 10470 + }, + { + "epoch": 7.141396933560477, + "grad_norm": 2.0002410411834717, + "learning_rate": 0.0002, + "loss": 0.7164, + "step": 10480 + }, + { + "epoch": 7.148211243611584, + "grad_norm": 2.0559451580047607, + "learning_rate": 0.0002, + "loss": 0.7246, + "step": 10490 + }, + { + "epoch": 7.155025553662692, + "grad_norm": 1.6929457187652588, + "learning_rate": 0.0002, + "loss": 0.7178, + "step": 10500 + }, + { + "epoch": 7.161839863713799, + "grad_norm": 1.8747141361236572, + "learning_rate": 0.0002, + "loss": 0.6839, + "step": 10510 + }, + { + "epoch": 7.1686541737649065, + "grad_norm": 2.1793057918548584, + "learning_rate": 0.0002, + "loss": 0.7672, + "step": 10520 + }, + { + "epoch": 7.175468483816013, + "grad_norm": 1.8422093391418457, + "learning_rate": 0.0002, + "loss": 0.7485, + "step": 10530 + }, + { + "epoch": 7.182282793867121, + "grad_norm": 1.4060566425323486, + "learning_rate": 0.0002, + "loss": 0.7678, + "step": 10540 + }, + { + "epoch": 7.189097103918228, + "grad_norm": 1.8884180784225464, + "learning_rate": 0.0002, + "loss": 0.6677, + "step": 10550 + }, + { + "epoch": 7.195911413969336, + "grad_norm": 1.523154854774475, + "learning_rate": 0.0002, + "loss": 0.7458, + "step": 10560 + }, + { + "epoch": 7.202725724020443, + "grad_norm": 1.8293776512145996, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 10570 + }, + { + "epoch": 7.2095400340715505, + "grad_norm": 1.8931537866592407, + "learning_rate": 0.0002, + "loss": 0.7172, + "step": 10580 + }, + { + "epoch": 7.216354344122657, + "grad_norm": 1.7758889198303223, + "learning_rate": 0.0002, + "loss": 0.7183, + "step": 10590 + }, + { + "epoch": 7.223168654173765, + "grad_norm": 1.9986528158187866, + "learning_rate": 0.0002, + "loss": 0.6953, + "step": 10600 + }, + { + "epoch": 7.229982964224872, + "grad_norm": 3.0123329162597656, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 10610 + }, + { + "epoch": 7.23679727427598, + "grad_norm": 2.203801155090332, + "learning_rate": 0.0002, + "loss": 0.7435, + "step": 10620 + }, + { + "epoch": 7.243611584327087, + "grad_norm": 1.756627082824707, + "learning_rate": 0.0002, + "loss": 0.7492, + "step": 10630 + }, + { + "epoch": 7.2504258943781945, + "grad_norm": 1.6657848358154297, + "learning_rate": 0.0002, + "loss": 0.7664, + "step": 10640 + }, + { + "epoch": 7.257240204429301, + "grad_norm": 1.8871530294418335, + "learning_rate": 0.0002, + "loss": 0.7611, + "step": 10650 + }, + { + "epoch": 7.264054514480409, + "grad_norm": 1.8031877279281616, + "learning_rate": 0.0002, + "loss": 0.7579, + "step": 10660 + }, + { + "epoch": 7.270868824531516, + "grad_norm": 1.8694801330566406, + "learning_rate": 0.0002, + "loss": 0.7928, + "step": 10670 + }, + { + "epoch": 7.277683134582624, + "grad_norm": 1.6305289268493652, + "learning_rate": 0.0002, + "loss": 0.7609, + "step": 10680 + }, + { + "epoch": 7.284497444633731, + "grad_norm": 1.8838950395584106, + "learning_rate": 0.0002, + "loss": 0.7178, + "step": 10690 + }, + { + "epoch": 7.2913117546848385, + "grad_norm": 1.6298766136169434, + "learning_rate": 0.0002, + "loss": 0.7397, + "step": 10700 + }, + { + "epoch": 7.298126064735945, + "grad_norm": 1.6832125186920166, + "learning_rate": 0.0002, + "loss": 0.805, + "step": 10710 + }, + { + "epoch": 7.304940374787053, + "grad_norm": 1.9299124479293823, + "learning_rate": 0.0002, + "loss": 0.7388, + "step": 10720 + }, + { + "epoch": 7.31175468483816, + "grad_norm": 1.6476620435714722, + "learning_rate": 0.0002, + "loss": 0.7219, + "step": 10730 + }, + { + "epoch": 7.318568994889268, + "grad_norm": 2.046297788619995, + "learning_rate": 0.0002, + "loss": 0.7623, + "step": 10740 + }, + { + "epoch": 7.325383304940375, + "grad_norm": 1.9311174154281616, + "learning_rate": 0.0002, + "loss": 0.7824, + "step": 10750 + }, + { + "epoch": 7.3321976149914825, + "grad_norm": 1.8964996337890625, + "learning_rate": 0.0002, + "loss": 0.7469, + "step": 10760 + }, + { + "epoch": 7.339011925042589, + "grad_norm": 1.8085095882415771, + "learning_rate": 0.0002, + "loss": 0.7361, + "step": 10770 + }, + { + "epoch": 7.345826235093697, + "grad_norm": 1.6951984167099, + "learning_rate": 0.0002, + "loss": 0.7753, + "step": 10780 + }, + { + "epoch": 7.352640545144804, + "grad_norm": 1.6665486097335815, + "learning_rate": 0.0002, + "loss": 0.7856, + "step": 10790 + }, + { + "epoch": 7.359454855195912, + "grad_norm": 1.4161039590835571, + "learning_rate": 0.0002, + "loss": 0.7632, + "step": 10800 + }, + { + "epoch": 7.366269165247019, + "grad_norm": 1.8640085458755493, + "learning_rate": 0.0002, + "loss": 0.7377, + "step": 10810 + }, + { + "epoch": 7.3730834752981265, + "grad_norm": 1.8302277326583862, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 10820 + }, + { + "epoch": 7.379897785349233, + "grad_norm": 1.6959542036056519, + "learning_rate": 0.0002, + "loss": 0.8338, + "step": 10830 + }, + { + "epoch": 7.386712095400341, + "grad_norm": 2.171138286590576, + "learning_rate": 0.0002, + "loss": 0.7657, + "step": 10840 + }, + { + "epoch": 7.393526405451448, + "grad_norm": 1.9314014911651611, + "learning_rate": 0.0002, + "loss": 0.7897, + "step": 10850 + }, + { + "epoch": 7.400340715502555, + "grad_norm": 1.8977826833724976, + "learning_rate": 0.0002, + "loss": 0.7314, + "step": 10860 + }, + { + "epoch": 7.407155025553663, + "grad_norm": 2.024486541748047, + "learning_rate": 0.0002, + "loss": 0.7459, + "step": 10870 + }, + { + "epoch": 7.4139693356047704, + "grad_norm": 1.8545196056365967, + "learning_rate": 0.0002, + "loss": 0.771, + "step": 10880 + }, + { + "epoch": 7.420783645655877, + "grad_norm": 1.9366614818572998, + "learning_rate": 0.0002, + "loss": 0.7558, + "step": 10890 + }, + { + "epoch": 7.427597955706984, + "grad_norm": 2.051706075668335, + "learning_rate": 0.0002, + "loss": 0.7121, + "step": 10900 + }, + { + "epoch": 7.434412265758092, + "grad_norm": 1.624997615814209, + "learning_rate": 0.0002, + "loss": 0.7618, + "step": 10910 + }, + { + "epoch": 7.4412265758092, + "grad_norm": 1.8717564344406128, + "learning_rate": 0.0002, + "loss": 0.7886, + "step": 10920 + }, + { + "epoch": 7.448040885860307, + "grad_norm": 2.0878796577453613, + "learning_rate": 0.0002, + "loss": 0.7614, + "step": 10930 + }, + { + "epoch": 7.4548551959114135, + "grad_norm": 1.7073718309402466, + "learning_rate": 0.0002, + "loss": 0.8, + "step": 10940 + }, + { + "epoch": 7.461669505962521, + "grad_norm": 1.6618555784225464, + "learning_rate": 0.0002, + "loss": 0.7897, + "step": 10950 + }, + { + "epoch": 7.468483816013628, + "grad_norm": 1.8428804874420166, + "learning_rate": 0.0002, + "loss": 0.8101, + "step": 10960 + }, + { + "epoch": 7.475298126064736, + "grad_norm": 1.8749566078186035, + "learning_rate": 0.0002, + "loss": 0.777, + "step": 10970 + }, + { + "epoch": 7.482112436115843, + "grad_norm": 1.846954107284546, + "learning_rate": 0.0002, + "loss": 0.8125, + "step": 10980 + }, + { + "epoch": 7.488926746166951, + "grad_norm": 1.878496527671814, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 10990 + }, + { + "epoch": 7.4957410562180575, + "grad_norm": 2.039119005203247, + "learning_rate": 0.0002, + "loss": 0.7796, + "step": 11000 + }, + { + "epoch": 7.502555366269165, + "grad_norm": 1.677701473236084, + "learning_rate": 0.0002, + "loss": 0.7788, + "step": 11010 + }, + { + "epoch": 7.509369676320272, + "grad_norm": 1.7645316123962402, + "learning_rate": 0.0002, + "loss": 0.7649, + "step": 11020 + }, + { + "epoch": 7.51618398637138, + "grad_norm": 1.7873706817626953, + "learning_rate": 0.0002, + "loss": 0.8325, + "step": 11030 + }, + { + "epoch": 7.522998296422487, + "grad_norm": 1.880903959274292, + "learning_rate": 0.0002, + "loss": 0.823, + "step": 11040 + }, + { + "epoch": 7.529812606473595, + "grad_norm": 1.4965842962265015, + "learning_rate": 0.0002, + "loss": 0.8542, + "step": 11050 + }, + { + "epoch": 7.5366269165247015, + "grad_norm": 1.9609076976776123, + "learning_rate": 0.0002, + "loss": 0.814, + "step": 11060 + }, + { + "epoch": 7.543441226575809, + "grad_norm": 1.8582744598388672, + "learning_rate": 0.0002, + "loss": 0.8063, + "step": 11070 + }, + { + "epoch": 7.550255536626916, + "grad_norm": 1.7395402193069458, + "learning_rate": 0.0002, + "loss": 0.7882, + "step": 11080 + }, + { + "epoch": 7.557069846678024, + "grad_norm": 1.8297388553619385, + "learning_rate": 0.0002, + "loss": 0.8347, + "step": 11090 + }, + { + "epoch": 7.563884156729131, + "grad_norm": 1.9110262393951416, + "learning_rate": 0.0002, + "loss": 0.7666, + "step": 11100 + }, + { + "epoch": 7.570698466780239, + "grad_norm": 1.873039722442627, + "learning_rate": 0.0002, + "loss": 0.8264, + "step": 11110 + }, + { + "epoch": 7.5775127768313455, + "grad_norm": 1.8473812341690063, + "learning_rate": 0.0002, + "loss": 0.8212, + "step": 11120 + }, + { + "epoch": 7.584327086882453, + "grad_norm": 1.9834227561950684, + "learning_rate": 0.0002, + "loss": 0.8532, + "step": 11130 + }, + { + "epoch": 7.59114139693356, + "grad_norm": 1.7381705045700073, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 11140 + }, + { + "epoch": 7.597955706984668, + "grad_norm": 1.619881272315979, + "learning_rate": 0.0002, + "loss": 0.815, + "step": 11150 + }, + { + "epoch": 7.604770017035775, + "grad_norm": 1.773484706878662, + "learning_rate": 0.0002, + "loss": 0.8182, + "step": 11160 + }, + { + "epoch": 7.611584327086883, + "grad_norm": 1.8400499820709229, + "learning_rate": 0.0002, + "loss": 0.848, + "step": 11170 + }, + { + "epoch": 7.6183986371379895, + "grad_norm": 1.936593770980835, + "learning_rate": 0.0002, + "loss": 0.8356, + "step": 11180 + }, + { + "epoch": 7.625212947189097, + "grad_norm": 2.037844181060791, + "learning_rate": 0.0002, + "loss": 0.8384, + "step": 11190 + }, + { + "epoch": 7.632027257240204, + "grad_norm": 1.6165574789047241, + "learning_rate": 0.0002, + "loss": 0.807, + "step": 11200 + }, + { + "epoch": 7.638841567291312, + "grad_norm": 1.886804461479187, + "learning_rate": 0.0002, + "loss": 0.7791, + "step": 11210 + }, + { + "epoch": 7.645655877342419, + "grad_norm": 1.8130316734313965, + "learning_rate": 0.0002, + "loss": 0.7953, + "step": 11220 + }, + { + "epoch": 7.652470187393527, + "grad_norm": 1.7955272197723389, + "learning_rate": 0.0002, + "loss": 0.7991, + "step": 11230 + }, + { + "epoch": 7.6592844974446335, + "grad_norm": 1.6500684022903442, + "learning_rate": 0.0002, + "loss": 0.8104, + "step": 11240 + }, + { + "epoch": 7.666098807495741, + "grad_norm": 1.782709002494812, + "learning_rate": 0.0002, + "loss": 0.8156, + "step": 11250 + }, + { + "epoch": 7.672913117546848, + "grad_norm": 1.8072985410690308, + "learning_rate": 0.0002, + "loss": 0.831, + "step": 11260 + }, + { + "epoch": 7.679727427597956, + "grad_norm": 1.8962644338607788, + "learning_rate": 0.0002, + "loss": 0.8852, + "step": 11270 + }, + { + "epoch": 7.686541737649063, + "grad_norm": 1.794803261756897, + "learning_rate": 0.0002, + "loss": 0.8586, + "step": 11280 + }, + { + "epoch": 7.693356047700171, + "grad_norm": 1.8621071577072144, + "learning_rate": 0.0002, + "loss": 0.8727, + "step": 11290 + }, + { + "epoch": 7.7001703577512775, + "grad_norm": 2.1268274784088135, + "learning_rate": 0.0002, + "loss": 0.8411, + "step": 11300 + }, + { + "epoch": 7.706984667802385, + "grad_norm": 1.776221513748169, + "learning_rate": 0.0002, + "loss": 0.8529, + "step": 11310 + }, + { + "epoch": 7.713798977853492, + "grad_norm": 2.5115597248077393, + "learning_rate": 0.0002, + "loss": 0.8108, + "step": 11320 + }, + { + "epoch": 7.7206132879046, + "grad_norm": 1.9946764707565308, + "learning_rate": 0.0002, + "loss": 0.8334, + "step": 11330 + }, + { + "epoch": 7.727427597955707, + "grad_norm": 1.7262247800827026, + "learning_rate": 0.0002, + "loss": 0.7893, + "step": 11340 + }, + { + "epoch": 7.7342419080068145, + "grad_norm": 1.971244215965271, + "learning_rate": 0.0002, + "loss": 0.8513, + "step": 11350 + }, + { + "epoch": 7.741056218057921, + "grad_norm": 1.8255480527877808, + "learning_rate": 0.0002, + "loss": 0.8064, + "step": 11360 + }, + { + "epoch": 7.747870528109029, + "grad_norm": 1.6721539497375488, + "learning_rate": 0.0002, + "loss": 0.7787, + "step": 11370 + }, + { + "epoch": 7.754684838160136, + "grad_norm": 1.9740724563598633, + "learning_rate": 0.0002, + "loss": 0.8256, + "step": 11380 + }, + { + "epoch": 7.761499148211244, + "grad_norm": 1.9174233675003052, + "learning_rate": 0.0002, + "loss": 0.8778, + "step": 11390 + }, + { + "epoch": 7.768313458262351, + "grad_norm": 1.927493691444397, + "learning_rate": 0.0002, + "loss": 0.8024, + "step": 11400 + }, + { + "epoch": 7.7751277683134585, + "grad_norm": 1.6313871145248413, + "learning_rate": 0.0002, + "loss": 0.8443, + "step": 11410 + }, + { + "epoch": 7.781942078364565, + "grad_norm": 2.0635557174682617, + "learning_rate": 0.0002, + "loss": 0.8771, + "step": 11420 + }, + { + "epoch": 7.788756388415673, + "grad_norm": 1.597979187965393, + "learning_rate": 0.0002, + "loss": 0.7838, + "step": 11430 + }, + { + "epoch": 7.79557069846678, + "grad_norm": 1.8125237226486206, + "learning_rate": 0.0002, + "loss": 0.8391, + "step": 11440 + }, + { + "epoch": 7.802385008517888, + "grad_norm": 1.6833277940750122, + "learning_rate": 0.0002, + "loss": 0.8462, + "step": 11450 + }, + { + "epoch": 7.809199318568995, + "grad_norm": 1.9060336351394653, + "learning_rate": 0.0002, + "loss": 0.9158, + "step": 11460 + }, + { + "epoch": 7.8160136286201025, + "grad_norm": 1.6847437620162964, + "learning_rate": 0.0002, + "loss": 0.8473, + "step": 11470 + }, + { + "epoch": 7.822827938671209, + "grad_norm": 1.8693677186965942, + "learning_rate": 0.0002, + "loss": 0.851, + "step": 11480 + }, + { + "epoch": 7.829642248722317, + "grad_norm": 1.7141996622085571, + "learning_rate": 0.0002, + "loss": 0.7793, + "step": 11490 + }, + { + "epoch": 7.836456558773424, + "grad_norm": 1.7096906900405884, + "learning_rate": 0.0002, + "loss": 0.8254, + "step": 11500 + }, + { + "epoch": 7.843270868824532, + "grad_norm": 1.7270509004592896, + "learning_rate": 0.0002, + "loss": 0.8372, + "step": 11510 + }, + { + "epoch": 7.850085178875639, + "grad_norm": 1.6399152278900146, + "learning_rate": 0.0002, + "loss": 0.8513, + "step": 11520 + }, + { + "epoch": 7.8568994889267465, + "grad_norm": 1.7190455198287964, + "learning_rate": 0.0002, + "loss": 0.7867, + "step": 11530 + }, + { + "epoch": 7.863713798977853, + "grad_norm": 1.7967315912246704, + "learning_rate": 0.0002, + "loss": 0.803, + "step": 11540 + }, + { + "epoch": 7.870528109028961, + "grad_norm": 1.904163122177124, + "learning_rate": 0.0002, + "loss": 0.7835, + "step": 11550 + }, + { + "epoch": 7.877342419080068, + "grad_norm": 1.898577094078064, + "learning_rate": 0.0002, + "loss": 0.8699, + "step": 11560 + }, + { + "epoch": 7.884156729131176, + "grad_norm": 1.9581187963485718, + "learning_rate": 0.0002, + "loss": 0.8596, + "step": 11570 + }, + { + "epoch": 7.890971039182283, + "grad_norm": 1.756208062171936, + "learning_rate": 0.0002, + "loss": 0.8564, + "step": 11580 + }, + { + "epoch": 7.8977853492333905, + "grad_norm": 2.020146608352661, + "learning_rate": 0.0002, + "loss": 0.9012, + "step": 11590 + }, + { + "epoch": 7.904599659284497, + "grad_norm": 1.647647738456726, + "learning_rate": 0.0002, + "loss": 0.8174, + "step": 11600 + }, + { + "epoch": 7.911413969335605, + "grad_norm": 1.8647202253341675, + "learning_rate": 0.0002, + "loss": 0.848, + "step": 11610 + }, + { + "epoch": 7.918228279386712, + "grad_norm": 1.72721266746521, + "learning_rate": 0.0002, + "loss": 0.8489, + "step": 11620 + }, + { + "epoch": 7.92504258943782, + "grad_norm": 1.9360839128494263, + "learning_rate": 0.0002, + "loss": 0.8407, + "step": 11630 + }, + { + "epoch": 7.931856899488927, + "grad_norm": 1.7773231267929077, + "learning_rate": 0.0002, + "loss": 0.8777, + "step": 11640 + }, + { + "epoch": 7.9386712095400345, + "grad_norm": 1.762197494506836, + "learning_rate": 0.0002, + "loss": 0.8201, + "step": 11650 + }, + { + "epoch": 7.945485519591141, + "grad_norm": 1.8185408115386963, + "learning_rate": 0.0002, + "loss": 0.8083, + "step": 11660 + }, + { + "epoch": 7.952299829642248, + "grad_norm": 1.9808121919631958, + "learning_rate": 0.0002, + "loss": 0.8979, + "step": 11670 + }, + { + "epoch": 7.959114139693356, + "grad_norm": 1.888456106185913, + "learning_rate": 0.0002, + "loss": 0.8176, + "step": 11680 + }, + { + "epoch": 7.965928449744464, + "grad_norm": 1.860640525817871, + "learning_rate": 0.0002, + "loss": 0.8445, + "step": 11690 + }, + { + "epoch": 7.972742759795571, + "grad_norm": 1.7443981170654297, + "learning_rate": 0.0002, + "loss": 0.8665, + "step": 11700 + }, + { + "epoch": 7.979557069846678, + "grad_norm": 1.6821815967559814, + "learning_rate": 0.0002, + "loss": 0.8394, + "step": 11710 + }, + { + "epoch": 7.986371379897785, + "grad_norm": 1.6265391111373901, + "learning_rate": 0.0002, + "loss": 0.8809, + "step": 11720 + }, + { + "epoch": 7.993185689948893, + "grad_norm": 1.8354634046554565, + "learning_rate": 0.0002, + "loss": 0.9274, + "step": 11730 + }, + { + "epoch": 7.997274275979557, + "eval_loss": 2.6678338050842285, + "eval_runtime": 67.4602, + "eval_samples_per_second": 7.516, + "eval_steps_per_second": 0.949, + "step": 11736 + } + ], + "logging_steps": 10, + "max_steps": 11736, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.028049186920858e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2f734531f14829705714a5a2703d90c81f36eeb0 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-11736/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2af5d486d0b370637d5a3999ab7668f94f5d53b6497f018967564aef5e8ad133 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..71d27bad73e9e55e6e2b87696e1b3b4dbe8f7f9d --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b346eaa9e073c3cd345a6d9f82f197fbea2e95dbd041c1698f40d6d882de0d94 +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d01078199419a4f1ce67fcfe8055b89cb1efcd3f --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d68a7e809da5386090196cc27b14f4ba0c78272626624e25bdbbe40c2bd7e4f +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..57a3579d0850f8a29220c6799abfd48c8dd3f858 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:518ef7371f6c934e0630bd1edb35965f0c213f980bc1d4e61fe89cf00cc1d928 +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..94b80282fc90c2e41c89e7cbf693a65d100a3a26 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:331d722674b800e8c81447e4ed79323e1d696462cdb1e33b3951468ac1e10380 +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5540be427e733232b3df20a1e3cd2278aaf4c6b8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/trainer_state.json @@ -0,0 +1,1063 @@ +{ + "best_metric": 1.8034634590148926, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467", + "epoch": 0.9996592844974447, + "eval_steps": 10, + "global_step": 1467, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0068143100511073255, + "grad_norm": 0.635574460029602, + "learning_rate": 0.0002, + "loss": 3.0988, + "step": 10 + }, + { + "epoch": 0.013628620102214651, + "grad_norm": 1.0401769876480103, + "learning_rate": 0.0002, + "loss": 2.5594, + "step": 20 + }, + { + "epoch": 0.020442930153321975, + "grad_norm": 0.4514131247997284, + "learning_rate": 0.0002, + "loss": 2.3587, + "step": 30 + }, + { + "epoch": 0.027257240204429302, + "grad_norm": 0.6303355693817139, + "learning_rate": 0.0002, + "loss": 2.121, + "step": 40 + }, + { + "epoch": 0.034071550255536626, + "grad_norm": 0.4648270606994629, + "learning_rate": 0.0002, + "loss": 1.9833, + "step": 50 + }, + { + "epoch": 0.04088586030664395, + "grad_norm": 0.42953479290008545, + "learning_rate": 0.0002, + "loss": 1.9384, + "step": 60 + }, + { + "epoch": 0.04770017035775128, + "grad_norm": 0.433614581823349, + "learning_rate": 0.0002, + "loss": 1.9202, + "step": 70 + }, + { + "epoch": 0.054514480408858604, + "grad_norm": 0.45318254828453064, + "learning_rate": 0.0002, + "loss": 1.911, + "step": 80 + }, + { + "epoch": 0.06132879045996593, + "grad_norm": 0.4023568630218506, + "learning_rate": 0.0002, + "loss": 1.8895, + "step": 90 + }, + { + "epoch": 0.06814310051107325, + "grad_norm": 0.43260207772254944, + "learning_rate": 0.0002, + "loss": 1.9257, + "step": 100 + }, + { + "epoch": 0.07495741056218058, + "grad_norm": 0.43389809131622314, + "learning_rate": 0.0002, + "loss": 1.9178, + "step": 110 + }, + { + "epoch": 0.0817717206132879, + "grad_norm": 0.39307987689971924, + "learning_rate": 0.0002, + "loss": 1.9071, + "step": 120 + }, + { + "epoch": 0.08858603066439523, + "grad_norm": 0.7703037261962891, + "learning_rate": 0.0002, + "loss": 1.9482, + "step": 130 + }, + { + "epoch": 0.09540034071550256, + "grad_norm": 0.38547563552856445, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 140 + }, + { + "epoch": 0.10221465076660988, + "grad_norm": 0.37948688864707947, + "learning_rate": 0.0002, + "loss": 1.9037, + "step": 150 + }, + { + "epoch": 0.10902896081771721, + "grad_norm": 0.33022379875183105, + "learning_rate": 0.0002, + "loss": 1.8664, + "step": 160 + }, + { + "epoch": 0.11584327086882454, + "grad_norm": 0.33703792095184326, + "learning_rate": 0.0002, + "loss": 1.8659, + "step": 170 + }, + { + "epoch": 0.12265758091993186, + "grad_norm": 0.3259912431240082, + "learning_rate": 0.0002, + "loss": 1.8389, + "step": 180 + }, + { + "epoch": 0.12947189097103917, + "grad_norm": 0.3593858778476715, + "learning_rate": 0.0002, + "loss": 1.8424, + "step": 190 + }, + { + "epoch": 0.1362862010221465, + "grad_norm": 0.3401614725589752, + "learning_rate": 0.0002, + "loss": 1.8457, + "step": 200 + }, + { + "epoch": 0.14310051107325383, + "grad_norm": 0.3892078697681427, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 210 + }, + { + "epoch": 0.14991482112436116, + "grad_norm": 0.315374493598938, + "learning_rate": 0.0002, + "loss": 1.8351, + "step": 220 + }, + { + "epoch": 0.1567291311754685, + "grad_norm": 0.3679497539997101, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 230 + }, + { + "epoch": 0.1635434412265758, + "grad_norm": 0.336730033159256, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 240 + }, + { + "epoch": 0.17035775127768313, + "grad_norm": 0.36472755670547485, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 250 + }, + { + "epoch": 0.17717206132879046, + "grad_norm": 0.34864023327827454, + "learning_rate": 0.0002, + "loss": 1.7766, + "step": 260 + }, + { + "epoch": 0.1839863713798978, + "grad_norm": 0.3350819945335388, + "learning_rate": 0.0002, + "loss": 1.896, + "step": 270 + }, + { + "epoch": 0.19080068143100512, + "grad_norm": 0.3844246566295624, + "learning_rate": 0.0002, + "loss": 1.8742, + "step": 280 + }, + { + "epoch": 0.19761499148211242, + "grad_norm": 0.38413065671920776, + "learning_rate": 0.0002, + "loss": 1.8531, + "step": 290 + }, + { + "epoch": 0.20442930153321975, + "grad_norm": 0.4080047607421875, + "learning_rate": 0.0002, + "loss": 1.8415, + "step": 300 + }, + { + "epoch": 0.21124361158432708, + "grad_norm": 0.3546800911426544, + "learning_rate": 0.0002, + "loss": 1.9009, + "step": 310 + }, + { + "epoch": 0.21805792163543442, + "grad_norm": 0.38577890396118164, + "learning_rate": 0.0002, + "loss": 1.8092, + "step": 320 + }, + { + "epoch": 0.22487223168654175, + "grad_norm": 0.38979098200798035, + "learning_rate": 0.0002, + "loss": 1.7113, + "step": 330 + }, + { + "epoch": 0.23168654173764908, + "grad_norm": 0.35105520486831665, + "learning_rate": 0.0002, + "loss": 1.8162, + "step": 340 + }, + { + "epoch": 0.23850085178875638, + "grad_norm": 0.31671223044395447, + "learning_rate": 0.0002, + "loss": 1.8408, + "step": 350 + }, + { + "epoch": 0.2453151618398637, + "grad_norm": 0.33034196496009827, + "learning_rate": 0.0002, + "loss": 1.8014, + "step": 360 + }, + { + "epoch": 0.252129471890971, + "grad_norm": 0.2990533709526062, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 370 + }, + { + "epoch": 0.25894378194207834, + "grad_norm": 0.362208753824234, + "learning_rate": 0.0002, + "loss": 1.8612, + "step": 380 + }, + { + "epoch": 0.2657580919931857, + "grad_norm": 0.3269096612930298, + "learning_rate": 0.0002, + "loss": 1.8001, + "step": 390 + }, + { + "epoch": 0.272572402044293, + "grad_norm": 0.30555954575538635, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 400 + }, + { + "epoch": 0.27938671209540034, + "grad_norm": 0.332933247089386, + "learning_rate": 0.0002, + "loss": 1.8018, + "step": 410 + }, + { + "epoch": 0.28620102214650767, + "grad_norm": 0.39454060792922974, + "learning_rate": 0.0002, + "loss": 1.8157, + "step": 420 + }, + { + "epoch": 0.293015332197615, + "grad_norm": 0.34589633345603943, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 430 + }, + { + "epoch": 0.29982964224872233, + "grad_norm": 0.3747332990169525, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 440 + }, + { + "epoch": 0.30664395229982966, + "grad_norm": 0.34825369715690613, + "learning_rate": 0.0002, + "loss": 1.8476, + "step": 450 + }, + { + "epoch": 0.313458262350937, + "grad_norm": 0.32906976342201233, + "learning_rate": 0.0002, + "loss": 1.775, + "step": 460 + }, + { + "epoch": 0.3202725724020443, + "grad_norm": 0.33108609914779663, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 470 + }, + { + "epoch": 0.3270868824531516, + "grad_norm": 0.3170463442802429, + "learning_rate": 0.0002, + "loss": 1.8012, + "step": 480 + }, + { + "epoch": 0.3339011925042589, + "grad_norm": 0.30792081356048584, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 490 + }, + { + "epoch": 0.34071550255536626, + "grad_norm": 0.31772997975349426, + "learning_rate": 0.0002, + "loss": 1.7361, + "step": 500 + }, + { + "epoch": 0.3475298126064736, + "grad_norm": 0.32714012265205383, + "learning_rate": 0.0002, + "loss": 1.8147, + "step": 510 + }, + { + "epoch": 0.3543441226575809, + "grad_norm": 0.3021100163459778, + "learning_rate": 0.0002, + "loss": 1.8332, + "step": 520 + }, + { + "epoch": 0.36115843270868825, + "grad_norm": 0.6045835018157959, + "learning_rate": 0.0002, + "loss": 1.7337, + "step": 530 + }, + { + "epoch": 0.3679727427597956, + "grad_norm": 0.3003896474838257, + "learning_rate": 0.0002, + "loss": 1.8661, + "step": 540 + }, + { + "epoch": 0.3747870528109029, + "grad_norm": 0.3678470551967621, + "learning_rate": 0.0002, + "loss": 1.8359, + "step": 550 + }, + { + "epoch": 0.38160136286201024, + "grad_norm": 0.35787615180015564, + "learning_rate": 0.0002, + "loss": 1.7512, + "step": 560 + }, + { + "epoch": 0.38841567291311757, + "grad_norm": 0.31882143020629883, + "learning_rate": 0.0002, + "loss": 1.8048, + "step": 570 + }, + { + "epoch": 0.39522998296422485, + "grad_norm": 0.3186313509941101, + "learning_rate": 0.0002, + "loss": 1.8108, + "step": 580 + }, + { + "epoch": 0.4020442930153322, + "grad_norm": 0.41443702578544617, + "learning_rate": 0.0002, + "loss": 1.8012, + "step": 590 + }, + { + "epoch": 0.4088586030664395, + "grad_norm": 0.28773069381713867, + "learning_rate": 0.0002, + "loss": 1.7794, + "step": 600 + }, + { + "epoch": 0.41567291311754684, + "grad_norm": 0.35743263363838196, + "learning_rate": 0.0002, + "loss": 1.8231, + "step": 610 + }, + { + "epoch": 0.42248722316865417, + "grad_norm": 0.3360286355018616, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 620 + }, + { + "epoch": 0.4293015332197615, + "grad_norm": 0.32838866114616394, + "learning_rate": 0.0002, + "loss": 1.7654, + "step": 630 + }, + { + "epoch": 0.43611584327086883, + "grad_norm": 0.2994388937950134, + "learning_rate": 0.0002, + "loss": 1.8176, + "step": 640 + }, + { + "epoch": 0.44293015332197616, + "grad_norm": 0.3306307792663574, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 650 + }, + { + "epoch": 0.4497444633730835, + "grad_norm": 0.3129560351371765, + "learning_rate": 0.0002, + "loss": 1.8201, + "step": 660 + }, + { + "epoch": 0.4565587734241908, + "grad_norm": 0.3244289457798004, + "learning_rate": 0.0002, + "loss": 1.803, + "step": 670 + }, + { + "epoch": 0.46337308347529815, + "grad_norm": 0.3196892738342285, + "learning_rate": 0.0002, + "loss": 1.8399, + "step": 680 + }, + { + "epoch": 0.47018739352640543, + "grad_norm": 0.3135230243206024, + "learning_rate": 0.0002, + "loss": 1.8291, + "step": 690 + }, + { + "epoch": 0.47700170357751276, + "grad_norm": 0.28677991032600403, + "learning_rate": 0.0002, + "loss": 1.7423, + "step": 700 + }, + { + "epoch": 0.4838160136286201, + "grad_norm": 0.3074065148830414, + "learning_rate": 0.0002, + "loss": 1.6982, + "step": 710 + }, + { + "epoch": 0.4906303236797274, + "grad_norm": 0.3354290723800659, + "learning_rate": 0.0002, + "loss": 1.8078, + "step": 720 + }, + { + "epoch": 0.49744463373083475, + "grad_norm": 0.324370801448822, + "learning_rate": 0.0002, + "loss": 1.7333, + "step": 730 + }, + { + "epoch": 0.504258943781942, + "grad_norm": 0.29496142268180847, + "learning_rate": 0.0002, + "loss": 1.8506, + "step": 740 + }, + { + "epoch": 0.5110732538330494, + "grad_norm": 0.30694130063056946, + "learning_rate": 0.0002, + "loss": 1.7761, + "step": 750 + }, + { + "epoch": 0.5178875638841567, + "grad_norm": 0.36168408393859863, + "learning_rate": 0.0002, + "loss": 1.7854, + "step": 760 + }, + { + "epoch": 0.524701873935264, + "grad_norm": 0.2930343449115753, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 770 + }, + { + "epoch": 0.5315161839863713, + "grad_norm": 0.3023432493209839, + "learning_rate": 0.0002, + "loss": 1.7924, + "step": 780 + }, + { + "epoch": 0.5383304940374787, + "grad_norm": 0.3272720277309418, + "learning_rate": 0.0002, + "loss": 1.8467, + "step": 790 + }, + { + "epoch": 0.545144804088586, + "grad_norm": 0.2907974421977997, + "learning_rate": 0.0002, + "loss": 1.8639, + "step": 800 + }, + { + "epoch": 0.5519591141396933, + "grad_norm": 0.32267168164253235, + "learning_rate": 0.0002, + "loss": 1.9018, + "step": 810 + }, + { + "epoch": 0.5587734241908007, + "grad_norm": 0.32059940695762634, + "learning_rate": 0.0002, + "loss": 1.8311, + "step": 820 + }, + { + "epoch": 0.565587734241908, + "grad_norm": 0.30951258540153503, + "learning_rate": 0.0002, + "loss": 1.7234, + "step": 830 + }, + { + "epoch": 0.5724020442930153, + "grad_norm": 0.33976122736930847, + "learning_rate": 0.0002, + "loss": 1.8063, + "step": 840 + }, + { + "epoch": 0.5792163543441227, + "grad_norm": 0.3195820450782776, + "learning_rate": 0.0002, + "loss": 1.7021, + "step": 850 + }, + { + "epoch": 0.58603066439523, + "grad_norm": 0.2828562557697296, + "learning_rate": 0.0002, + "loss": 1.8073, + "step": 860 + }, + { + "epoch": 0.5928449744463373, + "grad_norm": 0.29591670632362366, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 870 + }, + { + "epoch": 0.5996592844974447, + "grad_norm": 0.3086104393005371, + "learning_rate": 0.0002, + "loss": 1.8109, + "step": 880 + }, + { + "epoch": 0.606473594548552, + "grad_norm": 0.3592929542064667, + "learning_rate": 0.0002, + "loss": 1.8207, + "step": 890 + }, + { + "epoch": 0.6132879045996593, + "grad_norm": 0.2830186188220978, + "learning_rate": 0.0002, + "loss": 1.7662, + "step": 900 + }, + { + "epoch": 0.6201022146507666, + "grad_norm": 0.3128598630428314, + "learning_rate": 0.0002, + "loss": 1.8344, + "step": 910 + }, + { + "epoch": 0.626916524701874, + "grad_norm": 0.31957563757896423, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 920 + }, + { + "epoch": 0.6337308347529813, + "grad_norm": 0.30994319915771484, + "learning_rate": 0.0002, + "loss": 1.7578, + "step": 930 + }, + { + "epoch": 0.6405451448040886, + "grad_norm": 0.3352845013141632, + "learning_rate": 0.0002, + "loss": 1.8494, + "step": 940 + }, + { + "epoch": 0.6473594548551959, + "grad_norm": 0.2960077226161957, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 950 + }, + { + "epoch": 0.6541737649063032, + "grad_norm": 0.32675081491470337, + "learning_rate": 0.0002, + "loss": 1.7209, + "step": 960 + }, + { + "epoch": 0.6609880749574105, + "grad_norm": 0.30042028427124023, + "learning_rate": 0.0002, + "loss": 1.7776, + "step": 970 + }, + { + "epoch": 0.6678023850085179, + "grad_norm": 0.3288673758506775, + "learning_rate": 0.0002, + "loss": 1.7597, + "step": 980 + }, + { + "epoch": 0.6746166950596252, + "grad_norm": 0.323215126991272, + "learning_rate": 0.0002, + "loss": 1.7962, + "step": 990 + }, + { + "epoch": 0.6814310051107325, + "grad_norm": 0.30041399598121643, + "learning_rate": 0.0002, + "loss": 1.821, + "step": 1000 + }, + { + "epoch": 0.6882453151618398, + "grad_norm": 0.3076179623603821, + "learning_rate": 0.0002, + "loss": 1.7716, + "step": 1010 + }, + { + "epoch": 0.6950596252129472, + "grad_norm": 0.2941909730434418, + "learning_rate": 0.0002, + "loss": 1.7827, + "step": 1020 + }, + { + "epoch": 0.7018739352640545, + "grad_norm": 0.32220420241355896, + "learning_rate": 0.0002, + "loss": 1.7964, + "step": 1030 + }, + { + "epoch": 0.7086882453151618, + "grad_norm": 0.2989702820777893, + "learning_rate": 0.0002, + "loss": 1.7101, + "step": 1040 + }, + { + "epoch": 0.7155025553662692, + "grad_norm": 0.3049640357494354, + "learning_rate": 0.0002, + "loss": 1.8749, + "step": 1050 + }, + { + "epoch": 0.7223168654173765, + "grad_norm": 0.3183284103870392, + "learning_rate": 0.0002, + "loss": 1.782, + "step": 1060 + }, + { + "epoch": 0.7291311754684838, + "grad_norm": 0.3070095181465149, + "learning_rate": 0.0002, + "loss": 1.785, + "step": 1070 + }, + { + "epoch": 0.7359454855195912, + "grad_norm": 0.33263063430786133, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 1080 + }, + { + "epoch": 0.7427597955706985, + "grad_norm": 0.28774312138557434, + "learning_rate": 0.0002, + "loss": 1.7627, + "step": 1090 + }, + { + "epoch": 0.7495741056218058, + "grad_norm": 0.29304224252700806, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1100 + }, + { + "epoch": 0.7563884156729132, + "grad_norm": 0.27529507875442505, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 1110 + }, + { + "epoch": 0.7632027257240205, + "grad_norm": 0.32319945096969604, + "learning_rate": 0.0002, + "loss": 1.8071, + "step": 1120 + }, + { + "epoch": 0.7700170357751278, + "grad_norm": 0.33777597546577454, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 1130 + }, + { + "epoch": 0.7768313458262351, + "grad_norm": 0.29847201704978943, + "learning_rate": 0.0002, + "loss": 1.8488, + "step": 1140 + }, + { + "epoch": 0.7836456558773425, + "grad_norm": 0.31370633840560913, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 1150 + }, + { + "epoch": 0.7904599659284497, + "grad_norm": 0.31633856892585754, + "learning_rate": 0.0002, + "loss": 1.8175, + "step": 1160 + }, + { + "epoch": 0.797274275979557, + "grad_norm": 0.33849263191223145, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 1170 + }, + { + "epoch": 0.8040885860306644, + "grad_norm": 0.3306333124637604, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 1180 + }, + { + "epoch": 0.8109028960817717, + "grad_norm": 0.3249678313732147, + "learning_rate": 0.0002, + "loss": 1.777, + "step": 1190 + }, + { + "epoch": 0.817717206132879, + "grad_norm": 0.3252817690372467, + "learning_rate": 0.0002, + "loss": 1.7631, + "step": 1200 + }, + { + "epoch": 0.8245315161839863, + "grad_norm": 0.31772559881210327, + "learning_rate": 0.0002, + "loss": 1.7877, + "step": 1210 + }, + { + "epoch": 0.8313458262350937, + "grad_norm": 0.2803371846675873, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 1220 + }, + { + "epoch": 0.838160136286201, + "grad_norm": 0.26700571179389954, + "learning_rate": 0.0002, + "loss": 1.738, + "step": 1230 + }, + { + "epoch": 0.8449744463373083, + "grad_norm": 0.3060092031955719, + "learning_rate": 0.0002, + "loss": 1.8301, + "step": 1240 + }, + { + "epoch": 0.8517887563884157, + "grad_norm": 0.28831684589385986, + "learning_rate": 0.0002, + "loss": 1.7993, + "step": 1250 + }, + { + "epoch": 0.858603066439523, + "grad_norm": 0.30708742141723633, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 1260 + }, + { + "epoch": 0.8654173764906303, + "grad_norm": 0.2915987968444824, + "learning_rate": 0.0002, + "loss": 1.8506, + "step": 1270 + }, + { + "epoch": 0.8722316865417377, + "grad_norm": 0.2893589735031128, + "learning_rate": 0.0002, + "loss": 1.7536, + "step": 1280 + }, + { + "epoch": 0.879045996592845, + "grad_norm": 0.29545632004737854, + "learning_rate": 0.0002, + "loss": 1.7437, + "step": 1290 + }, + { + "epoch": 0.8858603066439523, + "grad_norm": 0.3354771137237549, + "learning_rate": 0.0002, + "loss": 1.859, + "step": 1300 + }, + { + "epoch": 0.8926746166950597, + "grad_norm": 0.37715399265289307, + "learning_rate": 0.0002, + "loss": 1.7644, + "step": 1310 + }, + { + "epoch": 0.899488926746167, + "grad_norm": 0.28847193717956543, + "learning_rate": 0.0002, + "loss": 1.7731, + "step": 1320 + }, + { + "epoch": 0.9063032367972743, + "grad_norm": 0.2780889868736267, + "learning_rate": 0.0002, + "loss": 1.9077, + "step": 1330 + }, + { + "epoch": 0.9131175468483816, + "grad_norm": 0.2714342176914215, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 1340 + }, + { + "epoch": 0.919931856899489, + "grad_norm": 0.2950133979320526, + "learning_rate": 0.0002, + "loss": 1.8, + "step": 1350 + }, + { + "epoch": 0.9267461669505963, + "grad_norm": 0.29097145795822144, + "learning_rate": 0.0002, + "loss": 1.7368, + "step": 1360 + }, + { + "epoch": 0.9335604770017035, + "grad_norm": 0.32540133595466614, + "learning_rate": 0.0002, + "loss": 1.7864, + "step": 1370 + }, + { + "epoch": 0.9403747870528109, + "grad_norm": 0.3076636493206024, + "learning_rate": 0.0002, + "loss": 1.7571, + "step": 1380 + }, + { + "epoch": 0.9471890971039182, + "grad_norm": 0.2962130308151245, + "learning_rate": 0.0002, + "loss": 1.8312, + "step": 1390 + }, + { + "epoch": 0.9540034071550255, + "grad_norm": 0.30086860060691833, + "learning_rate": 0.0002, + "loss": 1.7858, + "step": 1400 + }, + { + "epoch": 0.9608177172061328, + "grad_norm": 0.28634947538375854, + "learning_rate": 0.0002, + "loss": 1.7991, + "step": 1410 + }, + { + "epoch": 0.9676320272572402, + "grad_norm": 0.35314416885375977, + "learning_rate": 0.0002, + "loss": 1.8385, + "step": 1420 + }, + { + "epoch": 0.9744463373083475, + "grad_norm": 0.2939317524433136, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 1430 + }, + { + "epoch": 0.9812606473594548, + "grad_norm": 0.3010196089744568, + "learning_rate": 0.0002, + "loss": 1.7582, + "step": 1440 + }, + { + "epoch": 0.9880749574105622, + "grad_norm": 0.30816152691841125, + "learning_rate": 0.0002, + "loss": 1.758, + "step": 1450 + }, + { + "epoch": 0.9948892674616695, + "grad_norm": 0.28152793645858765, + "learning_rate": 0.0002, + "loss": 1.8159, + "step": 1460 + }, + { + "epoch": 0.9996592844974447, + "eval_loss": 1.8034634590148926, + "eval_runtime": 53.6017, + "eval_samples_per_second": 9.459, + "eval_steps_per_second": 1.194, + "step": 1467 + } + ], + "logging_steps": 10, + "max_steps": 11736, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.53762967093248e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2f734531f14829705714a5a2703d90c81f36eeb0 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2af5d486d0b370637d5a3999ab7668f94f5d53b6497f018967564aef5e8ad133 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f5cd891c77ef4929dd2579d4979e0ce11e6b4d43 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4002025d9819b79fafc141e1f79e8271951a1380f1ebbe316b20e549bfddeab +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..71395588050c1c41719fb770e00233bce63fb9de --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:226608c72e679dbbbfe91143a20a6b8f879a440121e912ed60f52738083a57f4 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..57b7f803e59eb2150d8f69d32609012dbad146d2 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c40525bbf82d41f93f6b771ab2a1dd632b97bf7bd07fc3d065d38513ede01b5b +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..7a0f10bf84b2bd756ada08f6f2ab0ef3c9c831f8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db1f83854ebaf959f41da4dd6f5de5ea276fd94830e03c1fc2b280600c84f67d +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f7618656aacc09ec3e1237c72ee383fbf324fb29 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/trainer_state.json @@ -0,0 +1,2100 @@ +{ + "best_metric": 1.8034634590148926, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467", + "epoch": 2.0, + "eval_steps": 10, + "global_step": 2935, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0068143100511073255, + "grad_norm": 0.635574460029602, + "learning_rate": 0.0002, + "loss": 3.0988, + "step": 10 + }, + { + "epoch": 0.013628620102214651, + "grad_norm": 1.0401769876480103, + "learning_rate": 0.0002, + "loss": 2.5594, + "step": 20 + }, + { + "epoch": 0.020442930153321975, + "grad_norm": 0.4514131247997284, + "learning_rate": 0.0002, + "loss": 2.3587, + "step": 30 + }, + { + "epoch": 0.027257240204429302, + "grad_norm": 0.6303355693817139, + "learning_rate": 0.0002, + "loss": 2.121, + "step": 40 + }, + { + "epoch": 0.034071550255536626, + "grad_norm": 0.4648270606994629, + "learning_rate": 0.0002, + "loss": 1.9833, + "step": 50 + }, + { + "epoch": 0.04088586030664395, + "grad_norm": 0.42953479290008545, + "learning_rate": 0.0002, + "loss": 1.9384, + "step": 60 + }, + { + "epoch": 0.04770017035775128, + "grad_norm": 0.433614581823349, + "learning_rate": 0.0002, + "loss": 1.9202, + "step": 70 + }, + { + "epoch": 0.054514480408858604, + "grad_norm": 0.45318254828453064, + "learning_rate": 0.0002, + "loss": 1.911, + "step": 80 + }, + { + "epoch": 0.06132879045996593, + "grad_norm": 0.4023568630218506, + "learning_rate": 0.0002, + "loss": 1.8895, + "step": 90 + }, + { + "epoch": 0.06814310051107325, + "grad_norm": 0.43260207772254944, + "learning_rate": 0.0002, + "loss": 1.9257, + "step": 100 + }, + { + "epoch": 0.07495741056218058, + "grad_norm": 0.43389809131622314, + "learning_rate": 0.0002, + "loss": 1.9178, + "step": 110 + }, + { + "epoch": 0.0817717206132879, + "grad_norm": 0.39307987689971924, + "learning_rate": 0.0002, + "loss": 1.9071, + "step": 120 + }, + { + "epoch": 0.08858603066439523, + "grad_norm": 0.7703037261962891, + "learning_rate": 0.0002, + "loss": 1.9482, + "step": 130 + }, + { + "epoch": 0.09540034071550256, + "grad_norm": 0.38547563552856445, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 140 + }, + { + "epoch": 0.10221465076660988, + "grad_norm": 0.37948688864707947, + "learning_rate": 0.0002, + "loss": 1.9037, + "step": 150 + }, + { + "epoch": 0.10902896081771721, + "grad_norm": 0.33022379875183105, + "learning_rate": 0.0002, + "loss": 1.8664, + "step": 160 + }, + { + "epoch": 0.11584327086882454, + "grad_norm": 0.33703792095184326, + "learning_rate": 0.0002, + "loss": 1.8659, + "step": 170 + }, + { + "epoch": 0.12265758091993186, + "grad_norm": 0.3259912431240082, + "learning_rate": 0.0002, + "loss": 1.8389, + "step": 180 + }, + { + "epoch": 0.12947189097103917, + "grad_norm": 0.3593858778476715, + "learning_rate": 0.0002, + "loss": 1.8424, + "step": 190 + }, + { + "epoch": 0.1362862010221465, + "grad_norm": 0.3401614725589752, + "learning_rate": 0.0002, + "loss": 1.8457, + "step": 200 + }, + { + "epoch": 0.14310051107325383, + "grad_norm": 0.3892078697681427, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 210 + }, + { + "epoch": 0.14991482112436116, + "grad_norm": 0.315374493598938, + "learning_rate": 0.0002, + "loss": 1.8351, + "step": 220 + }, + { + "epoch": 0.1567291311754685, + "grad_norm": 0.3679497539997101, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 230 + }, + { + "epoch": 0.1635434412265758, + "grad_norm": 0.336730033159256, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 240 + }, + { + "epoch": 0.17035775127768313, + "grad_norm": 0.36472755670547485, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 250 + }, + { + "epoch": 0.17717206132879046, + "grad_norm": 0.34864023327827454, + "learning_rate": 0.0002, + "loss": 1.7766, + "step": 260 + }, + { + "epoch": 0.1839863713798978, + "grad_norm": 0.3350819945335388, + "learning_rate": 0.0002, + "loss": 1.896, + "step": 270 + }, + { + "epoch": 0.19080068143100512, + "grad_norm": 0.3844246566295624, + "learning_rate": 0.0002, + "loss": 1.8742, + "step": 280 + }, + { + "epoch": 0.19761499148211242, + "grad_norm": 0.38413065671920776, + "learning_rate": 0.0002, + "loss": 1.8531, + "step": 290 + }, + { + "epoch": 0.20442930153321975, + "grad_norm": 0.4080047607421875, + "learning_rate": 0.0002, + "loss": 1.8415, + "step": 300 + }, + { + "epoch": 0.21124361158432708, + "grad_norm": 0.3546800911426544, + "learning_rate": 0.0002, + "loss": 1.9009, + "step": 310 + }, + { + "epoch": 0.21805792163543442, + "grad_norm": 0.38577890396118164, + "learning_rate": 0.0002, + "loss": 1.8092, + "step": 320 + }, + { + "epoch": 0.22487223168654175, + "grad_norm": 0.38979098200798035, + "learning_rate": 0.0002, + "loss": 1.7113, + "step": 330 + }, + { + "epoch": 0.23168654173764908, + "grad_norm": 0.35105520486831665, + "learning_rate": 0.0002, + "loss": 1.8162, + "step": 340 + }, + { + "epoch": 0.23850085178875638, + "grad_norm": 0.31671223044395447, + "learning_rate": 0.0002, + "loss": 1.8408, + "step": 350 + }, + { + "epoch": 0.2453151618398637, + "grad_norm": 0.33034196496009827, + "learning_rate": 0.0002, + "loss": 1.8014, + "step": 360 + }, + { + "epoch": 0.252129471890971, + "grad_norm": 0.2990533709526062, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 370 + }, + { + "epoch": 0.25894378194207834, + "grad_norm": 0.362208753824234, + "learning_rate": 0.0002, + "loss": 1.8612, + "step": 380 + }, + { + "epoch": 0.2657580919931857, + "grad_norm": 0.3269096612930298, + "learning_rate": 0.0002, + "loss": 1.8001, + "step": 390 + }, + { + "epoch": 0.272572402044293, + "grad_norm": 0.30555954575538635, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 400 + }, + { + "epoch": 0.27938671209540034, + "grad_norm": 0.332933247089386, + "learning_rate": 0.0002, + "loss": 1.8018, + "step": 410 + }, + { + "epoch": 0.28620102214650767, + "grad_norm": 0.39454060792922974, + "learning_rate": 0.0002, + "loss": 1.8157, + "step": 420 + }, + { + "epoch": 0.293015332197615, + "grad_norm": 0.34589633345603943, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 430 + }, + { + "epoch": 0.29982964224872233, + "grad_norm": 0.3747332990169525, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 440 + }, + { + "epoch": 0.30664395229982966, + "grad_norm": 0.34825369715690613, + "learning_rate": 0.0002, + "loss": 1.8476, + "step": 450 + }, + { + "epoch": 0.313458262350937, + "grad_norm": 0.32906976342201233, + "learning_rate": 0.0002, + "loss": 1.775, + "step": 460 + }, + { + "epoch": 0.3202725724020443, + "grad_norm": 0.33108609914779663, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 470 + }, + { + "epoch": 0.3270868824531516, + "grad_norm": 0.3170463442802429, + "learning_rate": 0.0002, + "loss": 1.8012, + "step": 480 + }, + { + "epoch": 0.3339011925042589, + "grad_norm": 0.30792081356048584, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 490 + }, + { + "epoch": 0.34071550255536626, + "grad_norm": 0.31772997975349426, + "learning_rate": 0.0002, + "loss": 1.7361, + "step": 500 + }, + { + "epoch": 0.3475298126064736, + "grad_norm": 0.32714012265205383, + "learning_rate": 0.0002, + "loss": 1.8147, + "step": 510 + }, + { + "epoch": 0.3543441226575809, + "grad_norm": 0.3021100163459778, + "learning_rate": 0.0002, + "loss": 1.8332, + "step": 520 + }, + { + "epoch": 0.36115843270868825, + "grad_norm": 0.6045835018157959, + "learning_rate": 0.0002, + "loss": 1.7337, + "step": 530 + }, + { + "epoch": 0.3679727427597956, + "grad_norm": 0.3003896474838257, + "learning_rate": 0.0002, + "loss": 1.8661, + "step": 540 + }, + { + "epoch": 0.3747870528109029, + "grad_norm": 0.3678470551967621, + "learning_rate": 0.0002, + "loss": 1.8359, + "step": 550 + }, + { + "epoch": 0.38160136286201024, + "grad_norm": 0.35787615180015564, + "learning_rate": 0.0002, + "loss": 1.7512, + "step": 560 + }, + { + "epoch": 0.38841567291311757, + "grad_norm": 0.31882143020629883, + "learning_rate": 0.0002, + "loss": 1.8048, + "step": 570 + }, + { + "epoch": 0.39522998296422485, + "grad_norm": 0.3186313509941101, + "learning_rate": 0.0002, + "loss": 1.8108, + "step": 580 + }, + { + "epoch": 0.4020442930153322, + "grad_norm": 0.41443702578544617, + "learning_rate": 0.0002, + "loss": 1.8012, + "step": 590 + }, + { + "epoch": 0.4088586030664395, + "grad_norm": 0.28773069381713867, + "learning_rate": 0.0002, + "loss": 1.7794, + "step": 600 + }, + { + "epoch": 0.41567291311754684, + "grad_norm": 0.35743263363838196, + "learning_rate": 0.0002, + "loss": 1.8231, + "step": 610 + }, + { + "epoch": 0.42248722316865417, + "grad_norm": 0.3360286355018616, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 620 + }, + { + "epoch": 0.4293015332197615, + "grad_norm": 0.32838866114616394, + "learning_rate": 0.0002, + "loss": 1.7654, + "step": 630 + }, + { + "epoch": 0.43611584327086883, + "grad_norm": 0.2994388937950134, + "learning_rate": 0.0002, + "loss": 1.8176, + "step": 640 + }, + { + "epoch": 0.44293015332197616, + "grad_norm": 0.3306307792663574, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 650 + }, + { + "epoch": 0.4497444633730835, + "grad_norm": 0.3129560351371765, + "learning_rate": 0.0002, + "loss": 1.8201, + "step": 660 + }, + { + "epoch": 0.4565587734241908, + "grad_norm": 0.3244289457798004, + "learning_rate": 0.0002, + "loss": 1.803, + "step": 670 + }, + { + "epoch": 0.46337308347529815, + "grad_norm": 0.3196892738342285, + "learning_rate": 0.0002, + "loss": 1.8399, + "step": 680 + }, + { + "epoch": 0.47018739352640543, + "grad_norm": 0.3135230243206024, + "learning_rate": 0.0002, + "loss": 1.8291, + "step": 690 + }, + { + "epoch": 0.47700170357751276, + "grad_norm": 0.28677991032600403, + "learning_rate": 0.0002, + "loss": 1.7423, + "step": 700 + }, + { + "epoch": 0.4838160136286201, + "grad_norm": 0.3074065148830414, + "learning_rate": 0.0002, + "loss": 1.6982, + "step": 710 + }, + { + "epoch": 0.4906303236797274, + "grad_norm": 0.3354290723800659, + "learning_rate": 0.0002, + "loss": 1.8078, + "step": 720 + }, + { + "epoch": 0.49744463373083475, + "grad_norm": 0.324370801448822, + "learning_rate": 0.0002, + "loss": 1.7333, + "step": 730 + }, + { + "epoch": 0.504258943781942, + "grad_norm": 0.29496142268180847, + "learning_rate": 0.0002, + "loss": 1.8506, + "step": 740 + }, + { + "epoch": 0.5110732538330494, + "grad_norm": 0.30694130063056946, + "learning_rate": 0.0002, + "loss": 1.7761, + "step": 750 + }, + { + "epoch": 0.5178875638841567, + "grad_norm": 0.36168408393859863, + "learning_rate": 0.0002, + "loss": 1.7854, + "step": 760 + }, + { + "epoch": 0.524701873935264, + "grad_norm": 0.2930343449115753, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 770 + }, + { + "epoch": 0.5315161839863713, + "grad_norm": 0.3023432493209839, + "learning_rate": 0.0002, + "loss": 1.7924, + "step": 780 + }, + { + "epoch": 0.5383304940374787, + "grad_norm": 0.3272720277309418, + "learning_rate": 0.0002, + "loss": 1.8467, + "step": 790 + }, + { + "epoch": 0.545144804088586, + "grad_norm": 0.2907974421977997, + "learning_rate": 0.0002, + "loss": 1.8639, + "step": 800 + }, + { + "epoch": 0.5519591141396933, + "grad_norm": 0.32267168164253235, + "learning_rate": 0.0002, + "loss": 1.9018, + "step": 810 + }, + { + "epoch": 0.5587734241908007, + "grad_norm": 0.32059940695762634, + "learning_rate": 0.0002, + "loss": 1.8311, + "step": 820 + }, + { + "epoch": 0.565587734241908, + "grad_norm": 0.30951258540153503, + "learning_rate": 0.0002, + "loss": 1.7234, + "step": 830 + }, + { + "epoch": 0.5724020442930153, + "grad_norm": 0.33976122736930847, + "learning_rate": 0.0002, + "loss": 1.8063, + "step": 840 + }, + { + "epoch": 0.5792163543441227, + "grad_norm": 0.3195820450782776, + "learning_rate": 0.0002, + "loss": 1.7021, + "step": 850 + }, + { + "epoch": 0.58603066439523, + "grad_norm": 0.2828562557697296, + "learning_rate": 0.0002, + "loss": 1.8073, + "step": 860 + }, + { + "epoch": 0.5928449744463373, + "grad_norm": 0.29591670632362366, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 870 + }, + { + "epoch": 0.5996592844974447, + "grad_norm": 0.3086104393005371, + "learning_rate": 0.0002, + "loss": 1.8109, + "step": 880 + }, + { + "epoch": 0.606473594548552, + "grad_norm": 0.3592929542064667, + "learning_rate": 0.0002, + "loss": 1.8207, + "step": 890 + }, + { + "epoch": 0.6132879045996593, + "grad_norm": 0.2830186188220978, + "learning_rate": 0.0002, + "loss": 1.7662, + "step": 900 + }, + { + "epoch": 0.6201022146507666, + "grad_norm": 0.3128598630428314, + "learning_rate": 0.0002, + "loss": 1.8344, + "step": 910 + }, + { + "epoch": 0.626916524701874, + "grad_norm": 0.31957563757896423, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 920 + }, + { + "epoch": 0.6337308347529813, + "grad_norm": 0.30994319915771484, + "learning_rate": 0.0002, + "loss": 1.7578, + "step": 930 + }, + { + "epoch": 0.6405451448040886, + "grad_norm": 0.3352845013141632, + "learning_rate": 0.0002, + "loss": 1.8494, + "step": 940 + }, + { + "epoch": 0.6473594548551959, + "grad_norm": 0.2960077226161957, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 950 + }, + { + "epoch": 0.6541737649063032, + "grad_norm": 0.32675081491470337, + "learning_rate": 0.0002, + "loss": 1.7209, + "step": 960 + }, + { + "epoch": 0.6609880749574105, + "grad_norm": 0.30042028427124023, + "learning_rate": 0.0002, + "loss": 1.7776, + "step": 970 + }, + { + "epoch": 0.6678023850085179, + "grad_norm": 0.3288673758506775, + "learning_rate": 0.0002, + "loss": 1.7597, + "step": 980 + }, + { + "epoch": 0.6746166950596252, + "grad_norm": 0.323215126991272, + "learning_rate": 0.0002, + "loss": 1.7962, + "step": 990 + }, + { + "epoch": 0.6814310051107325, + "grad_norm": 0.30041399598121643, + "learning_rate": 0.0002, + "loss": 1.821, + "step": 1000 + }, + { + "epoch": 0.6882453151618398, + "grad_norm": 0.3076179623603821, + "learning_rate": 0.0002, + "loss": 1.7716, + "step": 1010 + }, + { + "epoch": 0.6950596252129472, + "grad_norm": 0.2941909730434418, + "learning_rate": 0.0002, + "loss": 1.7827, + "step": 1020 + }, + { + "epoch": 0.7018739352640545, + "grad_norm": 0.32220420241355896, + "learning_rate": 0.0002, + "loss": 1.7964, + "step": 1030 + }, + { + "epoch": 0.7086882453151618, + "grad_norm": 0.2989702820777893, + "learning_rate": 0.0002, + "loss": 1.7101, + "step": 1040 + }, + { + "epoch": 0.7155025553662692, + "grad_norm": 0.3049640357494354, + "learning_rate": 0.0002, + "loss": 1.8749, + "step": 1050 + }, + { + "epoch": 0.7223168654173765, + "grad_norm": 0.3183284103870392, + "learning_rate": 0.0002, + "loss": 1.782, + "step": 1060 + }, + { + "epoch": 0.7291311754684838, + "grad_norm": 0.3070095181465149, + "learning_rate": 0.0002, + "loss": 1.785, + "step": 1070 + }, + { + "epoch": 0.7359454855195912, + "grad_norm": 0.33263063430786133, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 1080 + }, + { + "epoch": 0.7427597955706985, + "grad_norm": 0.28774312138557434, + "learning_rate": 0.0002, + "loss": 1.7627, + "step": 1090 + }, + { + "epoch": 0.7495741056218058, + "grad_norm": 0.29304224252700806, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1100 + }, + { + "epoch": 0.7563884156729132, + "grad_norm": 0.27529507875442505, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 1110 + }, + { + "epoch": 0.7632027257240205, + "grad_norm": 0.32319945096969604, + "learning_rate": 0.0002, + "loss": 1.8071, + "step": 1120 + }, + { + "epoch": 0.7700170357751278, + "grad_norm": 0.33777597546577454, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 1130 + }, + { + "epoch": 0.7768313458262351, + "grad_norm": 0.29847201704978943, + "learning_rate": 0.0002, + "loss": 1.8488, + "step": 1140 + }, + { + "epoch": 0.7836456558773425, + "grad_norm": 0.31370633840560913, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 1150 + }, + { + "epoch": 0.7904599659284497, + "grad_norm": 0.31633856892585754, + "learning_rate": 0.0002, + "loss": 1.8175, + "step": 1160 + }, + { + "epoch": 0.797274275979557, + "grad_norm": 0.33849263191223145, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 1170 + }, + { + "epoch": 0.8040885860306644, + "grad_norm": 0.3306333124637604, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 1180 + }, + { + "epoch": 0.8109028960817717, + "grad_norm": 0.3249678313732147, + "learning_rate": 0.0002, + "loss": 1.777, + "step": 1190 + }, + { + "epoch": 0.817717206132879, + "grad_norm": 0.3252817690372467, + "learning_rate": 0.0002, + "loss": 1.7631, + "step": 1200 + }, + { + "epoch": 0.8245315161839863, + "grad_norm": 0.31772559881210327, + "learning_rate": 0.0002, + "loss": 1.7877, + "step": 1210 + }, + { + "epoch": 0.8313458262350937, + "grad_norm": 0.2803371846675873, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 1220 + }, + { + "epoch": 0.838160136286201, + "grad_norm": 0.26700571179389954, + "learning_rate": 0.0002, + "loss": 1.738, + "step": 1230 + }, + { + "epoch": 0.8449744463373083, + "grad_norm": 0.3060092031955719, + "learning_rate": 0.0002, + "loss": 1.8301, + "step": 1240 + }, + { + "epoch": 0.8517887563884157, + "grad_norm": 0.28831684589385986, + "learning_rate": 0.0002, + "loss": 1.7993, + "step": 1250 + }, + { + "epoch": 0.858603066439523, + "grad_norm": 0.30708742141723633, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 1260 + }, + { + "epoch": 0.8654173764906303, + "grad_norm": 0.2915987968444824, + "learning_rate": 0.0002, + "loss": 1.8506, + "step": 1270 + }, + { + "epoch": 0.8722316865417377, + "grad_norm": 0.2893589735031128, + "learning_rate": 0.0002, + "loss": 1.7536, + "step": 1280 + }, + { + "epoch": 0.879045996592845, + "grad_norm": 0.29545632004737854, + "learning_rate": 0.0002, + "loss": 1.7437, + "step": 1290 + }, + { + "epoch": 0.8858603066439523, + "grad_norm": 0.3354771137237549, + "learning_rate": 0.0002, + "loss": 1.859, + "step": 1300 + }, + { + "epoch": 0.8926746166950597, + "grad_norm": 0.37715399265289307, + "learning_rate": 0.0002, + "loss": 1.7644, + "step": 1310 + }, + { + "epoch": 0.899488926746167, + "grad_norm": 0.28847193717956543, + "learning_rate": 0.0002, + "loss": 1.7731, + "step": 1320 + }, + { + "epoch": 0.9063032367972743, + "grad_norm": 0.2780889868736267, + "learning_rate": 0.0002, + "loss": 1.9077, + "step": 1330 + }, + { + "epoch": 0.9131175468483816, + "grad_norm": 0.2714342176914215, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 1340 + }, + { + "epoch": 0.919931856899489, + "grad_norm": 0.2950133979320526, + "learning_rate": 0.0002, + "loss": 1.8, + "step": 1350 + }, + { + "epoch": 0.9267461669505963, + "grad_norm": 0.29097145795822144, + "learning_rate": 0.0002, + "loss": 1.7368, + "step": 1360 + }, + { + "epoch": 0.9335604770017035, + "grad_norm": 0.32540133595466614, + "learning_rate": 0.0002, + "loss": 1.7864, + "step": 1370 + }, + { + "epoch": 0.9403747870528109, + "grad_norm": 0.3076636493206024, + "learning_rate": 0.0002, + "loss": 1.7571, + "step": 1380 + }, + { + "epoch": 0.9471890971039182, + "grad_norm": 0.2962130308151245, + "learning_rate": 0.0002, + "loss": 1.8312, + "step": 1390 + }, + { + "epoch": 0.9540034071550255, + "grad_norm": 0.30086860060691833, + "learning_rate": 0.0002, + "loss": 1.7858, + "step": 1400 + }, + { + "epoch": 0.9608177172061328, + "grad_norm": 0.28634947538375854, + "learning_rate": 0.0002, + "loss": 1.7991, + "step": 1410 + }, + { + "epoch": 0.9676320272572402, + "grad_norm": 0.35314416885375977, + "learning_rate": 0.0002, + "loss": 1.8385, + "step": 1420 + }, + { + "epoch": 0.9744463373083475, + "grad_norm": 0.2939317524433136, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 1430 + }, + { + "epoch": 0.9812606473594548, + "grad_norm": 0.3010196089744568, + "learning_rate": 0.0002, + "loss": 1.7582, + "step": 1440 + }, + { + "epoch": 0.9880749574105622, + "grad_norm": 0.30816152691841125, + "learning_rate": 0.0002, + "loss": 1.758, + "step": 1450 + }, + { + "epoch": 0.9948892674616695, + "grad_norm": 0.28152793645858765, + "learning_rate": 0.0002, + "loss": 1.8159, + "step": 1460 + }, + { + "epoch": 0.9996592844974447, + "eval_loss": 1.8034634590148926, + "eval_runtime": 53.6017, + "eval_samples_per_second": 9.459, + "eval_steps_per_second": 1.194, + "step": 1467 + }, + { + "epoch": 1.0017035775127767, + "grad_norm": 0.29246416687965393, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 1470 + }, + { + "epoch": 1.008517887563884, + "grad_norm": 0.2668602168560028, + "learning_rate": 0.0002, + "loss": 1.8159, + "step": 1480 + }, + { + "epoch": 1.0153321976149914, + "grad_norm": 0.2998567819595337, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 1490 + }, + { + "epoch": 1.0221465076660987, + "grad_norm": 0.3284934461116791, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 1500 + }, + { + "epoch": 1.028960817717206, + "grad_norm": 0.3275827169418335, + "learning_rate": 0.0002, + "loss": 1.689, + "step": 1510 + }, + { + "epoch": 1.0357751277683134, + "grad_norm": 0.3382718563079834, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 1520 + }, + { + "epoch": 1.0425894378194207, + "grad_norm": 0.36737215518951416, + "learning_rate": 0.0002, + "loss": 1.7215, + "step": 1530 + }, + { + "epoch": 1.049403747870528, + "grad_norm": 0.3442603647708893, + "learning_rate": 0.0002, + "loss": 1.6993, + "step": 1540 + }, + { + "epoch": 1.0562180579216354, + "grad_norm": 0.3323381245136261, + "learning_rate": 0.0002, + "loss": 1.6996, + "step": 1550 + }, + { + "epoch": 1.0630323679727427, + "grad_norm": 0.341227650642395, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 1560 + }, + { + "epoch": 1.06984667802385, + "grad_norm": 0.3361579477787018, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 1570 + }, + { + "epoch": 1.0766609880749574, + "grad_norm": 0.3556230962276459, + "learning_rate": 0.0002, + "loss": 1.7337, + "step": 1580 + }, + { + "epoch": 1.0834752981260647, + "grad_norm": 0.27130424976348877, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 1590 + }, + { + "epoch": 1.090289608177172, + "grad_norm": 0.29366323351860046, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 1600 + }, + { + "epoch": 1.0971039182282794, + "grad_norm": 0.3581245541572571, + "learning_rate": 0.0002, + "loss": 1.6721, + "step": 1610 + }, + { + "epoch": 1.1039182282793867, + "grad_norm": 0.3021670877933502, + "learning_rate": 0.0002, + "loss": 1.7639, + "step": 1620 + }, + { + "epoch": 1.110732538330494, + "grad_norm": 0.3145572543144226, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 1630 + }, + { + "epoch": 1.1175468483816013, + "grad_norm": 0.35362154245376587, + "learning_rate": 0.0002, + "loss": 1.7408, + "step": 1640 + }, + { + "epoch": 1.1243611584327087, + "grad_norm": 0.5413113236427307, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 1650 + }, + { + "epoch": 1.131175468483816, + "grad_norm": 0.6858654022216797, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 1660 + }, + { + "epoch": 1.1379897785349233, + "grad_norm": 0.3781903386116028, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 1670 + }, + { + "epoch": 1.1448040885860307, + "grad_norm": 0.404864102602005, + "learning_rate": 0.0002, + "loss": 1.659, + "step": 1680 + }, + { + "epoch": 1.151618398637138, + "grad_norm": 0.3595100939273834, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 1690 + }, + { + "epoch": 1.1584327086882453, + "grad_norm": 0.33682283759117126, + "learning_rate": 0.0002, + "loss": 1.7023, + "step": 1700 + }, + { + "epoch": 1.1652470187393527, + "grad_norm": 0.3877373933792114, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 1710 + }, + { + "epoch": 1.17206132879046, + "grad_norm": 0.34606897830963135, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 1720 + }, + { + "epoch": 1.1788756388415673, + "grad_norm": 0.3122918903827667, + "learning_rate": 0.0002, + "loss": 1.6889, + "step": 1730 + }, + { + "epoch": 1.1856899488926746, + "grad_norm": 0.34081900119781494, + "learning_rate": 0.0002, + "loss": 1.6585, + "step": 1740 + }, + { + "epoch": 1.192504258943782, + "grad_norm": 0.3418807089328766, + "learning_rate": 0.0002, + "loss": 1.7794, + "step": 1750 + }, + { + "epoch": 1.1993185689948893, + "grad_norm": 0.3495500981807709, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 1760 + }, + { + "epoch": 1.2061328790459966, + "grad_norm": 0.557288408279419, + "learning_rate": 0.0002, + "loss": 1.6578, + "step": 1770 + }, + { + "epoch": 1.212947189097104, + "grad_norm": 0.3193778693675995, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 1780 + }, + { + "epoch": 1.2197614991482113, + "grad_norm": 0.3306216895580292, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 1790 + }, + { + "epoch": 1.2265758091993186, + "grad_norm": 0.37998732924461365, + "learning_rate": 0.0002, + "loss": 1.6772, + "step": 1800 + }, + { + "epoch": 1.233390119250426, + "grad_norm": 0.3255669176578522, + "learning_rate": 0.0002, + "loss": 1.7094, + "step": 1810 + }, + { + "epoch": 1.2402044293015333, + "grad_norm": 0.3741287291049957, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 1820 + }, + { + "epoch": 1.2470187393526406, + "grad_norm": 0.36727291345596313, + "learning_rate": 0.0002, + "loss": 1.7317, + "step": 1830 + }, + { + "epoch": 1.253833049403748, + "grad_norm": 0.3479527235031128, + "learning_rate": 0.0002, + "loss": 1.7418, + "step": 1840 + }, + { + "epoch": 1.2606473594548553, + "grad_norm": 0.3472636640071869, + "learning_rate": 0.0002, + "loss": 1.7062, + "step": 1850 + }, + { + "epoch": 1.2674616695059626, + "grad_norm": 0.3702869415283203, + "learning_rate": 0.0002, + "loss": 1.6263, + "step": 1860 + }, + { + "epoch": 1.27427597955707, + "grad_norm": 0.3934040069580078, + "learning_rate": 0.0002, + "loss": 1.6222, + "step": 1870 + }, + { + "epoch": 1.2810902896081773, + "grad_norm": 0.46887534856796265, + "learning_rate": 0.0002, + "loss": 1.7616, + "step": 1880 + }, + { + "epoch": 1.2879045996592846, + "grad_norm": 0.3191998600959778, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 1890 + }, + { + "epoch": 1.294718909710392, + "grad_norm": 0.34032225608825684, + "learning_rate": 0.0002, + "loss": 1.7001, + "step": 1900 + }, + { + "epoch": 1.3015332197614993, + "grad_norm": 0.33453696966171265, + "learning_rate": 0.0002, + "loss": 1.8316, + "step": 1910 + }, + { + "epoch": 1.3083475298126066, + "grad_norm": 0.3451494872570038, + "learning_rate": 0.0002, + "loss": 1.6567, + "step": 1920 + }, + { + "epoch": 1.315161839863714, + "grad_norm": 0.36203092336654663, + "learning_rate": 0.0002, + "loss": 1.708, + "step": 1930 + }, + { + "epoch": 1.321976149914821, + "grad_norm": 0.43794456124305725, + "learning_rate": 0.0002, + "loss": 1.7095, + "step": 1940 + }, + { + "epoch": 1.3287904599659284, + "grad_norm": 0.3630591034889221, + "learning_rate": 0.0002, + "loss": 1.7264, + "step": 1950 + }, + { + "epoch": 1.3356047700170357, + "grad_norm": 0.36951911449432373, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 1960 + }, + { + "epoch": 1.342419080068143, + "grad_norm": 0.4001159965991974, + "learning_rate": 0.0002, + "loss": 1.651, + "step": 1970 + }, + { + "epoch": 1.3492333901192504, + "grad_norm": 0.3820836544036865, + "learning_rate": 0.0002, + "loss": 1.677, + "step": 1980 + }, + { + "epoch": 1.3560477001703577, + "grad_norm": 0.3705870807170868, + "learning_rate": 0.0002, + "loss": 1.583, + "step": 1990 + }, + { + "epoch": 1.362862010221465, + "grad_norm": 0.3557972013950348, + "learning_rate": 0.0002, + "loss": 1.6207, + "step": 2000 + }, + { + "epoch": 1.3696763202725724, + "grad_norm": 0.38546398282051086, + "learning_rate": 0.0002, + "loss": 1.7656, + "step": 2010 + }, + { + "epoch": 1.3764906303236797, + "grad_norm": 0.3908020853996277, + "learning_rate": 0.0002, + "loss": 1.6881, + "step": 2020 + }, + { + "epoch": 1.383304940374787, + "grad_norm": 0.3822040855884552, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 2030 + }, + { + "epoch": 1.3901192504258943, + "grad_norm": 0.425327867269516, + "learning_rate": 0.0002, + "loss": 1.7923, + "step": 2040 + }, + { + "epoch": 1.3969335604770017, + "grad_norm": 0.3436259329319, + "learning_rate": 0.0002, + "loss": 1.7032, + "step": 2050 + }, + { + "epoch": 1.403747870528109, + "grad_norm": 0.33124062418937683, + "learning_rate": 0.0002, + "loss": 1.6398, + "step": 2060 + }, + { + "epoch": 1.4105621805792163, + "grad_norm": 0.3662424683570862, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 2070 + }, + { + "epoch": 1.4173764906303237, + "grad_norm": 0.3720635175704956, + "learning_rate": 0.0002, + "loss": 1.7258, + "step": 2080 + }, + { + "epoch": 1.424190800681431, + "grad_norm": 0.3361680805683136, + "learning_rate": 0.0002, + "loss": 1.7186, + "step": 2090 + }, + { + "epoch": 1.4310051107325383, + "grad_norm": 0.32751724123954773, + "learning_rate": 0.0002, + "loss": 1.7606, + "step": 2100 + }, + { + "epoch": 1.4378194207836457, + "grad_norm": 0.34333378076553345, + "learning_rate": 0.0002, + "loss": 1.7051, + "step": 2110 + }, + { + "epoch": 1.444633730834753, + "grad_norm": 0.37777671217918396, + "learning_rate": 0.0002, + "loss": 1.6979, + "step": 2120 + }, + { + "epoch": 1.4514480408858603, + "grad_norm": 0.37126365303993225, + "learning_rate": 0.0002, + "loss": 1.7085, + "step": 2130 + }, + { + "epoch": 1.4582623509369677, + "grad_norm": 0.3602267801761627, + "learning_rate": 0.0002, + "loss": 1.721, + "step": 2140 + }, + { + "epoch": 1.465076660988075, + "grad_norm": 0.3287110924720764, + "learning_rate": 0.0002, + "loss": 1.8148, + "step": 2150 + }, + { + "epoch": 1.4718909710391823, + "grad_norm": 0.3562135100364685, + "learning_rate": 0.0002, + "loss": 1.6966, + "step": 2160 + }, + { + "epoch": 1.4787052810902896, + "grad_norm": 0.38292962312698364, + "learning_rate": 0.0002, + "loss": 1.713, + "step": 2170 + }, + { + "epoch": 1.485519591141397, + "grad_norm": 0.38220319151878357, + "learning_rate": 0.0002, + "loss": 1.7036, + "step": 2180 + }, + { + "epoch": 1.4923339011925043, + "grad_norm": 0.3570062220096588, + "learning_rate": 0.0002, + "loss": 1.7297, + "step": 2190 + }, + { + "epoch": 1.4991482112436116, + "grad_norm": 0.363146036863327, + "learning_rate": 0.0002, + "loss": 1.5652, + "step": 2200 + }, + { + "epoch": 1.5059625212947187, + "grad_norm": 0.37393274903297424, + "learning_rate": 0.0002, + "loss": 1.6716, + "step": 2210 + }, + { + "epoch": 1.512776831345826, + "grad_norm": 0.3628501892089844, + "learning_rate": 0.0002, + "loss": 1.6839, + "step": 2220 + }, + { + "epoch": 1.5195911413969334, + "grad_norm": 0.33430740237236023, + "learning_rate": 0.0002, + "loss": 1.6627, + "step": 2230 + }, + { + "epoch": 1.5264054514480407, + "grad_norm": 0.35363978147506714, + "learning_rate": 0.0002, + "loss": 1.7495, + "step": 2240 + }, + { + "epoch": 1.533219761499148, + "grad_norm": 0.37220337986946106, + "learning_rate": 0.0002, + "loss": 1.6364, + "step": 2250 + }, + { + "epoch": 1.5400340715502554, + "grad_norm": 0.35020262002944946, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 2260 + }, + { + "epoch": 1.5468483816013627, + "grad_norm": 0.35274937748908997, + "learning_rate": 0.0002, + "loss": 1.7453, + "step": 2270 + }, + { + "epoch": 1.55366269165247, + "grad_norm": 0.3797738552093506, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 2280 + }, + { + "epoch": 1.5604770017035774, + "grad_norm": 0.4160412847995758, + "learning_rate": 0.0002, + "loss": 1.6197, + "step": 2290 + }, + { + "epoch": 1.5672913117546847, + "grad_norm": 0.38779592514038086, + "learning_rate": 0.0002, + "loss": 1.7101, + "step": 2300 + }, + { + "epoch": 1.574105621805792, + "grad_norm": 0.39171287417411804, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 2310 + }, + { + "epoch": 1.5809199318568994, + "grad_norm": 0.3606826663017273, + "learning_rate": 0.0002, + "loss": 1.7087, + "step": 2320 + }, + { + "epoch": 1.5877342419080067, + "grad_norm": 0.3745017647743225, + "learning_rate": 0.0002, + "loss": 1.7269, + "step": 2330 + }, + { + "epoch": 1.594548551959114, + "grad_norm": 0.34933462738990784, + "learning_rate": 0.0002, + "loss": 1.7068, + "step": 2340 + }, + { + "epoch": 1.6013628620102214, + "grad_norm": 0.37268444895744324, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 2350 + }, + { + "epoch": 1.6081771720613287, + "grad_norm": 0.4603484869003296, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 2360 + }, + { + "epoch": 1.614991482112436, + "grad_norm": 0.35689088702201843, + "learning_rate": 0.0002, + "loss": 1.7621, + "step": 2370 + }, + { + "epoch": 1.6218057921635434, + "grad_norm": 0.3392031490802765, + "learning_rate": 0.0002, + "loss": 1.6989, + "step": 2380 + }, + { + "epoch": 1.6286201022146507, + "grad_norm": 0.394653856754303, + "learning_rate": 0.0002, + "loss": 1.7077, + "step": 2390 + }, + { + "epoch": 1.635434412265758, + "grad_norm": 0.33972012996673584, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 2400 + }, + { + "epoch": 1.6422487223168654, + "grad_norm": 0.3854375183582306, + "learning_rate": 0.0002, + "loss": 1.7681, + "step": 2410 + }, + { + "epoch": 1.6490630323679727, + "grad_norm": 0.36143961548805237, + "learning_rate": 0.0002, + "loss": 1.7102, + "step": 2420 + }, + { + "epoch": 1.65587734241908, + "grad_norm": 0.35816189646720886, + "learning_rate": 0.0002, + "loss": 1.7432, + "step": 2430 + }, + { + "epoch": 1.6626916524701874, + "grad_norm": 0.36298736929893494, + "learning_rate": 0.0002, + "loss": 1.6407, + "step": 2440 + }, + { + "epoch": 1.6695059625212947, + "grad_norm": 0.36756929755210876, + "learning_rate": 0.0002, + "loss": 1.723, + "step": 2450 + }, + { + "epoch": 1.676320272572402, + "grad_norm": 0.35969603061676025, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 2460 + }, + { + "epoch": 1.6831345826235093, + "grad_norm": 0.38449376821517944, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 2470 + }, + { + "epoch": 1.6899488926746167, + "grad_norm": 0.44511452317237854, + "learning_rate": 0.0002, + "loss": 1.7261, + "step": 2480 + }, + { + "epoch": 1.696763202725724, + "grad_norm": 0.3831416368484497, + "learning_rate": 0.0002, + "loss": 1.7397, + "step": 2490 + }, + { + "epoch": 1.7035775127768313, + "grad_norm": 0.3795325756072998, + "learning_rate": 0.0002, + "loss": 1.7046, + "step": 2500 + }, + { + "epoch": 1.7103918228279387, + "grad_norm": 0.34978193044662476, + "learning_rate": 0.0002, + "loss": 1.6993, + "step": 2510 + }, + { + "epoch": 1.717206132879046, + "grad_norm": 0.35923877358436584, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 2520 + }, + { + "epoch": 1.7240204429301533, + "grad_norm": 0.352999746799469, + "learning_rate": 0.0002, + "loss": 1.6253, + "step": 2530 + }, + { + "epoch": 1.7308347529812607, + "grad_norm": 0.43673479557037354, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 2540 + }, + { + "epoch": 1.737649063032368, + "grad_norm": 0.4153687357902527, + "learning_rate": 0.0002, + "loss": 1.7079, + "step": 2550 + }, + { + "epoch": 1.7444633730834753, + "grad_norm": 0.35541167855262756, + "learning_rate": 0.0002, + "loss": 1.6714, + "step": 2560 + }, + { + "epoch": 1.7512776831345827, + "grad_norm": 0.3288775086402893, + "learning_rate": 0.0002, + "loss": 1.696, + "step": 2570 + }, + { + "epoch": 1.75809199318569, + "grad_norm": 0.3991123139858246, + "learning_rate": 0.0002, + "loss": 1.7486, + "step": 2580 + }, + { + "epoch": 1.7649063032367973, + "grad_norm": 0.39967241883277893, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 2590 + }, + { + "epoch": 1.7717206132879046, + "grad_norm": 0.41104283928871155, + "learning_rate": 0.0002, + "loss": 1.6855, + "step": 2600 + }, + { + "epoch": 1.778534923339012, + "grad_norm": 0.44885286688804626, + "learning_rate": 0.0002, + "loss": 1.6993, + "step": 2610 + }, + { + "epoch": 1.7853492333901193, + "grad_norm": 0.38384467363357544, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 2620 + }, + { + "epoch": 1.7921635434412266, + "grad_norm": 0.35020917654037476, + "learning_rate": 0.0002, + "loss": 1.7213, + "step": 2630 + }, + { + "epoch": 1.798977853492334, + "grad_norm": 0.3360341489315033, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 2640 + }, + { + "epoch": 1.8057921635434413, + "grad_norm": 0.38875144720077515, + "learning_rate": 0.0002, + "loss": 1.7037, + "step": 2650 + }, + { + "epoch": 1.8126064735945486, + "grad_norm": 0.34876883029937744, + "learning_rate": 0.0002, + "loss": 1.693, + "step": 2660 + }, + { + "epoch": 1.819420783645656, + "grad_norm": 0.419979989528656, + "learning_rate": 0.0002, + "loss": 1.7743, + "step": 2670 + }, + { + "epoch": 1.8262350936967633, + "grad_norm": 0.3648919463157654, + "learning_rate": 0.0002, + "loss": 1.7089, + "step": 2680 + }, + { + "epoch": 1.8330494037478706, + "grad_norm": 0.3485383987426758, + "learning_rate": 0.0002, + "loss": 1.7828, + "step": 2690 + }, + { + "epoch": 1.839863713798978, + "grad_norm": 0.3647468686103821, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 2700 + }, + { + "epoch": 1.8466780238500853, + "grad_norm": 0.37003210186958313, + "learning_rate": 0.0002, + "loss": 1.7318, + "step": 2710 + }, + { + "epoch": 1.8534923339011926, + "grad_norm": 0.37031617760658264, + "learning_rate": 0.0002, + "loss": 1.6647, + "step": 2720 + }, + { + "epoch": 1.8603066439523, + "grad_norm": 0.3438796103000641, + "learning_rate": 0.0002, + "loss": 1.69, + "step": 2730 + }, + { + "epoch": 1.8671209540034073, + "grad_norm": 0.41574627161026, + "learning_rate": 0.0002, + "loss": 1.5995, + "step": 2740 + }, + { + "epoch": 1.8739352640545146, + "grad_norm": 0.35049930214881897, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 2750 + }, + { + "epoch": 1.880749574105622, + "grad_norm": 0.3943989872932434, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 2760 + }, + { + "epoch": 1.8875638841567293, + "grad_norm": 0.3384978175163269, + "learning_rate": 0.0002, + "loss": 1.7047, + "step": 2770 + }, + { + "epoch": 1.8943781942078366, + "grad_norm": 0.3501328229904175, + "learning_rate": 0.0002, + "loss": 1.7848, + "step": 2780 + }, + { + "epoch": 1.901192504258944, + "grad_norm": 0.37484532594680786, + "learning_rate": 0.0002, + "loss": 1.6986, + "step": 2790 + }, + { + "epoch": 1.9080068143100513, + "grad_norm": 0.34497788548469543, + "learning_rate": 0.0002, + "loss": 1.6867, + "step": 2800 + }, + { + "epoch": 1.9148211243611586, + "grad_norm": 0.3530851900577545, + "learning_rate": 0.0002, + "loss": 1.6513, + "step": 2810 + }, + { + "epoch": 1.921635434412266, + "grad_norm": 0.3879254162311554, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 2820 + }, + { + "epoch": 1.9284497444633732, + "grad_norm": 0.3885590136051178, + "learning_rate": 0.0002, + "loss": 1.6786, + "step": 2830 + }, + { + "epoch": 1.9352640545144804, + "grad_norm": 0.3868715465068817, + "learning_rate": 0.0002, + "loss": 1.8049, + "step": 2840 + }, + { + "epoch": 1.9420783645655877, + "grad_norm": 0.4152422249317169, + "learning_rate": 0.0002, + "loss": 1.7099, + "step": 2850 + }, + { + "epoch": 1.948892674616695, + "grad_norm": 0.401714563369751, + "learning_rate": 0.0002, + "loss": 1.6696, + "step": 2860 + }, + { + "epoch": 1.9557069846678024, + "grad_norm": 0.34825265407562256, + "learning_rate": 0.0002, + "loss": 1.7182, + "step": 2870 + }, + { + "epoch": 1.9625212947189097, + "grad_norm": 0.3620675504207611, + "learning_rate": 0.0002, + "loss": 1.6612, + "step": 2880 + }, + { + "epoch": 1.969335604770017, + "grad_norm": 0.3977806866168976, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 2890 + }, + { + "epoch": 1.9761499148211243, + "grad_norm": 0.3687497079372406, + "learning_rate": 0.0002, + "loss": 1.6514, + "step": 2900 + }, + { + "epoch": 1.9829642248722317, + "grad_norm": 0.408640056848526, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 2910 + }, + { + "epoch": 1.989778534923339, + "grad_norm": 0.34510108828544617, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 2920 + }, + { + "epoch": 1.9965928449744463, + "grad_norm": 0.3596334755420685, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 2930 + }, + { + "epoch": 2.0, + "eval_loss": 1.8056600093841553, + "eval_runtime": 60.7049, + "eval_samples_per_second": 8.352, + "eval_steps_per_second": 1.054, + "step": 2935 + } + ], + "logging_steps": 10, + "max_steps": 11736, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.507525934186496e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2f734531f14829705714a5a2703d90c81f36eeb0 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-2935/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2af5d486d0b370637d5a3999ab7668f94f5d53b6497f018967564aef5e8ad133 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6bee3b71e95f0a43b8ca574e73891b42fd79eb98 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1119cfe3cb03edfb548e079f18e1345de95da69f41661941c75756159c726705 +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..27d7503edab0293c3438eb8a6dfd785a5ecc9ec6 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aac90f87d8e03db18f03bf6c9ad1c5059c046e7fba5a0fa8e5a3d5f428ded113 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..48fc1bb500daa1e3d3179e235ea4bc81cdfd1a58 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43941c5d604916976d93eeef5d005b177b85c5f9ab835e629517be021403dc9e +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..57abc6027461e3535aa2fe48d873bcd201876696 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f06319f7fba3d5447e00cccb8b0e8732e3211398e356d6a5bda2974882ed4a9 +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..068680b71f386477b481d24fc35389494c895fde --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/trainer_state.json @@ -0,0 +1,3137 @@ +{ + "best_metric": 1.8034634590148926, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467", + "epoch": 2.9996592844974446, + "eval_steps": 10, + "global_step": 4402, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0068143100511073255, + "grad_norm": 0.635574460029602, + "learning_rate": 0.0002, + "loss": 3.0988, + "step": 10 + }, + { + "epoch": 0.013628620102214651, + "grad_norm": 1.0401769876480103, + "learning_rate": 0.0002, + "loss": 2.5594, + "step": 20 + }, + { + "epoch": 0.020442930153321975, + "grad_norm": 0.4514131247997284, + "learning_rate": 0.0002, + "loss": 2.3587, + "step": 30 + }, + { + "epoch": 0.027257240204429302, + "grad_norm": 0.6303355693817139, + "learning_rate": 0.0002, + "loss": 2.121, + "step": 40 + }, + { + "epoch": 0.034071550255536626, + "grad_norm": 0.4648270606994629, + "learning_rate": 0.0002, + "loss": 1.9833, + "step": 50 + }, + { + "epoch": 0.04088586030664395, + "grad_norm": 0.42953479290008545, + "learning_rate": 0.0002, + "loss": 1.9384, + "step": 60 + }, + { + "epoch": 0.04770017035775128, + "grad_norm": 0.433614581823349, + "learning_rate": 0.0002, + "loss": 1.9202, + "step": 70 + }, + { + "epoch": 0.054514480408858604, + "grad_norm": 0.45318254828453064, + "learning_rate": 0.0002, + "loss": 1.911, + "step": 80 + }, + { + "epoch": 0.06132879045996593, + "grad_norm": 0.4023568630218506, + "learning_rate": 0.0002, + "loss": 1.8895, + "step": 90 + }, + { + "epoch": 0.06814310051107325, + "grad_norm": 0.43260207772254944, + "learning_rate": 0.0002, + "loss": 1.9257, + "step": 100 + }, + { + "epoch": 0.07495741056218058, + "grad_norm": 0.43389809131622314, + "learning_rate": 0.0002, + "loss": 1.9178, + "step": 110 + }, + { + "epoch": 0.0817717206132879, + "grad_norm": 0.39307987689971924, + "learning_rate": 0.0002, + "loss": 1.9071, + "step": 120 + }, + { + "epoch": 0.08858603066439523, + "grad_norm": 0.7703037261962891, + "learning_rate": 0.0002, + "loss": 1.9482, + "step": 130 + }, + { + "epoch": 0.09540034071550256, + "grad_norm": 0.38547563552856445, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 140 + }, + { + "epoch": 0.10221465076660988, + "grad_norm": 0.37948688864707947, + "learning_rate": 0.0002, + "loss": 1.9037, + "step": 150 + }, + { + "epoch": 0.10902896081771721, + "grad_norm": 0.33022379875183105, + "learning_rate": 0.0002, + "loss": 1.8664, + "step": 160 + }, + { + "epoch": 0.11584327086882454, + "grad_norm": 0.33703792095184326, + "learning_rate": 0.0002, + "loss": 1.8659, + "step": 170 + }, + { + "epoch": 0.12265758091993186, + "grad_norm": 0.3259912431240082, + "learning_rate": 0.0002, + "loss": 1.8389, + "step": 180 + }, + { + "epoch": 0.12947189097103917, + "grad_norm": 0.3593858778476715, + "learning_rate": 0.0002, + "loss": 1.8424, + "step": 190 + }, + { + "epoch": 0.1362862010221465, + "grad_norm": 0.3401614725589752, + "learning_rate": 0.0002, + "loss": 1.8457, + "step": 200 + }, + { + "epoch": 0.14310051107325383, + "grad_norm": 0.3892078697681427, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 210 + }, + { + "epoch": 0.14991482112436116, + "grad_norm": 0.315374493598938, + "learning_rate": 0.0002, + "loss": 1.8351, + "step": 220 + }, + { + "epoch": 0.1567291311754685, + "grad_norm": 0.3679497539997101, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 230 + }, + { + "epoch": 0.1635434412265758, + "grad_norm": 0.336730033159256, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 240 + }, + { + "epoch": 0.17035775127768313, + "grad_norm": 0.36472755670547485, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 250 + }, + { + "epoch": 0.17717206132879046, + "grad_norm": 0.34864023327827454, + "learning_rate": 0.0002, + "loss": 1.7766, + "step": 260 + }, + { + "epoch": 0.1839863713798978, + "grad_norm": 0.3350819945335388, + "learning_rate": 0.0002, + "loss": 1.896, + "step": 270 + }, + { + "epoch": 0.19080068143100512, + "grad_norm": 0.3844246566295624, + "learning_rate": 0.0002, + "loss": 1.8742, + "step": 280 + }, + { + "epoch": 0.19761499148211242, + "grad_norm": 0.38413065671920776, + "learning_rate": 0.0002, + "loss": 1.8531, + "step": 290 + }, + { + "epoch": 0.20442930153321975, + "grad_norm": 0.4080047607421875, + "learning_rate": 0.0002, + "loss": 1.8415, + "step": 300 + }, + { + "epoch": 0.21124361158432708, + "grad_norm": 0.3546800911426544, + "learning_rate": 0.0002, + "loss": 1.9009, + "step": 310 + }, + { + "epoch": 0.21805792163543442, + "grad_norm": 0.38577890396118164, + "learning_rate": 0.0002, + "loss": 1.8092, + "step": 320 + }, + { + "epoch": 0.22487223168654175, + "grad_norm": 0.38979098200798035, + "learning_rate": 0.0002, + "loss": 1.7113, + "step": 330 + }, + { + "epoch": 0.23168654173764908, + "grad_norm": 0.35105520486831665, + "learning_rate": 0.0002, + "loss": 1.8162, + "step": 340 + }, + { + "epoch": 0.23850085178875638, + "grad_norm": 0.31671223044395447, + "learning_rate": 0.0002, + "loss": 1.8408, + "step": 350 + }, + { + "epoch": 0.2453151618398637, + "grad_norm": 0.33034196496009827, + "learning_rate": 0.0002, + "loss": 1.8014, + "step": 360 + }, + { + "epoch": 0.252129471890971, + "grad_norm": 0.2990533709526062, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 370 + }, + { + "epoch": 0.25894378194207834, + "grad_norm": 0.362208753824234, + "learning_rate": 0.0002, + "loss": 1.8612, + "step": 380 + }, + { + "epoch": 0.2657580919931857, + "grad_norm": 0.3269096612930298, + "learning_rate": 0.0002, + "loss": 1.8001, + "step": 390 + }, + { + "epoch": 0.272572402044293, + "grad_norm": 0.30555954575538635, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 400 + }, + { + "epoch": 0.27938671209540034, + "grad_norm": 0.332933247089386, + "learning_rate": 0.0002, + "loss": 1.8018, + "step": 410 + }, + { + "epoch": 0.28620102214650767, + "grad_norm": 0.39454060792922974, + "learning_rate": 0.0002, + "loss": 1.8157, + "step": 420 + }, + { + "epoch": 0.293015332197615, + "grad_norm": 0.34589633345603943, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 430 + }, + { + "epoch": 0.29982964224872233, + "grad_norm": 0.3747332990169525, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 440 + }, + { + "epoch": 0.30664395229982966, + "grad_norm": 0.34825369715690613, + "learning_rate": 0.0002, + "loss": 1.8476, + "step": 450 + }, + { + "epoch": 0.313458262350937, + "grad_norm": 0.32906976342201233, + "learning_rate": 0.0002, + "loss": 1.775, + "step": 460 + }, + { + "epoch": 0.3202725724020443, + "grad_norm": 0.33108609914779663, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 470 + }, + { + "epoch": 0.3270868824531516, + "grad_norm": 0.3170463442802429, + "learning_rate": 0.0002, + "loss": 1.8012, + "step": 480 + }, + { + "epoch": 0.3339011925042589, + "grad_norm": 0.30792081356048584, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 490 + }, + { + "epoch": 0.34071550255536626, + "grad_norm": 0.31772997975349426, + "learning_rate": 0.0002, + "loss": 1.7361, + "step": 500 + }, + { + "epoch": 0.3475298126064736, + "grad_norm": 0.32714012265205383, + "learning_rate": 0.0002, + "loss": 1.8147, + "step": 510 + }, + { + "epoch": 0.3543441226575809, + "grad_norm": 0.3021100163459778, + "learning_rate": 0.0002, + "loss": 1.8332, + "step": 520 + }, + { + "epoch": 0.36115843270868825, + "grad_norm": 0.6045835018157959, + "learning_rate": 0.0002, + "loss": 1.7337, + "step": 530 + }, + { + "epoch": 0.3679727427597956, + "grad_norm": 0.3003896474838257, + "learning_rate": 0.0002, + "loss": 1.8661, + "step": 540 + }, + { + "epoch": 0.3747870528109029, + "grad_norm": 0.3678470551967621, + "learning_rate": 0.0002, + "loss": 1.8359, + "step": 550 + }, + { + "epoch": 0.38160136286201024, + "grad_norm": 0.35787615180015564, + "learning_rate": 0.0002, + "loss": 1.7512, + "step": 560 + }, + { + "epoch": 0.38841567291311757, + "grad_norm": 0.31882143020629883, + "learning_rate": 0.0002, + "loss": 1.8048, + "step": 570 + }, + { + "epoch": 0.39522998296422485, + "grad_norm": 0.3186313509941101, + "learning_rate": 0.0002, + "loss": 1.8108, + "step": 580 + }, + { + "epoch": 0.4020442930153322, + "grad_norm": 0.41443702578544617, + "learning_rate": 0.0002, + "loss": 1.8012, + "step": 590 + }, + { + "epoch": 0.4088586030664395, + "grad_norm": 0.28773069381713867, + "learning_rate": 0.0002, + "loss": 1.7794, + "step": 600 + }, + { + "epoch": 0.41567291311754684, + "grad_norm": 0.35743263363838196, + "learning_rate": 0.0002, + "loss": 1.8231, + "step": 610 + }, + { + "epoch": 0.42248722316865417, + "grad_norm": 0.3360286355018616, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 620 + }, + { + "epoch": 0.4293015332197615, + "grad_norm": 0.32838866114616394, + "learning_rate": 0.0002, + "loss": 1.7654, + "step": 630 + }, + { + "epoch": 0.43611584327086883, + "grad_norm": 0.2994388937950134, + "learning_rate": 0.0002, + "loss": 1.8176, + "step": 640 + }, + { + "epoch": 0.44293015332197616, + "grad_norm": 0.3306307792663574, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 650 + }, + { + "epoch": 0.4497444633730835, + "grad_norm": 0.3129560351371765, + "learning_rate": 0.0002, + "loss": 1.8201, + "step": 660 + }, + { + "epoch": 0.4565587734241908, + "grad_norm": 0.3244289457798004, + "learning_rate": 0.0002, + "loss": 1.803, + "step": 670 + }, + { + "epoch": 0.46337308347529815, + "grad_norm": 0.3196892738342285, + "learning_rate": 0.0002, + "loss": 1.8399, + "step": 680 + }, + { + "epoch": 0.47018739352640543, + "grad_norm": 0.3135230243206024, + "learning_rate": 0.0002, + "loss": 1.8291, + "step": 690 + }, + { + "epoch": 0.47700170357751276, + "grad_norm": 0.28677991032600403, + "learning_rate": 0.0002, + "loss": 1.7423, + "step": 700 + }, + { + "epoch": 0.4838160136286201, + "grad_norm": 0.3074065148830414, + "learning_rate": 0.0002, + "loss": 1.6982, + "step": 710 + }, + { + "epoch": 0.4906303236797274, + "grad_norm": 0.3354290723800659, + "learning_rate": 0.0002, + "loss": 1.8078, + "step": 720 + }, + { + "epoch": 0.49744463373083475, + "grad_norm": 0.324370801448822, + "learning_rate": 0.0002, + "loss": 1.7333, + "step": 730 + }, + { + "epoch": 0.504258943781942, + "grad_norm": 0.29496142268180847, + "learning_rate": 0.0002, + "loss": 1.8506, + "step": 740 + }, + { + "epoch": 0.5110732538330494, + "grad_norm": 0.30694130063056946, + "learning_rate": 0.0002, + "loss": 1.7761, + "step": 750 + }, + { + "epoch": 0.5178875638841567, + "grad_norm": 0.36168408393859863, + "learning_rate": 0.0002, + "loss": 1.7854, + "step": 760 + }, + { + "epoch": 0.524701873935264, + "grad_norm": 0.2930343449115753, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 770 + }, + { + "epoch": 0.5315161839863713, + "grad_norm": 0.3023432493209839, + "learning_rate": 0.0002, + "loss": 1.7924, + "step": 780 + }, + { + "epoch": 0.5383304940374787, + "grad_norm": 0.3272720277309418, + "learning_rate": 0.0002, + "loss": 1.8467, + "step": 790 + }, + { + "epoch": 0.545144804088586, + "grad_norm": 0.2907974421977997, + "learning_rate": 0.0002, + "loss": 1.8639, + "step": 800 + }, + { + "epoch": 0.5519591141396933, + "grad_norm": 0.32267168164253235, + "learning_rate": 0.0002, + "loss": 1.9018, + "step": 810 + }, + { + "epoch": 0.5587734241908007, + "grad_norm": 0.32059940695762634, + "learning_rate": 0.0002, + "loss": 1.8311, + "step": 820 + }, + { + "epoch": 0.565587734241908, + "grad_norm": 0.30951258540153503, + "learning_rate": 0.0002, + "loss": 1.7234, + "step": 830 + }, + { + "epoch": 0.5724020442930153, + "grad_norm": 0.33976122736930847, + "learning_rate": 0.0002, + "loss": 1.8063, + "step": 840 + }, + { + "epoch": 0.5792163543441227, + "grad_norm": 0.3195820450782776, + "learning_rate": 0.0002, + "loss": 1.7021, + "step": 850 + }, + { + "epoch": 0.58603066439523, + "grad_norm": 0.2828562557697296, + "learning_rate": 0.0002, + "loss": 1.8073, + "step": 860 + }, + { + "epoch": 0.5928449744463373, + "grad_norm": 0.29591670632362366, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 870 + }, + { + "epoch": 0.5996592844974447, + "grad_norm": 0.3086104393005371, + "learning_rate": 0.0002, + "loss": 1.8109, + "step": 880 + }, + { + "epoch": 0.606473594548552, + "grad_norm": 0.3592929542064667, + "learning_rate": 0.0002, + "loss": 1.8207, + "step": 890 + }, + { + "epoch": 0.6132879045996593, + "grad_norm": 0.2830186188220978, + "learning_rate": 0.0002, + "loss": 1.7662, + "step": 900 + }, + { + "epoch": 0.6201022146507666, + "grad_norm": 0.3128598630428314, + "learning_rate": 0.0002, + "loss": 1.8344, + "step": 910 + }, + { + "epoch": 0.626916524701874, + "grad_norm": 0.31957563757896423, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 920 + }, + { + "epoch": 0.6337308347529813, + "grad_norm": 0.30994319915771484, + "learning_rate": 0.0002, + "loss": 1.7578, + "step": 930 + }, + { + "epoch": 0.6405451448040886, + "grad_norm": 0.3352845013141632, + "learning_rate": 0.0002, + "loss": 1.8494, + "step": 940 + }, + { + "epoch": 0.6473594548551959, + "grad_norm": 0.2960077226161957, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 950 + }, + { + "epoch": 0.6541737649063032, + "grad_norm": 0.32675081491470337, + "learning_rate": 0.0002, + "loss": 1.7209, + "step": 960 + }, + { + "epoch": 0.6609880749574105, + "grad_norm": 0.30042028427124023, + "learning_rate": 0.0002, + "loss": 1.7776, + "step": 970 + }, + { + "epoch": 0.6678023850085179, + "grad_norm": 0.3288673758506775, + "learning_rate": 0.0002, + "loss": 1.7597, + "step": 980 + }, + { + "epoch": 0.6746166950596252, + "grad_norm": 0.323215126991272, + "learning_rate": 0.0002, + "loss": 1.7962, + "step": 990 + }, + { + "epoch": 0.6814310051107325, + "grad_norm": 0.30041399598121643, + "learning_rate": 0.0002, + "loss": 1.821, + "step": 1000 + }, + { + "epoch": 0.6882453151618398, + "grad_norm": 0.3076179623603821, + "learning_rate": 0.0002, + "loss": 1.7716, + "step": 1010 + }, + { + "epoch": 0.6950596252129472, + "grad_norm": 0.2941909730434418, + "learning_rate": 0.0002, + "loss": 1.7827, + "step": 1020 + }, + { + "epoch": 0.7018739352640545, + "grad_norm": 0.32220420241355896, + "learning_rate": 0.0002, + "loss": 1.7964, + "step": 1030 + }, + { + "epoch": 0.7086882453151618, + "grad_norm": 0.2989702820777893, + "learning_rate": 0.0002, + "loss": 1.7101, + "step": 1040 + }, + { + "epoch": 0.7155025553662692, + "grad_norm": 0.3049640357494354, + "learning_rate": 0.0002, + "loss": 1.8749, + "step": 1050 + }, + { + "epoch": 0.7223168654173765, + "grad_norm": 0.3183284103870392, + "learning_rate": 0.0002, + "loss": 1.782, + "step": 1060 + }, + { + "epoch": 0.7291311754684838, + "grad_norm": 0.3070095181465149, + "learning_rate": 0.0002, + "loss": 1.785, + "step": 1070 + }, + { + "epoch": 0.7359454855195912, + "grad_norm": 0.33263063430786133, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 1080 + }, + { + "epoch": 0.7427597955706985, + "grad_norm": 0.28774312138557434, + "learning_rate": 0.0002, + "loss": 1.7627, + "step": 1090 + }, + { + "epoch": 0.7495741056218058, + "grad_norm": 0.29304224252700806, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1100 + }, + { + "epoch": 0.7563884156729132, + "grad_norm": 0.27529507875442505, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 1110 + }, + { + "epoch": 0.7632027257240205, + "grad_norm": 0.32319945096969604, + "learning_rate": 0.0002, + "loss": 1.8071, + "step": 1120 + }, + { + "epoch": 0.7700170357751278, + "grad_norm": 0.33777597546577454, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 1130 + }, + { + "epoch": 0.7768313458262351, + "grad_norm": 0.29847201704978943, + "learning_rate": 0.0002, + "loss": 1.8488, + "step": 1140 + }, + { + "epoch": 0.7836456558773425, + "grad_norm": 0.31370633840560913, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 1150 + }, + { + "epoch": 0.7904599659284497, + "grad_norm": 0.31633856892585754, + "learning_rate": 0.0002, + "loss": 1.8175, + "step": 1160 + }, + { + "epoch": 0.797274275979557, + "grad_norm": 0.33849263191223145, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 1170 + }, + { + "epoch": 0.8040885860306644, + "grad_norm": 0.3306333124637604, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 1180 + }, + { + "epoch": 0.8109028960817717, + "grad_norm": 0.3249678313732147, + "learning_rate": 0.0002, + "loss": 1.777, + "step": 1190 + }, + { + "epoch": 0.817717206132879, + "grad_norm": 0.3252817690372467, + "learning_rate": 0.0002, + "loss": 1.7631, + "step": 1200 + }, + { + "epoch": 0.8245315161839863, + "grad_norm": 0.31772559881210327, + "learning_rate": 0.0002, + "loss": 1.7877, + "step": 1210 + }, + { + "epoch": 0.8313458262350937, + "grad_norm": 0.2803371846675873, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 1220 + }, + { + "epoch": 0.838160136286201, + "grad_norm": 0.26700571179389954, + "learning_rate": 0.0002, + "loss": 1.738, + "step": 1230 + }, + { + "epoch": 0.8449744463373083, + "grad_norm": 0.3060092031955719, + "learning_rate": 0.0002, + "loss": 1.8301, + "step": 1240 + }, + { + "epoch": 0.8517887563884157, + "grad_norm": 0.28831684589385986, + "learning_rate": 0.0002, + "loss": 1.7993, + "step": 1250 + }, + { + "epoch": 0.858603066439523, + "grad_norm": 0.30708742141723633, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 1260 + }, + { + "epoch": 0.8654173764906303, + "grad_norm": 0.2915987968444824, + "learning_rate": 0.0002, + "loss": 1.8506, + "step": 1270 + }, + { + "epoch": 0.8722316865417377, + "grad_norm": 0.2893589735031128, + "learning_rate": 0.0002, + "loss": 1.7536, + "step": 1280 + }, + { + "epoch": 0.879045996592845, + "grad_norm": 0.29545632004737854, + "learning_rate": 0.0002, + "loss": 1.7437, + "step": 1290 + }, + { + "epoch": 0.8858603066439523, + "grad_norm": 0.3354771137237549, + "learning_rate": 0.0002, + "loss": 1.859, + "step": 1300 + }, + { + "epoch": 0.8926746166950597, + "grad_norm": 0.37715399265289307, + "learning_rate": 0.0002, + "loss": 1.7644, + "step": 1310 + }, + { + "epoch": 0.899488926746167, + "grad_norm": 0.28847193717956543, + "learning_rate": 0.0002, + "loss": 1.7731, + "step": 1320 + }, + { + "epoch": 0.9063032367972743, + "grad_norm": 0.2780889868736267, + "learning_rate": 0.0002, + "loss": 1.9077, + "step": 1330 + }, + { + "epoch": 0.9131175468483816, + "grad_norm": 0.2714342176914215, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 1340 + }, + { + "epoch": 0.919931856899489, + "grad_norm": 0.2950133979320526, + "learning_rate": 0.0002, + "loss": 1.8, + "step": 1350 + }, + { + "epoch": 0.9267461669505963, + "grad_norm": 0.29097145795822144, + "learning_rate": 0.0002, + "loss": 1.7368, + "step": 1360 + }, + { + "epoch": 0.9335604770017035, + "grad_norm": 0.32540133595466614, + "learning_rate": 0.0002, + "loss": 1.7864, + "step": 1370 + }, + { + "epoch": 0.9403747870528109, + "grad_norm": 0.3076636493206024, + "learning_rate": 0.0002, + "loss": 1.7571, + "step": 1380 + }, + { + "epoch": 0.9471890971039182, + "grad_norm": 0.2962130308151245, + "learning_rate": 0.0002, + "loss": 1.8312, + "step": 1390 + }, + { + "epoch": 0.9540034071550255, + "grad_norm": 0.30086860060691833, + "learning_rate": 0.0002, + "loss": 1.7858, + "step": 1400 + }, + { + "epoch": 0.9608177172061328, + "grad_norm": 0.28634947538375854, + "learning_rate": 0.0002, + "loss": 1.7991, + "step": 1410 + }, + { + "epoch": 0.9676320272572402, + "grad_norm": 0.35314416885375977, + "learning_rate": 0.0002, + "loss": 1.8385, + "step": 1420 + }, + { + "epoch": 0.9744463373083475, + "grad_norm": 0.2939317524433136, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 1430 + }, + { + "epoch": 0.9812606473594548, + "grad_norm": 0.3010196089744568, + "learning_rate": 0.0002, + "loss": 1.7582, + "step": 1440 + }, + { + "epoch": 0.9880749574105622, + "grad_norm": 0.30816152691841125, + "learning_rate": 0.0002, + "loss": 1.758, + "step": 1450 + }, + { + "epoch": 0.9948892674616695, + "grad_norm": 0.28152793645858765, + "learning_rate": 0.0002, + "loss": 1.8159, + "step": 1460 + }, + { + "epoch": 0.9996592844974447, + "eval_loss": 1.8034634590148926, + "eval_runtime": 53.6017, + "eval_samples_per_second": 9.459, + "eval_steps_per_second": 1.194, + "step": 1467 + }, + { + "epoch": 1.0017035775127767, + "grad_norm": 0.29246416687965393, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 1470 + }, + { + "epoch": 1.008517887563884, + "grad_norm": 0.2668602168560028, + "learning_rate": 0.0002, + "loss": 1.8159, + "step": 1480 + }, + { + "epoch": 1.0153321976149914, + "grad_norm": 0.2998567819595337, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 1490 + }, + { + "epoch": 1.0221465076660987, + "grad_norm": 0.3284934461116791, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 1500 + }, + { + "epoch": 1.028960817717206, + "grad_norm": 0.3275827169418335, + "learning_rate": 0.0002, + "loss": 1.689, + "step": 1510 + }, + { + "epoch": 1.0357751277683134, + "grad_norm": 0.3382718563079834, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 1520 + }, + { + "epoch": 1.0425894378194207, + "grad_norm": 0.36737215518951416, + "learning_rate": 0.0002, + "loss": 1.7215, + "step": 1530 + }, + { + "epoch": 1.049403747870528, + "grad_norm": 0.3442603647708893, + "learning_rate": 0.0002, + "loss": 1.6993, + "step": 1540 + }, + { + "epoch": 1.0562180579216354, + "grad_norm": 0.3323381245136261, + "learning_rate": 0.0002, + "loss": 1.6996, + "step": 1550 + }, + { + "epoch": 1.0630323679727427, + "grad_norm": 0.341227650642395, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 1560 + }, + { + "epoch": 1.06984667802385, + "grad_norm": 0.3361579477787018, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 1570 + }, + { + "epoch": 1.0766609880749574, + "grad_norm": 0.3556230962276459, + "learning_rate": 0.0002, + "loss": 1.7337, + "step": 1580 + }, + { + "epoch": 1.0834752981260647, + "grad_norm": 0.27130424976348877, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 1590 + }, + { + "epoch": 1.090289608177172, + "grad_norm": 0.29366323351860046, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 1600 + }, + { + "epoch": 1.0971039182282794, + "grad_norm": 0.3581245541572571, + "learning_rate": 0.0002, + "loss": 1.6721, + "step": 1610 + }, + { + "epoch": 1.1039182282793867, + "grad_norm": 0.3021670877933502, + "learning_rate": 0.0002, + "loss": 1.7639, + "step": 1620 + }, + { + "epoch": 1.110732538330494, + "grad_norm": 0.3145572543144226, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 1630 + }, + { + "epoch": 1.1175468483816013, + "grad_norm": 0.35362154245376587, + "learning_rate": 0.0002, + "loss": 1.7408, + "step": 1640 + }, + { + "epoch": 1.1243611584327087, + "grad_norm": 0.5413113236427307, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 1650 + }, + { + "epoch": 1.131175468483816, + "grad_norm": 0.6858654022216797, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 1660 + }, + { + "epoch": 1.1379897785349233, + "grad_norm": 0.3781903386116028, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 1670 + }, + { + "epoch": 1.1448040885860307, + "grad_norm": 0.404864102602005, + "learning_rate": 0.0002, + "loss": 1.659, + "step": 1680 + }, + { + "epoch": 1.151618398637138, + "grad_norm": 0.3595100939273834, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 1690 + }, + { + "epoch": 1.1584327086882453, + "grad_norm": 0.33682283759117126, + "learning_rate": 0.0002, + "loss": 1.7023, + "step": 1700 + }, + { + "epoch": 1.1652470187393527, + "grad_norm": 0.3877373933792114, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 1710 + }, + { + "epoch": 1.17206132879046, + "grad_norm": 0.34606897830963135, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 1720 + }, + { + "epoch": 1.1788756388415673, + "grad_norm": 0.3122918903827667, + "learning_rate": 0.0002, + "loss": 1.6889, + "step": 1730 + }, + { + "epoch": 1.1856899488926746, + "grad_norm": 0.34081900119781494, + "learning_rate": 0.0002, + "loss": 1.6585, + "step": 1740 + }, + { + "epoch": 1.192504258943782, + "grad_norm": 0.3418807089328766, + "learning_rate": 0.0002, + "loss": 1.7794, + "step": 1750 + }, + { + "epoch": 1.1993185689948893, + "grad_norm": 0.3495500981807709, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 1760 + }, + { + "epoch": 1.2061328790459966, + "grad_norm": 0.557288408279419, + "learning_rate": 0.0002, + "loss": 1.6578, + "step": 1770 + }, + { + "epoch": 1.212947189097104, + "grad_norm": 0.3193778693675995, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 1780 + }, + { + "epoch": 1.2197614991482113, + "grad_norm": 0.3306216895580292, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 1790 + }, + { + "epoch": 1.2265758091993186, + "grad_norm": 0.37998732924461365, + "learning_rate": 0.0002, + "loss": 1.6772, + "step": 1800 + }, + { + "epoch": 1.233390119250426, + "grad_norm": 0.3255669176578522, + "learning_rate": 0.0002, + "loss": 1.7094, + "step": 1810 + }, + { + "epoch": 1.2402044293015333, + "grad_norm": 0.3741287291049957, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 1820 + }, + { + "epoch": 1.2470187393526406, + "grad_norm": 0.36727291345596313, + "learning_rate": 0.0002, + "loss": 1.7317, + "step": 1830 + }, + { + "epoch": 1.253833049403748, + "grad_norm": 0.3479527235031128, + "learning_rate": 0.0002, + "loss": 1.7418, + "step": 1840 + }, + { + "epoch": 1.2606473594548553, + "grad_norm": 0.3472636640071869, + "learning_rate": 0.0002, + "loss": 1.7062, + "step": 1850 + }, + { + "epoch": 1.2674616695059626, + "grad_norm": 0.3702869415283203, + "learning_rate": 0.0002, + "loss": 1.6263, + "step": 1860 + }, + { + "epoch": 1.27427597955707, + "grad_norm": 0.3934040069580078, + "learning_rate": 0.0002, + "loss": 1.6222, + "step": 1870 + }, + { + "epoch": 1.2810902896081773, + "grad_norm": 0.46887534856796265, + "learning_rate": 0.0002, + "loss": 1.7616, + "step": 1880 + }, + { + "epoch": 1.2879045996592846, + "grad_norm": 0.3191998600959778, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 1890 + }, + { + "epoch": 1.294718909710392, + "grad_norm": 0.34032225608825684, + "learning_rate": 0.0002, + "loss": 1.7001, + "step": 1900 + }, + { + "epoch": 1.3015332197614993, + "grad_norm": 0.33453696966171265, + "learning_rate": 0.0002, + "loss": 1.8316, + "step": 1910 + }, + { + "epoch": 1.3083475298126066, + "grad_norm": 0.3451494872570038, + "learning_rate": 0.0002, + "loss": 1.6567, + "step": 1920 + }, + { + "epoch": 1.315161839863714, + "grad_norm": 0.36203092336654663, + "learning_rate": 0.0002, + "loss": 1.708, + "step": 1930 + }, + { + "epoch": 1.321976149914821, + "grad_norm": 0.43794456124305725, + "learning_rate": 0.0002, + "loss": 1.7095, + "step": 1940 + }, + { + "epoch": 1.3287904599659284, + "grad_norm": 0.3630591034889221, + "learning_rate": 0.0002, + "loss": 1.7264, + "step": 1950 + }, + { + "epoch": 1.3356047700170357, + "grad_norm": 0.36951911449432373, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 1960 + }, + { + "epoch": 1.342419080068143, + "grad_norm": 0.4001159965991974, + "learning_rate": 0.0002, + "loss": 1.651, + "step": 1970 + }, + { + "epoch": 1.3492333901192504, + "grad_norm": 0.3820836544036865, + "learning_rate": 0.0002, + "loss": 1.677, + "step": 1980 + }, + { + "epoch": 1.3560477001703577, + "grad_norm": 0.3705870807170868, + "learning_rate": 0.0002, + "loss": 1.583, + "step": 1990 + }, + { + "epoch": 1.362862010221465, + "grad_norm": 0.3557972013950348, + "learning_rate": 0.0002, + "loss": 1.6207, + "step": 2000 + }, + { + "epoch": 1.3696763202725724, + "grad_norm": 0.38546398282051086, + "learning_rate": 0.0002, + "loss": 1.7656, + "step": 2010 + }, + { + "epoch": 1.3764906303236797, + "grad_norm": 0.3908020853996277, + "learning_rate": 0.0002, + "loss": 1.6881, + "step": 2020 + }, + { + "epoch": 1.383304940374787, + "grad_norm": 0.3822040855884552, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 2030 + }, + { + "epoch": 1.3901192504258943, + "grad_norm": 0.425327867269516, + "learning_rate": 0.0002, + "loss": 1.7923, + "step": 2040 + }, + { + "epoch": 1.3969335604770017, + "grad_norm": 0.3436259329319, + "learning_rate": 0.0002, + "loss": 1.7032, + "step": 2050 + }, + { + "epoch": 1.403747870528109, + "grad_norm": 0.33124062418937683, + "learning_rate": 0.0002, + "loss": 1.6398, + "step": 2060 + }, + { + "epoch": 1.4105621805792163, + "grad_norm": 0.3662424683570862, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 2070 + }, + { + "epoch": 1.4173764906303237, + "grad_norm": 0.3720635175704956, + "learning_rate": 0.0002, + "loss": 1.7258, + "step": 2080 + }, + { + "epoch": 1.424190800681431, + "grad_norm": 0.3361680805683136, + "learning_rate": 0.0002, + "loss": 1.7186, + "step": 2090 + }, + { + "epoch": 1.4310051107325383, + "grad_norm": 0.32751724123954773, + "learning_rate": 0.0002, + "loss": 1.7606, + "step": 2100 + }, + { + "epoch": 1.4378194207836457, + "grad_norm": 0.34333378076553345, + "learning_rate": 0.0002, + "loss": 1.7051, + "step": 2110 + }, + { + "epoch": 1.444633730834753, + "grad_norm": 0.37777671217918396, + "learning_rate": 0.0002, + "loss": 1.6979, + "step": 2120 + }, + { + "epoch": 1.4514480408858603, + "grad_norm": 0.37126365303993225, + "learning_rate": 0.0002, + "loss": 1.7085, + "step": 2130 + }, + { + "epoch": 1.4582623509369677, + "grad_norm": 0.3602267801761627, + "learning_rate": 0.0002, + "loss": 1.721, + "step": 2140 + }, + { + "epoch": 1.465076660988075, + "grad_norm": 0.3287110924720764, + "learning_rate": 0.0002, + "loss": 1.8148, + "step": 2150 + }, + { + "epoch": 1.4718909710391823, + "grad_norm": 0.3562135100364685, + "learning_rate": 0.0002, + "loss": 1.6966, + "step": 2160 + }, + { + "epoch": 1.4787052810902896, + "grad_norm": 0.38292962312698364, + "learning_rate": 0.0002, + "loss": 1.713, + "step": 2170 + }, + { + "epoch": 1.485519591141397, + "grad_norm": 0.38220319151878357, + "learning_rate": 0.0002, + "loss": 1.7036, + "step": 2180 + }, + { + "epoch": 1.4923339011925043, + "grad_norm": 0.3570062220096588, + "learning_rate": 0.0002, + "loss": 1.7297, + "step": 2190 + }, + { + "epoch": 1.4991482112436116, + "grad_norm": 0.363146036863327, + "learning_rate": 0.0002, + "loss": 1.5652, + "step": 2200 + }, + { + "epoch": 1.5059625212947187, + "grad_norm": 0.37393274903297424, + "learning_rate": 0.0002, + "loss": 1.6716, + "step": 2210 + }, + { + "epoch": 1.512776831345826, + "grad_norm": 0.3628501892089844, + "learning_rate": 0.0002, + "loss": 1.6839, + "step": 2220 + }, + { + "epoch": 1.5195911413969334, + "grad_norm": 0.33430740237236023, + "learning_rate": 0.0002, + "loss": 1.6627, + "step": 2230 + }, + { + "epoch": 1.5264054514480407, + "grad_norm": 0.35363978147506714, + "learning_rate": 0.0002, + "loss": 1.7495, + "step": 2240 + }, + { + "epoch": 1.533219761499148, + "grad_norm": 0.37220337986946106, + "learning_rate": 0.0002, + "loss": 1.6364, + "step": 2250 + }, + { + "epoch": 1.5400340715502554, + "grad_norm": 0.35020262002944946, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 2260 + }, + { + "epoch": 1.5468483816013627, + "grad_norm": 0.35274937748908997, + "learning_rate": 0.0002, + "loss": 1.7453, + "step": 2270 + }, + { + "epoch": 1.55366269165247, + "grad_norm": 0.3797738552093506, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 2280 + }, + { + "epoch": 1.5604770017035774, + "grad_norm": 0.4160412847995758, + "learning_rate": 0.0002, + "loss": 1.6197, + "step": 2290 + }, + { + "epoch": 1.5672913117546847, + "grad_norm": 0.38779592514038086, + "learning_rate": 0.0002, + "loss": 1.7101, + "step": 2300 + }, + { + "epoch": 1.574105621805792, + "grad_norm": 0.39171287417411804, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 2310 + }, + { + "epoch": 1.5809199318568994, + "grad_norm": 0.3606826663017273, + "learning_rate": 0.0002, + "loss": 1.7087, + "step": 2320 + }, + { + "epoch": 1.5877342419080067, + "grad_norm": 0.3745017647743225, + "learning_rate": 0.0002, + "loss": 1.7269, + "step": 2330 + }, + { + "epoch": 1.594548551959114, + "grad_norm": 0.34933462738990784, + "learning_rate": 0.0002, + "loss": 1.7068, + "step": 2340 + }, + { + "epoch": 1.6013628620102214, + "grad_norm": 0.37268444895744324, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 2350 + }, + { + "epoch": 1.6081771720613287, + "grad_norm": 0.4603484869003296, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 2360 + }, + { + "epoch": 1.614991482112436, + "grad_norm": 0.35689088702201843, + "learning_rate": 0.0002, + "loss": 1.7621, + "step": 2370 + }, + { + "epoch": 1.6218057921635434, + "grad_norm": 0.3392031490802765, + "learning_rate": 0.0002, + "loss": 1.6989, + "step": 2380 + }, + { + "epoch": 1.6286201022146507, + "grad_norm": 0.394653856754303, + "learning_rate": 0.0002, + "loss": 1.7077, + "step": 2390 + }, + { + "epoch": 1.635434412265758, + "grad_norm": 0.33972012996673584, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 2400 + }, + { + "epoch": 1.6422487223168654, + "grad_norm": 0.3854375183582306, + "learning_rate": 0.0002, + "loss": 1.7681, + "step": 2410 + }, + { + "epoch": 1.6490630323679727, + "grad_norm": 0.36143961548805237, + "learning_rate": 0.0002, + "loss": 1.7102, + "step": 2420 + }, + { + "epoch": 1.65587734241908, + "grad_norm": 0.35816189646720886, + "learning_rate": 0.0002, + "loss": 1.7432, + "step": 2430 + }, + { + "epoch": 1.6626916524701874, + "grad_norm": 0.36298736929893494, + "learning_rate": 0.0002, + "loss": 1.6407, + "step": 2440 + }, + { + "epoch": 1.6695059625212947, + "grad_norm": 0.36756929755210876, + "learning_rate": 0.0002, + "loss": 1.723, + "step": 2450 + }, + { + "epoch": 1.676320272572402, + "grad_norm": 0.35969603061676025, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 2460 + }, + { + "epoch": 1.6831345826235093, + "grad_norm": 0.38449376821517944, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 2470 + }, + { + "epoch": 1.6899488926746167, + "grad_norm": 0.44511452317237854, + "learning_rate": 0.0002, + "loss": 1.7261, + "step": 2480 + }, + { + "epoch": 1.696763202725724, + "grad_norm": 0.3831416368484497, + "learning_rate": 0.0002, + "loss": 1.7397, + "step": 2490 + }, + { + "epoch": 1.7035775127768313, + "grad_norm": 0.3795325756072998, + "learning_rate": 0.0002, + "loss": 1.7046, + "step": 2500 + }, + { + "epoch": 1.7103918228279387, + "grad_norm": 0.34978193044662476, + "learning_rate": 0.0002, + "loss": 1.6993, + "step": 2510 + }, + { + "epoch": 1.717206132879046, + "grad_norm": 0.35923877358436584, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 2520 + }, + { + "epoch": 1.7240204429301533, + "grad_norm": 0.352999746799469, + "learning_rate": 0.0002, + "loss": 1.6253, + "step": 2530 + }, + { + "epoch": 1.7308347529812607, + "grad_norm": 0.43673479557037354, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 2540 + }, + { + "epoch": 1.737649063032368, + "grad_norm": 0.4153687357902527, + "learning_rate": 0.0002, + "loss": 1.7079, + "step": 2550 + }, + { + "epoch": 1.7444633730834753, + "grad_norm": 0.35541167855262756, + "learning_rate": 0.0002, + "loss": 1.6714, + "step": 2560 + }, + { + "epoch": 1.7512776831345827, + "grad_norm": 0.3288775086402893, + "learning_rate": 0.0002, + "loss": 1.696, + "step": 2570 + }, + { + "epoch": 1.75809199318569, + "grad_norm": 0.3991123139858246, + "learning_rate": 0.0002, + "loss": 1.7486, + "step": 2580 + }, + { + "epoch": 1.7649063032367973, + "grad_norm": 0.39967241883277893, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 2590 + }, + { + "epoch": 1.7717206132879046, + "grad_norm": 0.41104283928871155, + "learning_rate": 0.0002, + "loss": 1.6855, + "step": 2600 + }, + { + "epoch": 1.778534923339012, + "grad_norm": 0.44885286688804626, + "learning_rate": 0.0002, + "loss": 1.6993, + "step": 2610 + }, + { + "epoch": 1.7853492333901193, + "grad_norm": 0.38384467363357544, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 2620 + }, + { + "epoch": 1.7921635434412266, + "grad_norm": 0.35020917654037476, + "learning_rate": 0.0002, + "loss": 1.7213, + "step": 2630 + }, + { + "epoch": 1.798977853492334, + "grad_norm": 0.3360341489315033, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 2640 + }, + { + "epoch": 1.8057921635434413, + "grad_norm": 0.38875144720077515, + "learning_rate": 0.0002, + "loss": 1.7037, + "step": 2650 + }, + { + "epoch": 1.8126064735945486, + "grad_norm": 0.34876883029937744, + "learning_rate": 0.0002, + "loss": 1.693, + "step": 2660 + }, + { + "epoch": 1.819420783645656, + "grad_norm": 0.419979989528656, + "learning_rate": 0.0002, + "loss": 1.7743, + "step": 2670 + }, + { + "epoch": 1.8262350936967633, + "grad_norm": 0.3648919463157654, + "learning_rate": 0.0002, + "loss": 1.7089, + "step": 2680 + }, + { + "epoch": 1.8330494037478706, + "grad_norm": 0.3485383987426758, + "learning_rate": 0.0002, + "loss": 1.7828, + "step": 2690 + }, + { + "epoch": 1.839863713798978, + "grad_norm": 0.3647468686103821, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 2700 + }, + { + "epoch": 1.8466780238500853, + "grad_norm": 0.37003210186958313, + "learning_rate": 0.0002, + "loss": 1.7318, + "step": 2710 + }, + { + "epoch": 1.8534923339011926, + "grad_norm": 0.37031617760658264, + "learning_rate": 0.0002, + "loss": 1.6647, + "step": 2720 + }, + { + "epoch": 1.8603066439523, + "grad_norm": 0.3438796103000641, + "learning_rate": 0.0002, + "loss": 1.69, + "step": 2730 + }, + { + "epoch": 1.8671209540034073, + "grad_norm": 0.41574627161026, + "learning_rate": 0.0002, + "loss": 1.5995, + "step": 2740 + }, + { + "epoch": 1.8739352640545146, + "grad_norm": 0.35049930214881897, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 2750 + }, + { + "epoch": 1.880749574105622, + "grad_norm": 0.3943989872932434, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 2760 + }, + { + "epoch": 1.8875638841567293, + "grad_norm": 0.3384978175163269, + "learning_rate": 0.0002, + "loss": 1.7047, + "step": 2770 + }, + { + "epoch": 1.8943781942078366, + "grad_norm": 0.3501328229904175, + "learning_rate": 0.0002, + "loss": 1.7848, + "step": 2780 + }, + { + "epoch": 1.901192504258944, + "grad_norm": 0.37484532594680786, + "learning_rate": 0.0002, + "loss": 1.6986, + "step": 2790 + }, + { + "epoch": 1.9080068143100513, + "grad_norm": 0.34497788548469543, + "learning_rate": 0.0002, + "loss": 1.6867, + "step": 2800 + }, + { + "epoch": 1.9148211243611586, + "grad_norm": 0.3530851900577545, + "learning_rate": 0.0002, + "loss": 1.6513, + "step": 2810 + }, + { + "epoch": 1.921635434412266, + "grad_norm": 0.3879254162311554, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 2820 + }, + { + "epoch": 1.9284497444633732, + "grad_norm": 0.3885590136051178, + "learning_rate": 0.0002, + "loss": 1.6786, + "step": 2830 + }, + { + "epoch": 1.9352640545144804, + "grad_norm": 0.3868715465068817, + "learning_rate": 0.0002, + "loss": 1.8049, + "step": 2840 + }, + { + "epoch": 1.9420783645655877, + "grad_norm": 0.4152422249317169, + "learning_rate": 0.0002, + "loss": 1.7099, + "step": 2850 + }, + { + "epoch": 1.948892674616695, + "grad_norm": 0.401714563369751, + "learning_rate": 0.0002, + "loss": 1.6696, + "step": 2860 + }, + { + "epoch": 1.9557069846678024, + "grad_norm": 0.34825265407562256, + "learning_rate": 0.0002, + "loss": 1.7182, + "step": 2870 + }, + { + "epoch": 1.9625212947189097, + "grad_norm": 0.3620675504207611, + "learning_rate": 0.0002, + "loss": 1.6612, + "step": 2880 + }, + { + "epoch": 1.969335604770017, + "grad_norm": 0.3977806866168976, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 2890 + }, + { + "epoch": 1.9761499148211243, + "grad_norm": 0.3687497079372406, + "learning_rate": 0.0002, + "loss": 1.6514, + "step": 2900 + }, + { + "epoch": 1.9829642248722317, + "grad_norm": 0.408640056848526, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 2910 + }, + { + "epoch": 1.989778534923339, + "grad_norm": 0.34510108828544617, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 2920 + }, + { + "epoch": 1.9965928449744463, + "grad_norm": 0.3596334755420685, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 2930 + }, + { + "epoch": 2.0, + "eval_loss": 1.8056600093841553, + "eval_runtime": 60.7049, + "eval_samples_per_second": 8.352, + "eval_steps_per_second": 1.054, + "step": 2935 + }, + { + "epoch": 2.0034071550255534, + "grad_norm": 0.3460802137851715, + "learning_rate": 0.0002, + "loss": 1.6918, + "step": 2940 + }, + { + "epoch": 2.0102214650766608, + "grad_norm": 0.4038620591163635, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 2950 + }, + { + "epoch": 2.017035775127768, + "grad_norm": 0.3950219750404358, + "learning_rate": 0.0002, + "loss": 1.5386, + "step": 2960 + }, + { + "epoch": 2.0238500851788754, + "grad_norm": 0.519116997718811, + "learning_rate": 0.0002, + "loss": 1.5649, + "step": 2970 + }, + { + "epoch": 2.0306643952299828, + "grad_norm": 0.4097684919834137, + "learning_rate": 0.0002, + "loss": 1.5397, + "step": 2980 + }, + { + "epoch": 2.03747870528109, + "grad_norm": 0.4153544306755066, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 2990 + }, + { + "epoch": 2.0442930153321974, + "grad_norm": 0.4351160526275635, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 3000 + }, + { + "epoch": 2.0511073253833048, + "grad_norm": 0.42036259174346924, + "learning_rate": 0.0002, + "loss": 1.5091, + "step": 3010 + }, + { + "epoch": 2.057921635434412, + "grad_norm": 0.4433218836784363, + "learning_rate": 0.0002, + "loss": 1.5686, + "step": 3020 + }, + { + "epoch": 2.0647359454855194, + "grad_norm": 0.46511581540107727, + "learning_rate": 0.0002, + "loss": 1.5478, + "step": 3030 + }, + { + "epoch": 2.0715502555366268, + "grad_norm": 0.4567560851573944, + "learning_rate": 0.0002, + "loss": 1.5554, + "step": 3040 + }, + { + "epoch": 2.078364565587734, + "grad_norm": 0.45671048760414124, + "learning_rate": 0.0002, + "loss": 1.5561, + "step": 3050 + }, + { + "epoch": 2.0851788756388414, + "grad_norm": 0.4598552882671356, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 3060 + }, + { + "epoch": 2.0919931856899487, + "grad_norm": 0.4582861661911011, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 3070 + }, + { + "epoch": 2.098807495741056, + "grad_norm": 0.4366969168186188, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 3080 + }, + { + "epoch": 2.1056218057921634, + "grad_norm": 0.495917409658432, + "learning_rate": 0.0002, + "loss": 1.5574, + "step": 3090 + }, + { + "epoch": 2.1124361158432707, + "grad_norm": 1.6846044063568115, + "learning_rate": 0.0002, + "loss": 1.5798, + "step": 3100 + }, + { + "epoch": 2.119250425894378, + "grad_norm": 0.4765092134475708, + "learning_rate": 0.0002, + "loss": 1.5877, + "step": 3110 + }, + { + "epoch": 2.1260647359454854, + "grad_norm": 0.45029810070991516, + "learning_rate": 0.0002, + "loss": 1.6106, + "step": 3120 + }, + { + "epoch": 2.1328790459965927, + "grad_norm": 0.5706973075866699, + "learning_rate": 0.0002, + "loss": 1.5707, + "step": 3130 + }, + { + "epoch": 2.1396933560477, + "grad_norm": 0.4606274366378784, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 3140 + }, + { + "epoch": 2.1465076660988074, + "grad_norm": 2.199115753173828, + "learning_rate": 0.0002, + "loss": 1.564, + "step": 3150 + }, + { + "epoch": 2.1533219761499147, + "grad_norm": 0.6231027245521545, + "learning_rate": 0.0002, + "loss": 1.5808, + "step": 3160 + }, + { + "epoch": 2.160136286201022, + "grad_norm": 0.46918219327926636, + "learning_rate": 0.0002, + "loss": 1.5397, + "step": 3170 + }, + { + "epoch": 2.1669505962521294, + "grad_norm": 0.5006393194198608, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 3180 + }, + { + "epoch": 2.1737649063032367, + "grad_norm": 0.4745093286037445, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 3190 + }, + { + "epoch": 2.180579216354344, + "grad_norm": 0.511576771736145, + "learning_rate": 0.0002, + "loss": 1.6459, + "step": 3200 + }, + { + "epoch": 2.1873935264054514, + "grad_norm": 0.38622918725013733, + "learning_rate": 0.0002, + "loss": 1.6019, + "step": 3210 + }, + { + "epoch": 2.1942078364565587, + "grad_norm": 0.4425644278526306, + "learning_rate": 0.0002, + "loss": 1.6187, + "step": 3220 + }, + { + "epoch": 2.201022146507666, + "grad_norm": 0.45643091201782227, + "learning_rate": 0.0002, + "loss": 1.6114, + "step": 3230 + }, + { + "epoch": 2.2078364565587734, + "grad_norm": 0.4975406229496002, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 3240 + }, + { + "epoch": 2.2146507666098807, + "grad_norm": 0.4673331081867218, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 3250 + }, + { + "epoch": 2.221465076660988, + "grad_norm": 0.5081731081008911, + "learning_rate": 0.0002, + "loss": 1.6155, + "step": 3260 + }, + { + "epoch": 2.2282793867120954, + "grad_norm": 0.4790806770324707, + "learning_rate": 0.0002, + "loss": 1.53, + "step": 3270 + }, + { + "epoch": 2.2350936967632027, + "grad_norm": 0.5184140801429749, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 3280 + }, + { + "epoch": 2.24190800681431, + "grad_norm": 0.5159083604812622, + "learning_rate": 0.0002, + "loss": 1.5646, + "step": 3290 + }, + { + "epoch": 2.2487223168654173, + "grad_norm": 0.4876042604446411, + "learning_rate": 0.0002, + "loss": 1.6577, + "step": 3300 + }, + { + "epoch": 2.2555366269165247, + "grad_norm": 0.5454957485198975, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 3310 + }, + { + "epoch": 2.262350936967632, + "grad_norm": 0.49866822361946106, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 3320 + }, + { + "epoch": 2.2691652470187393, + "grad_norm": 0.42674365639686584, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 3330 + }, + { + "epoch": 2.2759795570698467, + "grad_norm": 0.5202316641807556, + "learning_rate": 0.0002, + "loss": 1.5588, + "step": 3340 + }, + { + "epoch": 2.282793867120954, + "grad_norm": 0.4849465489387512, + "learning_rate": 0.0002, + "loss": 1.6032, + "step": 3350 + }, + { + "epoch": 2.2896081771720613, + "grad_norm": 0.47202569246292114, + "learning_rate": 0.0002, + "loss": 1.6853, + "step": 3360 + }, + { + "epoch": 2.2964224872231687, + "grad_norm": 0.5311620235443115, + "learning_rate": 0.0002, + "loss": 1.6164, + "step": 3370 + }, + { + "epoch": 2.303236797274276, + "grad_norm": 0.49011409282684326, + "learning_rate": 0.0002, + "loss": 1.6004, + "step": 3380 + }, + { + "epoch": 2.3100511073253833, + "grad_norm": 0.4789247512817383, + "learning_rate": 0.0002, + "loss": 1.5484, + "step": 3390 + }, + { + "epoch": 2.3168654173764907, + "grad_norm": 0.46646103262901306, + "learning_rate": 0.0002, + "loss": 1.5866, + "step": 3400 + }, + { + "epoch": 2.323679727427598, + "grad_norm": 0.5552441477775574, + "learning_rate": 0.0002, + "loss": 1.5308, + "step": 3410 + }, + { + "epoch": 2.3304940374787053, + "grad_norm": 0.4530351758003235, + "learning_rate": 0.0002, + "loss": 1.5761, + "step": 3420 + }, + { + "epoch": 2.3373083475298126, + "grad_norm": 0.4806232750415802, + "learning_rate": 0.0002, + "loss": 1.5919, + "step": 3430 + }, + { + "epoch": 2.34412265758092, + "grad_norm": 0.5998363494873047, + "learning_rate": 0.0002, + "loss": 1.5569, + "step": 3440 + }, + { + "epoch": 2.3509369676320273, + "grad_norm": 0.4918554425239563, + "learning_rate": 0.0002, + "loss": 1.513, + "step": 3450 + }, + { + "epoch": 2.3577512776831346, + "grad_norm": 0.5359559655189514, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 3460 + }, + { + "epoch": 2.364565587734242, + "grad_norm": 0.5053277611732483, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 3470 + }, + { + "epoch": 2.3713798977853493, + "grad_norm": 0.5058915019035339, + "learning_rate": 0.0002, + "loss": 1.5673, + "step": 3480 + }, + { + "epoch": 2.3781942078364566, + "grad_norm": 0.5314899682998657, + "learning_rate": 0.0002, + "loss": 1.5434, + "step": 3490 + }, + { + "epoch": 2.385008517887564, + "grad_norm": 0.48035913705825806, + "learning_rate": 0.0002, + "loss": 1.5203, + "step": 3500 + }, + { + "epoch": 2.3918228279386713, + "grad_norm": 0.45864903926849365, + "learning_rate": 0.0002, + "loss": 1.5936, + "step": 3510 + }, + { + "epoch": 2.3986371379897786, + "grad_norm": 0.4553050398826599, + "learning_rate": 0.0002, + "loss": 1.6285, + "step": 3520 + }, + { + "epoch": 2.405451448040886, + "grad_norm": 0.4483442008495331, + "learning_rate": 0.0002, + "loss": 1.5444, + "step": 3530 + }, + { + "epoch": 2.4122657580919933, + "grad_norm": 0.5043742060661316, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 3540 + }, + { + "epoch": 2.4190800681431006, + "grad_norm": 0.44807168841362, + "learning_rate": 0.0002, + "loss": 1.5692, + "step": 3550 + }, + { + "epoch": 2.425894378194208, + "grad_norm": 0.5065137147903442, + "learning_rate": 0.0002, + "loss": 1.6306, + "step": 3560 + }, + { + "epoch": 2.4327086882453153, + "grad_norm": 0.5186443328857422, + "learning_rate": 0.0002, + "loss": 1.5842, + "step": 3570 + }, + { + "epoch": 2.4395229982964226, + "grad_norm": 0.49743232131004333, + "learning_rate": 0.0002, + "loss": 1.5956, + "step": 3580 + }, + { + "epoch": 2.44633730834753, + "grad_norm": 0.524450421333313, + "learning_rate": 0.0002, + "loss": 1.6021, + "step": 3590 + }, + { + "epoch": 2.4531516183986373, + "grad_norm": 0.5053797364234924, + "learning_rate": 0.0002, + "loss": 1.6283, + "step": 3600 + }, + { + "epoch": 2.4599659284497446, + "grad_norm": 0.5223091840744019, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 3610 + }, + { + "epoch": 2.466780238500852, + "grad_norm": 0.4763810932636261, + "learning_rate": 0.0002, + "loss": 1.6315, + "step": 3620 + }, + { + "epoch": 2.4735945485519593, + "grad_norm": 0.5097282528877258, + "learning_rate": 0.0002, + "loss": 1.5623, + "step": 3630 + }, + { + "epoch": 2.4804088586030666, + "grad_norm": 0.5831942558288574, + "learning_rate": 0.0002, + "loss": 1.605, + "step": 3640 + }, + { + "epoch": 2.487223168654174, + "grad_norm": 0.47573572397232056, + "learning_rate": 0.0002, + "loss": 1.6074, + "step": 3650 + }, + { + "epoch": 2.4940374787052813, + "grad_norm": 0.49602726101875305, + "learning_rate": 0.0002, + "loss": 1.6411, + "step": 3660 + }, + { + "epoch": 2.500851788756388, + "grad_norm": 0.5468524694442749, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 3670 + }, + { + "epoch": 2.507666098807496, + "grad_norm": 0.45899373292922974, + "learning_rate": 0.0002, + "loss": 1.5782, + "step": 3680 + }, + { + "epoch": 2.514480408858603, + "grad_norm": 0.5031567215919495, + "learning_rate": 0.0002, + "loss": 1.5114, + "step": 3690 + }, + { + "epoch": 2.5212947189097106, + "grad_norm": 0.5224900841712952, + "learning_rate": 0.0002, + "loss": 1.538, + "step": 3700 + }, + { + "epoch": 2.5281090289608175, + "grad_norm": 0.504769504070282, + "learning_rate": 0.0002, + "loss": 1.5269, + "step": 3710 + }, + { + "epoch": 2.5349233390119252, + "grad_norm": 0.6120529770851135, + "learning_rate": 0.0002, + "loss": 1.5141, + "step": 3720 + }, + { + "epoch": 2.541737649063032, + "grad_norm": 0.47930678725242615, + "learning_rate": 0.0002, + "loss": 1.5666, + "step": 3730 + }, + { + "epoch": 2.54855195911414, + "grad_norm": 0.5039092302322388, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 3740 + }, + { + "epoch": 2.555366269165247, + "grad_norm": 0.49758994579315186, + "learning_rate": 0.0002, + "loss": 1.5585, + "step": 3750 + }, + { + "epoch": 2.5621805792163546, + "grad_norm": 0.44739171862602234, + "learning_rate": 0.0002, + "loss": 1.5351, + "step": 3760 + }, + { + "epoch": 2.5689948892674614, + "grad_norm": 0.47056373953819275, + "learning_rate": 0.0002, + "loss": 1.5099, + "step": 3770 + }, + { + "epoch": 2.575809199318569, + "grad_norm": 0.5077595114707947, + "learning_rate": 0.0002, + "loss": 1.5524, + "step": 3780 + }, + { + "epoch": 2.582623509369676, + "grad_norm": 0.4981902837753296, + "learning_rate": 0.0002, + "loss": 1.5524, + "step": 3790 + }, + { + "epoch": 2.589437819420784, + "grad_norm": 0.5736238360404968, + "learning_rate": 0.0002, + "loss": 1.5381, + "step": 3800 + }, + { + "epoch": 2.5962521294718908, + "grad_norm": 0.4898384213447571, + "learning_rate": 0.0002, + "loss": 1.67, + "step": 3810 + }, + { + "epoch": 2.6030664395229985, + "grad_norm": 0.4611325263977051, + "learning_rate": 0.0002, + "loss": 1.5411, + "step": 3820 + }, + { + "epoch": 2.6098807495741054, + "grad_norm": 0.5285341739654541, + "learning_rate": 0.0002, + "loss": 1.5662, + "step": 3830 + }, + { + "epoch": 2.616695059625213, + "grad_norm": 0.5679430961608887, + "learning_rate": 0.0002, + "loss": 1.5875, + "step": 3840 + }, + { + "epoch": 2.62350936967632, + "grad_norm": 0.48532548546791077, + "learning_rate": 0.0002, + "loss": 1.5544, + "step": 3850 + }, + { + "epoch": 2.630323679727428, + "grad_norm": 0.45506492257118225, + "learning_rate": 0.0002, + "loss": 1.579, + "step": 3860 + }, + { + "epoch": 2.6371379897785348, + "grad_norm": 0.6552556753158569, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 3870 + }, + { + "epoch": 2.643952299829642, + "grad_norm": 0.537874698638916, + "learning_rate": 0.0002, + "loss": 1.608, + "step": 3880 + }, + { + "epoch": 2.6507666098807494, + "grad_norm": 0.46102389693260193, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 3890 + }, + { + "epoch": 2.6575809199318567, + "grad_norm": 0.45531195402145386, + "learning_rate": 0.0002, + "loss": 1.5293, + "step": 3900 + }, + { + "epoch": 2.664395229982964, + "grad_norm": 0.5327293872833252, + "learning_rate": 0.0002, + "loss": 1.5492, + "step": 3910 + }, + { + "epoch": 2.6712095400340714, + "grad_norm": 0.4968956410884857, + "learning_rate": 0.0002, + "loss": 1.5921, + "step": 3920 + }, + { + "epoch": 2.6780238500851787, + "grad_norm": 0.4790082275867462, + "learning_rate": 0.0002, + "loss": 1.5823, + "step": 3930 + }, + { + "epoch": 2.684838160136286, + "grad_norm": 0.5392967462539673, + "learning_rate": 0.0002, + "loss": 1.615, + "step": 3940 + }, + { + "epoch": 2.6916524701873934, + "grad_norm": 0.5076649785041809, + "learning_rate": 0.0002, + "loss": 1.6218, + "step": 3950 + }, + { + "epoch": 2.6984667802385007, + "grad_norm": 0.5628064274787903, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 3960 + }, + { + "epoch": 2.705281090289608, + "grad_norm": 0.5012659430503845, + "learning_rate": 0.0002, + "loss": 1.5417, + "step": 3970 + }, + { + "epoch": 2.7120954003407154, + "grad_norm": 0.4947647452354431, + "learning_rate": 0.0002, + "loss": 1.5339, + "step": 3980 + }, + { + "epoch": 2.7189097103918227, + "grad_norm": 0.4890969693660736, + "learning_rate": 0.0002, + "loss": 1.5724, + "step": 3990 + }, + { + "epoch": 2.72572402044293, + "grad_norm": 0.4471694231033325, + "learning_rate": 0.0002, + "loss": 1.5746, + "step": 4000 + }, + { + "epoch": 2.7325383304940374, + "grad_norm": 0.5116439461708069, + "learning_rate": 0.0002, + "loss": 1.6669, + "step": 4010 + }, + { + "epoch": 2.7393526405451447, + "grad_norm": 0.5720411539077759, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 4020 + }, + { + "epoch": 2.746166950596252, + "grad_norm": 0.5529406070709229, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 4030 + }, + { + "epoch": 2.7529812606473594, + "grad_norm": 0.5229396820068359, + "learning_rate": 0.0002, + "loss": 1.6296, + "step": 4040 + }, + { + "epoch": 2.7597955706984667, + "grad_norm": 0.5270276069641113, + "learning_rate": 0.0002, + "loss": 1.5363, + "step": 4050 + }, + { + "epoch": 2.766609880749574, + "grad_norm": 0.48413026332855225, + "learning_rate": 0.0002, + "loss": 1.6305, + "step": 4060 + }, + { + "epoch": 2.7734241908006814, + "grad_norm": 0.5145403742790222, + "learning_rate": 0.0002, + "loss": 1.5488, + "step": 4070 + }, + { + "epoch": 2.7802385008517887, + "grad_norm": 0.48626071214675903, + "learning_rate": 0.0002, + "loss": 1.6023, + "step": 4080 + }, + { + "epoch": 2.787052810902896, + "grad_norm": 0.5018984079360962, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 4090 + }, + { + "epoch": 2.7938671209540034, + "grad_norm": 0.4881938695907593, + "learning_rate": 0.0002, + "loss": 1.7166, + "step": 4100 + }, + { + "epoch": 2.8006814310051107, + "grad_norm": 0.5151546001434326, + "learning_rate": 0.0002, + "loss": 1.515, + "step": 4110 + }, + { + "epoch": 2.807495741056218, + "grad_norm": 0.5109850764274597, + "learning_rate": 0.0002, + "loss": 1.6069, + "step": 4120 + }, + { + "epoch": 2.8143100511073254, + "grad_norm": 0.5109251141548157, + "learning_rate": 0.0002, + "loss": 1.6153, + "step": 4130 + }, + { + "epoch": 2.8211243611584327, + "grad_norm": 0.5025496482849121, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 4140 + }, + { + "epoch": 2.82793867120954, + "grad_norm": 0.49027004837989807, + "learning_rate": 0.0002, + "loss": 1.6292, + "step": 4150 + }, + { + "epoch": 2.8347529812606473, + "grad_norm": 0.4957362413406372, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 4160 + }, + { + "epoch": 2.8415672913117547, + "grad_norm": 0.5159927606582642, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 4170 + }, + { + "epoch": 2.848381601362862, + "grad_norm": 0.6040670871734619, + "learning_rate": 0.0002, + "loss": 1.577, + "step": 4180 + }, + { + "epoch": 2.8551959114139693, + "grad_norm": 0.5489953756332397, + "learning_rate": 0.0002, + "loss": 1.5295, + "step": 4190 + }, + { + "epoch": 2.8620102214650767, + "grad_norm": 0.5416634678840637, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 4200 + }, + { + "epoch": 2.868824531516184, + "grad_norm": 0.5278245210647583, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 4210 + }, + { + "epoch": 2.8756388415672913, + "grad_norm": 0.43382319808006287, + "learning_rate": 0.0002, + "loss": 1.4848, + "step": 4220 + }, + { + "epoch": 2.8824531516183987, + "grad_norm": 0.4724387228488922, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 4230 + }, + { + "epoch": 2.889267461669506, + "grad_norm": 0.49824780225753784, + "learning_rate": 0.0002, + "loss": 1.5748, + "step": 4240 + }, + { + "epoch": 2.8960817717206133, + "grad_norm": 0.5360262989997864, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 4250 + }, + { + "epoch": 2.9028960817717206, + "grad_norm": 0.49090322852134705, + "learning_rate": 0.0002, + "loss": 1.5934, + "step": 4260 + }, + { + "epoch": 2.909710391822828, + "grad_norm": 0.5613328218460083, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 4270 + }, + { + "epoch": 2.9165247018739353, + "grad_norm": 0.4611356258392334, + "learning_rate": 0.0002, + "loss": 1.5713, + "step": 4280 + }, + { + "epoch": 2.9233390119250426, + "grad_norm": 0.550897479057312, + "learning_rate": 0.0002, + "loss": 1.5457, + "step": 4290 + }, + { + "epoch": 2.93015332197615, + "grad_norm": 0.5089612603187561, + "learning_rate": 0.0002, + "loss": 1.6225, + "step": 4300 + }, + { + "epoch": 2.9369676320272573, + "grad_norm": 0.5210904479026794, + "learning_rate": 0.0002, + "loss": 1.5897, + "step": 4310 + }, + { + "epoch": 2.9437819420783646, + "grad_norm": 0.5506424903869629, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 4320 + }, + { + "epoch": 2.950596252129472, + "grad_norm": 0.5118561387062073, + "learning_rate": 0.0002, + "loss": 1.6395, + "step": 4330 + }, + { + "epoch": 2.9574105621805793, + "grad_norm": 0.5034464597702026, + "learning_rate": 0.0002, + "loss": 1.704, + "step": 4340 + }, + { + "epoch": 2.9642248722316866, + "grad_norm": 0.5019990801811218, + "learning_rate": 0.0002, + "loss": 1.6314, + "step": 4350 + }, + { + "epoch": 2.971039182282794, + "grad_norm": 0.5423325300216675, + "learning_rate": 0.0002, + "loss": 1.6161, + "step": 4360 + }, + { + "epoch": 2.9778534923339013, + "grad_norm": 0.5287469625473022, + "learning_rate": 0.0002, + "loss": 1.6144, + "step": 4370 + }, + { + "epoch": 2.9846678023850086, + "grad_norm": 0.5206913352012634, + "learning_rate": 0.0002, + "loss": 1.5227, + "step": 4380 + }, + { + "epoch": 2.991482112436116, + "grad_norm": 0.5407394170761108, + "learning_rate": 0.0002, + "loss": 1.6026, + "step": 4390 + }, + { + "epoch": 2.9982964224872233, + "grad_norm": 0.5244600176811218, + "learning_rate": 0.0002, + "loss": 1.5908, + "step": 4400 + }, + { + "epoch": 2.9996592844974446, + "eval_loss": 1.8412635326385498, + "eval_runtime": 65.5583, + "eval_samples_per_second": 7.734, + "eval_steps_per_second": 0.976, + "step": 4402 + } + ], + "logging_steps": 10, + "max_steps": 11736, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.261288901279744e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2f734531f14829705714a5a2703d90c81f36eeb0 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-4402/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2af5d486d0b370637d5a3999ab7668f94f5d53b6497f018967564aef5e8ad133 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a4745d64f147ed042cb05bdc4b6efd04b43d6f98 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:465bf8d18c6e7a0814771e8c6ec2f7df1567876c4717f3b1b69918921c6db018 +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3dbb2bd2d77af3d031940b1130d493cd20167aa1 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbe8b03b73961461031da7a2a4ad7d2593899f11628d6c9d9cfe89e1ac3edb3b +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..dbc4835cafa3dc24cf614ef208d03c655d3edbb8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2c021bdfc3a2f218a7d10e335a0edf96400d6ec30dc645ced8d08cb6379bf7f +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..809571415aaf958a756b0b234ccc0cde1d99f07b --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:789a8a7aa2f2d9ebf17b91ddca6a65acf66173aebb642caa9560880f331b0a43 +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..505bba01c1372f129cff1f36d033805ee6e0867f --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/trainer_state.json @@ -0,0 +1,4174 @@ +{ + "best_metric": 1.8034634590148926, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467", + "epoch": 4.0, + "eval_steps": 10, + "global_step": 5870, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0068143100511073255, + "grad_norm": 0.635574460029602, + "learning_rate": 0.0002, + "loss": 3.0988, + "step": 10 + }, + { + "epoch": 0.013628620102214651, + "grad_norm": 1.0401769876480103, + "learning_rate": 0.0002, + "loss": 2.5594, + "step": 20 + }, + { + "epoch": 0.020442930153321975, + "grad_norm": 0.4514131247997284, + "learning_rate": 0.0002, + "loss": 2.3587, + "step": 30 + }, + { + "epoch": 0.027257240204429302, + "grad_norm": 0.6303355693817139, + "learning_rate": 0.0002, + "loss": 2.121, + "step": 40 + }, + { + "epoch": 0.034071550255536626, + "grad_norm": 0.4648270606994629, + "learning_rate": 0.0002, + "loss": 1.9833, + "step": 50 + }, + { + "epoch": 0.04088586030664395, + "grad_norm": 0.42953479290008545, + "learning_rate": 0.0002, + "loss": 1.9384, + "step": 60 + }, + { + "epoch": 0.04770017035775128, + "grad_norm": 0.433614581823349, + "learning_rate": 0.0002, + "loss": 1.9202, + "step": 70 + }, + { + "epoch": 0.054514480408858604, + "grad_norm": 0.45318254828453064, + "learning_rate": 0.0002, + "loss": 1.911, + "step": 80 + }, + { + "epoch": 0.06132879045996593, + "grad_norm": 0.4023568630218506, + "learning_rate": 0.0002, + "loss": 1.8895, + "step": 90 + }, + { + "epoch": 0.06814310051107325, + "grad_norm": 0.43260207772254944, + "learning_rate": 0.0002, + "loss": 1.9257, + "step": 100 + }, + { + "epoch": 0.07495741056218058, + "grad_norm": 0.43389809131622314, + "learning_rate": 0.0002, + "loss": 1.9178, + "step": 110 + }, + { + "epoch": 0.0817717206132879, + "grad_norm": 0.39307987689971924, + "learning_rate": 0.0002, + "loss": 1.9071, + "step": 120 + }, + { + "epoch": 0.08858603066439523, + "grad_norm": 0.7703037261962891, + "learning_rate": 0.0002, + "loss": 1.9482, + "step": 130 + }, + { + "epoch": 0.09540034071550256, + "grad_norm": 0.38547563552856445, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 140 + }, + { + "epoch": 0.10221465076660988, + "grad_norm": 0.37948688864707947, + "learning_rate": 0.0002, + "loss": 1.9037, + "step": 150 + }, + { + "epoch": 0.10902896081771721, + "grad_norm": 0.33022379875183105, + "learning_rate": 0.0002, + "loss": 1.8664, + "step": 160 + }, + { + "epoch": 0.11584327086882454, + "grad_norm": 0.33703792095184326, + "learning_rate": 0.0002, + "loss": 1.8659, + "step": 170 + }, + { + "epoch": 0.12265758091993186, + "grad_norm": 0.3259912431240082, + "learning_rate": 0.0002, + "loss": 1.8389, + "step": 180 + }, + { + "epoch": 0.12947189097103917, + "grad_norm": 0.3593858778476715, + "learning_rate": 0.0002, + "loss": 1.8424, + "step": 190 + }, + { + "epoch": 0.1362862010221465, + "grad_norm": 0.3401614725589752, + "learning_rate": 0.0002, + "loss": 1.8457, + "step": 200 + }, + { + "epoch": 0.14310051107325383, + "grad_norm": 0.3892078697681427, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 210 + }, + { + "epoch": 0.14991482112436116, + "grad_norm": 0.315374493598938, + "learning_rate": 0.0002, + "loss": 1.8351, + "step": 220 + }, + { + "epoch": 0.1567291311754685, + "grad_norm": 0.3679497539997101, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 230 + }, + { + "epoch": 0.1635434412265758, + "grad_norm": 0.336730033159256, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 240 + }, + { + "epoch": 0.17035775127768313, + "grad_norm": 0.36472755670547485, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 250 + }, + { + "epoch": 0.17717206132879046, + "grad_norm": 0.34864023327827454, + "learning_rate": 0.0002, + "loss": 1.7766, + "step": 260 + }, + { + "epoch": 0.1839863713798978, + "grad_norm": 0.3350819945335388, + "learning_rate": 0.0002, + "loss": 1.896, + "step": 270 + }, + { + "epoch": 0.19080068143100512, + "grad_norm": 0.3844246566295624, + "learning_rate": 0.0002, + "loss": 1.8742, + "step": 280 + }, + { + "epoch": 0.19761499148211242, + "grad_norm": 0.38413065671920776, + "learning_rate": 0.0002, + "loss": 1.8531, + "step": 290 + }, + { + "epoch": 0.20442930153321975, + "grad_norm": 0.4080047607421875, + "learning_rate": 0.0002, + "loss": 1.8415, + "step": 300 + }, + { + "epoch": 0.21124361158432708, + "grad_norm": 0.3546800911426544, + "learning_rate": 0.0002, + "loss": 1.9009, + "step": 310 + }, + { + "epoch": 0.21805792163543442, + "grad_norm": 0.38577890396118164, + "learning_rate": 0.0002, + "loss": 1.8092, + "step": 320 + }, + { + "epoch": 0.22487223168654175, + "grad_norm": 0.38979098200798035, + "learning_rate": 0.0002, + "loss": 1.7113, + "step": 330 + }, + { + "epoch": 0.23168654173764908, + "grad_norm": 0.35105520486831665, + "learning_rate": 0.0002, + "loss": 1.8162, + "step": 340 + }, + { + "epoch": 0.23850085178875638, + "grad_norm": 0.31671223044395447, + "learning_rate": 0.0002, + "loss": 1.8408, + "step": 350 + }, + { + "epoch": 0.2453151618398637, + "grad_norm": 0.33034196496009827, + "learning_rate": 0.0002, + "loss": 1.8014, + "step": 360 + }, + { + "epoch": 0.252129471890971, + "grad_norm": 0.2990533709526062, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 370 + }, + { + "epoch": 0.25894378194207834, + "grad_norm": 0.362208753824234, + "learning_rate": 0.0002, + "loss": 1.8612, + "step": 380 + }, + { + "epoch": 0.2657580919931857, + "grad_norm": 0.3269096612930298, + "learning_rate": 0.0002, + "loss": 1.8001, + "step": 390 + }, + { + "epoch": 0.272572402044293, + "grad_norm": 0.30555954575538635, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 400 + }, + { + "epoch": 0.27938671209540034, + "grad_norm": 0.332933247089386, + "learning_rate": 0.0002, + "loss": 1.8018, + "step": 410 + }, + { + "epoch": 0.28620102214650767, + "grad_norm": 0.39454060792922974, + "learning_rate": 0.0002, + "loss": 1.8157, + "step": 420 + }, + { + "epoch": 0.293015332197615, + "grad_norm": 0.34589633345603943, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 430 + }, + { + "epoch": 0.29982964224872233, + "grad_norm": 0.3747332990169525, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 440 + }, + { + "epoch": 0.30664395229982966, + "grad_norm": 0.34825369715690613, + "learning_rate": 0.0002, + "loss": 1.8476, + "step": 450 + }, + { + "epoch": 0.313458262350937, + "grad_norm": 0.32906976342201233, + "learning_rate": 0.0002, + "loss": 1.775, + "step": 460 + }, + { + "epoch": 0.3202725724020443, + "grad_norm": 0.33108609914779663, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 470 + }, + { + "epoch": 0.3270868824531516, + "grad_norm": 0.3170463442802429, + "learning_rate": 0.0002, + "loss": 1.8012, + "step": 480 + }, + { + "epoch": 0.3339011925042589, + "grad_norm": 0.30792081356048584, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 490 + }, + { + "epoch": 0.34071550255536626, + "grad_norm": 0.31772997975349426, + "learning_rate": 0.0002, + "loss": 1.7361, + "step": 500 + }, + { + "epoch": 0.3475298126064736, + "grad_norm": 0.32714012265205383, + "learning_rate": 0.0002, + "loss": 1.8147, + "step": 510 + }, + { + "epoch": 0.3543441226575809, + "grad_norm": 0.3021100163459778, + "learning_rate": 0.0002, + "loss": 1.8332, + "step": 520 + }, + { + "epoch": 0.36115843270868825, + "grad_norm": 0.6045835018157959, + "learning_rate": 0.0002, + "loss": 1.7337, + "step": 530 + }, + { + "epoch": 0.3679727427597956, + "grad_norm": 0.3003896474838257, + "learning_rate": 0.0002, + "loss": 1.8661, + "step": 540 + }, + { + "epoch": 0.3747870528109029, + "grad_norm": 0.3678470551967621, + "learning_rate": 0.0002, + "loss": 1.8359, + "step": 550 + }, + { + "epoch": 0.38160136286201024, + "grad_norm": 0.35787615180015564, + "learning_rate": 0.0002, + "loss": 1.7512, + "step": 560 + }, + { + "epoch": 0.38841567291311757, + "grad_norm": 0.31882143020629883, + "learning_rate": 0.0002, + "loss": 1.8048, + "step": 570 + }, + { + "epoch": 0.39522998296422485, + "grad_norm": 0.3186313509941101, + "learning_rate": 0.0002, + "loss": 1.8108, + "step": 580 + }, + { + "epoch": 0.4020442930153322, + "grad_norm": 0.41443702578544617, + "learning_rate": 0.0002, + "loss": 1.8012, + "step": 590 + }, + { + "epoch": 0.4088586030664395, + "grad_norm": 0.28773069381713867, + "learning_rate": 0.0002, + "loss": 1.7794, + "step": 600 + }, + { + "epoch": 0.41567291311754684, + "grad_norm": 0.35743263363838196, + "learning_rate": 0.0002, + "loss": 1.8231, + "step": 610 + }, + { + "epoch": 0.42248722316865417, + "grad_norm": 0.3360286355018616, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 620 + }, + { + "epoch": 0.4293015332197615, + "grad_norm": 0.32838866114616394, + "learning_rate": 0.0002, + "loss": 1.7654, + "step": 630 + }, + { + "epoch": 0.43611584327086883, + "grad_norm": 0.2994388937950134, + "learning_rate": 0.0002, + "loss": 1.8176, + "step": 640 + }, + { + "epoch": 0.44293015332197616, + "grad_norm": 0.3306307792663574, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 650 + }, + { + "epoch": 0.4497444633730835, + "grad_norm": 0.3129560351371765, + "learning_rate": 0.0002, + "loss": 1.8201, + "step": 660 + }, + { + "epoch": 0.4565587734241908, + "grad_norm": 0.3244289457798004, + "learning_rate": 0.0002, + "loss": 1.803, + "step": 670 + }, + { + "epoch": 0.46337308347529815, + "grad_norm": 0.3196892738342285, + "learning_rate": 0.0002, + "loss": 1.8399, + "step": 680 + }, + { + "epoch": 0.47018739352640543, + "grad_norm": 0.3135230243206024, + "learning_rate": 0.0002, + "loss": 1.8291, + "step": 690 + }, + { + "epoch": 0.47700170357751276, + "grad_norm": 0.28677991032600403, + "learning_rate": 0.0002, + "loss": 1.7423, + "step": 700 + }, + { + "epoch": 0.4838160136286201, + "grad_norm": 0.3074065148830414, + "learning_rate": 0.0002, + "loss": 1.6982, + "step": 710 + }, + { + "epoch": 0.4906303236797274, + "grad_norm": 0.3354290723800659, + "learning_rate": 0.0002, + "loss": 1.8078, + "step": 720 + }, + { + "epoch": 0.49744463373083475, + "grad_norm": 0.324370801448822, + "learning_rate": 0.0002, + "loss": 1.7333, + "step": 730 + }, + { + "epoch": 0.504258943781942, + "grad_norm": 0.29496142268180847, + "learning_rate": 0.0002, + "loss": 1.8506, + "step": 740 + }, + { + "epoch": 0.5110732538330494, + "grad_norm": 0.30694130063056946, + "learning_rate": 0.0002, + "loss": 1.7761, + "step": 750 + }, + { + "epoch": 0.5178875638841567, + "grad_norm": 0.36168408393859863, + "learning_rate": 0.0002, + "loss": 1.7854, + "step": 760 + }, + { + "epoch": 0.524701873935264, + "grad_norm": 0.2930343449115753, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 770 + }, + { + "epoch": 0.5315161839863713, + "grad_norm": 0.3023432493209839, + "learning_rate": 0.0002, + "loss": 1.7924, + "step": 780 + }, + { + "epoch": 0.5383304940374787, + "grad_norm": 0.3272720277309418, + "learning_rate": 0.0002, + "loss": 1.8467, + "step": 790 + }, + { + "epoch": 0.545144804088586, + "grad_norm": 0.2907974421977997, + "learning_rate": 0.0002, + "loss": 1.8639, + "step": 800 + }, + { + "epoch": 0.5519591141396933, + "grad_norm": 0.32267168164253235, + "learning_rate": 0.0002, + "loss": 1.9018, + "step": 810 + }, + { + "epoch": 0.5587734241908007, + "grad_norm": 0.32059940695762634, + "learning_rate": 0.0002, + "loss": 1.8311, + "step": 820 + }, + { + "epoch": 0.565587734241908, + "grad_norm": 0.30951258540153503, + "learning_rate": 0.0002, + "loss": 1.7234, + "step": 830 + }, + { + "epoch": 0.5724020442930153, + "grad_norm": 0.33976122736930847, + "learning_rate": 0.0002, + "loss": 1.8063, + "step": 840 + }, + { + "epoch": 0.5792163543441227, + "grad_norm": 0.3195820450782776, + "learning_rate": 0.0002, + "loss": 1.7021, + "step": 850 + }, + { + "epoch": 0.58603066439523, + "grad_norm": 0.2828562557697296, + "learning_rate": 0.0002, + "loss": 1.8073, + "step": 860 + }, + { + "epoch": 0.5928449744463373, + "grad_norm": 0.29591670632362366, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 870 + }, + { + "epoch": 0.5996592844974447, + "grad_norm": 0.3086104393005371, + "learning_rate": 0.0002, + "loss": 1.8109, + "step": 880 + }, + { + "epoch": 0.606473594548552, + "grad_norm": 0.3592929542064667, + "learning_rate": 0.0002, + "loss": 1.8207, + "step": 890 + }, + { + "epoch": 0.6132879045996593, + "grad_norm": 0.2830186188220978, + "learning_rate": 0.0002, + "loss": 1.7662, + "step": 900 + }, + { + "epoch": 0.6201022146507666, + "grad_norm": 0.3128598630428314, + "learning_rate": 0.0002, + "loss": 1.8344, + "step": 910 + }, + { + "epoch": 0.626916524701874, + "grad_norm": 0.31957563757896423, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 920 + }, + { + "epoch": 0.6337308347529813, + "grad_norm": 0.30994319915771484, + "learning_rate": 0.0002, + "loss": 1.7578, + "step": 930 + }, + { + "epoch": 0.6405451448040886, + "grad_norm": 0.3352845013141632, + "learning_rate": 0.0002, + "loss": 1.8494, + "step": 940 + }, + { + "epoch": 0.6473594548551959, + "grad_norm": 0.2960077226161957, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 950 + }, + { + "epoch": 0.6541737649063032, + "grad_norm": 0.32675081491470337, + "learning_rate": 0.0002, + "loss": 1.7209, + "step": 960 + }, + { + "epoch": 0.6609880749574105, + "grad_norm": 0.30042028427124023, + "learning_rate": 0.0002, + "loss": 1.7776, + "step": 970 + }, + { + "epoch": 0.6678023850085179, + "grad_norm": 0.3288673758506775, + "learning_rate": 0.0002, + "loss": 1.7597, + "step": 980 + }, + { + "epoch": 0.6746166950596252, + "grad_norm": 0.323215126991272, + "learning_rate": 0.0002, + "loss": 1.7962, + "step": 990 + }, + { + "epoch": 0.6814310051107325, + "grad_norm": 0.30041399598121643, + "learning_rate": 0.0002, + "loss": 1.821, + "step": 1000 + }, + { + "epoch": 0.6882453151618398, + "grad_norm": 0.3076179623603821, + "learning_rate": 0.0002, + "loss": 1.7716, + "step": 1010 + }, + { + "epoch": 0.6950596252129472, + "grad_norm": 0.2941909730434418, + "learning_rate": 0.0002, + "loss": 1.7827, + "step": 1020 + }, + { + "epoch": 0.7018739352640545, + "grad_norm": 0.32220420241355896, + "learning_rate": 0.0002, + "loss": 1.7964, + "step": 1030 + }, + { + "epoch": 0.7086882453151618, + "grad_norm": 0.2989702820777893, + "learning_rate": 0.0002, + "loss": 1.7101, + "step": 1040 + }, + { + "epoch": 0.7155025553662692, + "grad_norm": 0.3049640357494354, + "learning_rate": 0.0002, + "loss": 1.8749, + "step": 1050 + }, + { + "epoch": 0.7223168654173765, + "grad_norm": 0.3183284103870392, + "learning_rate": 0.0002, + "loss": 1.782, + "step": 1060 + }, + { + "epoch": 0.7291311754684838, + "grad_norm": 0.3070095181465149, + "learning_rate": 0.0002, + "loss": 1.785, + "step": 1070 + }, + { + "epoch": 0.7359454855195912, + "grad_norm": 0.33263063430786133, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 1080 + }, + { + "epoch": 0.7427597955706985, + "grad_norm": 0.28774312138557434, + "learning_rate": 0.0002, + "loss": 1.7627, + "step": 1090 + }, + { + "epoch": 0.7495741056218058, + "grad_norm": 0.29304224252700806, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1100 + }, + { + "epoch": 0.7563884156729132, + "grad_norm": 0.27529507875442505, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 1110 + }, + { + "epoch": 0.7632027257240205, + "grad_norm": 0.32319945096969604, + "learning_rate": 0.0002, + "loss": 1.8071, + "step": 1120 + }, + { + "epoch": 0.7700170357751278, + "grad_norm": 0.33777597546577454, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 1130 + }, + { + "epoch": 0.7768313458262351, + "grad_norm": 0.29847201704978943, + "learning_rate": 0.0002, + "loss": 1.8488, + "step": 1140 + }, + { + "epoch": 0.7836456558773425, + "grad_norm": 0.31370633840560913, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 1150 + }, + { + "epoch": 0.7904599659284497, + "grad_norm": 0.31633856892585754, + "learning_rate": 0.0002, + "loss": 1.8175, + "step": 1160 + }, + { + "epoch": 0.797274275979557, + "grad_norm": 0.33849263191223145, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 1170 + }, + { + "epoch": 0.8040885860306644, + "grad_norm": 0.3306333124637604, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 1180 + }, + { + "epoch": 0.8109028960817717, + "grad_norm": 0.3249678313732147, + "learning_rate": 0.0002, + "loss": 1.777, + "step": 1190 + }, + { + "epoch": 0.817717206132879, + "grad_norm": 0.3252817690372467, + "learning_rate": 0.0002, + "loss": 1.7631, + "step": 1200 + }, + { + "epoch": 0.8245315161839863, + "grad_norm": 0.31772559881210327, + "learning_rate": 0.0002, + "loss": 1.7877, + "step": 1210 + }, + { + "epoch": 0.8313458262350937, + "grad_norm": 0.2803371846675873, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 1220 + }, + { + "epoch": 0.838160136286201, + "grad_norm": 0.26700571179389954, + "learning_rate": 0.0002, + "loss": 1.738, + "step": 1230 + }, + { + "epoch": 0.8449744463373083, + "grad_norm": 0.3060092031955719, + "learning_rate": 0.0002, + "loss": 1.8301, + "step": 1240 + }, + { + "epoch": 0.8517887563884157, + "grad_norm": 0.28831684589385986, + "learning_rate": 0.0002, + "loss": 1.7993, + "step": 1250 + }, + { + "epoch": 0.858603066439523, + "grad_norm": 0.30708742141723633, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 1260 + }, + { + "epoch": 0.8654173764906303, + "grad_norm": 0.2915987968444824, + "learning_rate": 0.0002, + "loss": 1.8506, + "step": 1270 + }, + { + "epoch": 0.8722316865417377, + "grad_norm": 0.2893589735031128, + "learning_rate": 0.0002, + "loss": 1.7536, + "step": 1280 + }, + { + "epoch": 0.879045996592845, + "grad_norm": 0.29545632004737854, + "learning_rate": 0.0002, + "loss": 1.7437, + "step": 1290 + }, + { + "epoch": 0.8858603066439523, + "grad_norm": 0.3354771137237549, + "learning_rate": 0.0002, + "loss": 1.859, + "step": 1300 + }, + { + "epoch": 0.8926746166950597, + "grad_norm": 0.37715399265289307, + "learning_rate": 0.0002, + "loss": 1.7644, + "step": 1310 + }, + { + "epoch": 0.899488926746167, + "grad_norm": 0.28847193717956543, + "learning_rate": 0.0002, + "loss": 1.7731, + "step": 1320 + }, + { + "epoch": 0.9063032367972743, + "grad_norm": 0.2780889868736267, + "learning_rate": 0.0002, + "loss": 1.9077, + "step": 1330 + }, + { + "epoch": 0.9131175468483816, + "grad_norm": 0.2714342176914215, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 1340 + }, + { + "epoch": 0.919931856899489, + "grad_norm": 0.2950133979320526, + "learning_rate": 0.0002, + "loss": 1.8, + "step": 1350 + }, + { + "epoch": 0.9267461669505963, + "grad_norm": 0.29097145795822144, + "learning_rate": 0.0002, + "loss": 1.7368, + "step": 1360 + }, + { + "epoch": 0.9335604770017035, + "grad_norm": 0.32540133595466614, + "learning_rate": 0.0002, + "loss": 1.7864, + "step": 1370 + }, + { + "epoch": 0.9403747870528109, + "grad_norm": 0.3076636493206024, + "learning_rate": 0.0002, + "loss": 1.7571, + "step": 1380 + }, + { + "epoch": 0.9471890971039182, + "grad_norm": 0.2962130308151245, + "learning_rate": 0.0002, + "loss": 1.8312, + "step": 1390 + }, + { + "epoch": 0.9540034071550255, + "grad_norm": 0.30086860060691833, + "learning_rate": 0.0002, + "loss": 1.7858, + "step": 1400 + }, + { + "epoch": 0.9608177172061328, + "grad_norm": 0.28634947538375854, + "learning_rate": 0.0002, + "loss": 1.7991, + "step": 1410 + }, + { + "epoch": 0.9676320272572402, + "grad_norm": 0.35314416885375977, + "learning_rate": 0.0002, + "loss": 1.8385, + "step": 1420 + }, + { + "epoch": 0.9744463373083475, + "grad_norm": 0.2939317524433136, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 1430 + }, + { + "epoch": 0.9812606473594548, + "grad_norm": 0.3010196089744568, + "learning_rate": 0.0002, + "loss": 1.7582, + "step": 1440 + }, + { + "epoch": 0.9880749574105622, + "grad_norm": 0.30816152691841125, + "learning_rate": 0.0002, + "loss": 1.758, + "step": 1450 + }, + { + "epoch": 0.9948892674616695, + "grad_norm": 0.28152793645858765, + "learning_rate": 0.0002, + "loss": 1.8159, + "step": 1460 + }, + { + "epoch": 0.9996592844974447, + "eval_loss": 1.8034634590148926, + "eval_runtime": 53.6017, + "eval_samples_per_second": 9.459, + "eval_steps_per_second": 1.194, + "step": 1467 + }, + { + "epoch": 1.0017035775127767, + "grad_norm": 0.29246416687965393, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 1470 + }, + { + "epoch": 1.008517887563884, + "grad_norm": 0.2668602168560028, + "learning_rate": 0.0002, + "loss": 1.8159, + "step": 1480 + }, + { + "epoch": 1.0153321976149914, + "grad_norm": 0.2998567819595337, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 1490 + }, + { + "epoch": 1.0221465076660987, + "grad_norm": 0.3284934461116791, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 1500 + }, + { + "epoch": 1.028960817717206, + "grad_norm": 0.3275827169418335, + "learning_rate": 0.0002, + "loss": 1.689, + "step": 1510 + }, + { + "epoch": 1.0357751277683134, + "grad_norm": 0.3382718563079834, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 1520 + }, + { + "epoch": 1.0425894378194207, + "grad_norm": 0.36737215518951416, + "learning_rate": 0.0002, + "loss": 1.7215, + "step": 1530 + }, + { + "epoch": 1.049403747870528, + "grad_norm": 0.3442603647708893, + "learning_rate": 0.0002, + "loss": 1.6993, + "step": 1540 + }, + { + "epoch": 1.0562180579216354, + "grad_norm": 0.3323381245136261, + "learning_rate": 0.0002, + "loss": 1.6996, + "step": 1550 + }, + { + "epoch": 1.0630323679727427, + "grad_norm": 0.341227650642395, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 1560 + }, + { + "epoch": 1.06984667802385, + "grad_norm": 0.3361579477787018, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 1570 + }, + { + "epoch": 1.0766609880749574, + "grad_norm": 0.3556230962276459, + "learning_rate": 0.0002, + "loss": 1.7337, + "step": 1580 + }, + { + "epoch": 1.0834752981260647, + "grad_norm": 0.27130424976348877, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 1590 + }, + { + "epoch": 1.090289608177172, + "grad_norm": 0.29366323351860046, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 1600 + }, + { + "epoch": 1.0971039182282794, + "grad_norm": 0.3581245541572571, + "learning_rate": 0.0002, + "loss": 1.6721, + "step": 1610 + }, + { + "epoch": 1.1039182282793867, + "grad_norm": 0.3021670877933502, + "learning_rate": 0.0002, + "loss": 1.7639, + "step": 1620 + }, + { + "epoch": 1.110732538330494, + "grad_norm": 0.3145572543144226, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 1630 + }, + { + "epoch": 1.1175468483816013, + "grad_norm": 0.35362154245376587, + "learning_rate": 0.0002, + "loss": 1.7408, + "step": 1640 + }, + { + "epoch": 1.1243611584327087, + "grad_norm": 0.5413113236427307, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 1650 + }, + { + "epoch": 1.131175468483816, + "grad_norm": 0.6858654022216797, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 1660 + }, + { + "epoch": 1.1379897785349233, + "grad_norm": 0.3781903386116028, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 1670 + }, + { + "epoch": 1.1448040885860307, + "grad_norm": 0.404864102602005, + "learning_rate": 0.0002, + "loss": 1.659, + "step": 1680 + }, + { + "epoch": 1.151618398637138, + "grad_norm": 0.3595100939273834, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 1690 + }, + { + "epoch": 1.1584327086882453, + "grad_norm": 0.33682283759117126, + "learning_rate": 0.0002, + "loss": 1.7023, + "step": 1700 + }, + { + "epoch": 1.1652470187393527, + "grad_norm": 0.3877373933792114, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 1710 + }, + { + "epoch": 1.17206132879046, + "grad_norm": 0.34606897830963135, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 1720 + }, + { + "epoch": 1.1788756388415673, + "grad_norm": 0.3122918903827667, + "learning_rate": 0.0002, + "loss": 1.6889, + "step": 1730 + }, + { + "epoch": 1.1856899488926746, + "grad_norm": 0.34081900119781494, + "learning_rate": 0.0002, + "loss": 1.6585, + "step": 1740 + }, + { + "epoch": 1.192504258943782, + "grad_norm": 0.3418807089328766, + "learning_rate": 0.0002, + "loss": 1.7794, + "step": 1750 + }, + { + "epoch": 1.1993185689948893, + "grad_norm": 0.3495500981807709, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 1760 + }, + { + "epoch": 1.2061328790459966, + "grad_norm": 0.557288408279419, + "learning_rate": 0.0002, + "loss": 1.6578, + "step": 1770 + }, + { + "epoch": 1.212947189097104, + "grad_norm": 0.3193778693675995, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 1780 + }, + { + "epoch": 1.2197614991482113, + "grad_norm": 0.3306216895580292, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 1790 + }, + { + "epoch": 1.2265758091993186, + "grad_norm": 0.37998732924461365, + "learning_rate": 0.0002, + "loss": 1.6772, + "step": 1800 + }, + { + "epoch": 1.233390119250426, + "grad_norm": 0.3255669176578522, + "learning_rate": 0.0002, + "loss": 1.7094, + "step": 1810 + }, + { + "epoch": 1.2402044293015333, + "grad_norm": 0.3741287291049957, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 1820 + }, + { + "epoch": 1.2470187393526406, + "grad_norm": 0.36727291345596313, + "learning_rate": 0.0002, + "loss": 1.7317, + "step": 1830 + }, + { + "epoch": 1.253833049403748, + "grad_norm": 0.3479527235031128, + "learning_rate": 0.0002, + "loss": 1.7418, + "step": 1840 + }, + { + "epoch": 1.2606473594548553, + "grad_norm": 0.3472636640071869, + "learning_rate": 0.0002, + "loss": 1.7062, + "step": 1850 + }, + { + "epoch": 1.2674616695059626, + "grad_norm": 0.3702869415283203, + "learning_rate": 0.0002, + "loss": 1.6263, + "step": 1860 + }, + { + "epoch": 1.27427597955707, + "grad_norm": 0.3934040069580078, + "learning_rate": 0.0002, + "loss": 1.6222, + "step": 1870 + }, + { + "epoch": 1.2810902896081773, + "grad_norm": 0.46887534856796265, + "learning_rate": 0.0002, + "loss": 1.7616, + "step": 1880 + }, + { + "epoch": 1.2879045996592846, + "grad_norm": 0.3191998600959778, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 1890 + }, + { + "epoch": 1.294718909710392, + "grad_norm": 0.34032225608825684, + "learning_rate": 0.0002, + "loss": 1.7001, + "step": 1900 + }, + { + "epoch": 1.3015332197614993, + "grad_norm": 0.33453696966171265, + "learning_rate": 0.0002, + "loss": 1.8316, + "step": 1910 + }, + { + "epoch": 1.3083475298126066, + "grad_norm": 0.3451494872570038, + "learning_rate": 0.0002, + "loss": 1.6567, + "step": 1920 + }, + { + "epoch": 1.315161839863714, + "grad_norm": 0.36203092336654663, + "learning_rate": 0.0002, + "loss": 1.708, + "step": 1930 + }, + { + "epoch": 1.321976149914821, + "grad_norm": 0.43794456124305725, + "learning_rate": 0.0002, + "loss": 1.7095, + "step": 1940 + }, + { + "epoch": 1.3287904599659284, + "grad_norm": 0.3630591034889221, + "learning_rate": 0.0002, + "loss": 1.7264, + "step": 1950 + }, + { + "epoch": 1.3356047700170357, + "grad_norm": 0.36951911449432373, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 1960 + }, + { + "epoch": 1.342419080068143, + "grad_norm": 0.4001159965991974, + "learning_rate": 0.0002, + "loss": 1.651, + "step": 1970 + }, + { + "epoch": 1.3492333901192504, + "grad_norm": 0.3820836544036865, + "learning_rate": 0.0002, + "loss": 1.677, + "step": 1980 + }, + { + "epoch": 1.3560477001703577, + "grad_norm": 0.3705870807170868, + "learning_rate": 0.0002, + "loss": 1.583, + "step": 1990 + }, + { + "epoch": 1.362862010221465, + "grad_norm": 0.3557972013950348, + "learning_rate": 0.0002, + "loss": 1.6207, + "step": 2000 + }, + { + "epoch": 1.3696763202725724, + "grad_norm": 0.38546398282051086, + "learning_rate": 0.0002, + "loss": 1.7656, + "step": 2010 + }, + { + "epoch": 1.3764906303236797, + "grad_norm": 0.3908020853996277, + "learning_rate": 0.0002, + "loss": 1.6881, + "step": 2020 + }, + { + "epoch": 1.383304940374787, + "grad_norm": 0.3822040855884552, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 2030 + }, + { + "epoch": 1.3901192504258943, + "grad_norm": 0.425327867269516, + "learning_rate": 0.0002, + "loss": 1.7923, + "step": 2040 + }, + { + "epoch": 1.3969335604770017, + "grad_norm": 0.3436259329319, + "learning_rate": 0.0002, + "loss": 1.7032, + "step": 2050 + }, + { + "epoch": 1.403747870528109, + "grad_norm": 0.33124062418937683, + "learning_rate": 0.0002, + "loss": 1.6398, + "step": 2060 + }, + { + "epoch": 1.4105621805792163, + "grad_norm": 0.3662424683570862, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 2070 + }, + { + "epoch": 1.4173764906303237, + "grad_norm": 0.3720635175704956, + "learning_rate": 0.0002, + "loss": 1.7258, + "step": 2080 + }, + { + "epoch": 1.424190800681431, + "grad_norm": 0.3361680805683136, + "learning_rate": 0.0002, + "loss": 1.7186, + "step": 2090 + }, + { + "epoch": 1.4310051107325383, + "grad_norm": 0.32751724123954773, + "learning_rate": 0.0002, + "loss": 1.7606, + "step": 2100 + }, + { + "epoch": 1.4378194207836457, + "grad_norm": 0.34333378076553345, + "learning_rate": 0.0002, + "loss": 1.7051, + "step": 2110 + }, + { + "epoch": 1.444633730834753, + "grad_norm": 0.37777671217918396, + "learning_rate": 0.0002, + "loss": 1.6979, + "step": 2120 + }, + { + "epoch": 1.4514480408858603, + "grad_norm": 0.37126365303993225, + "learning_rate": 0.0002, + "loss": 1.7085, + "step": 2130 + }, + { + "epoch": 1.4582623509369677, + "grad_norm": 0.3602267801761627, + "learning_rate": 0.0002, + "loss": 1.721, + "step": 2140 + }, + { + "epoch": 1.465076660988075, + "grad_norm": 0.3287110924720764, + "learning_rate": 0.0002, + "loss": 1.8148, + "step": 2150 + }, + { + "epoch": 1.4718909710391823, + "grad_norm": 0.3562135100364685, + "learning_rate": 0.0002, + "loss": 1.6966, + "step": 2160 + }, + { + "epoch": 1.4787052810902896, + "grad_norm": 0.38292962312698364, + "learning_rate": 0.0002, + "loss": 1.713, + "step": 2170 + }, + { + "epoch": 1.485519591141397, + "grad_norm": 0.38220319151878357, + "learning_rate": 0.0002, + "loss": 1.7036, + "step": 2180 + }, + { + "epoch": 1.4923339011925043, + "grad_norm": 0.3570062220096588, + "learning_rate": 0.0002, + "loss": 1.7297, + "step": 2190 + }, + { + "epoch": 1.4991482112436116, + "grad_norm": 0.363146036863327, + "learning_rate": 0.0002, + "loss": 1.5652, + "step": 2200 + }, + { + "epoch": 1.5059625212947187, + "grad_norm": 0.37393274903297424, + "learning_rate": 0.0002, + "loss": 1.6716, + "step": 2210 + }, + { + "epoch": 1.512776831345826, + "grad_norm": 0.3628501892089844, + "learning_rate": 0.0002, + "loss": 1.6839, + "step": 2220 + }, + { + "epoch": 1.5195911413969334, + "grad_norm": 0.33430740237236023, + "learning_rate": 0.0002, + "loss": 1.6627, + "step": 2230 + }, + { + "epoch": 1.5264054514480407, + "grad_norm": 0.35363978147506714, + "learning_rate": 0.0002, + "loss": 1.7495, + "step": 2240 + }, + { + "epoch": 1.533219761499148, + "grad_norm": 0.37220337986946106, + "learning_rate": 0.0002, + "loss": 1.6364, + "step": 2250 + }, + { + "epoch": 1.5400340715502554, + "grad_norm": 0.35020262002944946, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 2260 + }, + { + "epoch": 1.5468483816013627, + "grad_norm": 0.35274937748908997, + "learning_rate": 0.0002, + "loss": 1.7453, + "step": 2270 + }, + { + "epoch": 1.55366269165247, + "grad_norm": 0.3797738552093506, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 2280 + }, + { + "epoch": 1.5604770017035774, + "grad_norm": 0.4160412847995758, + "learning_rate": 0.0002, + "loss": 1.6197, + "step": 2290 + }, + { + "epoch": 1.5672913117546847, + "grad_norm": 0.38779592514038086, + "learning_rate": 0.0002, + "loss": 1.7101, + "step": 2300 + }, + { + "epoch": 1.574105621805792, + "grad_norm": 0.39171287417411804, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 2310 + }, + { + "epoch": 1.5809199318568994, + "grad_norm": 0.3606826663017273, + "learning_rate": 0.0002, + "loss": 1.7087, + "step": 2320 + }, + { + "epoch": 1.5877342419080067, + "grad_norm": 0.3745017647743225, + "learning_rate": 0.0002, + "loss": 1.7269, + "step": 2330 + }, + { + "epoch": 1.594548551959114, + "grad_norm": 0.34933462738990784, + "learning_rate": 0.0002, + "loss": 1.7068, + "step": 2340 + }, + { + "epoch": 1.6013628620102214, + "grad_norm": 0.37268444895744324, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 2350 + }, + { + "epoch": 1.6081771720613287, + "grad_norm": 0.4603484869003296, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 2360 + }, + { + "epoch": 1.614991482112436, + "grad_norm": 0.35689088702201843, + "learning_rate": 0.0002, + "loss": 1.7621, + "step": 2370 + }, + { + "epoch": 1.6218057921635434, + "grad_norm": 0.3392031490802765, + "learning_rate": 0.0002, + "loss": 1.6989, + "step": 2380 + }, + { + "epoch": 1.6286201022146507, + "grad_norm": 0.394653856754303, + "learning_rate": 0.0002, + "loss": 1.7077, + "step": 2390 + }, + { + "epoch": 1.635434412265758, + "grad_norm": 0.33972012996673584, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 2400 + }, + { + "epoch": 1.6422487223168654, + "grad_norm": 0.3854375183582306, + "learning_rate": 0.0002, + "loss": 1.7681, + "step": 2410 + }, + { + "epoch": 1.6490630323679727, + "grad_norm": 0.36143961548805237, + "learning_rate": 0.0002, + "loss": 1.7102, + "step": 2420 + }, + { + "epoch": 1.65587734241908, + "grad_norm": 0.35816189646720886, + "learning_rate": 0.0002, + "loss": 1.7432, + "step": 2430 + }, + { + "epoch": 1.6626916524701874, + "grad_norm": 0.36298736929893494, + "learning_rate": 0.0002, + "loss": 1.6407, + "step": 2440 + }, + { + "epoch": 1.6695059625212947, + "grad_norm": 0.36756929755210876, + "learning_rate": 0.0002, + "loss": 1.723, + "step": 2450 + }, + { + "epoch": 1.676320272572402, + "grad_norm": 0.35969603061676025, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 2460 + }, + { + "epoch": 1.6831345826235093, + "grad_norm": 0.38449376821517944, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 2470 + }, + { + "epoch": 1.6899488926746167, + "grad_norm": 0.44511452317237854, + "learning_rate": 0.0002, + "loss": 1.7261, + "step": 2480 + }, + { + "epoch": 1.696763202725724, + "grad_norm": 0.3831416368484497, + "learning_rate": 0.0002, + "loss": 1.7397, + "step": 2490 + }, + { + "epoch": 1.7035775127768313, + "grad_norm": 0.3795325756072998, + "learning_rate": 0.0002, + "loss": 1.7046, + "step": 2500 + }, + { + "epoch": 1.7103918228279387, + "grad_norm": 0.34978193044662476, + "learning_rate": 0.0002, + "loss": 1.6993, + "step": 2510 + }, + { + "epoch": 1.717206132879046, + "grad_norm": 0.35923877358436584, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 2520 + }, + { + "epoch": 1.7240204429301533, + "grad_norm": 0.352999746799469, + "learning_rate": 0.0002, + "loss": 1.6253, + "step": 2530 + }, + { + "epoch": 1.7308347529812607, + "grad_norm": 0.43673479557037354, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 2540 + }, + { + "epoch": 1.737649063032368, + "grad_norm": 0.4153687357902527, + "learning_rate": 0.0002, + "loss": 1.7079, + "step": 2550 + }, + { + "epoch": 1.7444633730834753, + "grad_norm": 0.35541167855262756, + "learning_rate": 0.0002, + "loss": 1.6714, + "step": 2560 + }, + { + "epoch": 1.7512776831345827, + "grad_norm": 0.3288775086402893, + "learning_rate": 0.0002, + "loss": 1.696, + "step": 2570 + }, + { + "epoch": 1.75809199318569, + "grad_norm": 0.3991123139858246, + "learning_rate": 0.0002, + "loss": 1.7486, + "step": 2580 + }, + { + "epoch": 1.7649063032367973, + "grad_norm": 0.39967241883277893, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 2590 + }, + { + "epoch": 1.7717206132879046, + "grad_norm": 0.41104283928871155, + "learning_rate": 0.0002, + "loss": 1.6855, + "step": 2600 + }, + { + "epoch": 1.778534923339012, + "grad_norm": 0.44885286688804626, + "learning_rate": 0.0002, + "loss": 1.6993, + "step": 2610 + }, + { + "epoch": 1.7853492333901193, + "grad_norm": 0.38384467363357544, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 2620 + }, + { + "epoch": 1.7921635434412266, + "grad_norm": 0.35020917654037476, + "learning_rate": 0.0002, + "loss": 1.7213, + "step": 2630 + }, + { + "epoch": 1.798977853492334, + "grad_norm": 0.3360341489315033, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 2640 + }, + { + "epoch": 1.8057921635434413, + "grad_norm": 0.38875144720077515, + "learning_rate": 0.0002, + "loss": 1.7037, + "step": 2650 + }, + { + "epoch": 1.8126064735945486, + "grad_norm": 0.34876883029937744, + "learning_rate": 0.0002, + "loss": 1.693, + "step": 2660 + }, + { + "epoch": 1.819420783645656, + "grad_norm": 0.419979989528656, + "learning_rate": 0.0002, + "loss": 1.7743, + "step": 2670 + }, + { + "epoch": 1.8262350936967633, + "grad_norm": 0.3648919463157654, + "learning_rate": 0.0002, + "loss": 1.7089, + "step": 2680 + }, + { + "epoch": 1.8330494037478706, + "grad_norm": 0.3485383987426758, + "learning_rate": 0.0002, + "loss": 1.7828, + "step": 2690 + }, + { + "epoch": 1.839863713798978, + "grad_norm": 0.3647468686103821, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 2700 + }, + { + "epoch": 1.8466780238500853, + "grad_norm": 0.37003210186958313, + "learning_rate": 0.0002, + "loss": 1.7318, + "step": 2710 + }, + { + "epoch": 1.8534923339011926, + "grad_norm": 0.37031617760658264, + "learning_rate": 0.0002, + "loss": 1.6647, + "step": 2720 + }, + { + "epoch": 1.8603066439523, + "grad_norm": 0.3438796103000641, + "learning_rate": 0.0002, + "loss": 1.69, + "step": 2730 + }, + { + "epoch": 1.8671209540034073, + "grad_norm": 0.41574627161026, + "learning_rate": 0.0002, + "loss": 1.5995, + "step": 2740 + }, + { + "epoch": 1.8739352640545146, + "grad_norm": 0.35049930214881897, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 2750 + }, + { + "epoch": 1.880749574105622, + "grad_norm": 0.3943989872932434, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 2760 + }, + { + "epoch": 1.8875638841567293, + "grad_norm": 0.3384978175163269, + "learning_rate": 0.0002, + "loss": 1.7047, + "step": 2770 + }, + { + "epoch": 1.8943781942078366, + "grad_norm": 0.3501328229904175, + "learning_rate": 0.0002, + "loss": 1.7848, + "step": 2780 + }, + { + "epoch": 1.901192504258944, + "grad_norm": 0.37484532594680786, + "learning_rate": 0.0002, + "loss": 1.6986, + "step": 2790 + }, + { + "epoch": 1.9080068143100513, + "grad_norm": 0.34497788548469543, + "learning_rate": 0.0002, + "loss": 1.6867, + "step": 2800 + }, + { + "epoch": 1.9148211243611586, + "grad_norm": 0.3530851900577545, + "learning_rate": 0.0002, + "loss": 1.6513, + "step": 2810 + }, + { + "epoch": 1.921635434412266, + "grad_norm": 0.3879254162311554, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 2820 + }, + { + "epoch": 1.9284497444633732, + "grad_norm": 0.3885590136051178, + "learning_rate": 0.0002, + "loss": 1.6786, + "step": 2830 + }, + { + "epoch": 1.9352640545144804, + "grad_norm": 0.3868715465068817, + "learning_rate": 0.0002, + "loss": 1.8049, + "step": 2840 + }, + { + "epoch": 1.9420783645655877, + "grad_norm": 0.4152422249317169, + "learning_rate": 0.0002, + "loss": 1.7099, + "step": 2850 + }, + { + "epoch": 1.948892674616695, + "grad_norm": 0.401714563369751, + "learning_rate": 0.0002, + "loss": 1.6696, + "step": 2860 + }, + { + "epoch": 1.9557069846678024, + "grad_norm": 0.34825265407562256, + "learning_rate": 0.0002, + "loss": 1.7182, + "step": 2870 + }, + { + "epoch": 1.9625212947189097, + "grad_norm": 0.3620675504207611, + "learning_rate": 0.0002, + "loss": 1.6612, + "step": 2880 + }, + { + "epoch": 1.969335604770017, + "grad_norm": 0.3977806866168976, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 2890 + }, + { + "epoch": 1.9761499148211243, + "grad_norm": 0.3687497079372406, + "learning_rate": 0.0002, + "loss": 1.6514, + "step": 2900 + }, + { + "epoch": 1.9829642248722317, + "grad_norm": 0.408640056848526, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 2910 + }, + { + "epoch": 1.989778534923339, + "grad_norm": 0.34510108828544617, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 2920 + }, + { + "epoch": 1.9965928449744463, + "grad_norm": 0.3596334755420685, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 2930 + }, + { + "epoch": 2.0, + "eval_loss": 1.8056600093841553, + "eval_runtime": 60.7049, + "eval_samples_per_second": 8.352, + "eval_steps_per_second": 1.054, + "step": 2935 + }, + { + "epoch": 2.0034071550255534, + "grad_norm": 0.3460802137851715, + "learning_rate": 0.0002, + "loss": 1.6918, + "step": 2940 + }, + { + "epoch": 2.0102214650766608, + "grad_norm": 0.4038620591163635, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 2950 + }, + { + "epoch": 2.017035775127768, + "grad_norm": 0.3950219750404358, + "learning_rate": 0.0002, + "loss": 1.5386, + "step": 2960 + }, + { + "epoch": 2.0238500851788754, + "grad_norm": 0.519116997718811, + "learning_rate": 0.0002, + "loss": 1.5649, + "step": 2970 + }, + { + "epoch": 2.0306643952299828, + "grad_norm": 0.4097684919834137, + "learning_rate": 0.0002, + "loss": 1.5397, + "step": 2980 + }, + { + "epoch": 2.03747870528109, + "grad_norm": 0.4153544306755066, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 2990 + }, + { + "epoch": 2.0442930153321974, + "grad_norm": 0.4351160526275635, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 3000 + }, + { + "epoch": 2.0511073253833048, + "grad_norm": 0.42036259174346924, + "learning_rate": 0.0002, + "loss": 1.5091, + "step": 3010 + }, + { + "epoch": 2.057921635434412, + "grad_norm": 0.4433218836784363, + "learning_rate": 0.0002, + "loss": 1.5686, + "step": 3020 + }, + { + "epoch": 2.0647359454855194, + "grad_norm": 0.46511581540107727, + "learning_rate": 0.0002, + "loss": 1.5478, + "step": 3030 + }, + { + "epoch": 2.0715502555366268, + "grad_norm": 0.4567560851573944, + "learning_rate": 0.0002, + "loss": 1.5554, + "step": 3040 + }, + { + "epoch": 2.078364565587734, + "grad_norm": 0.45671048760414124, + "learning_rate": 0.0002, + "loss": 1.5561, + "step": 3050 + }, + { + "epoch": 2.0851788756388414, + "grad_norm": 0.4598552882671356, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 3060 + }, + { + "epoch": 2.0919931856899487, + "grad_norm": 0.4582861661911011, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 3070 + }, + { + "epoch": 2.098807495741056, + "grad_norm": 0.4366969168186188, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 3080 + }, + { + "epoch": 2.1056218057921634, + "grad_norm": 0.495917409658432, + "learning_rate": 0.0002, + "loss": 1.5574, + "step": 3090 + }, + { + "epoch": 2.1124361158432707, + "grad_norm": 1.6846044063568115, + "learning_rate": 0.0002, + "loss": 1.5798, + "step": 3100 + }, + { + "epoch": 2.119250425894378, + "grad_norm": 0.4765092134475708, + "learning_rate": 0.0002, + "loss": 1.5877, + "step": 3110 + }, + { + "epoch": 2.1260647359454854, + "grad_norm": 0.45029810070991516, + "learning_rate": 0.0002, + "loss": 1.6106, + "step": 3120 + }, + { + "epoch": 2.1328790459965927, + "grad_norm": 0.5706973075866699, + "learning_rate": 0.0002, + "loss": 1.5707, + "step": 3130 + }, + { + "epoch": 2.1396933560477, + "grad_norm": 0.4606274366378784, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 3140 + }, + { + "epoch": 2.1465076660988074, + "grad_norm": 2.199115753173828, + "learning_rate": 0.0002, + "loss": 1.564, + "step": 3150 + }, + { + "epoch": 2.1533219761499147, + "grad_norm": 0.6231027245521545, + "learning_rate": 0.0002, + "loss": 1.5808, + "step": 3160 + }, + { + "epoch": 2.160136286201022, + "grad_norm": 0.46918219327926636, + "learning_rate": 0.0002, + "loss": 1.5397, + "step": 3170 + }, + { + "epoch": 2.1669505962521294, + "grad_norm": 0.5006393194198608, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 3180 + }, + { + "epoch": 2.1737649063032367, + "grad_norm": 0.4745093286037445, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 3190 + }, + { + "epoch": 2.180579216354344, + "grad_norm": 0.511576771736145, + "learning_rate": 0.0002, + "loss": 1.6459, + "step": 3200 + }, + { + "epoch": 2.1873935264054514, + "grad_norm": 0.38622918725013733, + "learning_rate": 0.0002, + "loss": 1.6019, + "step": 3210 + }, + { + "epoch": 2.1942078364565587, + "grad_norm": 0.4425644278526306, + "learning_rate": 0.0002, + "loss": 1.6187, + "step": 3220 + }, + { + "epoch": 2.201022146507666, + "grad_norm": 0.45643091201782227, + "learning_rate": 0.0002, + "loss": 1.6114, + "step": 3230 + }, + { + "epoch": 2.2078364565587734, + "grad_norm": 0.4975406229496002, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 3240 + }, + { + "epoch": 2.2146507666098807, + "grad_norm": 0.4673331081867218, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 3250 + }, + { + "epoch": 2.221465076660988, + "grad_norm": 0.5081731081008911, + "learning_rate": 0.0002, + "loss": 1.6155, + "step": 3260 + }, + { + "epoch": 2.2282793867120954, + "grad_norm": 0.4790806770324707, + "learning_rate": 0.0002, + "loss": 1.53, + "step": 3270 + }, + { + "epoch": 2.2350936967632027, + "grad_norm": 0.5184140801429749, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 3280 + }, + { + "epoch": 2.24190800681431, + "grad_norm": 0.5159083604812622, + "learning_rate": 0.0002, + "loss": 1.5646, + "step": 3290 + }, + { + "epoch": 2.2487223168654173, + "grad_norm": 0.4876042604446411, + "learning_rate": 0.0002, + "loss": 1.6577, + "step": 3300 + }, + { + "epoch": 2.2555366269165247, + "grad_norm": 0.5454957485198975, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 3310 + }, + { + "epoch": 2.262350936967632, + "grad_norm": 0.49866822361946106, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 3320 + }, + { + "epoch": 2.2691652470187393, + "grad_norm": 0.42674365639686584, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 3330 + }, + { + "epoch": 2.2759795570698467, + "grad_norm": 0.5202316641807556, + "learning_rate": 0.0002, + "loss": 1.5588, + "step": 3340 + }, + { + "epoch": 2.282793867120954, + "grad_norm": 0.4849465489387512, + "learning_rate": 0.0002, + "loss": 1.6032, + "step": 3350 + }, + { + "epoch": 2.2896081771720613, + "grad_norm": 0.47202569246292114, + "learning_rate": 0.0002, + "loss": 1.6853, + "step": 3360 + }, + { + "epoch": 2.2964224872231687, + "grad_norm": 0.5311620235443115, + "learning_rate": 0.0002, + "loss": 1.6164, + "step": 3370 + }, + { + "epoch": 2.303236797274276, + "grad_norm": 0.49011409282684326, + "learning_rate": 0.0002, + "loss": 1.6004, + "step": 3380 + }, + { + "epoch": 2.3100511073253833, + "grad_norm": 0.4789247512817383, + "learning_rate": 0.0002, + "loss": 1.5484, + "step": 3390 + }, + { + "epoch": 2.3168654173764907, + "grad_norm": 0.46646103262901306, + "learning_rate": 0.0002, + "loss": 1.5866, + "step": 3400 + }, + { + "epoch": 2.323679727427598, + "grad_norm": 0.5552441477775574, + "learning_rate": 0.0002, + "loss": 1.5308, + "step": 3410 + }, + { + "epoch": 2.3304940374787053, + "grad_norm": 0.4530351758003235, + "learning_rate": 0.0002, + "loss": 1.5761, + "step": 3420 + }, + { + "epoch": 2.3373083475298126, + "grad_norm": 0.4806232750415802, + "learning_rate": 0.0002, + "loss": 1.5919, + "step": 3430 + }, + { + "epoch": 2.34412265758092, + "grad_norm": 0.5998363494873047, + "learning_rate": 0.0002, + "loss": 1.5569, + "step": 3440 + }, + { + "epoch": 2.3509369676320273, + "grad_norm": 0.4918554425239563, + "learning_rate": 0.0002, + "loss": 1.513, + "step": 3450 + }, + { + "epoch": 2.3577512776831346, + "grad_norm": 0.5359559655189514, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 3460 + }, + { + "epoch": 2.364565587734242, + "grad_norm": 0.5053277611732483, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 3470 + }, + { + "epoch": 2.3713798977853493, + "grad_norm": 0.5058915019035339, + "learning_rate": 0.0002, + "loss": 1.5673, + "step": 3480 + }, + { + "epoch": 2.3781942078364566, + "grad_norm": 0.5314899682998657, + "learning_rate": 0.0002, + "loss": 1.5434, + "step": 3490 + }, + { + "epoch": 2.385008517887564, + "grad_norm": 0.48035913705825806, + "learning_rate": 0.0002, + "loss": 1.5203, + "step": 3500 + }, + { + "epoch": 2.3918228279386713, + "grad_norm": 0.45864903926849365, + "learning_rate": 0.0002, + "loss": 1.5936, + "step": 3510 + }, + { + "epoch": 2.3986371379897786, + "grad_norm": 0.4553050398826599, + "learning_rate": 0.0002, + "loss": 1.6285, + "step": 3520 + }, + { + "epoch": 2.405451448040886, + "grad_norm": 0.4483442008495331, + "learning_rate": 0.0002, + "loss": 1.5444, + "step": 3530 + }, + { + "epoch": 2.4122657580919933, + "grad_norm": 0.5043742060661316, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 3540 + }, + { + "epoch": 2.4190800681431006, + "grad_norm": 0.44807168841362, + "learning_rate": 0.0002, + "loss": 1.5692, + "step": 3550 + }, + { + "epoch": 2.425894378194208, + "grad_norm": 0.5065137147903442, + "learning_rate": 0.0002, + "loss": 1.6306, + "step": 3560 + }, + { + "epoch": 2.4327086882453153, + "grad_norm": 0.5186443328857422, + "learning_rate": 0.0002, + "loss": 1.5842, + "step": 3570 + }, + { + "epoch": 2.4395229982964226, + "grad_norm": 0.49743232131004333, + "learning_rate": 0.0002, + "loss": 1.5956, + "step": 3580 + }, + { + "epoch": 2.44633730834753, + "grad_norm": 0.524450421333313, + "learning_rate": 0.0002, + "loss": 1.6021, + "step": 3590 + }, + { + "epoch": 2.4531516183986373, + "grad_norm": 0.5053797364234924, + "learning_rate": 0.0002, + "loss": 1.6283, + "step": 3600 + }, + { + "epoch": 2.4599659284497446, + "grad_norm": 0.5223091840744019, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 3610 + }, + { + "epoch": 2.466780238500852, + "grad_norm": 0.4763810932636261, + "learning_rate": 0.0002, + "loss": 1.6315, + "step": 3620 + }, + { + "epoch": 2.4735945485519593, + "grad_norm": 0.5097282528877258, + "learning_rate": 0.0002, + "loss": 1.5623, + "step": 3630 + }, + { + "epoch": 2.4804088586030666, + "grad_norm": 0.5831942558288574, + "learning_rate": 0.0002, + "loss": 1.605, + "step": 3640 + }, + { + "epoch": 2.487223168654174, + "grad_norm": 0.47573572397232056, + "learning_rate": 0.0002, + "loss": 1.6074, + "step": 3650 + }, + { + "epoch": 2.4940374787052813, + "grad_norm": 0.49602726101875305, + "learning_rate": 0.0002, + "loss": 1.6411, + "step": 3660 + }, + { + "epoch": 2.500851788756388, + "grad_norm": 0.5468524694442749, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 3670 + }, + { + "epoch": 2.507666098807496, + "grad_norm": 0.45899373292922974, + "learning_rate": 0.0002, + "loss": 1.5782, + "step": 3680 + }, + { + "epoch": 2.514480408858603, + "grad_norm": 0.5031567215919495, + "learning_rate": 0.0002, + "loss": 1.5114, + "step": 3690 + }, + { + "epoch": 2.5212947189097106, + "grad_norm": 0.5224900841712952, + "learning_rate": 0.0002, + "loss": 1.538, + "step": 3700 + }, + { + "epoch": 2.5281090289608175, + "grad_norm": 0.504769504070282, + "learning_rate": 0.0002, + "loss": 1.5269, + "step": 3710 + }, + { + "epoch": 2.5349233390119252, + "grad_norm": 0.6120529770851135, + "learning_rate": 0.0002, + "loss": 1.5141, + "step": 3720 + }, + { + "epoch": 2.541737649063032, + "grad_norm": 0.47930678725242615, + "learning_rate": 0.0002, + "loss": 1.5666, + "step": 3730 + }, + { + "epoch": 2.54855195911414, + "grad_norm": 0.5039092302322388, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 3740 + }, + { + "epoch": 2.555366269165247, + "grad_norm": 0.49758994579315186, + "learning_rate": 0.0002, + "loss": 1.5585, + "step": 3750 + }, + { + "epoch": 2.5621805792163546, + "grad_norm": 0.44739171862602234, + "learning_rate": 0.0002, + "loss": 1.5351, + "step": 3760 + }, + { + "epoch": 2.5689948892674614, + "grad_norm": 0.47056373953819275, + "learning_rate": 0.0002, + "loss": 1.5099, + "step": 3770 + }, + { + "epoch": 2.575809199318569, + "grad_norm": 0.5077595114707947, + "learning_rate": 0.0002, + "loss": 1.5524, + "step": 3780 + }, + { + "epoch": 2.582623509369676, + "grad_norm": 0.4981902837753296, + "learning_rate": 0.0002, + "loss": 1.5524, + "step": 3790 + }, + { + "epoch": 2.589437819420784, + "grad_norm": 0.5736238360404968, + "learning_rate": 0.0002, + "loss": 1.5381, + "step": 3800 + }, + { + "epoch": 2.5962521294718908, + "grad_norm": 0.4898384213447571, + "learning_rate": 0.0002, + "loss": 1.67, + "step": 3810 + }, + { + "epoch": 2.6030664395229985, + "grad_norm": 0.4611325263977051, + "learning_rate": 0.0002, + "loss": 1.5411, + "step": 3820 + }, + { + "epoch": 2.6098807495741054, + "grad_norm": 0.5285341739654541, + "learning_rate": 0.0002, + "loss": 1.5662, + "step": 3830 + }, + { + "epoch": 2.616695059625213, + "grad_norm": 0.5679430961608887, + "learning_rate": 0.0002, + "loss": 1.5875, + "step": 3840 + }, + { + "epoch": 2.62350936967632, + "grad_norm": 0.48532548546791077, + "learning_rate": 0.0002, + "loss": 1.5544, + "step": 3850 + }, + { + "epoch": 2.630323679727428, + "grad_norm": 0.45506492257118225, + "learning_rate": 0.0002, + "loss": 1.579, + "step": 3860 + }, + { + "epoch": 2.6371379897785348, + "grad_norm": 0.6552556753158569, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 3870 + }, + { + "epoch": 2.643952299829642, + "grad_norm": 0.537874698638916, + "learning_rate": 0.0002, + "loss": 1.608, + "step": 3880 + }, + { + "epoch": 2.6507666098807494, + "grad_norm": 0.46102389693260193, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 3890 + }, + { + "epoch": 2.6575809199318567, + "grad_norm": 0.45531195402145386, + "learning_rate": 0.0002, + "loss": 1.5293, + "step": 3900 + }, + { + "epoch": 2.664395229982964, + "grad_norm": 0.5327293872833252, + "learning_rate": 0.0002, + "loss": 1.5492, + "step": 3910 + }, + { + "epoch": 2.6712095400340714, + "grad_norm": 0.4968956410884857, + "learning_rate": 0.0002, + "loss": 1.5921, + "step": 3920 + }, + { + "epoch": 2.6780238500851787, + "grad_norm": 0.4790082275867462, + "learning_rate": 0.0002, + "loss": 1.5823, + "step": 3930 + }, + { + "epoch": 2.684838160136286, + "grad_norm": 0.5392967462539673, + "learning_rate": 0.0002, + "loss": 1.615, + "step": 3940 + }, + { + "epoch": 2.6916524701873934, + "grad_norm": 0.5076649785041809, + "learning_rate": 0.0002, + "loss": 1.6218, + "step": 3950 + }, + { + "epoch": 2.6984667802385007, + "grad_norm": 0.5628064274787903, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 3960 + }, + { + "epoch": 2.705281090289608, + "grad_norm": 0.5012659430503845, + "learning_rate": 0.0002, + "loss": 1.5417, + "step": 3970 + }, + { + "epoch": 2.7120954003407154, + "grad_norm": 0.4947647452354431, + "learning_rate": 0.0002, + "loss": 1.5339, + "step": 3980 + }, + { + "epoch": 2.7189097103918227, + "grad_norm": 0.4890969693660736, + "learning_rate": 0.0002, + "loss": 1.5724, + "step": 3990 + }, + { + "epoch": 2.72572402044293, + "grad_norm": 0.4471694231033325, + "learning_rate": 0.0002, + "loss": 1.5746, + "step": 4000 + }, + { + "epoch": 2.7325383304940374, + "grad_norm": 0.5116439461708069, + "learning_rate": 0.0002, + "loss": 1.6669, + "step": 4010 + }, + { + "epoch": 2.7393526405451447, + "grad_norm": 0.5720411539077759, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 4020 + }, + { + "epoch": 2.746166950596252, + "grad_norm": 0.5529406070709229, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 4030 + }, + { + "epoch": 2.7529812606473594, + "grad_norm": 0.5229396820068359, + "learning_rate": 0.0002, + "loss": 1.6296, + "step": 4040 + }, + { + "epoch": 2.7597955706984667, + "grad_norm": 0.5270276069641113, + "learning_rate": 0.0002, + "loss": 1.5363, + "step": 4050 + }, + { + "epoch": 2.766609880749574, + "grad_norm": 0.48413026332855225, + "learning_rate": 0.0002, + "loss": 1.6305, + "step": 4060 + }, + { + "epoch": 2.7734241908006814, + "grad_norm": 0.5145403742790222, + "learning_rate": 0.0002, + "loss": 1.5488, + "step": 4070 + }, + { + "epoch": 2.7802385008517887, + "grad_norm": 0.48626071214675903, + "learning_rate": 0.0002, + "loss": 1.6023, + "step": 4080 + }, + { + "epoch": 2.787052810902896, + "grad_norm": 0.5018984079360962, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 4090 + }, + { + "epoch": 2.7938671209540034, + "grad_norm": 0.4881938695907593, + "learning_rate": 0.0002, + "loss": 1.7166, + "step": 4100 + }, + { + "epoch": 2.8006814310051107, + "grad_norm": 0.5151546001434326, + "learning_rate": 0.0002, + "loss": 1.515, + "step": 4110 + }, + { + "epoch": 2.807495741056218, + "grad_norm": 0.5109850764274597, + "learning_rate": 0.0002, + "loss": 1.6069, + "step": 4120 + }, + { + "epoch": 2.8143100511073254, + "grad_norm": 0.5109251141548157, + "learning_rate": 0.0002, + "loss": 1.6153, + "step": 4130 + }, + { + "epoch": 2.8211243611584327, + "grad_norm": 0.5025496482849121, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 4140 + }, + { + "epoch": 2.82793867120954, + "grad_norm": 0.49027004837989807, + "learning_rate": 0.0002, + "loss": 1.6292, + "step": 4150 + }, + { + "epoch": 2.8347529812606473, + "grad_norm": 0.4957362413406372, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 4160 + }, + { + "epoch": 2.8415672913117547, + "grad_norm": 0.5159927606582642, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 4170 + }, + { + "epoch": 2.848381601362862, + "grad_norm": 0.6040670871734619, + "learning_rate": 0.0002, + "loss": 1.577, + "step": 4180 + }, + { + "epoch": 2.8551959114139693, + "grad_norm": 0.5489953756332397, + "learning_rate": 0.0002, + "loss": 1.5295, + "step": 4190 + }, + { + "epoch": 2.8620102214650767, + "grad_norm": 0.5416634678840637, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 4200 + }, + { + "epoch": 2.868824531516184, + "grad_norm": 0.5278245210647583, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 4210 + }, + { + "epoch": 2.8756388415672913, + "grad_norm": 0.43382319808006287, + "learning_rate": 0.0002, + "loss": 1.4848, + "step": 4220 + }, + { + "epoch": 2.8824531516183987, + "grad_norm": 0.4724387228488922, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 4230 + }, + { + "epoch": 2.889267461669506, + "grad_norm": 0.49824780225753784, + "learning_rate": 0.0002, + "loss": 1.5748, + "step": 4240 + }, + { + "epoch": 2.8960817717206133, + "grad_norm": 0.5360262989997864, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 4250 + }, + { + "epoch": 2.9028960817717206, + "grad_norm": 0.49090322852134705, + "learning_rate": 0.0002, + "loss": 1.5934, + "step": 4260 + }, + { + "epoch": 2.909710391822828, + "grad_norm": 0.5613328218460083, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 4270 + }, + { + "epoch": 2.9165247018739353, + "grad_norm": 0.4611356258392334, + "learning_rate": 0.0002, + "loss": 1.5713, + "step": 4280 + }, + { + "epoch": 2.9233390119250426, + "grad_norm": 0.550897479057312, + "learning_rate": 0.0002, + "loss": 1.5457, + "step": 4290 + }, + { + "epoch": 2.93015332197615, + "grad_norm": 0.5089612603187561, + "learning_rate": 0.0002, + "loss": 1.6225, + "step": 4300 + }, + { + "epoch": 2.9369676320272573, + "grad_norm": 0.5210904479026794, + "learning_rate": 0.0002, + "loss": 1.5897, + "step": 4310 + }, + { + "epoch": 2.9437819420783646, + "grad_norm": 0.5506424903869629, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 4320 + }, + { + "epoch": 2.950596252129472, + "grad_norm": 0.5118561387062073, + "learning_rate": 0.0002, + "loss": 1.6395, + "step": 4330 + }, + { + "epoch": 2.9574105621805793, + "grad_norm": 0.5034464597702026, + "learning_rate": 0.0002, + "loss": 1.704, + "step": 4340 + }, + { + "epoch": 2.9642248722316866, + "grad_norm": 0.5019990801811218, + "learning_rate": 0.0002, + "loss": 1.6314, + "step": 4350 + }, + { + "epoch": 2.971039182282794, + "grad_norm": 0.5423325300216675, + "learning_rate": 0.0002, + "loss": 1.6161, + "step": 4360 + }, + { + "epoch": 2.9778534923339013, + "grad_norm": 0.5287469625473022, + "learning_rate": 0.0002, + "loss": 1.6144, + "step": 4370 + }, + { + "epoch": 2.9846678023850086, + "grad_norm": 0.5206913352012634, + "learning_rate": 0.0002, + "loss": 1.5227, + "step": 4380 + }, + { + "epoch": 2.991482112436116, + "grad_norm": 0.5407394170761108, + "learning_rate": 0.0002, + "loss": 1.6026, + "step": 4390 + }, + { + "epoch": 2.9982964224872233, + "grad_norm": 0.5244600176811218, + "learning_rate": 0.0002, + "loss": 1.5908, + "step": 4400 + }, + { + "epoch": 2.9996592844974446, + "eval_loss": 1.8412635326385498, + "eval_runtime": 65.5583, + "eval_samples_per_second": 7.734, + "eval_steps_per_second": 0.976, + "step": 4402 + }, + { + "epoch": 3.0051107325383306, + "grad_norm": 0.5172150731086731, + "learning_rate": 0.0002, + "loss": 1.5157, + "step": 4410 + }, + { + "epoch": 3.011925042589438, + "grad_norm": 0.6882525086402893, + "learning_rate": 0.0002, + "loss": 1.398, + "step": 4420 + }, + { + "epoch": 3.0187393526405453, + "grad_norm": 0.6435003280639648, + "learning_rate": 0.0002, + "loss": 1.3884, + "step": 4430 + }, + { + "epoch": 3.0255536626916526, + "grad_norm": 0.7126057147979736, + "learning_rate": 0.0002, + "loss": 1.4493, + "step": 4440 + }, + { + "epoch": 3.03236797274276, + "grad_norm": 0.6634385585784912, + "learning_rate": 0.0002, + "loss": 1.4397, + "step": 4450 + }, + { + "epoch": 3.0391822827938673, + "grad_norm": 0.6468435525894165, + "learning_rate": 0.0002, + "loss": 1.3674, + "step": 4460 + }, + { + "epoch": 3.0459965928449746, + "grad_norm": 0.5690478086471558, + "learning_rate": 0.0002, + "loss": 1.4045, + "step": 4470 + }, + { + "epoch": 3.052810902896082, + "grad_norm": 0.7323708534240723, + "learning_rate": 0.0002, + "loss": 1.3742, + "step": 4480 + }, + { + "epoch": 3.0596252129471893, + "grad_norm": 0.6989302039146423, + "learning_rate": 0.0002, + "loss": 1.3281, + "step": 4490 + }, + { + "epoch": 3.0664395229982966, + "grad_norm": 0.6704450845718384, + "learning_rate": 0.0002, + "loss": 1.379, + "step": 4500 + }, + { + "epoch": 3.073253833049404, + "grad_norm": 0.769137442111969, + "learning_rate": 0.0002, + "loss": 1.4028, + "step": 4510 + }, + { + "epoch": 3.0800681431005112, + "grad_norm": 0.6556448936462402, + "learning_rate": 0.0002, + "loss": 1.4295, + "step": 4520 + }, + { + "epoch": 3.0868824531516186, + "grad_norm": 0.7143950462341309, + "learning_rate": 0.0002, + "loss": 1.2763, + "step": 4530 + }, + { + "epoch": 3.093696763202726, + "grad_norm": 0.7060510516166687, + "learning_rate": 0.0002, + "loss": 1.4806, + "step": 4540 + }, + { + "epoch": 3.1005110732538332, + "grad_norm": 0.6637526750564575, + "learning_rate": 0.0002, + "loss": 1.4097, + "step": 4550 + }, + { + "epoch": 3.1073253833049406, + "grad_norm": 0.822989284992218, + "learning_rate": 0.0002, + "loss": 1.4752, + "step": 4560 + }, + { + "epoch": 3.114139693356048, + "grad_norm": 0.5542152523994446, + "learning_rate": 0.0002, + "loss": 1.4994, + "step": 4570 + }, + { + "epoch": 3.1209540034071552, + "grad_norm": 0.7780306935310364, + "learning_rate": 0.0002, + "loss": 1.4306, + "step": 4580 + }, + { + "epoch": 3.1277683134582626, + "grad_norm": 0.7372637987136841, + "learning_rate": 0.0002, + "loss": 1.3909, + "step": 4590 + }, + { + "epoch": 3.1345826235093694, + "grad_norm": 0.6730087995529175, + "learning_rate": 0.0002, + "loss": 1.3989, + "step": 4600 + }, + { + "epoch": 3.1413969335604772, + "grad_norm": 0.6687398552894592, + "learning_rate": 0.0002, + "loss": 1.3591, + "step": 4610 + }, + { + "epoch": 3.148211243611584, + "grad_norm": 0.7645083665847778, + "learning_rate": 0.0002, + "loss": 1.436, + "step": 4620 + }, + { + "epoch": 3.155025553662692, + "grad_norm": 0.6770380139350891, + "learning_rate": 0.0002, + "loss": 1.3681, + "step": 4630 + }, + { + "epoch": 3.1618398637137988, + "grad_norm": 0.7200576663017273, + "learning_rate": 0.0002, + "loss": 1.405, + "step": 4640 + }, + { + "epoch": 3.168654173764906, + "grad_norm": 0.6663638949394226, + "learning_rate": 0.0002, + "loss": 1.3752, + "step": 4650 + }, + { + "epoch": 3.1754684838160134, + "grad_norm": 0.6602960228919983, + "learning_rate": 0.0002, + "loss": 1.4099, + "step": 4660 + }, + { + "epoch": 3.1822827938671208, + "grad_norm": 0.7838228344917297, + "learning_rate": 0.0002, + "loss": 1.4003, + "step": 4670 + }, + { + "epoch": 3.189097103918228, + "grad_norm": 0.7559184432029724, + "learning_rate": 0.0002, + "loss": 1.3853, + "step": 4680 + }, + { + "epoch": 3.1959114139693354, + "grad_norm": 0.6609814167022705, + "learning_rate": 0.0002, + "loss": 1.4516, + "step": 4690 + }, + { + "epoch": 3.2027257240204428, + "grad_norm": 0.8470419645309448, + "learning_rate": 0.0002, + "loss": 1.4464, + "step": 4700 + }, + { + "epoch": 3.20954003407155, + "grad_norm": 0.7282822728157043, + "learning_rate": 0.0002, + "loss": 1.428, + "step": 4710 + }, + { + "epoch": 3.2163543441226574, + "grad_norm": 0.6722773313522339, + "learning_rate": 0.0002, + "loss": 1.5261, + "step": 4720 + }, + { + "epoch": 3.2231686541737647, + "grad_norm": 0.7630265355110168, + "learning_rate": 0.0002, + "loss": 1.3809, + "step": 4730 + }, + { + "epoch": 3.229982964224872, + "grad_norm": 0.7102773785591125, + "learning_rate": 0.0002, + "loss": 1.42, + "step": 4740 + }, + { + "epoch": 3.2367972742759794, + "grad_norm": 0.7778299450874329, + "learning_rate": 0.0002, + "loss": 1.3529, + "step": 4750 + }, + { + "epoch": 3.2436115843270867, + "grad_norm": 0.7189921736717224, + "learning_rate": 0.0002, + "loss": 1.4715, + "step": 4760 + }, + { + "epoch": 3.250425894378194, + "grad_norm": 0.7708092331886292, + "learning_rate": 0.0002, + "loss": 1.4328, + "step": 4770 + }, + { + "epoch": 3.2572402044293014, + "grad_norm": 0.7208452224731445, + "learning_rate": 0.0002, + "loss": 1.3855, + "step": 4780 + }, + { + "epoch": 3.2640545144804087, + "grad_norm": 0.7220432758331299, + "learning_rate": 0.0002, + "loss": 1.3206, + "step": 4790 + }, + { + "epoch": 3.270868824531516, + "grad_norm": 0.7064954042434692, + "learning_rate": 0.0002, + "loss": 1.463, + "step": 4800 + }, + { + "epoch": 3.2776831345826234, + "grad_norm": 0.6618382334709167, + "learning_rate": 0.0002, + "loss": 1.4236, + "step": 4810 + }, + { + "epoch": 3.2844974446337307, + "grad_norm": 0.6854256391525269, + "learning_rate": 0.0002, + "loss": 1.3878, + "step": 4820 + }, + { + "epoch": 3.291311754684838, + "grad_norm": 0.6036319136619568, + "learning_rate": 0.0002, + "loss": 1.4236, + "step": 4830 + }, + { + "epoch": 3.2981260647359454, + "grad_norm": 0.714678943157196, + "learning_rate": 0.0002, + "loss": 1.4796, + "step": 4840 + }, + { + "epoch": 3.3049403747870527, + "grad_norm": 0.7218600511550903, + "learning_rate": 0.0002, + "loss": 1.4273, + "step": 4850 + }, + { + "epoch": 3.31175468483816, + "grad_norm": 0.7243074774742126, + "learning_rate": 0.0002, + "loss": 1.3915, + "step": 4860 + }, + { + "epoch": 3.3185689948892674, + "grad_norm": 0.7058630585670471, + "learning_rate": 0.0002, + "loss": 1.4088, + "step": 4870 + }, + { + "epoch": 3.3253833049403747, + "grad_norm": 0.7091076970100403, + "learning_rate": 0.0002, + "loss": 1.3837, + "step": 4880 + }, + { + "epoch": 3.332197614991482, + "grad_norm": 0.7375147342681885, + "learning_rate": 0.0002, + "loss": 1.4745, + "step": 4890 + }, + { + "epoch": 3.3390119250425894, + "grad_norm": 0.9426755309104919, + "learning_rate": 0.0002, + "loss": 1.4826, + "step": 4900 + }, + { + "epoch": 3.3458262350936967, + "grad_norm": 0.6508213877677917, + "learning_rate": 0.0002, + "loss": 1.369, + "step": 4910 + }, + { + "epoch": 3.352640545144804, + "grad_norm": 0.6945043206214905, + "learning_rate": 0.0002, + "loss": 1.3839, + "step": 4920 + }, + { + "epoch": 3.3594548551959114, + "grad_norm": 0.6335888504981995, + "learning_rate": 0.0002, + "loss": 1.3571, + "step": 4930 + }, + { + "epoch": 3.3662691652470187, + "grad_norm": 0.6947107911109924, + "learning_rate": 0.0002, + "loss": 1.4391, + "step": 4940 + }, + { + "epoch": 3.373083475298126, + "grad_norm": 0.8204733729362488, + "learning_rate": 0.0002, + "loss": 1.3885, + "step": 4950 + }, + { + "epoch": 3.3798977853492334, + "grad_norm": 0.7212244868278503, + "learning_rate": 0.0002, + "loss": 1.4886, + "step": 4960 + }, + { + "epoch": 3.3867120954003407, + "grad_norm": 0.6053042411804199, + "learning_rate": 0.0002, + "loss": 1.4581, + "step": 4970 + }, + { + "epoch": 3.393526405451448, + "grad_norm": 0.7820029854774475, + "learning_rate": 0.0002, + "loss": 1.3863, + "step": 4980 + }, + { + "epoch": 3.4003407155025553, + "grad_norm": 0.6866770386695862, + "learning_rate": 0.0002, + "loss": 1.4326, + "step": 4990 + }, + { + "epoch": 3.4071550255536627, + "grad_norm": 0.6652463674545288, + "learning_rate": 0.0002, + "loss": 1.4287, + "step": 5000 + }, + { + "epoch": 3.41396933560477, + "grad_norm": 1.1209032535552979, + "learning_rate": 0.0002, + "loss": 1.3667, + "step": 5010 + }, + { + "epoch": 3.4207836456558773, + "grad_norm": 0.8390814661979675, + "learning_rate": 0.0002, + "loss": 1.4461, + "step": 5020 + }, + { + "epoch": 3.4275979557069847, + "grad_norm": 0.7541858553886414, + "learning_rate": 0.0002, + "loss": 1.4556, + "step": 5030 + }, + { + "epoch": 3.434412265758092, + "grad_norm": 0.6902772784233093, + "learning_rate": 0.0002, + "loss": 1.4245, + "step": 5040 + }, + { + "epoch": 3.4412265758091993, + "grad_norm": 0.7070329785346985, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 5050 + }, + { + "epoch": 3.4480408858603067, + "grad_norm": 0.8075643181800842, + "learning_rate": 0.0002, + "loss": 1.3903, + "step": 5060 + }, + { + "epoch": 3.454855195911414, + "grad_norm": 0.7133861780166626, + "learning_rate": 0.0002, + "loss": 1.3929, + "step": 5070 + }, + { + "epoch": 3.4616695059625213, + "grad_norm": 0.6631823182106018, + "learning_rate": 0.0002, + "loss": 1.4632, + "step": 5080 + }, + { + "epoch": 3.4684838160136287, + "grad_norm": 0.673870325088501, + "learning_rate": 0.0002, + "loss": 1.4162, + "step": 5090 + }, + { + "epoch": 3.475298126064736, + "grad_norm": 0.6438634395599365, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 5100 + }, + { + "epoch": 3.4821124361158433, + "grad_norm": 0.7560495734214783, + "learning_rate": 0.0002, + "loss": 1.4421, + "step": 5110 + }, + { + "epoch": 3.4889267461669506, + "grad_norm": 0.6877814531326294, + "learning_rate": 0.0002, + "loss": 1.4125, + "step": 5120 + }, + { + "epoch": 3.495741056218058, + "grad_norm": 0.7031328678131104, + "learning_rate": 0.0002, + "loss": 1.4308, + "step": 5130 + }, + { + "epoch": 3.5025553662691653, + "grad_norm": 0.6797195672988892, + "learning_rate": 0.0002, + "loss": 1.3705, + "step": 5140 + }, + { + "epoch": 3.5093696763202726, + "grad_norm": 0.6766413450241089, + "learning_rate": 0.0002, + "loss": 1.4687, + "step": 5150 + }, + { + "epoch": 3.51618398637138, + "grad_norm": 0.666656494140625, + "learning_rate": 0.0002, + "loss": 1.4194, + "step": 5160 + }, + { + "epoch": 3.5229982964224873, + "grad_norm": 0.74996417760849, + "learning_rate": 0.0002, + "loss": 1.469, + "step": 5170 + }, + { + "epoch": 3.5298126064735946, + "grad_norm": 0.7370911836624146, + "learning_rate": 0.0002, + "loss": 1.4848, + "step": 5180 + }, + { + "epoch": 3.536626916524702, + "grad_norm": 0.9063456654548645, + "learning_rate": 0.0002, + "loss": 1.4523, + "step": 5190 + }, + { + "epoch": 3.5434412265758093, + "grad_norm": 0.6861422657966614, + "learning_rate": 0.0002, + "loss": 1.4726, + "step": 5200 + }, + { + "epoch": 3.5502555366269166, + "grad_norm": 0.7104039788246155, + "learning_rate": 0.0002, + "loss": 1.4803, + "step": 5210 + }, + { + "epoch": 3.557069846678024, + "grad_norm": 0.6578653454780579, + "learning_rate": 0.0002, + "loss": 1.4313, + "step": 5220 + }, + { + "epoch": 3.5638841567291313, + "grad_norm": 0.7336562275886536, + "learning_rate": 0.0002, + "loss": 1.4596, + "step": 5230 + }, + { + "epoch": 3.5706984667802386, + "grad_norm": 0.7163010835647583, + "learning_rate": 0.0002, + "loss": 1.4591, + "step": 5240 + }, + { + "epoch": 3.577512776831346, + "grad_norm": 0.8112391233444214, + "learning_rate": 0.0002, + "loss": 1.3814, + "step": 5250 + }, + { + "epoch": 3.5843270868824533, + "grad_norm": 0.7260391116142273, + "learning_rate": 0.0002, + "loss": 1.4249, + "step": 5260 + }, + { + "epoch": 3.5911413969335606, + "grad_norm": 0.7038731575012207, + "learning_rate": 0.0002, + "loss": 1.4249, + "step": 5270 + }, + { + "epoch": 3.597955706984668, + "grad_norm": 0.7864376902580261, + "learning_rate": 0.0002, + "loss": 1.4172, + "step": 5280 + }, + { + "epoch": 3.6047700170357753, + "grad_norm": 0.6968383193016052, + "learning_rate": 0.0002, + "loss": 1.4637, + "step": 5290 + }, + { + "epoch": 3.6115843270868826, + "grad_norm": 0.6726206541061401, + "learning_rate": 0.0002, + "loss": 1.5269, + "step": 5300 + }, + { + "epoch": 3.61839863713799, + "grad_norm": 0.6716854572296143, + "learning_rate": 0.0002, + "loss": 1.4199, + "step": 5310 + }, + { + "epoch": 3.6252129471890973, + "grad_norm": 0.7229742407798767, + "learning_rate": 0.0002, + "loss": 1.4686, + "step": 5320 + }, + { + "epoch": 3.6320272572402046, + "grad_norm": 0.7338683009147644, + "learning_rate": 0.0002, + "loss": 1.4441, + "step": 5330 + }, + { + "epoch": 3.638841567291312, + "grad_norm": 0.771672785282135, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 5340 + }, + { + "epoch": 3.645655877342419, + "grad_norm": 0.7024078369140625, + "learning_rate": 0.0002, + "loss": 1.4007, + "step": 5350 + }, + { + "epoch": 3.6524701873935266, + "grad_norm": 0.6847538352012634, + "learning_rate": 0.0002, + "loss": 1.4996, + "step": 5360 + }, + { + "epoch": 3.6592844974446335, + "grad_norm": 0.71802818775177, + "learning_rate": 0.0002, + "loss": 1.4111, + "step": 5370 + }, + { + "epoch": 3.6660988074957412, + "grad_norm": 0.78530353307724, + "learning_rate": 0.0002, + "loss": 1.4224, + "step": 5380 + }, + { + "epoch": 3.672913117546848, + "grad_norm": 0.7262226939201355, + "learning_rate": 0.0002, + "loss": 1.4582, + "step": 5390 + }, + { + "epoch": 3.679727427597956, + "grad_norm": 0.7608316540718079, + "learning_rate": 0.0002, + "loss": 1.4704, + "step": 5400 + }, + { + "epoch": 3.686541737649063, + "grad_norm": 0.6994926333427429, + "learning_rate": 0.0002, + "loss": 1.3742, + "step": 5410 + }, + { + "epoch": 3.6933560477001706, + "grad_norm": 0.7888479828834534, + "learning_rate": 0.0002, + "loss": 1.4738, + "step": 5420 + }, + { + "epoch": 3.7001703577512775, + "grad_norm": 0.7053858041763306, + "learning_rate": 0.0002, + "loss": 1.4213, + "step": 5430 + }, + { + "epoch": 3.7069846678023852, + "grad_norm": 0.7063165903091431, + "learning_rate": 0.0002, + "loss": 1.4988, + "step": 5440 + }, + { + "epoch": 3.713798977853492, + "grad_norm": 0.6603744626045227, + "learning_rate": 0.0002, + "loss": 1.4386, + "step": 5450 + }, + { + "epoch": 3.7206132879046, + "grad_norm": 0.7043602466583252, + "learning_rate": 0.0002, + "loss": 1.4695, + "step": 5460 + }, + { + "epoch": 3.7274275979557068, + "grad_norm": 0.7026081681251526, + "learning_rate": 0.0002, + "loss": 1.5051, + "step": 5470 + }, + { + "epoch": 3.7342419080068145, + "grad_norm": 0.7200090289115906, + "learning_rate": 0.0002, + "loss": 1.5613, + "step": 5480 + }, + { + "epoch": 3.7410562180579214, + "grad_norm": 0.7170904278755188, + "learning_rate": 0.0002, + "loss": 1.4182, + "step": 5490 + }, + { + "epoch": 3.747870528109029, + "grad_norm": 0.7489104866981506, + "learning_rate": 0.0002, + "loss": 1.4344, + "step": 5500 + }, + { + "epoch": 3.754684838160136, + "grad_norm": 0.6540989875793457, + "learning_rate": 0.0002, + "loss": 1.4911, + "step": 5510 + }, + { + "epoch": 3.761499148211244, + "grad_norm": 0.6654048562049866, + "learning_rate": 0.0002, + "loss": 1.4955, + "step": 5520 + }, + { + "epoch": 3.7683134582623508, + "grad_norm": 0.6577395796775818, + "learning_rate": 0.0002, + "loss": 1.4487, + "step": 5530 + }, + { + "epoch": 3.7751277683134585, + "grad_norm": 0.7762192487716675, + "learning_rate": 0.0002, + "loss": 1.4283, + "step": 5540 + }, + { + "epoch": 3.7819420783645654, + "grad_norm": 0.6336314678192139, + "learning_rate": 0.0002, + "loss": 1.4727, + "step": 5550 + }, + { + "epoch": 3.7887563884156727, + "grad_norm": 0.7098057866096497, + "learning_rate": 0.0002, + "loss": 1.4588, + "step": 5560 + }, + { + "epoch": 3.79557069846678, + "grad_norm": 0.7379715442657471, + "learning_rate": 0.0002, + "loss": 1.4679, + "step": 5570 + }, + { + "epoch": 3.8023850085178874, + "grad_norm": 0.6726924777030945, + "learning_rate": 0.0002, + "loss": 1.4633, + "step": 5580 + }, + { + "epoch": 3.8091993185689947, + "grad_norm": 1.1212009191513062, + "learning_rate": 0.0002, + "loss": 1.4751, + "step": 5590 + }, + { + "epoch": 3.816013628620102, + "grad_norm": 0.6503795981407166, + "learning_rate": 0.0002, + "loss": 1.4503, + "step": 5600 + }, + { + "epoch": 3.8228279386712094, + "grad_norm": 0.7041325569152832, + "learning_rate": 0.0002, + "loss": 1.4754, + "step": 5610 + }, + { + "epoch": 3.8296422487223167, + "grad_norm": 0.7962933778762817, + "learning_rate": 0.0002, + "loss": 1.4199, + "step": 5620 + }, + { + "epoch": 3.836456558773424, + "grad_norm": 0.6613591909408569, + "learning_rate": 0.0002, + "loss": 1.4672, + "step": 5630 + }, + { + "epoch": 3.8432708688245314, + "grad_norm": 0.7293516397476196, + "learning_rate": 0.0002, + "loss": 1.5688, + "step": 5640 + }, + { + "epoch": 3.8500851788756387, + "grad_norm": 0.7388607859611511, + "learning_rate": 0.0002, + "loss": 1.4149, + "step": 5650 + }, + { + "epoch": 3.856899488926746, + "grad_norm": 0.6440677642822266, + "learning_rate": 0.0002, + "loss": 1.4743, + "step": 5660 + }, + { + "epoch": 3.8637137989778534, + "grad_norm": 0.7729013562202454, + "learning_rate": 0.0002, + "loss": 1.5082, + "step": 5670 + }, + { + "epoch": 3.8705281090289607, + "grad_norm": 0.6696794033050537, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 5680 + }, + { + "epoch": 3.877342419080068, + "grad_norm": 0.7151781320571899, + "learning_rate": 0.0002, + "loss": 1.472, + "step": 5690 + }, + { + "epoch": 3.8841567291311754, + "grad_norm": 0.6736966371536255, + "learning_rate": 0.0002, + "loss": 1.4923, + "step": 5700 + }, + { + "epoch": 3.8909710391822827, + "grad_norm": 0.7444243431091309, + "learning_rate": 0.0002, + "loss": 1.4453, + "step": 5710 + }, + { + "epoch": 3.89778534923339, + "grad_norm": 0.6701464653015137, + "learning_rate": 0.0002, + "loss": 1.4562, + "step": 5720 + }, + { + "epoch": 3.9045996592844974, + "grad_norm": 0.7231952548027039, + "learning_rate": 0.0002, + "loss": 1.4478, + "step": 5730 + }, + { + "epoch": 3.9114139693356047, + "grad_norm": 0.831954300403595, + "learning_rate": 0.0002, + "loss": 1.4539, + "step": 5740 + }, + { + "epoch": 3.918228279386712, + "grad_norm": 0.7697733640670776, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 5750 + }, + { + "epoch": 3.9250425894378194, + "grad_norm": 0.6964395046234131, + "learning_rate": 0.0002, + "loss": 1.4552, + "step": 5760 + }, + { + "epoch": 3.9318568994889267, + "grad_norm": 0.6942925453186035, + "learning_rate": 0.0002, + "loss": 1.4688, + "step": 5770 + }, + { + "epoch": 3.938671209540034, + "grad_norm": 0.6491202712059021, + "learning_rate": 0.0002, + "loss": 1.4668, + "step": 5780 + }, + { + "epoch": 3.9454855195911414, + "grad_norm": 0.7004382610321045, + "learning_rate": 0.0002, + "loss": 1.4404, + "step": 5790 + }, + { + "epoch": 3.9522998296422487, + "grad_norm": 0.7337747812271118, + "learning_rate": 0.0002, + "loss": 1.5022, + "step": 5800 + }, + { + "epoch": 3.959114139693356, + "grad_norm": 0.6923640966415405, + "learning_rate": 0.0002, + "loss": 1.5314, + "step": 5810 + }, + { + "epoch": 3.9659284497444633, + "grad_norm": 0.6815266609191895, + "learning_rate": 0.0002, + "loss": 1.4811, + "step": 5820 + }, + { + "epoch": 3.9727427597955707, + "grad_norm": 0.6755654811859131, + "learning_rate": 0.0002, + "loss": 1.437, + "step": 5830 + }, + { + "epoch": 3.979557069846678, + "grad_norm": 0.6912487149238586, + "learning_rate": 0.0002, + "loss": 1.4277, + "step": 5840 + }, + { + "epoch": 3.9863713798977853, + "grad_norm": 0.6948044896125793, + "learning_rate": 0.0002, + "loss": 1.4654, + "step": 5850 + }, + { + "epoch": 3.9931856899488927, + "grad_norm": 0.6735455989837646, + "learning_rate": 0.0002, + "loss": 1.4779, + "step": 5860 + }, + { + "epoch": 4.0, + "grad_norm": 0.7005048990249634, + "learning_rate": 0.0002, + "loss": 1.5102, + "step": 5870 + }, + { + "epoch": 4.0, + "eval_loss": 1.923058032989502, + "eval_runtime": 58.9903, + "eval_samples_per_second": 8.595, + "eval_steps_per_second": 1.085, + "step": 5870 + } + ], + "logging_steps": 10, + "max_steps": 11736, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.015051868372992e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2f734531f14829705714a5a2703d90c81f36eeb0 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-5870/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2af5d486d0b370637d5a3999ab7668f94f5d53b6497f018967564aef5e8ad133 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5d67121bd706ee47d737871d6ebfc27034281821 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:710c07c9e2af41cdcc67dd4c06ce27184fe644e4ced3fad348c95b99e38931d8 +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..caf0160a63f77d245e033df4363db3e31e26d7d1 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6dce5cad797ff8d9b76ff56f55ebe89d995b602de58d5ec7a4eaeb448d6879f +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3027848c19766d4060a7b3041131850480fc2e6e --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fa32d90e1407128079864510e4ed0565289de00560eedbe2bf517e41e7dd1ef +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c571c4169cb08ac5f3154716e788b54d494cf608 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d8009e5519616ea44f02b4c094a3795fd6b0e28febea52fa33b18733acb1acd +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..afd586dff5e1c6b0d81c6eed3b550b5de3874163 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/trainer_state.json @@ -0,0 +1,5204 @@ +{ + "best_metric": 1.8034634590148926, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467", + "epoch": 4.999659284497445, + "eval_steps": 10, + "global_step": 7337, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0068143100511073255, + "grad_norm": 0.635574460029602, + "learning_rate": 0.0002, + "loss": 3.0988, + "step": 10 + }, + { + "epoch": 0.013628620102214651, + "grad_norm": 1.0401769876480103, + "learning_rate": 0.0002, + "loss": 2.5594, + "step": 20 + }, + { + "epoch": 0.020442930153321975, + "grad_norm": 0.4514131247997284, + "learning_rate": 0.0002, + "loss": 2.3587, + "step": 30 + }, + { + "epoch": 0.027257240204429302, + "grad_norm": 0.6303355693817139, + "learning_rate": 0.0002, + "loss": 2.121, + "step": 40 + }, + { + "epoch": 0.034071550255536626, + "grad_norm": 0.4648270606994629, + "learning_rate": 0.0002, + "loss": 1.9833, + "step": 50 + }, + { + "epoch": 0.04088586030664395, + "grad_norm": 0.42953479290008545, + "learning_rate": 0.0002, + "loss": 1.9384, + "step": 60 + }, + { + "epoch": 0.04770017035775128, + "grad_norm": 0.433614581823349, + "learning_rate": 0.0002, + "loss": 1.9202, + "step": 70 + }, + { + "epoch": 0.054514480408858604, + "grad_norm": 0.45318254828453064, + "learning_rate": 0.0002, + "loss": 1.911, + "step": 80 + }, + { + "epoch": 0.06132879045996593, + "grad_norm": 0.4023568630218506, + "learning_rate": 0.0002, + "loss": 1.8895, + "step": 90 + }, + { + "epoch": 0.06814310051107325, + "grad_norm": 0.43260207772254944, + "learning_rate": 0.0002, + "loss": 1.9257, + "step": 100 + }, + { + "epoch": 0.07495741056218058, + "grad_norm": 0.43389809131622314, + "learning_rate": 0.0002, + "loss": 1.9178, + "step": 110 + }, + { + "epoch": 0.0817717206132879, + "grad_norm": 0.39307987689971924, + "learning_rate": 0.0002, + "loss": 1.9071, + "step": 120 + }, + { + "epoch": 0.08858603066439523, + "grad_norm": 0.7703037261962891, + "learning_rate": 0.0002, + "loss": 1.9482, + "step": 130 + }, + { + "epoch": 0.09540034071550256, + "grad_norm": 0.38547563552856445, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 140 + }, + { + "epoch": 0.10221465076660988, + "grad_norm": 0.37948688864707947, + "learning_rate": 0.0002, + "loss": 1.9037, + "step": 150 + }, + { + "epoch": 0.10902896081771721, + "grad_norm": 0.33022379875183105, + "learning_rate": 0.0002, + "loss": 1.8664, + "step": 160 + }, + { + "epoch": 0.11584327086882454, + "grad_norm": 0.33703792095184326, + "learning_rate": 0.0002, + "loss": 1.8659, + "step": 170 + }, + { + "epoch": 0.12265758091993186, + "grad_norm": 0.3259912431240082, + "learning_rate": 0.0002, + "loss": 1.8389, + "step": 180 + }, + { + "epoch": 0.12947189097103917, + "grad_norm": 0.3593858778476715, + "learning_rate": 0.0002, + "loss": 1.8424, + "step": 190 + }, + { + "epoch": 0.1362862010221465, + "grad_norm": 0.3401614725589752, + "learning_rate": 0.0002, + "loss": 1.8457, + "step": 200 + }, + { + "epoch": 0.14310051107325383, + "grad_norm": 0.3892078697681427, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 210 + }, + { + "epoch": 0.14991482112436116, + "grad_norm": 0.315374493598938, + "learning_rate": 0.0002, + "loss": 1.8351, + "step": 220 + }, + { + "epoch": 0.1567291311754685, + "grad_norm": 0.3679497539997101, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 230 + }, + { + "epoch": 0.1635434412265758, + "grad_norm": 0.336730033159256, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 240 + }, + { + "epoch": 0.17035775127768313, + "grad_norm": 0.36472755670547485, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 250 + }, + { + "epoch": 0.17717206132879046, + "grad_norm": 0.34864023327827454, + "learning_rate": 0.0002, + "loss": 1.7766, + "step": 260 + }, + { + "epoch": 0.1839863713798978, + "grad_norm": 0.3350819945335388, + "learning_rate": 0.0002, + "loss": 1.896, + "step": 270 + }, + { + "epoch": 0.19080068143100512, + "grad_norm": 0.3844246566295624, + "learning_rate": 0.0002, + "loss": 1.8742, + "step": 280 + }, + { + "epoch": 0.19761499148211242, + "grad_norm": 0.38413065671920776, + "learning_rate": 0.0002, + "loss": 1.8531, + "step": 290 + }, + { + "epoch": 0.20442930153321975, + "grad_norm": 0.4080047607421875, + "learning_rate": 0.0002, + "loss": 1.8415, + "step": 300 + }, + { + "epoch": 0.21124361158432708, + "grad_norm": 0.3546800911426544, + "learning_rate": 0.0002, + "loss": 1.9009, + "step": 310 + }, + { + "epoch": 0.21805792163543442, + "grad_norm": 0.38577890396118164, + "learning_rate": 0.0002, + "loss": 1.8092, + "step": 320 + }, + { + "epoch": 0.22487223168654175, + "grad_norm": 0.38979098200798035, + "learning_rate": 0.0002, + "loss": 1.7113, + "step": 330 + }, + { + "epoch": 0.23168654173764908, + "grad_norm": 0.35105520486831665, + "learning_rate": 0.0002, + "loss": 1.8162, + "step": 340 + }, + { + "epoch": 0.23850085178875638, + "grad_norm": 0.31671223044395447, + "learning_rate": 0.0002, + "loss": 1.8408, + "step": 350 + }, + { + "epoch": 0.2453151618398637, + "grad_norm": 0.33034196496009827, + "learning_rate": 0.0002, + "loss": 1.8014, + "step": 360 + }, + { + "epoch": 0.252129471890971, + "grad_norm": 0.2990533709526062, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 370 + }, + { + "epoch": 0.25894378194207834, + "grad_norm": 0.362208753824234, + "learning_rate": 0.0002, + "loss": 1.8612, + "step": 380 + }, + { + "epoch": 0.2657580919931857, + "grad_norm": 0.3269096612930298, + "learning_rate": 0.0002, + "loss": 1.8001, + "step": 390 + }, + { + "epoch": 0.272572402044293, + "grad_norm": 0.30555954575538635, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 400 + }, + { + "epoch": 0.27938671209540034, + "grad_norm": 0.332933247089386, + "learning_rate": 0.0002, + "loss": 1.8018, + "step": 410 + }, + { + "epoch": 0.28620102214650767, + "grad_norm": 0.39454060792922974, + "learning_rate": 0.0002, + "loss": 1.8157, + "step": 420 + }, + { + "epoch": 0.293015332197615, + "grad_norm": 0.34589633345603943, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 430 + }, + { + "epoch": 0.29982964224872233, + "grad_norm": 0.3747332990169525, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 440 + }, + { + "epoch": 0.30664395229982966, + "grad_norm": 0.34825369715690613, + "learning_rate": 0.0002, + "loss": 1.8476, + "step": 450 + }, + { + "epoch": 0.313458262350937, + "grad_norm": 0.32906976342201233, + "learning_rate": 0.0002, + "loss": 1.775, + "step": 460 + }, + { + "epoch": 0.3202725724020443, + "grad_norm": 0.33108609914779663, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 470 + }, + { + "epoch": 0.3270868824531516, + "grad_norm": 0.3170463442802429, + "learning_rate": 0.0002, + "loss": 1.8012, + "step": 480 + }, + { + "epoch": 0.3339011925042589, + "grad_norm": 0.30792081356048584, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 490 + }, + { + "epoch": 0.34071550255536626, + "grad_norm": 0.31772997975349426, + "learning_rate": 0.0002, + "loss": 1.7361, + "step": 500 + }, + { + "epoch": 0.3475298126064736, + "grad_norm": 0.32714012265205383, + "learning_rate": 0.0002, + "loss": 1.8147, + "step": 510 + }, + { + "epoch": 0.3543441226575809, + "grad_norm": 0.3021100163459778, + "learning_rate": 0.0002, + "loss": 1.8332, + "step": 520 + }, + { + "epoch": 0.36115843270868825, + "grad_norm": 0.6045835018157959, + "learning_rate": 0.0002, + "loss": 1.7337, + "step": 530 + }, + { + "epoch": 0.3679727427597956, + "grad_norm": 0.3003896474838257, + "learning_rate": 0.0002, + "loss": 1.8661, + "step": 540 + }, + { + "epoch": 0.3747870528109029, + "grad_norm": 0.3678470551967621, + "learning_rate": 0.0002, + "loss": 1.8359, + "step": 550 + }, + { + "epoch": 0.38160136286201024, + "grad_norm": 0.35787615180015564, + "learning_rate": 0.0002, + "loss": 1.7512, + "step": 560 + }, + { + "epoch": 0.38841567291311757, + "grad_norm": 0.31882143020629883, + "learning_rate": 0.0002, + "loss": 1.8048, + "step": 570 + }, + { + "epoch": 0.39522998296422485, + "grad_norm": 0.3186313509941101, + "learning_rate": 0.0002, + "loss": 1.8108, + "step": 580 + }, + { + "epoch": 0.4020442930153322, + "grad_norm": 0.41443702578544617, + "learning_rate": 0.0002, + "loss": 1.8012, + "step": 590 + }, + { + "epoch": 0.4088586030664395, + "grad_norm": 0.28773069381713867, + "learning_rate": 0.0002, + "loss": 1.7794, + "step": 600 + }, + { + "epoch": 0.41567291311754684, + "grad_norm": 0.35743263363838196, + "learning_rate": 0.0002, + "loss": 1.8231, + "step": 610 + }, + { + "epoch": 0.42248722316865417, + "grad_norm": 0.3360286355018616, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 620 + }, + { + "epoch": 0.4293015332197615, + "grad_norm": 0.32838866114616394, + "learning_rate": 0.0002, + "loss": 1.7654, + "step": 630 + }, + { + "epoch": 0.43611584327086883, + "grad_norm": 0.2994388937950134, + "learning_rate": 0.0002, + "loss": 1.8176, + "step": 640 + }, + { + "epoch": 0.44293015332197616, + "grad_norm": 0.3306307792663574, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 650 + }, + { + "epoch": 0.4497444633730835, + "grad_norm": 0.3129560351371765, + "learning_rate": 0.0002, + "loss": 1.8201, + "step": 660 + }, + { + "epoch": 0.4565587734241908, + "grad_norm": 0.3244289457798004, + "learning_rate": 0.0002, + "loss": 1.803, + "step": 670 + }, + { + "epoch": 0.46337308347529815, + "grad_norm": 0.3196892738342285, + "learning_rate": 0.0002, + "loss": 1.8399, + "step": 680 + }, + { + "epoch": 0.47018739352640543, + "grad_norm": 0.3135230243206024, + "learning_rate": 0.0002, + "loss": 1.8291, + "step": 690 + }, + { + "epoch": 0.47700170357751276, + "grad_norm": 0.28677991032600403, + "learning_rate": 0.0002, + "loss": 1.7423, + "step": 700 + }, + { + "epoch": 0.4838160136286201, + "grad_norm": 0.3074065148830414, + "learning_rate": 0.0002, + "loss": 1.6982, + "step": 710 + }, + { + "epoch": 0.4906303236797274, + "grad_norm": 0.3354290723800659, + "learning_rate": 0.0002, + "loss": 1.8078, + "step": 720 + }, + { + "epoch": 0.49744463373083475, + "grad_norm": 0.324370801448822, + "learning_rate": 0.0002, + "loss": 1.7333, + "step": 730 + }, + { + "epoch": 0.504258943781942, + "grad_norm": 0.29496142268180847, + "learning_rate": 0.0002, + "loss": 1.8506, + "step": 740 + }, + { + "epoch": 0.5110732538330494, + "grad_norm": 0.30694130063056946, + "learning_rate": 0.0002, + "loss": 1.7761, + "step": 750 + }, + { + "epoch": 0.5178875638841567, + "grad_norm": 0.36168408393859863, + "learning_rate": 0.0002, + "loss": 1.7854, + "step": 760 + }, + { + "epoch": 0.524701873935264, + "grad_norm": 0.2930343449115753, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 770 + }, + { + "epoch": 0.5315161839863713, + "grad_norm": 0.3023432493209839, + "learning_rate": 0.0002, + "loss": 1.7924, + "step": 780 + }, + { + "epoch": 0.5383304940374787, + "grad_norm": 0.3272720277309418, + "learning_rate": 0.0002, + "loss": 1.8467, + "step": 790 + }, + { + "epoch": 0.545144804088586, + "grad_norm": 0.2907974421977997, + "learning_rate": 0.0002, + "loss": 1.8639, + "step": 800 + }, + { + "epoch": 0.5519591141396933, + "grad_norm": 0.32267168164253235, + "learning_rate": 0.0002, + "loss": 1.9018, + "step": 810 + }, + { + "epoch": 0.5587734241908007, + "grad_norm": 0.32059940695762634, + "learning_rate": 0.0002, + "loss": 1.8311, + "step": 820 + }, + { + "epoch": 0.565587734241908, + "grad_norm": 0.30951258540153503, + "learning_rate": 0.0002, + "loss": 1.7234, + "step": 830 + }, + { + "epoch": 0.5724020442930153, + "grad_norm": 0.33976122736930847, + "learning_rate": 0.0002, + "loss": 1.8063, + "step": 840 + }, + { + "epoch": 0.5792163543441227, + "grad_norm": 0.3195820450782776, + "learning_rate": 0.0002, + "loss": 1.7021, + "step": 850 + }, + { + "epoch": 0.58603066439523, + "grad_norm": 0.2828562557697296, + "learning_rate": 0.0002, + "loss": 1.8073, + "step": 860 + }, + { + "epoch": 0.5928449744463373, + "grad_norm": 0.29591670632362366, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 870 + }, + { + "epoch": 0.5996592844974447, + "grad_norm": 0.3086104393005371, + "learning_rate": 0.0002, + "loss": 1.8109, + "step": 880 + }, + { + "epoch": 0.606473594548552, + "grad_norm": 0.3592929542064667, + "learning_rate": 0.0002, + "loss": 1.8207, + "step": 890 + }, + { + "epoch": 0.6132879045996593, + "grad_norm": 0.2830186188220978, + "learning_rate": 0.0002, + "loss": 1.7662, + "step": 900 + }, + { + "epoch": 0.6201022146507666, + "grad_norm": 0.3128598630428314, + "learning_rate": 0.0002, + "loss": 1.8344, + "step": 910 + }, + { + "epoch": 0.626916524701874, + "grad_norm": 0.31957563757896423, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 920 + }, + { + "epoch": 0.6337308347529813, + "grad_norm": 0.30994319915771484, + "learning_rate": 0.0002, + "loss": 1.7578, + "step": 930 + }, + { + "epoch": 0.6405451448040886, + "grad_norm": 0.3352845013141632, + "learning_rate": 0.0002, + "loss": 1.8494, + "step": 940 + }, + { + "epoch": 0.6473594548551959, + "grad_norm": 0.2960077226161957, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 950 + }, + { + "epoch": 0.6541737649063032, + "grad_norm": 0.32675081491470337, + "learning_rate": 0.0002, + "loss": 1.7209, + "step": 960 + }, + { + "epoch": 0.6609880749574105, + "grad_norm": 0.30042028427124023, + "learning_rate": 0.0002, + "loss": 1.7776, + "step": 970 + }, + { + "epoch": 0.6678023850085179, + "grad_norm": 0.3288673758506775, + "learning_rate": 0.0002, + "loss": 1.7597, + "step": 980 + }, + { + "epoch": 0.6746166950596252, + "grad_norm": 0.323215126991272, + "learning_rate": 0.0002, + "loss": 1.7962, + "step": 990 + }, + { + "epoch": 0.6814310051107325, + "grad_norm": 0.30041399598121643, + "learning_rate": 0.0002, + "loss": 1.821, + "step": 1000 + }, + { + "epoch": 0.6882453151618398, + "grad_norm": 0.3076179623603821, + "learning_rate": 0.0002, + "loss": 1.7716, + "step": 1010 + }, + { + "epoch": 0.6950596252129472, + "grad_norm": 0.2941909730434418, + "learning_rate": 0.0002, + "loss": 1.7827, + "step": 1020 + }, + { + "epoch": 0.7018739352640545, + "grad_norm": 0.32220420241355896, + "learning_rate": 0.0002, + "loss": 1.7964, + "step": 1030 + }, + { + "epoch": 0.7086882453151618, + "grad_norm": 0.2989702820777893, + "learning_rate": 0.0002, + "loss": 1.7101, + "step": 1040 + }, + { + "epoch": 0.7155025553662692, + "grad_norm": 0.3049640357494354, + "learning_rate": 0.0002, + "loss": 1.8749, + "step": 1050 + }, + { + "epoch": 0.7223168654173765, + "grad_norm": 0.3183284103870392, + "learning_rate": 0.0002, + "loss": 1.782, + "step": 1060 + }, + { + "epoch": 0.7291311754684838, + "grad_norm": 0.3070095181465149, + "learning_rate": 0.0002, + "loss": 1.785, + "step": 1070 + }, + { + "epoch": 0.7359454855195912, + "grad_norm": 0.33263063430786133, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 1080 + }, + { + "epoch": 0.7427597955706985, + "grad_norm": 0.28774312138557434, + "learning_rate": 0.0002, + "loss": 1.7627, + "step": 1090 + }, + { + "epoch": 0.7495741056218058, + "grad_norm": 0.29304224252700806, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1100 + }, + { + "epoch": 0.7563884156729132, + "grad_norm": 0.27529507875442505, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 1110 + }, + { + "epoch": 0.7632027257240205, + "grad_norm": 0.32319945096969604, + "learning_rate": 0.0002, + "loss": 1.8071, + "step": 1120 + }, + { + "epoch": 0.7700170357751278, + "grad_norm": 0.33777597546577454, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 1130 + }, + { + "epoch": 0.7768313458262351, + "grad_norm": 0.29847201704978943, + "learning_rate": 0.0002, + "loss": 1.8488, + "step": 1140 + }, + { + "epoch": 0.7836456558773425, + "grad_norm": 0.31370633840560913, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 1150 + }, + { + "epoch": 0.7904599659284497, + "grad_norm": 0.31633856892585754, + "learning_rate": 0.0002, + "loss": 1.8175, + "step": 1160 + }, + { + "epoch": 0.797274275979557, + "grad_norm": 0.33849263191223145, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 1170 + }, + { + "epoch": 0.8040885860306644, + "grad_norm": 0.3306333124637604, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 1180 + }, + { + "epoch": 0.8109028960817717, + "grad_norm": 0.3249678313732147, + "learning_rate": 0.0002, + "loss": 1.777, + "step": 1190 + }, + { + "epoch": 0.817717206132879, + "grad_norm": 0.3252817690372467, + "learning_rate": 0.0002, + "loss": 1.7631, + "step": 1200 + }, + { + "epoch": 0.8245315161839863, + "grad_norm": 0.31772559881210327, + "learning_rate": 0.0002, + "loss": 1.7877, + "step": 1210 + }, + { + "epoch": 0.8313458262350937, + "grad_norm": 0.2803371846675873, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 1220 + }, + { + "epoch": 0.838160136286201, + "grad_norm": 0.26700571179389954, + "learning_rate": 0.0002, + "loss": 1.738, + "step": 1230 + }, + { + "epoch": 0.8449744463373083, + "grad_norm": 0.3060092031955719, + "learning_rate": 0.0002, + "loss": 1.8301, + "step": 1240 + }, + { + "epoch": 0.8517887563884157, + "grad_norm": 0.28831684589385986, + "learning_rate": 0.0002, + "loss": 1.7993, + "step": 1250 + }, + { + "epoch": 0.858603066439523, + "grad_norm": 0.30708742141723633, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 1260 + }, + { + "epoch": 0.8654173764906303, + "grad_norm": 0.2915987968444824, + "learning_rate": 0.0002, + "loss": 1.8506, + "step": 1270 + }, + { + "epoch": 0.8722316865417377, + "grad_norm": 0.2893589735031128, + "learning_rate": 0.0002, + "loss": 1.7536, + "step": 1280 + }, + { + "epoch": 0.879045996592845, + "grad_norm": 0.29545632004737854, + "learning_rate": 0.0002, + "loss": 1.7437, + "step": 1290 + }, + { + "epoch": 0.8858603066439523, + "grad_norm": 0.3354771137237549, + "learning_rate": 0.0002, + "loss": 1.859, + "step": 1300 + }, + { + "epoch": 0.8926746166950597, + "grad_norm": 0.37715399265289307, + "learning_rate": 0.0002, + "loss": 1.7644, + "step": 1310 + }, + { + "epoch": 0.899488926746167, + "grad_norm": 0.28847193717956543, + "learning_rate": 0.0002, + "loss": 1.7731, + "step": 1320 + }, + { + "epoch": 0.9063032367972743, + "grad_norm": 0.2780889868736267, + "learning_rate": 0.0002, + "loss": 1.9077, + "step": 1330 + }, + { + "epoch": 0.9131175468483816, + "grad_norm": 0.2714342176914215, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 1340 + }, + { + "epoch": 0.919931856899489, + "grad_norm": 0.2950133979320526, + "learning_rate": 0.0002, + "loss": 1.8, + "step": 1350 + }, + { + "epoch": 0.9267461669505963, + "grad_norm": 0.29097145795822144, + "learning_rate": 0.0002, + "loss": 1.7368, + "step": 1360 + }, + { + "epoch": 0.9335604770017035, + "grad_norm": 0.32540133595466614, + "learning_rate": 0.0002, + "loss": 1.7864, + "step": 1370 + }, + { + "epoch": 0.9403747870528109, + "grad_norm": 0.3076636493206024, + "learning_rate": 0.0002, + "loss": 1.7571, + "step": 1380 + }, + { + "epoch": 0.9471890971039182, + "grad_norm": 0.2962130308151245, + "learning_rate": 0.0002, + "loss": 1.8312, + "step": 1390 + }, + { + "epoch": 0.9540034071550255, + "grad_norm": 0.30086860060691833, + "learning_rate": 0.0002, + "loss": 1.7858, + "step": 1400 + }, + { + "epoch": 0.9608177172061328, + "grad_norm": 0.28634947538375854, + "learning_rate": 0.0002, + "loss": 1.7991, + "step": 1410 + }, + { + "epoch": 0.9676320272572402, + "grad_norm": 0.35314416885375977, + "learning_rate": 0.0002, + "loss": 1.8385, + "step": 1420 + }, + { + "epoch": 0.9744463373083475, + "grad_norm": 0.2939317524433136, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 1430 + }, + { + "epoch": 0.9812606473594548, + "grad_norm": 0.3010196089744568, + "learning_rate": 0.0002, + "loss": 1.7582, + "step": 1440 + }, + { + "epoch": 0.9880749574105622, + "grad_norm": 0.30816152691841125, + "learning_rate": 0.0002, + "loss": 1.758, + "step": 1450 + }, + { + "epoch": 0.9948892674616695, + "grad_norm": 0.28152793645858765, + "learning_rate": 0.0002, + "loss": 1.8159, + "step": 1460 + }, + { + "epoch": 0.9996592844974447, + "eval_loss": 1.8034634590148926, + "eval_runtime": 53.6017, + "eval_samples_per_second": 9.459, + "eval_steps_per_second": 1.194, + "step": 1467 + }, + { + "epoch": 1.0017035775127767, + "grad_norm": 0.29246416687965393, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 1470 + }, + { + "epoch": 1.008517887563884, + "grad_norm": 0.2668602168560028, + "learning_rate": 0.0002, + "loss": 1.8159, + "step": 1480 + }, + { + "epoch": 1.0153321976149914, + "grad_norm": 0.2998567819595337, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 1490 + }, + { + "epoch": 1.0221465076660987, + "grad_norm": 0.3284934461116791, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 1500 + }, + { + "epoch": 1.028960817717206, + "grad_norm": 0.3275827169418335, + "learning_rate": 0.0002, + "loss": 1.689, + "step": 1510 + }, + { + "epoch": 1.0357751277683134, + "grad_norm": 0.3382718563079834, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 1520 + }, + { + "epoch": 1.0425894378194207, + "grad_norm": 0.36737215518951416, + "learning_rate": 0.0002, + "loss": 1.7215, + "step": 1530 + }, + { + "epoch": 1.049403747870528, + "grad_norm": 0.3442603647708893, + "learning_rate": 0.0002, + "loss": 1.6993, + "step": 1540 + }, + { + "epoch": 1.0562180579216354, + "grad_norm": 0.3323381245136261, + "learning_rate": 0.0002, + "loss": 1.6996, + "step": 1550 + }, + { + "epoch": 1.0630323679727427, + "grad_norm": 0.341227650642395, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 1560 + }, + { + "epoch": 1.06984667802385, + "grad_norm": 0.3361579477787018, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 1570 + }, + { + "epoch": 1.0766609880749574, + "grad_norm": 0.3556230962276459, + "learning_rate": 0.0002, + "loss": 1.7337, + "step": 1580 + }, + { + "epoch": 1.0834752981260647, + "grad_norm": 0.27130424976348877, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 1590 + }, + { + "epoch": 1.090289608177172, + "grad_norm": 0.29366323351860046, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 1600 + }, + { + "epoch": 1.0971039182282794, + "grad_norm": 0.3581245541572571, + "learning_rate": 0.0002, + "loss": 1.6721, + "step": 1610 + }, + { + "epoch": 1.1039182282793867, + "grad_norm": 0.3021670877933502, + "learning_rate": 0.0002, + "loss": 1.7639, + "step": 1620 + }, + { + "epoch": 1.110732538330494, + "grad_norm": 0.3145572543144226, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 1630 + }, + { + "epoch": 1.1175468483816013, + "grad_norm": 0.35362154245376587, + "learning_rate": 0.0002, + "loss": 1.7408, + "step": 1640 + }, + { + "epoch": 1.1243611584327087, + "grad_norm": 0.5413113236427307, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 1650 + }, + { + "epoch": 1.131175468483816, + "grad_norm": 0.6858654022216797, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 1660 + }, + { + "epoch": 1.1379897785349233, + "grad_norm": 0.3781903386116028, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 1670 + }, + { + "epoch": 1.1448040885860307, + "grad_norm": 0.404864102602005, + "learning_rate": 0.0002, + "loss": 1.659, + "step": 1680 + }, + { + "epoch": 1.151618398637138, + "grad_norm": 0.3595100939273834, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 1690 + }, + { + "epoch": 1.1584327086882453, + "grad_norm": 0.33682283759117126, + "learning_rate": 0.0002, + "loss": 1.7023, + "step": 1700 + }, + { + "epoch": 1.1652470187393527, + "grad_norm": 0.3877373933792114, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 1710 + }, + { + "epoch": 1.17206132879046, + "grad_norm": 0.34606897830963135, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 1720 + }, + { + "epoch": 1.1788756388415673, + "grad_norm": 0.3122918903827667, + "learning_rate": 0.0002, + "loss": 1.6889, + "step": 1730 + }, + { + "epoch": 1.1856899488926746, + "grad_norm": 0.34081900119781494, + "learning_rate": 0.0002, + "loss": 1.6585, + "step": 1740 + }, + { + "epoch": 1.192504258943782, + "grad_norm": 0.3418807089328766, + "learning_rate": 0.0002, + "loss": 1.7794, + "step": 1750 + }, + { + "epoch": 1.1993185689948893, + "grad_norm": 0.3495500981807709, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 1760 + }, + { + "epoch": 1.2061328790459966, + "grad_norm": 0.557288408279419, + "learning_rate": 0.0002, + "loss": 1.6578, + "step": 1770 + }, + { + "epoch": 1.212947189097104, + "grad_norm": 0.3193778693675995, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 1780 + }, + { + "epoch": 1.2197614991482113, + "grad_norm": 0.3306216895580292, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 1790 + }, + { + "epoch": 1.2265758091993186, + "grad_norm": 0.37998732924461365, + "learning_rate": 0.0002, + "loss": 1.6772, + "step": 1800 + }, + { + "epoch": 1.233390119250426, + "grad_norm": 0.3255669176578522, + "learning_rate": 0.0002, + "loss": 1.7094, + "step": 1810 + }, + { + "epoch": 1.2402044293015333, + "grad_norm": 0.3741287291049957, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 1820 + }, + { + "epoch": 1.2470187393526406, + "grad_norm": 0.36727291345596313, + "learning_rate": 0.0002, + "loss": 1.7317, + "step": 1830 + }, + { + "epoch": 1.253833049403748, + "grad_norm": 0.3479527235031128, + "learning_rate": 0.0002, + "loss": 1.7418, + "step": 1840 + }, + { + "epoch": 1.2606473594548553, + "grad_norm": 0.3472636640071869, + "learning_rate": 0.0002, + "loss": 1.7062, + "step": 1850 + }, + { + "epoch": 1.2674616695059626, + "grad_norm": 0.3702869415283203, + "learning_rate": 0.0002, + "loss": 1.6263, + "step": 1860 + }, + { + "epoch": 1.27427597955707, + "grad_norm": 0.3934040069580078, + "learning_rate": 0.0002, + "loss": 1.6222, + "step": 1870 + }, + { + "epoch": 1.2810902896081773, + "grad_norm": 0.46887534856796265, + "learning_rate": 0.0002, + "loss": 1.7616, + "step": 1880 + }, + { + "epoch": 1.2879045996592846, + "grad_norm": 0.3191998600959778, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 1890 + }, + { + "epoch": 1.294718909710392, + "grad_norm": 0.34032225608825684, + "learning_rate": 0.0002, + "loss": 1.7001, + "step": 1900 + }, + { + "epoch": 1.3015332197614993, + "grad_norm": 0.33453696966171265, + "learning_rate": 0.0002, + "loss": 1.8316, + "step": 1910 + }, + { + "epoch": 1.3083475298126066, + "grad_norm": 0.3451494872570038, + "learning_rate": 0.0002, + "loss": 1.6567, + "step": 1920 + }, + { + "epoch": 1.315161839863714, + "grad_norm": 0.36203092336654663, + "learning_rate": 0.0002, + "loss": 1.708, + "step": 1930 + }, + { + "epoch": 1.321976149914821, + "grad_norm": 0.43794456124305725, + "learning_rate": 0.0002, + "loss": 1.7095, + "step": 1940 + }, + { + "epoch": 1.3287904599659284, + "grad_norm": 0.3630591034889221, + "learning_rate": 0.0002, + "loss": 1.7264, + "step": 1950 + }, + { + "epoch": 1.3356047700170357, + "grad_norm": 0.36951911449432373, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 1960 + }, + { + "epoch": 1.342419080068143, + "grad_norm": 0.4001159965991974, + "learning_rate": 0.0002, + "loss": 1.651, + "step": 1970 + }, + { + "epoch": 1.3492333901192504, + "grad_norm": 0.3820836544036865, + "learning_rate": 0.0002, + "loss": 1.677, + "step": 1980 + }, + { + "epoch": 1.3560477001703577, + "grad_norm": 0.3705870807170868, + "learning_rate": 0.0002, + "loss": 1.583, + "step": 1990 + }, + { + "epoch": 1.362862010221465, + "grad_norm": 0.3557972013950348, + "learning_rate": 0.0002, + "loss": 1.6207, + "step": 2000 + }, + { + "epoch": 1.3696763202725724, + "grad_norm": 0.38546398282051086, + "learning_rate": 0.0002, + "loss": 1.7656, + "step": 2010 + }, + { + "epoch": 1.3764906303236797, + "grad_norm": 0.3908020853996277, + "learning_rate": 0.0002, + "loss": 1.6881, + "step": 2020 + }, + { + "epoch": 1.383304940374787, + "grad_norm": 0.3822040855884552, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 2030 + }, + { + "epoch": 1.3901192504258943, + "grad_norm": 0.425327867269516, + "learning_rate": 0.0002, + "loss": 1.7923, + "step": 2040 + }, + { + "epoch": 1.3969335604770017, + "grad_norm": 0.3436259329319, + "learning_rate": 0.0002, + "loss": 1.7032, + "step": 2050 + }, + { + "epoch": 1.403747870528109, + "grad_norm": 0.33124062418937683, + "learning_rate": 0.0002, + "loss": 1.6398, + "step": 2060 + }, + { + "epoch": 1.4105621805792163, + "grad_norm": 0.3662424683570862, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 2070 + }, + { + "epoch": 1.4173764906303237, + "grad_norm": 0.3720635175704956, + "learning_rate": 0.0002, + "loss": 1.7258, + "step": 2080 + }, + { + "epoch": 1.424190800681431, + "grad_norm": 0.3361680805683136, + "learning_rate": 0.0002, + "loss": 1.7186, + "step": 2090 + }, + { + "epoch": 1.4310051107325383, + "grad_norm": 0.32751724123954773, + "learning_rate": 0.0002, + "loss": 1.7606, + "step": 2100 + }, + { + "epoch": 1.4378194207836457, + "grad_norm": 0.34333378076553345, + "learning_rate": 0.0002, + "loss": 1.7051, + "step": 2110 + }, + { + "epoch": 1.444633730834753, + "grad_norm": 0.37777671217918396, + "learning_rate": 0.0002, + "loss": 1.6979, + "step": 2120 + }, + { + "epoch": 1.4514480408858603, + "grad_norm": 0.37126365303993225, + "learning_rate": 0.0002, + "loss": 1.7085, + "step": 2130 + }, + { + "epoch": 1.4582623509369677, + "grad_norm": 0.3602267801761627, + "learning_rate": 0.0002, + "loss": 1.721, + "step": 2140 + }, + { + "epoch": 1.465076660988075, + "grad_norm": 0.3287110924720764, + "learning_rate": 0.0002, + "loss": 1.8148, + "step": 2150 + }, + { + "epoch": 1.4718909710391823, + "grad_norm": 0.3562135100364685, + "learning_rate": 0.0002, + "loss": 1.6966, + "step": 2160 + }, + { + "epoch": 1.4787052810902896, + "grad_norm": 0.38292962312698364, + "learning_rate": 0.0002, + "loss": 1.713, + "step": 2170 + }, + { + "epoch": 1.485519591141397, + "grad_norm": 0.38220319151878357, + "learning_rate": 0.0002, + "loss": 1.7036, + "step": 2180 + }, + { + "epoch": 1.4923339011925043, + "grad_norm": 0.3570062220096588, + "learning_rate": 0.0002, + "loss": 1.7297, + "step": 2190 + }, + { + "epoch": 1.4991482112436116, + "grad_norm": 0.363146036863327, + "learning_rate": 0.0002, + "loss": 1.5652, + "step": 2200 + }, + { + "epoch": 1.5059625212947187, + "grad_norm": 0.37393274903297424, + "learning_rate": 0.0002, + "loss": 1.6716, + "step": 2210 + }, + { + "epoch": 1.512776831345826, + "grad_norm": 0.3628501892089844, + "learning_rate": 0.0002, + "loss": 1.6839, + "step": 2220 + }, + { + "epoch": 1.5195911413969334, + "grad_norm": 0.33430740237236023, + "learning_rate": 0.0002, + "loss": 1.6627, + "step": 2230 + }, + { + "epoch": 1.5264054514480407, + "grad_norm": 0.35363978147506714, + "learning_rate": 0.0002, + "loss": 1.7495, + "step": 2240 + }, + { + "epoch": 1.533219761499148, + "grad_norm": 0.37220337986946106, + "learning_rate": 0.0002, + "loss": 1.6364, + "step": 2250 + }, + { + "epoch": 1.5400340715502554, + "grad_norm": 0.35020262002944946, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 2260 + }, + { + "epoch": 1.5468483816013627, + "grad_norm": 0.35274937748908997, + "learning_rate": 0.0002, + "loss": 1.7453, + "step": 2270 + }, + { + "epoch": 1.55366269165247, + "grad_norm": 0.3797738552093506, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 2280 + }, + { + "epoch": 1.5604770017035774, + "grad_norm": 0.4160412847995758, + "learning_rate": 0.0002, + "loss": 1.6197, + "step": 2290 + }, + { + "epoch": 1.5672913117546847, + "grad_norm": 0.38779592514038086, + "learning_rate": 0.0002, + "loss": 1.7101, + "step": 2300 + }, + { + "epoch": 1.574105621805792, + "grad_norm": 0.39171287417411804, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 2310 + }, + { + "epoch": 1.5809199318568994, + "grad_norm": 0.3606826663017273, + "learning_rate": 0.0002, + "loss": 1.7087, + "step": 2320 + }, + { + "epoch": 1.5877342419080067, + "grad_norm": 0.3745017647743225, + "learning_rate": 0.0002, + "loss": 1.7269, + "step": 2330 + }, + { + "epoch": 1.594548551959114, + "grad_norm": 0.34933462738990784, + "learning_rate": 0.0002, + "loss": 1.7068, + "step": 2340 + }, + { + "epoch": 1.6013628620102214, + "grad_norm": 0.37268444895744324, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 2350 + }, + { + "epoch": 1.6081771720613287, + "grad_norm": 0.4603484869003296, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 2360 + }, + { + "epoch": 1.614991482112436, + "grad_norm": 0.35689088702201843, + "learning_rate": 0.0002, + "loss": 1.7621, + "step": 2370 + }, + { + "epoch": 1.6218057921635434, + "grad_norm": 0.3392031490802765, + "learning_rate": 0.0002, + "loss": 1.6989, + "step": 2380 + }, + { + "epoch": 1.6286201022146507, + "grad_norm": 0.394653856754303, + "learning_rate": 0.0002, + "loss": 1.7077, + "step": 2390 + }, + { + "epoch": 1.635434412265758, + "grad_norm": 0.33972012996673584, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 2400 + }, + { + "epoch": 1.6422487223168654, + "grad_norm": 0.3854375183582306, + "learning_rate": 0.0002, + "loss": 1.7681, + "step": 2410 + }, + { + "epoch": 1.6490630323679727, + "grad_norm": 0.36143961548805237, + "learning_rate": 0.0002, + "loss": 1.7102, + "step": 2420 + }, + { + "epoch": 1.65587734241908, + "grad_norm": 0.35816189646720886, + "learning_rate": 0.0002, + "loss": 1.7432, + "step": 2430 + }, + { + "epoch": 1.6626916524701874, + "grad_norm": 0.36298736929893494, + "learning_rate": 0.0002, + "loss": 1.6407, + "step": 2440 + }, + { + "epoch": 1.6695059625212947, + "grad_norm": 0.36756929755210876, + "learning_rate": 0.0002, + "loss": 1.723, + "step": 2450 + }, + { + "epoch": 1.676320272572402, + "grad_norm": 0.35969603061676025, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 2460 + }, + { + "epoch": 1.6831345826235093, + "grad_norm": 0.38449376821517944, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 2470 + }, + { + "epoch": 1.6899488926746167, + "grad_norm": 0.44511452317237854, + "learning_rate": 0.0002, + "loss": 1.7261, + "step": 2480 + }, + { + "epoch": 1.696763202725724, + "grad_norm": 0.3831416368484497, + "learning_rate": 0.0002, + "loss": 1.7397, + "step": 2490 + }, + { + "epoch": 1.7035775127768313, + "grad_norm": 0.3795325756072998, + "learning_rate": 0.0002, + "loss": 1.7046, + "step": 2500 + }, + { + "epoch": 1.7103918228279387, + "grad_norm": 0.34978193044662476, + "learning_rate": 0.0002, + "loss": 1.6993, + "step": 2510 + }, + { + "epoch": 1.717206132879046, + "grad_norm": 0.35923877358436584, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 2520 + }, + { + "epoch": 1.7240204429301533, + "grad_norm": 0.352999746799469, + "learning_rate": 0.0002, + "loss": 1.6253, + "step": 2530 + }, + { + "epoch": 1.7308347529812607, + "grad_norm": 0.43673479557037354, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 2540 + }, + { + "epoch": 1.737649063032368, + "grad_norm": 0.4153687357902527, + "learning_rate": 0.0002, + "loss": 1.7079, + "step": 2550 + }, + { + "epoch": 1.7444633730834753, + "grad_norm": 0.35541167855262756, + "learning_rate": 0.0002, + "loss": 1.6714, + "step": 2560 + }, + { + "epoch": 1.7512776831345827, + "grad_norm": 0.3288775086402893, + "learning_rate": 0.0002, + "loss": 1.696, + "step": 2570 + }, + { + "epoch": 1.75809199318569, + "grad_norm": 0.3991123139858246, + "learning_rate": 0.0002, + "loss": 1.7486, + "step": 2580 + }, + { + "epoch": 1.7649063032367973, + "grad_norm": 0.39967241883277893, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 2590 + }, + { + "epoch": 1.7717206132879046, + "grad_norm": 0.41104283928871155, + "learning_rate": 0.0002, + "loss": 1.6855, + "step": 2600 + }, + { + "epoch": 1.778534923339012, + "grad_norm": 0.44885286688804626, + "learning_rate": 0.0002, + "loss": 1.6993, + "step": 2610 + }, + { + "epoch": 1.7853492333901193, + "grad_norm": 0.38384467363357544, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 2620 + }, + { + "epoch": 1.7921635434412266, + "grad_norm": 0.35020917654037476, + "learning_rate": 0.0002, + "loss": 1.7213, + "step": 2630 + }, + { + "epoch": 1.798977853492334, + "grad_norm": 0.3360341489315033, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 2640 + }, + { + "epoch": 1.8057921635434413, + "grad_norm": 0.38875144720077515, + "learning_rate": 0.0002, + "loss": 1.7037, + "step": 2650 + }, + { + "epoch": 1.8126064735945486, + "grad_norm": 0.34876883029937744, + "learning_rate": 0.0002, + "loss": 1.693, + "step": 2660 + }, + { + "epoch": 1.819420783645656, + "grad_norm": 0.419979989528656, + "learning_rate": 0.0002, + "loss": 1.7743, + "step": 2670 + }, + { + "epoch": 1.8262350936967633, + "grad_norm": 0.3648919463157654, + "learning_rate": 0.0002, + "loss": 1.7089, + "step": 2680 + }, + { + "epoch": 1.8330494037478706, + "grad_norm": 0.3485383987426758, + "learning_rate": 0.0002, + "loss": 1.7828, + "step": 2690 + }, + { + "epoch": 1.839863713798978, + "grad_norm": 0.3647468686103821, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 2700 + }, + { + "epoch": 1.8466780238500853, + "grad_norm": 0.37003210186958313, + "learning_rate": 0.0002, + "loss": 1.7318, + "step": 2710 + }, + { + "epoch": 1.8534923339011926, + "grad_norm": 0.37031617760658264, + "learning_rate": 0.0002, + "loss": 1.6647, + "step": 2720 + }, + { + "epoch": 1.8603066439523, + "grad_norm": 0.3438796103000641, + "learning_rate": 0.0002, + "loss": 1.69, + "step": 2730 + }, + { + "epoch": 1.8671209540034073, + "grad_norm": 0.41574627161026, + "learning_rate": 0.0002, + "loss": 1.5995, + "step": 2740 + }, + { + "epoch": 1.8739352640545146, + "grad_norm": 0.35049930214881897, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 2750 + }, + { + "epoch": 1.880749574105622, + "grad_norm": 0.3943989872932434, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 2760 + }, + { + "epoch": 1.8875638841567293, + "grad_norm": 0.3384978175163269, + "learning_rate": 0.0002, + "loss": 1.7047, + "step": 2770 + }, + { + "epoch": 1.8943781942078366, + "grad_norm": 0.3501328229904175, + "learning_rate": 0.0002, + "loss": 1.7848, + "step": 2780 + }, + { + "epoch": 1.901192504258944, + "grad_norm": 0.37484532594680786, + "learning_rate": 0.0002, + "loss": 1.6986, + "step": 2790 + }, + { + "epoch": 1.9080068143100513, + "grad_norm": 0.34497788548469543, + "learning_rate": 0.0002, + "loss": 1.6867, + "step": 2800 + }, + { + "epoch": 1.9148211243611586, + "grad_norm": 0.3530851900577545, + "learning_rate": 0.0002, + "loss": 1.6513, + "step": 2810 + }, + { + "epoch": 1.921635434412266, + "grad_norm": 0.3879254162311554, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 2820 + }, + { + "epoch": 1.9284497444633732, + "grad_norm": 0.3885590136051178, + "learning_rate": 0.0002, + "loss": 1.6786, + "step": 2830 + }, + { + "epoch": 1.9352640545144804, + "grad_norm": 0.3868715465068817, + "learning_rate": 0.0002, + "loss": 1.8049, + "step": 2840 + }, + { + "epoch": 1.9420783645655877, + "grad_norm": 0.4152422249317169, + "learning_rate": 0.0002, + "loss": 1.7099, + "step": 2850 + }, + { + "epoch": 1.948892674616695, + "grad_norm": 0.401714563369751, + "learning_rate": 0.0002, + "loss": 1.6696, + "step": 2860 + }, + { + "epoch": 1.9557069846678024, + "grad_norm": 0.34825265407562256, + "learning_rate": 0.0002, + "loss": 1.7182, + "step": 2870 + }, + { + "epoch": 1.9625212947189097, + "grad_norm": 0.3620675504207611, + "learning_rate": 0.0002, + "loss": 1.6612, + "step": 2880 + }, + { + "epoch": 1.969335604770017, + "grad_norm": 0.3977806866168976, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 2890 + }, + { + "epoch": 1.9761499148211243, + "grad_norm": 0.3687497079372406, + "learning_rate": 0.0002, + "loss": 1.6514, + "step": 2900 + }, + { + "epoch": 1.9829642248722317, + "grad_norm": 0.408640056848526, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 2910 + }, + { + "epoch": 1.989778534923339, + "grad_norm": 0.34510108828544617, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 2920 + }, + { + "epoch": 1.9965928449744463, + "grad_norm": 0.3596334755420685, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 2930 + }, + { + "epoch": 2.0, + "eval_loss": 1.8056600093841553, + "eval_runtime": 60.7049, + "eval_samples_per_second": 8.352, + "eval_steps_per_second": 1.054, + "step": 2935 + }, + { + "epoch": 2.0034071550255534, + "grad_norm": 0.3460802137851715, + "learning_rate": 0.0002, + "loss": 1.6918, + "step": 2940 + }, + { + "epoch": 2.0102214650766608, + "grad_norm": 0.4038620591163635, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 2950 + }, + { + "epoch": 2.017035775127768, + "grad_norm": 0.3950219750404358, + "learning_rate": 0.0002, + "loss": 1.5386, + "step": 2960 + }, + { + "epoch": 2.0238500851788754, + "grad_norm": 0.519116997718811, + "learning_rate": 0.0002, + "loss": 1.5649, + "step": 2970 + }, + { + "epoch": 2.0306643952299828, + "grad_norm": 0.4097684919834137, + "learning_rate": 0.0002, + "loss": 1.5397, + "step": 2980 + }, + { + "epoch": 2.03747870528109, + "grad_norm": 0.4153544306755066, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 2990 + }, + { + "epoch": 2.0442930153321974, + "grad_norm": 0.4351160526275635, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 3000 + }, + { + "epoch": 2.0511073253833048, + "grad_norm": 0.42036259174346924, + "learning_rate": 0.0002, + "loss": 1.5091, + "step": 3010 + }, + { + "epoch": 2.057921635434412, + "grad_norm": 0.4433218836784363, + "learning_rate": 0.0002, + "loss": 1.5686, + "step": 3020 + }, + { + "epoch": 2.0647359454855194, + "grad_norm": 0.46511581540107727, + "learning_rate": 0.0002, + "loss": 1.5478, + "step": 3030 + }, + { + "epoch": 2.0715502555366268, + "grad_norm": 0.4567560851573944, + "learning_rate": 0.0002, + "loss": 1.5554, + "step": 3040 + }, + { + "epoch": 2.078364565587734, + "grad_norm": 0.45671048760414124, + "learning_rate": 0.0002, + "loss": 1.5561, + "step": 3050 + }, + { + "epoch": 2.0851788756388414, + "grad_norm": 0.4598552882671356, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 3060 + }, + { + "epoch": 2.0919931856899487, + "grad_norm": 0.4582861661911011, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 3070 + }, + { + "epoch": 2.098807495741056, + "grad_norm": 0.4366969168186188, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 3080 + }, + { + "epoch": 2.1056218057921634, + "grad_norm": 0.495917409658432, + "learning_rate": 0.0002, + "loss": 1.5574, + "step": 3090 + }, + { + "epoch": 2.1124361158432707, + "grad_norm": 1.6846044063568115, + "learning_rate": 0.0002, + "loss": 1.5798, + "step": 3100 + }, + { + "epoch": 2.119250425894378, + "grad_norm": 0.4765092134475708, + "learning_rate": 0.0002, + "loss": 1.5877, + "step": 3110 + }, + { + "epoch": 2.1260647359454854, + "grad_norm": 0.45029810070991516, + "learning_rate": 0.0002, + "loss": 1.6106, + "step": 3120 + }, + { + "epoch": 2.1328790459965927, + "grad_norm": 0.5706973075866699, + "learning_rate": 0.0002, + "loss": 1.5707, + "step": 3130 + }, + { + "epoch": 2.1396933560477, + "grad_norm": 0.4606274366378784, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 3140 + }, + { + "epoch": 2.1465076660988074, + "grad_norm": 2.199115753173828, + "learning_rate": 0.0002, + "loss": 1.564, + "step": 3150 + }, + { + "epoch": 2.1533219761499147, + "grad_norm": 0.6231027245521545, + "learning_rate": 0.0002, + "loss": 1.5808, + "step": 3160 + }, + { + "epoch": 2.160136286201022, + "grad_norm": 0.46918219327926636, + "learning_rate": 0.0002, + "loss": 1.5397, + "step": 3170 + }, + { + "epoch": 2.1669505962521294, + "grad_norm": 0.5006393194198608, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 3180 + }, + { + "epoch": 2.1737649063032367, + "grad_norm": 0.4745093286037445, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 3190 + }, + { + "epoch": 2.180579216354344, + "grad_norm": 0.511576771736145, + "learning_rate": 0.0002, + "loss": 1.6459, + "step": 3200 + }, + { + "epoch": 2.1873935264054514, + "grad_norm": 0.38622918725013733, + "learning_rate": 0.0002, + "loss": 1.6019, + "step": 3210 + }, + { + "epoch": 2.1942078364565587, + "grad_norm": 0.4425644278526306, + "learning_rate": 0.0002, + "loss": 1.6187, + "step": 3220 + }, + { + "epoch": 2.201022146507666, + "grad_norm": 0.45643091201782227, + "learning_rate": 0.0002, + "loss": 1.6114, + "step": 3230 + }, + { + "epoch": 2.2078364565587734, + "grad_norm": 0.4975406229496002, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 3240 + }, + { + "epoch": 2.2146507666098807, + "grad_norm": 0.4673331081867218, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 3250 + }, + { + "epoch": 2.221465076660988, + "grad_norm": 0.5081731081008911, + "learning_rate": 0.0002, + "loss": 1.6155, + "step": 3260 + }, + { + "epoch": 2.2282793867120954, + "grad_norm": 0.4790806770324707, + "learning_rate": 0.0002, + "loss": 1.53, + "step": 3270 + }, + { + "epoch": 2.2350936967632027, + "grad_norm": 0.5184140801429749, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 3280 + }, + { + "epoch": 2.24190800681431, + "grad_norm": 0.5159083604812622, + "learning_rate": 0.0002, + "loss": 1.5646, + "step": 3290 + }, + { + "epoch": 2.2487223168654173, + "grad_norm": 0.4876042604446411, + "learning_rate": 0.0002, + "loss": 1.6577, + "step": 3300 + }, + { + "epoch": 2.2555366269165247, + "grad_norm": 0.5454957485198975, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 3310 + }, + { + "epoch": 2.262350936967632, + "grad_norm": 0.49866822361946106, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 3320 + }, + { + "epoch": 2.2691652470187393, + "grad_norm": 0.42674365639686584, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 3330 + }, + { + "epoch": 2.2759795570698467, + "grad_norm": 0.5202316641807556, + "learning_rate": 0.0002, + "loss": 1.5588, + "step": 3340 + }, + { + "epoch": 2.282793867120954, + "grad_norm": 0.4849465489387512, + "learning_rate": 0.0002, + "loss": 1.6032, + "step": 3350 + }, + { + "epoch": 2.2896081771720613, + "grad_norm": 0.47202569246292114, + "learning_rate": 0.0002, + "loss": 1.6853, + "step": 3360 + }, + { + "epoch": 2.2964224872231687, + "grad_norm": 0.5311620235443115, + "learning_rate": 0.0002, + "loss": 1.6164, + "step": 3370 + }, + { + "epoch": 2.303236797274276, + "grad_norm": 0.49011409282684326, + "learning_rate": 0.0002, + "loss": 1.6004, + "step": 3380 + }, + { + "epoch": 2.3100511073253833, + "grad_norm": 0.4789247512817383, + "learning_rate": 0.0002, + "loss": 1.5484, + "step": 3390 + }, + { + "epoch": 2.3168654173764907, + "grad_norm": 0.46646103262901306, + "learning_rate": 0.0002, + "loss": 1.5866, + "step": 3400 + }, + { + "epoch": 2.323679727427598, + "grad_norm": 0.5552441477775574, + "learning_rate": 0.0002, + "loss": 1.5308, + "step": 3410 + }, + { + "epoch": 2.3304940374787053, + "grad_norm": 0.4530351758003235, + "learning_rate": 0.0002, + "loss": 1.5761, + "step": 3420 + }, + { + "epoch": 2.3373083475298126, + "grad_norm": 0.4806232750415802, + "learning_rate": 0.0002, + "loss": 1.5919, + "step": 3430 + }, + { + "epoch": 2.34412265758092, + "grad_norm": 0.5998363494873047, + "learning_rate": 0.0002, + "loss": 1.5569, + "step": 3440 + }, + { + "epoch": 2.3509369676320273, + "grad_norm": 0.4918554425239563, + "learning_rate": 0.0002, + "loss": 1.513, + "step": 3450 + }, + { + "epoch": 2.3577512776831346, + "grad_norm": 0.5359559655189514, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 3460 + }, + { + "epoch": 2.364565587734242, + "grad_norm": 0.5053277611732483, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 3470 + }, + { + "epoch": 2.3713798977853493, + "grad_norm": 0.5058915019035339, + "learning_rate": 0.0002, + "loss": 1.5673, + "step": 3480 + }, + { + "epoch": 2.3781942078364566, + "grad_norm": 0.5314899682998657, + "learning_rate": 0.0002, + "loss": 1.5434, + "step": 3490 + }, + { + "epoch": 2.385008517887564, + "grad_norm": 0.48035913705825806, + "learning_rate": 0.0002, + "loss": 1.5203, + "step": 3500 + }, + { + "epoch": 2.3918228279386713, + "grad_norm": 0.45864903926849365, + "learning_rate": 0.0002, + "loss": 1.5936, + "step": 3510 + }, + { + "epoch": 2.3986371379897786, + "grad_norm": 0.4553050398826599, + "learning_rate": 0.0002, + "loss": 1.6285, + "step": 3520 + }, + { + "epoch": 2.405451448040886, + "grad_norm": 0.4483442008495331, + "learning_rate": 0.0002, + "loss": 1.5444, + "step": 3530 + }, + { + "epoch": 2.4122657580919933, + "grad_norm": 0.5043742060661316, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 3540 + }, + { + "epoch": 2.4190800681431006, + "grad_norm": 0.44807168841362, + "learning_rate": 0.0002, + "loss": 1.5692, + "step": 3550 + }, + { + "epoch": 2.425894378194208, + "grad_norm": 0.5065137147903442, + "learning_rate": 0.0002, + "loss": 1.6306, + "step": 3560 + }, + { + "epoch": 2.4327086882453153, + "grad_norm": 0.5186443328857422, + "learning_rate": 0.0002, + "loss": 1.5842, + "step": 3570 + }, + { + "epoch": 2.4395229982964226, + "grad_norm": 0.49743232131004333, + "learning_rate": 0.0002, + "loss": 1.5956, + "step": 3580 + }, + { + "epoch": 2.44633730834753, + "grad_norm": 0.524450421333313, + "learning_rate": 0.0002, + "loss": 1.6021, + "step": 3590 + }, + { + "epoch": 2.4531516183986373, + "grad_norm": 0.5053797364234924, + "learning_rate": 0.0002, + "loss": 1.6283, + "step": 3600 + }, + { + "epoch": 2.4599659284497446, + "grad_norm": 0.5223091840744019, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 3610 + }, + { + "epoch": 2.466780238500852, + "grad_norm": 0.4763810932636261, + "learning_rate": 0.0002, + "loss": 1.6315, + "step": 3620 + }, + { + "epoch": 2.4735945485519593, + "grad_norm": 0.5097282528877258, + "learning_rate": 0.0002, + "loss": 1.5623, + "step": 3630 + }, + { + "epoch": 2.4804088586030666, + "grad_norm": 0.5831942558288574, + "learning_rate": 0.0002, + "loss": 1.605, + "step": 3640 + }, + { + "epoch": 2.487223168654174, + "grad_norm": 0.47573572397232056, + "learning_rate": 0.0002, + "loss": 1.6074, + "step": 3650 + }, + { + "epoch": 2.4940374787052813, + "grad_norm": 0.49602726101875305, + "learning_rate": 0.0002, + "loss": 1.6411, + "step": 3660 + }, + { + "epoch": 2.500851788756388, + "grad_norm": 0.5468524694442749, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 3670 + }, + { + "epoch": 2.507666098807496, + "grad_norm": 0.45899373292922974, + "learning_rate": 0.0002, + "loss": 1.5782, + "step": 3680 + }, + { + "epoch": 2.514480408858603, + "grad_norm": 0.5031567215919495, + "learning_rate": 0.0002, + "loss": 1.5114, + "step": 3690 + }, + { + "epoch": 2.5212947189097106, + "grad_norm": 0.5224900841712952, + "learning_rate": 0.0002, + "loss": 1.538, + "step": 3700 + }, + { + "epoch": 2.5281090289608175, + "grad_norm": 0.504769504070282, + "learning_rate": 0.0002, + "loss": 1.5269, + "step": 3710 + }, + { + "epoch": 2.5349233390119252, + "grad_norm": 0.6120529770851135, + "learning_rate": 0.0002, + "loss": 1.5141, + "step": 3720 + }, + { + "epoch": 2.541737649063032, + "grad_norm": 0.47930678725242615, + "learning_rate": 0.0002, + "loss": 1.5666, + "step": 3730 + }, + { + "epoch": 2.54855195911414, + "grad_norm": 0.5039092302322388, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 3740 + }, + { + "epoch": 2.555366269165247, + "grad_norm": 0.49758994579315186, + "learning_rate": 0.0002, + "loss": 1.5585, + "step": 3750 + }, + { + "epoch": 2.5621805792163546, + "grad_norm": 0.44739171862602234, + "learning_rate": 0.0002, + "loss": 1.5351, + "step": 3760 + }, + { + "epoch": 2.5689948892674614, + "grad_norm": 0.47056373953819275, + "learning_rate": 0.0002, + "loss": 1.5099, + "step": 3770 + }, + { + "epoch": 2.575809199318569, + "grad_norm": 0.5077595114707947, + "learning_rate": 0.0002, + "loss": 1.5524, + "step": 3780 + }, + { + "epoch": 2.582623509369676, + "grad_norm": 0.4981902837753296, + "learning_rate": 0.0002, + "loss": 1.5524, + "step": 3790 + }, + { + "epoch": 2.589437819420784, + "grad_norm": 0.5736238360404968, + "learning_rate": 0.0002, + "loss": 1.5381, + "step": 3800 + }, + { + "epoch": 2.5962521294718908, + "grad_norm": 0.4898384213447571, + "learning_rate": 0.0002, + "loss": 1.67, + "step": 3810 + }, + { + "epoch": 2.6030664395229985, + "grad_norm": 0.4611325263977051, + "learning_rate": 0.0002, + "loss": 1.5411, + "step": 3820 + }, + { + "epoch": 2.6098807495741054, + "grad_norm": 0.5285341739654541, + "learning_rate": 0.0002, + "loss": 1.5662, + "step": 3830 + }, + { + "epoch": 2.616695059625213, + "grad_norm": 0.5679430961608887, + "learning_rate": 0.0002, + "loss": 1.5875, + "step": 3840 + }, + { + "epoch": 2.62350936967632, + "grad_norm": 0.48532548546791077, + "learning_rate": 0.0002, + "loss": 1.5544, + "step": 3850 + }, + { + "epoch": 2.630323679727428, + "grad_norm": 0.45506492257118225, + "learning_rate": 0.0002, + "loss": 1.579, + "step": 3860 + }, + { + "epoch": 2.6371379897785348, + "grad_norm": 0.6552556753158569, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 3870 + }, + { + "epoch": 2.643952299829642, + "grad_norm": 0.537874698638916, + "learning_rate": 0.0002, + "loss": 1.608, + "step": 3880 + }, + { + "epoch": 2.6507666098807494, + "grad_norm": 0.46102389693260193, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 3890 + }, + { + "epoch": 2.6575809199318567, + "grad_norm": 0.45531195402145386, + "learning_rate": 0.0002, + "loss": 1.5293, + "step": 3900 + }, + { + "epoch": 2.664395229982964, + "grad_norm": 0.5327293872833252, + "learning_rate": 0.0002, + "loss": 1.5492, + "step": 3910 + }, + { + "epoch": 2.6712095400340714, + "grad_norm": 0.4968956410884857, + "learning_rate": 0.0002, + "loss": 1.5921, + "step": 3920 + }, + { + "epoch": 2.6780238500851787, + "grad_norm": 0.4790082275867462, + "learning_rate": 0.0002, + "loss": 1.5823, + "step": 3930 + }, + { + "epoch": 2.684838160136286, + "grad_norm": 0.5392967462539673, + "learning_rate": 0.0002, + "loss": 1.615, + "step": 3940 + }, + { + "epoch": 2.6916524701873934, + "grad_norm": 0.5076649785041809, + "learning_rate": 0.0002, + "loss": 1.6218, + "step": 3950 + }, + { + "epoch": 2.6984667802385007, + "grad_norm": 0.5628064274787903, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 3960 + }, + { + "epoch": 2.705281090289608, + "grad_norm": 0.5012659430503845, + "learning_rate": 0.0002, + "loss": 1.5417, + "step": 3970 + }, + { + "epoch": 2.7120954003407154, + "grad_norm": 0.4947647452354431, + "learning_rate": 0.0002, + "loss": 1.5339, + "step": 3980 + }, + { + "epoch": 2.7189097103918227, + "grad_norm": 0.4890969693660736, + "learning_rate": 0.0002, + "loss": 1.5724, + "step": 3990 + }, + { + "epoch": 2.72572402044293, + "grad_norm": 0.4471694231033325, + "learning_rate": 0.0002, + "loss": 1.5746, + "step": 4000 + }, + { + "epoch": 2.7325383304940374, + "grad_norm": 0.5116439461708069, + "learning_rate": 0.0002, + "loss": 1.6669, + "step": 4010 + }, + { + "epoch": 2.7393526405451447, + "grad_norm": 0.5720411539077759, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 4020 + }, + { + "epoch": 2.746166950596252, + "grad_norm": 0.5529406070709229, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 4030 + }, + { + "epoch": 2.7529812606473594, + "grad_norm": 0.5229396820068359, + "learning_rate": 0.0002, + "loss": 1.6296, + "step": 4040 + }, + { + "epoch": 2.7597955706984667, + "grad_norm": 0.5270276069641113, + "learning_rate": 0.0002, + "loss": 1.5363, + "step": 4050 + }, + { + "epoch": 2.766609880749574, + "grad_norm": 0.48413026332855225, + "learning_rate": 0.0002, + "loss": 1.6305, + "step": 4060 + }, + { + "epoch": 2.7734241908006814, + "grad_norm": 0.5145403742790222, + "learning_rate": 0.0002, + "loss": 1.5488, + "step": 4070 + }, + { + "epoch": 2.7802385008517887, + "grad_norm": 0.48626071214675903, + "learning_rate": 0.0002, + "loss": 1.6023, + "step": 4080 + }, + { + "epoch": 2.787052810902896, + "grad_norm": 0.5018984079360962, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 4090 + }, + { + "epoch": 2.7938671209540034, + "grad_norm": 0.4881938695907593, + "learning_rate": 0.0002, + "loss": 1.7166, + "step": 4100 + }, + { + "epoch": 2.8006814310051107, + "grad_norm": 0.5151546001434326, + "learning_rate": 0.0002, + "loss": 1.515, + "step": 4110 + }, + { + "epoch": 2.807495741056218, + "grad_norm": 0.5109850764274597, + "learning_rate": 0.0002, + "loss": 1.6069, + "step": 4120 + }, + { + "epoch": 2.8143100511073254, + "grad_norm": 0.5109251141548157, + "learning_rate": 0.0002, + "loss": 1.6153, + "step": 4130 + }, + { + "epoch": 2.8211243611584327, + "grad_norm": 0.5025496482849121, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 4140 + }, + { + "epoch": 2.82793867120954, + "grad_norm": 0.49027004837989807, + "learning_rate": 0.0002, + "loss": 1.6292, + "step": 4150 + }, + { + "epoch": 2.8347529812606473, + "grad_norm": 0.4957362413406372, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 4160 + }, + { + "epoch": 2.8415672913117547, + "grad_norm": 0.5159927606582642, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 4170 + }, + { + "epoch": 2.848381601362862, + "grad_norm": 0.6040670871734619, + "learning_rate": 0.0002, + "loss": 1.577, + "step": 4180 + }, + { + "epoch": 2.8551959114139693, + "grad_norm": 0.5489953756332397, + "learning_rate": 0.0002, + "loss": 1.5295, + "step": 4190 + }, + { + "epoch": 2.8620102214650767, + "grad_norm": 0.5416634678840637, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 4200 + }, + { + "epoch": 2.868824531516184, + "grad_norm": 0.5278245210647583, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 4210 + }, + { + "epoch": 2.8756388415672913, + "grad_norm": 0.43382319808006287, + "learning_rate": 0.0002, + "loss": 1.4848, + "step": 4220 + }, + { + "epoch": 2.8824531516183987, + "grad_norm": 0.4724387228488922, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 4230 + }, + { + "epoch": 2.889267461669506, + "grad_norm": 0.49824780225753784, + "learning_rate": 0.0002, + "loss": 1.5748, + "step": 4240 + }, + { + "epoch": 2.8960817717206133, + "grad_norm": 0.5360262989997864, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 4250 + }, + { + "epoch": 2.9028960817717206, + "grad_norm": 0.49090322852134705, + "learning_rate": 0.0002, + "loss": 1.5934, + "step": 4260 + }, + { + "epoch": 2.909710391822828, + "grad_norm": 0.5613328218460083, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 4270 + }, + { + "epoch": 2.9165247018739353, + "grad_norm": 0.4611356258392334, + "learning_rate": 0.0002, + "loss": 1.5713, + "step": 4280 + }, + { + "epoch": 2.9233390119250426, + "grad_norm": 0.550897479057312, + "learning_rate": 0.0002, + "loss": 1.5457, + "step": 4290 + }, + { + "epoch": 2.93015332197615, + "grad_norm": 0.5089612603187561, + "learning_rate": 0.0002, + "loss": 1.6225, + "step": 4300 + }, + { + "epoch": 2.9369676320272573, + "grad_norm": 0.5210904479026794, + "learning_rate": 0.0002, + "loss": 1.5897, + "step": 4310 + }, + { + "epoch": 2.9437819420783646, + "grad_norm": 0.5506424903869629, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 4320 + }, + { + "epoch": 2.950596252129472, + "grad_norm": 0.5118561387062073, + "learning_rate": 0.0002, + "loss": 1.6395, + "step": 4330 + }, + { + "epoch": 2.9574105621805793, + "grad_norm": 0.5034464597702026, + "learning_rate": 0.0002, + "loss": 1.704, + "step": 4340 + }, + { + "epoch": 2.9642248722316866, + "grad_norm": 0.5019990801811218, + "learning_rate": 0.0002, + "loss": 1.6314, + "step": 4350 + }, + { + "epoch": 2.971039182282794, + "grad_norm": 0.5423325300216675, + "learning_rate": 0.0002, + "loss": 1.6161, + "step": 4360 + }, + { + "epoch": 2.9778534923339013, + "grad_norm": 0.5287469625473022, + "learning_rate": 0.0002, + "loss": 1.6144, + "step": 4370 + }, + { + "epoch": 2.9846678023850086, + "grad_norm": 0.5206913352012634, + "learning_rate": 0.0002, + "loss": 1.5227, + "step": 4380 + }, + { + "epoch": 2.991482112436116, + "grad_norm": 0.5407394170761108, + "learning_rate": 0.0002, + "loss": 1.6026, + "step": 4390 + }, + { + "epoch": 2.9982964224872233, + "grad_norm": 0.5244600176811218, + "learning_rate": 0.0002, + "loss": 1.5908, + "step": 4400 + }, + { + "epoch": 2.9996592844974446, + "eval_loss": 1.8412635326385498, + "eval_runtime": 65.5583, + "eval_samples_per_second": 7.734, + "eval_steps_per_second": 0.976, + "step": 4402 + }, + { + "epoch": 3.0051107325383306, + "grad_norm": 0.5172150731086731, + "learning_rate": 0.0002, + "loss": 1.5157, + "step": 4410 + }, + { + "epoch": 3.011925042589438, + "grad_norm": 0.6882525086402893, + "learning_rate": 0.0002, + "loss": 1.398, + "step": 4420 + }, + { + "epoch": 3.0187393526405453, + "grad_norm": 0.6435003280639648, + "learning_rate": 0.0002, + "loss": 1.3884, + "step": 4430 + }, + { + "epoch": 3.0255536626916526, + "grad_norm": 0.7126057147979736, + "learning_rate": 0.0002, + "loss": 1.4493, + "step": 4440 + }, + { + "epoch": 3.03236797274276, + "grad_norm": 0.6634385585784912, + "learning_rate": 0.0002, + "loss": 1.4397, + "step": 4450 + }, + { + "epoch": 3.0391822827938673, + "grad_norm": 0.6468435525894165, + "learning_rate": 0.0002, + "loss": 1.3674, + "step": 4460 + }, + { + "epoch": 3.0459965928449746, + "grad_norm": 0.5690478086471558, + "learning_rate": 0.0002, + "loss": 1.4045, + "step": 4470 + }, + { + "epoch": 3.052810902896082, + "grad_norm": 0.7323708534240723, + "learning_rate": 0.0002, + "loss": 1.3742, + "step": 4480 + }, + { + "epoch": 3.0596252129471893, + "grad_norm": 0.6989302039146423, + "learning_rate": 0.0002, + "loss": 1.3281, + "step": 4490 + }, + { + "epoch": 3.0664395229982966, + "grad_norm": 0.6704450845718384, + "learning_rate": 0.0002, + "loss": 1.379, + "step": 4500 + }, + { + "epoch": 3.073253833049404, + "grad_norm": 0.769137442111969, + "learning_rate": 0.0002, + "loss": 1.4028, + "step": 4510 + }, + { + "epoch": 3.0800681431005112, + "grad_norm": 0.6556448936462402, + "learning_rate": 0.0002, + "loss": 1.4295, + "step": 4520 + }, + { + "epoch": 3.0868824531516186, + "grad_norm": 0.7143950462341309, + "learning_rate": 0.0002, + "loss": 1.2763, + "step": 4530 + }, + { + "epoch": 3.093696763202726, + "grad_norm": 0.7060510516166687, + "learning_rate": 0.0002, + "loss": 1.4806, + "step": 4540 + }, + { + "epoch": 3.1005110732538332, + "grad_norm": 0.6637526750564575, + "learning_rate": 0.0002, + "loss": 1.4097, + "step": 4550 + }, + { + "epoch": 3.1073253833049406, + "grad_norm": 0.822989284992218, + "learning_rate": 0.0002, + "loss": 1.4752, + "step": 4560 + }, + { + "epoch": 3.114139693356048, + "grad_norm": 0.5542152523994446, + "learning_rate": 0.0002, + "loss": 1.4994, + "step": 4570 + }, + { + "epoch": 3.1209540034071552, + "grad_norm": 0.7780306935310364, + "learning_rate": 0.0002, + "loss": 1.4306, + "step": 4580 + }, + { + "epoch": 3.1277683134582626, + "grad_norm": 0.7372637987136841, + "learning_rate": 0.0002, + "loss": 1.3909, + "step": 4590 + }, + { + "epoch": 3.1345826235093694, + "grad_norm": 0.6730087995529175, + "learning_rate": 0.0002, + "loss": 1.3989, + "step": 4600 + }, + { + "epoch": 3.1413969335604772, + "grad_norm": 0.6687398552894592, + "learning_rate": 0.0002, + "loss": 1.3591, + "step": 4610 + }, + { + "epoch": 3.148211243611584, + "grad_norm": 0.7645083665847778, + "learning_rate": 0.0002, + "loss": 1.436, + "step": 4620 + }, + { + "epoch": 3.155025553662692, + "grad_norm": 0.6770380139350891, + "learning_rate": 0.0002, + "loss": 1.3681, + "step": 4630 + }, + { + "epoch": 3.1618398637137988, + "grad_norm": 0.7200576663017273, + "learning_rate": 0.0002, + "loss": 1.405, + "step": 4640 + }, + { + "epoch": 3.168654173764906, + "grad_norm": 0.6663638949394226, + "learning_rate": 0.0002, + "loss": 1.3752, + "step": 4650 + }, + { + "epoch": 3.1754684838160134, + "grad_norm": 0.6602960228919983, + "learning_rate": 0.0002, + "loss": 1.4099, + "step": 4660 + }, + { + "epoch": 3.1822827938671208, + "grad_norm": 0.7838228344917297, + "learning_rate": 0.0002, + "loss": 1.4003, + "step": 4670 + }, + { + "epoch": 3.189097103918228, + "grad_norm": 0.7559184432029724, + "learning_rate": 0.0002, + "loss": 1.3853, + "step": 4680 + }, + { + "epoch": 3.1959114139693354, + "grad_norm": 0.6609814167022705, + "learning_rate": 0.0002, + "loss": 1.4516, + "step": 4690 + }, + { + "epoch": 3.2027257240204428, + "grad_norm": 0.8470419645309448, + "learning_rate": 0.0002, + "loss": 1.4464, + "step": 4700 + }, + { + "epoch": 3.20954003407155, + "grad_norm": 0.7282822728157043, + "learning_rate": 0.0002, + "loss": 1.428, + "step": 4710 + }, + { + "epoch": 3.2163543441226574, + "grad_norm": 0.6722773313522339, + "learning_rate": 0.0002, + "loss": 1.5261, + "step": 4720 + }, + { + "epoch": 3.2231686541737647, + "grad_norm": 0.7630265355110168, + "learning_rate": 0.0002, + "loss": 1.3809, + "step": 4730 + }, + { + "epoch": 3.229982964224872, + "grad_norm": 0.7102773785591125, + "learning_rate": 0.0002, + "loss": 1.42, + "step": 4740 + }, + { + "epoch": 3.2367972742759794, + "grad_norm": 0.7778299450874329, + "learning_rate": 0.0002, + "loss": 1.3529, + "step": 4750 + }, + { + "epoch": 3.2436115843270867, + "grad_norm": 0.7189921736717224, + "learning_rate": 0.0002, + "loss": 1.4715, + "step": 4760 + }, + { + "epoch": 3.250425894378194, + "grad_norm": 0.7708092331886292, + "learning_rate": 0.0002, + "loss": 1.4328, + "step": 4770 + }, + { + "epoch": 3.2572402044293014, + "grad_norm": 0.7208452224731445, + "learning_rate": 0.0002, + "loss": 1.3855, + "step": 4780 + }, + { + "epoch": 3.2640545144804087, + "grad_norm": 0.7220432758331299, + "learning_rate": 0.0002, + "loss": 1.3206, + "step": 4790 + }, + { + "epoch": 3.270868824531516, + "grad_norm": 0.7064954042434692, + "learning_rate": 0.0002, + "loss": 1.463, + "step": 4800 + }, + { + "epoch": 3.2776831345826234, + "grad_norm": 0.6618382334709167, + "learning_rate": 0.0002, + "loss": 1.4236, + "step": 4810 + }, + { + "epoch": 3.2844974446337307, + "grad_norm": 0.6854256391525269, + "learning_rate": 0.0002, + "loss": 1.3878, + "step": 4820 + }, + { + "epoch": 3.291311754684838, + "grad_norm": 0.6036319136619568, + "learning_rate": 0.0002, + "loss": 1.4236, + "step": 4830 + }, + { + "epoch": 3.2981260647359454, + "grad_norm": 0.714678943157196, + "learning_rate": 0.0002, + "loss": 1.4796, + "step": 4840 + }, + { + "epoch": 3.3049403747870527, + "grad_norm": 0.7218600511550903, + "learning_rate": 0.0002, + "loss": 1.4273, + "step": 4850 + }, + { + "epoch": 3.31175468483816, + "grad_norm": 0.7243074774742126, + "learning_rate": 0.0002, + "loss": 1.3915, + "step": 4860 + }, + { + "epoch": 3.3185689948892674, + "grad_norm": 0.7058630585670471, + "learning_rate": 0.0002, + "loss": 1.4088, + "step": 4870 + }, + { + "epoch": 3.3253833049403747, + "grad_norm": 0.7091076970100403, + "learning_rate": 0.0002, + "loss": 1.3837, + "step": 4880 + }, + { + "epoch": 3.332197614991482, + "grad_norm": 0.7375147342681885, + "learning_rate": 0.0002, + "loss": 1.4745, + "step": 4890 + }, + { + "epoch": 3.3390119250425894, + "grad_norm": 0.9426755309104919, + "learning_rate": 0.0002, + "loss": 1.4826, + "step": 4900 + }, + { + "epoch": 3.3458262350936967, + "grad_norm": 0.6508213877677917, + "learning_rate": 0.0002, + "loss": 1.369, + "step": 4910 + }, + { + "epoch": 3.352640545144804, + "grad_norm": 0.6945043206214905, + "learning_rate": 0.0002, + "loss": 1.3839, + "step": 4920 + }, + { + "epoch": 3.3594548551959114, + "grad_norm": 0.6335888504981995, + "learning_rate": 0.0002, + "loss": 1.3571, + "step": 4930 + }, + { + "epoch": 3.3662691652470187, + "grad_norm": 0.6947107911109924, + "learning_rate": 0.0002, + "loss": 1.4391, + "step": 4940 + }, + { + "epoch": 3.373083475298126, + "grad_norm": 0.8204733729362488, + "learning_rate": 0.0002, + "loss": 1.3885, + "step": 4950 + }, + { + "epoch": 3.3798977853492334, + "grad_norm": 0.7212244868278503, + "learning_rate": 0.0002, + "loss": 1.4886, + "step": 4960 + }, + { + "epoch": 3.3867120954003407, + "grad_norm": 0.6053042411804199, + "learning_rate": 0.0002, + "loss": 1.4581, + "step": 4970 + }, + { + "epoch": 3.393526405451448, + "grad_norm": 0.7820029854774475, + "learning_rate": 0.0002, + "loss": 1.3863, + "step": 4980 + }, + { + "epoch": 3.4003407155025553, + "grad_norm": 0.6866770386695862, + "learning_rate": 0.0002, + "loss": 1.4326, + "step": 4990 + }, + { + "epoch": 3.4071550255536627, + "grad_norm": 0.6652463674545288, + "learning_rate": 0.0002, + "loss": 1.4287, + "step": 5000 + }, + { + "epoch": 3.41396933560477, + "grad_norm": 1.1209032535552979, + "learning_rate": 0.0002, + "loss": 1.3667, + "step": 5010 + }, + { + "epoch": 3.4207836456558773, + "grad_norm": 0.8390814661979675, + "learning_rate": 0.0002, + "loss": 1.4461, + "step": 5020 + }, + { + "epoch": 3.4275979557069847, + "grad_norm": 0.7541858553886414, + "learning_rate": 0.0002, + "loss": 1.4556, + "step": 5030 + }, + { + "epoch": 3.434412265758092, + "grad_norm": 0.6902772784233093, + "learning_rate": 0.0002, + "loss": 1.4245, + "step": 5040 + }, + { + "epoch": 3.4412265758091993, + "grad_norm": 0.7070329785346985, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 5050 + }, + { + "epoch": 3.4480408858603067, + "grad_norm": 0.8075643181800842, + "learning_rate": 0.0002, + "loss": 1.3903, + "step": 5060 + }, + { + "epoch": 3.454855195911414, + "grad_norm": 0.7133861780166626, + "learning_rate": 0.0002, + "loss": 1.3929, + "step": 5070 + }, + { + "epoch": 3.4616695059625213, + "grad_norm": 0.6631823182106018, + "learning_rate": 0.0002, + "loss": 1.4632, + "step": 5080 + }, + { + "epoch": 3.4684838160136287, + "grad_norm": 0.673870325088501, + "learning_rate": 0.0002, + "loss": 1.4162, + "step": 5090 + }, + { + "epoch": 3.475298126064736, + "grad_norm": 0.6438634395599365, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 5100 + }, + { + "epoch": 3.4821124361158433, + "grad_norm": 0.7560495734214783, + "learning_rate": 0.0002, + "loss": 1.4421, + "step": 5110 + }, + { + "epoch": 3.4889267461669506, + "grad_norm": 0.6877814531326294, + "learning_rate": 0.0002, + "loss": 1.4125, + "step": 5120 + }, + { + "epoch": 3.495741056218058, + "grad_norm": 0.7031328678131104, + "learning_rate": 0.0002, + "loss": 1.4308, + "step": 5130 + }, + { + "epoch": 3.5025553662691653, + "grad_norm": 0.6797195672988892, + "learning_rate": 0.0002, + "loss": 1.3705, + "step": 5140 + }, + { + "epoch": 3.5093696763202726, + "grad_norm": 0.6766413450241089, + "learning_rate": 0.0002, + "loss": 1.4687, + "step": 5150 + }, + { + "epoch": 3.51618398637138, + "grad_norm": 0.666656494140625, + "learning_rate": 0.0002, + "loss": 1.4194, + "step": 5160 + }, + { + "epoch": 3.5229982964224873, + "grad_norm": 0.74996417760849, + "learning_rate": 0.0002, + "loss": 1.469, + "step": 5170 + }, + { + "epoch": 3.5298126064735946, + "grad_norm": 0.7370911836624146, + "learning_rate": 0.0002, + "loss": 1.4848, + "step": 5180 + }, + { + "epoch": 3.536626916524702, + "grad_norm": 0.9063456654548645, + "learning_rate": 0.0002, + "loss": 1.4523, + "step": 5190 + }, + { + "epoch": 3.5434412265758093, + "grad_norm": 0.6861422657966614, + "learning_rate": 0.0002, + "loss": 1.4726, + "step": 5200 + }, + { + "epoch": 3.5502555366269166, + "grad_norm": 0.7104039788246155, + "learning_rate": 0.0002, + "loss": 1.4803, + "step": 5210 + }, + { + "epoch": 3.557069846678024, + "grad_norm": 0.6578653454780579, + "learning_rate": 0.0002, + "loss": 1.4313, + "step": 5220 + }, + { + "epoch": 3.5638841567291313, + "grad_norm": 0.7336562275886536, + "learning_rate": 0.0002, + "loss": 1.4596, + "step": 5230 + }, + { + "epoch": 3.5706984667802386, + "grad_norm": 0.7163010835647583, + "learning_rate": 0.0002, + "loss": 1.4591, + "step": 5240 + }, + { + "epoch": 3.577512776831346, + "grad_norm": 0.8112391233444214, + "learning_rate": 0.0002, + "loss": 1.3814, + "step": 5250 + }, + { + "epoch": 3.5843270868824533, + "grad_norm": 0.7260391116142273, + "learning_rate": 0.0002, + "loss": 1.4249, + "step": 5260 + }, + { + "epoch": 3.5911413969335606, + "grad_norm": 0.7038731575012207, + "learning_rate": 0.0002, + "loss": 1.4249, + "step": 5270 + }, + { + "epoch": 3.597955706984668, + "grad_norm": 0.7864376902580261, + "learning_rate": 0.0002, + "loss": 1.4172, + "step": 5280 + }, + { + "epoch": 3.6047700170357753, + "grad_norm": 0.6968383193016052, + "learning_rate": 0.0002, + "loss": 1.4637, + "step": 5290 + }, + { + "epoch": 3.6115843270868826, + "grad_norm": 0.6726206541061401, + "learning_rate": 0.0002, + "loss": 1.5269, + "step": 5300 + }, + { + "epoch": 3.61839863713799, + "grad_norm": 0.6716854572296143, + "learning_rate": 0.0002, + "loss": 1.4199, + "step": 5310 + }, + { + "epoch": 3.6252129471890973, + "grad_norm": 0.7229742407798767, + "learning_rate": 0.0002, + "loss": 1.4686, + "step": 5320 + }, + { + "epoch": 3.6320272572402046, + "grad_norm": 0.7338683009147644, + "learning_rate": 0.0002, + "loss": 1.4441, + "step": 5330 + }, + { + "epoch": 3.638841567291312, + "grad_norm": 0.771672785282135, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 5340 + }, + { + "epoch": 3.645655877342419, + "grad_norm": 0.7024078369140625, + "learning_rate": 0.0002, + "loss": 1.4007, + "step": 5350 + }, + { + "epoch": 3.6524701873935266, + "grad_norm": 0.6847538352012634, + "learning_rate": 0.0002, + "loss": 1.4996, + "step": 5360 + }, + { + "epoch": 3.6592844974446335, + "grad_norm": 0.71802818775177, + "learning_rate": 0.0002, + "loss": 1.4111, + "step": 5370 + }, + { + "epoch": 3.6660988074957412, + "grad_norm": 0.78530353307724, + "learning_rate": 0.0002, + "loss": 1.4224, + "step": 5380 + }, + { + "epoch": 3.672913117546848, + "grad_norm": 0.7262226939201355, + "learning_rate": 0.0002, + "loss": 1.4582, + "step": 5390 + }, + { + "epoch": 3.679727427597956, + "grad_norm": 0.7608316540718079, + "learning_rate": 0.0002, + "loss": 1.4704, + "step": 5400 + }, + { + "epoch": 3.686541737649063, + "grad_norm": 0.6994926333427429, + "learning_rate": 0.0002, + "loss": 1.3742, + "step": 5410 + }, + { + "epoch": 3.6933560477001706, + "grad_norm": 0.7888479828834534, + "learning_rate": 0.0002, + "loss": 1.4738, + "step": 5420 + }, + { + "epoch": 3.7001703577512775, + "grad_norm": 0.7053858041763306, + "learning_rate": 0.0002, + "loss": 1.4213, + "step": 5430 + }, + { + "epoch": 3.7069846678023852, + "grad_norm": 0.7063165903091431, + "learning_rate": 0.0002, + "loss": 1.4988, + "step": 5440 + }, + { + "epoch": 3.713798977853492, + "grad_norm": 0.6603744626045227, + "learning_rate": 0.0002, + "loss": 1.4386, + "step": 5450 + }, + { + "epoch": 3.7206132879046, + "grad_norm": 0.7043602466583252, + "learning_rate": 0.0002, + "loss": 1.4695, + "step": 5460 + }, + { + "epoch": 3.7274275979557068, + "grad_norm": 0.7026081681251526, + "learning_rate": 0.0002, + "loss": 1.5051, + "step": 5470 + }, + { + "epoch": 3.7342419080068145, + "grad_norm": 0.7200090289115906, + "learning_rate": 0.0002, + "loss": 1.5613, + "step": 5480 + }, + { + "epoch": 3.7410562180579214, + "grad_norm": 0.7170904278755188, + "learning_rate": 0.0002, + "loss": 1.4182, + "step": 5490 + }, + { + "epoch": 3.747870528109029, + "grad_norm": 0.7489104866981506, + "learning_rate": 0.0002, + "loss": 1.4344, + "step": 5500 + }, + { + "epoch": 3.754684838160136, + "grad_norm": 0.6540989875793457, + "learning_rate": 0.0002, + "loss": 1.4911, + "step": 5510 + }, + { + "epoch": 3.761499148211244, + "grad_norm": 0.6654048562049866, + "learning_rate": 0.0002, + "loss": 1.4955, + "step": 5520 + }, + { + "epoch": 3.7683134582623508, + "grad_norm": 0.6577395796775818, + "learning_rate": 0.0002, + "loss": 1.4487, + "step": 5530 + }, + { + "epoch": 3.7751277683134585, + "grad_norm": 0.7762192487716675, + "learning_rate": 0.0002, + "loss": 1.4283, + "step": 5540 + }, + { + "epoch": 3.7819420783645654, + "grad_norm": 0.6336314678192139, + "learning_rate": 0.0002, + "loss": 1.4727, + "step": 5550 + }, + { + "epoch": 3.7887563884156727, + "grad_norm": 0.7098057866096497, + "learning_rate": 0.0002, + "loss": 1.4588, + "step": 5560 + }, + { + "epoch": 3.79557069846678, + "grad_norm": 0.7379715442657471, + "learning_rate": 0.0002, + "loss": 1.4679, + "step": 5570 + }, + { + "epoch": 3.8023850085178874, + "grad_norm": 0.6726924777030945, + "learning_rate": 0.0002, + "loss": 1.4633, + "step": 5580 + }, + { + "epoch": 3.8091993185689947, + "grad_norm": 1.1212009191513062, + "learning_rate": 0.0002, + "loss": 1.4751, + "step": 5590 + }, + { + "epoch": 3.816013628620102, + "grad_norm": 0.6503795981407166, + "learning_rate": 0.0002, + "loss": 1.4503, + "step": 5600 + }, + { + "epoch": 3.8228279386712094, + "grad_norm": 0.7041325569152832, + "learning_rate": 0.0002, + "loss": 1.4754, + "step": 5610 + }, + { + "epoch": 3.8296422487223167, + "grad_norm": 0.7962933778762817, + "learning_rate": 0.0002, + "loss": 1.4199, + "step": 5620 + }, + { + "epoch": 3.836456558773424, + "grad_norm": 0.6613591909408569, + "learning_rate": 0.0002, + "loss": 1.4672, + "step": 5630 + }, + { + "epoch": 3.8432708688245314, + "grad_norm": 0.7293516397476196, + "learning_rate": 0.0002, + "loss": 1.5688, + "step": 5640 + }, + { + "epoch": 3.8500851788756387, + "grad_norm": 0.7388607859611511, + "learning_rate": 0.0002, + "loss": 1.4149, + "step": 5650 + }, + { + "epoch": 3.856899488926746, + "grad_norm": 0.6440677642822266, + "learning_rate": 0.0002, + "loss": 1.4743, + "step": 5660 + }, + { + "epoch": 3.8637137989778534, + "grad_norm": 0.7729013562202454, + "learning_rate": 0.0002, + "loss": 1.5082, + "step": 5670 + }, + { + "epoch": 3.8705281090289607, + "grad_norm": 0.6696794033050537, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 5680 + }, + { + "epoch": 3.877342419080068, + "grad_norm": 0.7151781320571899, + "learning_rate": 0.0002, + "loss": 1.472, + "step": 5690 + }, + { + "epoch": 3.8841567291311754, + "grad_norm": 0.6736966371536255, + "learning_rate": 0.0002, + "loss": 1.4923, + "step": 5700 + }, + { + "epoch": 3.8909710391822827, + "grad_norm": 0.7444243431091309, + "learning_rate": 0.0002, + "loss": 1.4453, + "step": 5710 + }, + { + "epoch": 3.89778534923339, + "grad_norm": 0.6701464653015137, + "learning_rate": 0.0002, + "loss": 1.4562, + "step": 5720 + }, + { + "epoch": 3.9045996592844974, + "grad_norm": 0.7231952548027039, + "learning_rate": 0.0002, + "loss": 1.4478, + "step": 5730 + }, + { + "epoch": 3.9114139693356047, + "grad_norm": 0.831954300403595, + "learning_rate": 0.0002, + "loss": 1.4539, + "step": 5740 + }, + { + "epoch": 3.918228279386712, + "grad_norm": 0.7697733640670776, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 5750 + }, + { + "epoch": 3.9250425894378194, + "grad_norm": 0.6964395046234131, + "learning_rate": 0.0002, + "loss": 1.4552, + "step": 5760 + }, + { + "epoch": 3.9318568994889267, + "grad_norm": 0.6942925453186035, + "learning_rate": 0.0002, + "loss": 1.4688, + "step": 5770 + }, + { + "epoch": 3.938671209540034, + "grad_norm": 0.6491202712059021, + "learning_rate": 0.0002, + "loss": 1.4668, + "step": 5780 + }, + { + "epoch": 3.9454855195911414, + "grad_norm": 0.7004382610321045, + "learning_rate": 0.0002, + "loss": 1.4404, + "step": 5790 + }, + { + "epoch": 3.9522998296422487, + "grad_norm": 0.7337747812271118, + "learning_rate": 0.0002, + "loss": 1.5022, + "step": 5800 + }, + { + "epoch": 3.959114139693356, + "grad_norm": 0.6923640966415405, + "learning_rate": 0.0002, + "loss": 1.5314, + "step": 5810 + }, + { + "epoch": 3.9659284497444633, + "grad_norm": 0.6815266609191895, + "learning_rate": 0.0002, + "loss": 1.4811, + "step": 5820 + }, + { + "epoch": 3.9727427597955707, + "grad_norm": 0.6755654811859131, + "learning_rate": 0.0002, + "loss": 1.437, + "step": 5830 + }, + { + "epoch": 3.979557069846678, + "grad_norm": 0.6912487149238586, + "learning_rate": 0.0002, + "loss": 1.4277, + "step": 5840 + }, + { + "epoch": 3.9863713798977853, + "grad_norm": 0.6948044896125793, + "learning_rate": 0.0002, + "loss": 1.4654, + "step": 5850 + }, + { + "epoch": 3.9931856899488927, + "grad_norm": 0.6735455989837646, + "learning_rate": 0.0002, + "loss": 1.4779, + "step": 5860 + }, + { + "epoch": 4.0, + "grad_norm": 0.7005048990249634, + "learning_rate": 0.0002, + "loss": 1.5102, + "step": 5870 + }, + { + "epoch": 4.0, + "eval_loss": 1.923058032989502, + "eval_runtime": 58.9903, + "eval_samples_per_second": 8.595, + "eval_steps_per_second": 1.085, + "step": 5870 + }, + { + "epoch": 4.006814310051107, + "grad_norm": 0.809018075466156, + "learning_rate": 0.0002, + "loss": 1.2417, + "step": 5880 + }, + { + "epoch": 4.013628620102215, + "grad_norm": 0.9499403238296509, + "learning_rate": 0.0002, + "loss": 1.2874, + "step": 5890 + }, + { + "epoch": 4.0204429301533215, + "grad_norm": 0.7944574356079102, + "learning_rate": 0.0002, + "loss": 1.2245, + "step": 5900 + }, + { + "epoch": 4.027257240204429, + "grad_norm": 0.9501046538352966, + "learning_rate": 0.0002, + "loss": 1.2751, + "step": 5910 + }, + { + "epoch": 4.034071550255536, + "grad_norm": 0.8247923254966736, + "learning_rate": 0.0002, + "loss": 1.2706, + "step": 5920 + }, + { + "epoch": 4.040885860306644, + "grad_norm": 0.9358038902282715, + "learning_rate": 0.0002, + "loss": 1.2762, + "step": 5930 + }, + { + "epoch": 4.047700170357751, + "grad_norm": 1.0102452039718628, + "learning_rate": 0.0002, + "loss": 1.2953, + "step": 5940 + }, + { + "epoch": 4.054514480408859, + "grad_norm": 1.0248252153396606, + "learning_rate": 0.0002, + "loss": 1.216, + "step": 5950 + }, + { + "epoch": 4.0613287904599655, + "grad_norm": 1.0438553094863892, + "learning_rate": 0.0002, + "loss": 1.2115, + "step": 5960 + }, + { + "epoch": 4.068143100511073, + "grad_norm": 0.7964957356452942, + "learning_rate": 0.0002, + "loss": 1.2516, + "step": 5970 + }, + { + "epoch": 4.07495741056218, + "grad_norm": 0.9757015109062195, + "learning_rate": 0.0002, + "loss": 1.1555, + "step": 5980 + }, + { + "epoch": 4.081771720613288, + "grad_norm": 0.9157161116600037, + "learning_rate": 0.0002, + "loss": 1.2243, + "step": 5990 + }, + { + "epoch": 4.088586030664395, + "grad_norm": 0.9372851848602295, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 6000 + }, + { + "epoch": 4.095400340715503, + "grad_norm": 1.240779995918274, + "learning_rate": 0.0002, + "loss": 1.2091, + "step": 6010 + }, + { + "epoch": 4.1022146507666095, + "grad_norm": 0.8394840359687805, + "learning_rate": 0.0002, + "loss": 1.1727, + "step": 6020 + }, + { + "epoch": 4.109028960817717, + "grad_norm": 1.1081455945968628, + "learning_rate": 0.0002, + "loss": 1.2926, + "step": 6030 + }, + { + "epoch": 4.115843270868824, + "grad_norm": 0.9227745532989502, + "learning_rate": 0.0002, + "loss": 1.2417, + "step": 6040 + }, + { + "epoch": 4.122657580919932, + "grad_norm": 0.8487664461135864, + "learning_rate": 0.0002, + "loss": 1.1994, + "step": 6050 + }, + { + "epoch": 4.129471890971039, + "grad_norm": 0.9643339514732361, + "learning_rate": 0.0002, + "loss": 1.2378, + "step": 6060 + }, + { + "epoch": 4.136286201022147, + "grad_norm": 1.0296099185943604, + "learning_rate": 0.0002, + "loss": 1.2254, + "step": 6070 + }, + { + "epoch": 4.1431005110732535, + "grad_norm": 0.9534215927124023, + "learning_rate": 0.0002, + "loss": 1.2419, + "step": 6080 + }, + { + "epoch": 4.149914821124361, + "grad_norm": 0.9647086262702942, + "learning_rate": 0.0002, + "loss": 1.1849, + "step": 6090 + }, + { + "epoch": 4.156729131175468, + "grad_norm": 1.084836721420288, + "learning_rate": 0.0002, + "loss": 1.2713, + "step": 6100 + }, + { + "epoch": 4.163543441226576, + "grad_norm": 0.9315235614776611, + "learning_rate": 0.0002, + "loss": 1.1788, + "step": 6110 + }, + { + "epoch": 4.170357751277683, + "grad_norm": 0.9541679620742798, + "learning_rate": 0.0002, + "loss": 1.17, + "step": 6120 + }, + { + "epoch": 4.177172061328791, + "grad_norm": 0.9792100191116333, + "learning_rate": 0.0002, + "loss": 1.1407, + "step": 6130 + }, + { + "epoch": 4.1839863713798975, + "grad_norm": 1.065783143043518, + "learning_rate": 0.0002, + "loss": 1.2069, + "step": 6140 + }, + { + "epoch": 4.190800681431005, + "grad_norm": 1.036161184310913, + "learning_rate": 0.0002, + "loss": 1.2512, + "step": 6150 + }, + { + "epoch": 4.197614991482112, + "grad_norm": 0.8979679942131042, + "learning_rate": 0.0002, + "loss": 1.2371, + "step": 6160 + }, + { + "epoch": 4.20442930153322, + "grad_norm": 0.7584333419799805, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 6170 + }, + { + "epoch": 4.211243611584327, + "grad_norm": 1.1970131397247314, + "learning_rate": 0.0002, + "loss": 1.2128, + "step": 6180 + }, + { + "epoch": 4.218057921635435, + "grad_norm": 2.6447298526763916, + "learning_rate": 0.0002, + "loss": 1.1982, + "step": 6190 + }, + { + "epoch": 4.2248722316865415, + "grad_norm": 0.9357487559318542, + "learning_rate": 0.0002, + "loss": 1.2465, + "step": 6200 + }, + { + "epoch": 4.231686541737649, + "grad_norm": 0.9141183495521545, + "learning_rate": 0.0002, + "loss": 1.2963, + "step": 6210 + }, + { + "epoch": 4.238500851788756, + "grad_norm": 1.0606296062469482, + "learning_rate": 0.0002, + "loss": 1.1959, + "step": 6220 + }, + { + "epoch": 4.245315161839864, + "grad_norm": 0.9999088048934937, + "learning_rate": 0.0002, + "loss": 1.2629, + "step": 6230 + }, + { + "epoch": 4.252129471890971, + "grad_norm": 0.9469764232635498, + "learning_rate": 0.0002, + "loss": 1.1471, + "step": 6240 + }, + { + "epoch": 4.258943781942079, + "grad_norm": 1.1508198976516724, + "learning_rate": 0.0002, + "loss": 1.223, + "step": 6250 + }, + { + "epoch": 4.2657580919931855, + "grad_norm": 1.2576130628585815, + "learning_rate": 0.0002, + "loss": 1.2677, + "step": 6260 + }, + { + "epoch": 4.272572402044293, + "grad_norm": 0.9435968399047852, + "learning_rate": 0.0002, + "loss": 1.2216, + "step": 6270 + }, + { + "epoch": 4.2793867120954, + "grad_norm": 0.9290348887443542, + "learning_rate": 0.0002, + "loss": 1.2788, + "step": 6280 + }, + { + "epoch": 4.286201022146508, + "grad_norm": 0.9973701238632202, + "learning_rate": 0.0002, + "loss": 1.2631, + "step": 6290 + }, + { + "epoch": 4.293015332197615, + "grad_norm": 1.012855887413025, + "learning_rate": 0.0002, + "loss": 1.2276, + "step": 6300 + }, + { + "epoch": 4.2998296422487225, + "grad_norm": 0.8371705412864685, + "learning_rate": 0.0002, + "loss": 1.2115, + "step": 6310 + }, + { + "epoch": 4.306643952299829, + "grad_norm": 1.0867925882339478, + "learning_rate": 0.0002, + "loss": 1.2423, + "step": 6320 + }, + { + "epoch": 4.313458262350937, + "grad_norm": 0.9763767123222351, + "learning_rate": 0.0002, + "loss": 1.2262, + "step": 6330 + }, + { + "epoch": 4.320272572402044, + "grad_norm": 1.1844252347946167, + "learning_rate": 0.0002, + "loss": 1.2557, + "step": 6340 + }, + { + "epoch": 4.327086882453152, + "grad_norm": 0.8292830586433411, + "learning_rate": 0.0002, + "loss": 1.2635, + "step": 6350 + }, + { + "epoch": 4.333901192504259, + "grad_norm": 0.9351436495780945, + "learning_rate": 0.0002, + "loss": 1.262, + "step": 6360 + }, + { + "epoch": 4.3407155025553665, + "grad_norm": 1.0425835847854614, + "learning_rate": 0.0002, + "loss": 1.2678, + "step": 6370 + }, + { + "epoch": 4.347529812606473, + "grad_norm": 0.8894261121749878, + "learning_rate": 0.0002, + "loss": 1.2476, + "step": 6380 + }, + { + "epoch": 4.354344122657581, + "grad_norm": 0.9663366079330444, + "learning_rate": 0.0002, + "loss": 1.2965, + "step": 6390 + }, + { + "epoch": 4.361158432708688, + "grad_norm": 0.8915578126907349, + "learning_rate": 0.0002, + "loss": 1.2529, + "step": 6400 + }, + { + "epoch": 4.367972742759796, + "grad_norm": 1.0393000841140747, + "learning_rate": 0.0002, + "loss": 1.2573, + "step": 6410 + }, + { + "epoch": 4.374787052810903, + "grad_norm": 0.917398989200592, + "learning_rate": 0.0002, + "loss": 1.2254, + "step": 6420 + }, + { + "epoch": 4.3816013628620105, + "grad_norm": 1.0496646165847778, + "learning_rate": 0.0002, + "loss": 1.3115, + "step": 6430 + }, + { + "epoch": 4.388415672913117, + "grad_norm": 0.9349859356880188, + "learning_rate": 0.0002, + "loss": 1.2607, + "step": 6440 + }, + { + "epoch": 4.395229982964225, + "grad_norm": 1.0981004238128662, + "learning_rate": 0.0002, + "loss": 1.3414, + "step": 6450 + }, + { + "epoch": 4.402044293015332, + "grad_norm": 0.9794871807098389, + "learning_rate": 0.0002, + "loss": 1.2391, + "step": 6460 + }, + { + "epoch": 4.40885860306644, + "grad_norm": 0.9321421384811401, + "learning_rate": 0.0002, + "loss": 1.208, + "step": 6470 + }, + { + "epoch": 4.415672913117547, + "grad_norm": 0.9158342480659485, + "learning_rate": 0.0002, + "loss": 1.3398, + "step": 6480 + }, + { + "epoch": 4.4224872231686545, + "grad_norm": 0.9462087750434875, + "learning_rate": 0.0002, + "loss": 1.1832, + "step": 6490 + }, + { + "epoch": 4.429301533219761, + "grad_norm": 0.9740175604820251, + "learning_rate": 0.0002, + "loss": 1.2366, + "step": 6500 + }, + { + "epoch": 4.436115843270869, + "grad_norm": 0.8477463126182556, + "learning_rate": 0.0002, + "loss": 1.3074, + "step": 6510 + }, + { + "epoch": 4.442930153321976, + "grad_norm": 1.0296647548675537, + "learning_rate": 0.0002, + "loss": 1.2719, + "step": 6520 + }, + { + "epoch": 4.449744463373084, + "grad_norm": 0.9437751173973083, + "learning_rate": 0.0002, + "loss": 1.2647, + "step": 6530 + }, + { + "epoch": 4.456558773424191, + "grad_norm": 1.011192798614502, + "learning_rate": 0.0002, + "loss": 1.2043, + "step": 6540 + }, + { + "epoch": 4.4633730834752985, + "grad_norm": 0.8836222290992737, + "learning_rate": 0.0002, + "loss": 1.3673, + "step": 6550 + }, + { + "epoch": 4.470187393526405, + "grad_norm": 1.2799941301345825, + "learning_rate": 0.0002, + "loss": 1.3028, + "step": 6560 + }, + { + "epoch": 4.477001703577513, + "grad_norm": 0.925910472869873, + "learning_rate": 0.0002, + "loss": 1.2789, + "step": 6570 + }, + { + "epoch": 4.48381601362862, + "grad_norm": 0.957401692867279, + "learning_rate": 0.0002, + "loss": 1.2723, + "step": 6580 + }, + { + "epoch": 4.490630323679728, + "grad_norm": 1.0789544582366943, + "learning_rate": 0.0002, + "loss": 1.242, + "step": 6590 + }, + { + "epoch": 4.497444633730835, + "grad_norm": 0.8874586820602417, + "learning_rate": 0.0002, + "loss": 1.2553, + "step": 6600 + }, + { + "epoch": 4.504258943781942, + "grad_norm": 0.9394784569740295, + "learning_rate": 0.0002, + "loss": 1.2779, + "step": 6610 + }, + { + "epoch": 4.511073253833049, + "grad_norm": 1.029640793800354, + "learning_rate": 0.0002, + "loss": 1.2744, + "step": 6620 + }, + { + "epoch": 4.517887563884157, + "grad_norm": 0.9510841965675354, + "learning_rate": 0.0002, + "loss": 1.2634, + "step": 6630 + }, + { + "epoch": 4.524701873935264, + "grad_norm": 0.9992963671684265, + "learning_rate": 0.0002, + "loss": 1.2562, + "step": 6640 + }, + { + "epoch": 4.531516183986371, + "grad_norm": 0.9312878847122192, + "learning_rate": 0.0002, + "loss": 1.2942, + "step": 6650 + }, + { + "epoch": 4.538330494037479, + "grad_norm": 0.9406482577323914, + "learning_rate": 0.0002, + "loss": 1.2572, + "step": 6660 + }, + { + "epoch": 4.5451448040885865, + "grad_norm": 1.1058286428451538, + "learning_rate": 0.0002, + "loss": 1.2283, + "step": 6670 + }, + { + "epoch": 4.551959114139693, + "grad_norm": 0.9389635920524597, + "learning_rate": 0.0002, + "loss": 1.2391, + "step": 6680 + }, + { + "epoch": 4.5587734241908, + "grad_norm": 1.0356028079986572, + "learning_rate": 0.0002, + "loss": 1.2696, + "step": 6690 + }, + { + "epoch": 4.565587734241908, + "grad_norm": 0.9370909929275513, + "learning_rate": 0.0002, + "loss": 1.2935, + "step": 6700 + }, + { + "epoch": 4.572402044293016, + "grad_norm": 0.9917567372322083, + "learning_rate": 0.0002, + "loss": 1.2914, + "step": 6710 + }, + { + "epoch": 4.579216354344123, + "grad_norm": 0.9065384864807129, + "learning_rate": 0.0002, + "loss": 1.3318, + "step": 6720 + }, + { + "epoch": 4.5860306643952296, + "grad_norm": 1.3347833156585693, + "learning_rate": 0.0002, + "loss": 1.2909, + "step": 6730 + }, + { + "epoch": 4.592844974446337, + "grad_norm": 0.910632312297821, + "learning_rate": 0.0002, + "loss": 1.3322, + "step": 6740 + }, + { + "epoch": 4.599659284497445, + "grad_norm": 0.8874805569648743, + "learning_rate": 0.0002, + "loss": 1.2584, + "step": 6750 + }, + { + "epoch": 4.606473594548552, + "grad_norm": 0.9355664253234863, + "learning_rate": 0.0002, + "loss": 1.3173, + "step": 6760 + }, + { + "epoch": 4.613287904599659, + "grad_norm": 0.9360204339027405, + "learning_rate": 0.0002, + "loss": 1.3515, + "step": 6770 + }, + { + "epoch": 4.620102214650767, + "grad_norm": 0.9931750893592834, + "learning_rate": 0.0002, + "loss": 1.2326, + "step": 6780 + }, + { + "epoch": 4.626916524701874, + "grad_norm": 0.9195131063461304, + "learning_rate": 0.0002, + "loss": 1.2677, + "step": 6790 + }, + { + "epoch": 4.633730834752981, + "grad_norm": 0.9448373913764954, + "learning_rate": 0.0002, + "loss": 1.3417, + "step": 6800 + }, + { + "epoch": 4.640545144804088, + "grad_norm": 1.162890911102295, + "learning_rate": 0.0002, + "loss": 1.2658, + "step": 6810 + }, + { + "epoch": 4.647359454855196, + "grad_norm": 0.9739466905593872, + "learning_rate": 0.0002, + "loss": 1.2841, + "step": 6820 + }, + { + "epoch": 4.654173764906303, + "grad_norm": 0.9462909698486328, + "learning_rate": 0.0002, + "loss": 1.3068, + "step": 6830 + }, + { + "epoch": 4.660988074957411, + "grad_norm": 1.042639970779419, + "learning_rate": 0.0002, + "loss": 1.284, + "step": 6840 + }, + { + "epoch": 4.6678023850085175, + "grad_norm": 0.8910539150238037, + "learning_rate": 0.0002, + "loss": 1.3337, + "step": 6850 + }, + { + "epoch": 4.674616695059625, + "grad_norm": 1.0806447267532349, + "learning_rate": 0.0002, + "loss": 1.3025, + "step": 6860 + }, + { + "epoch": 4.681431005110732, + "grad_norm": 1.0054864883422852, + "learning_rate": 0.0002, + "loss": 1.2258, + "step": 6870 + }, + { + "epoch": 4.68824531516184, + "grad_norm": 0.7774158120155334, + "learning_rate": 0.0002, + "loss": 1.3261, + "step": 6880 + }, + { + "epoch": 4.695059625212947, + "grad_norm": 0.9729512333869934, + "learning_rate": 0.0002, + "loss": 1.2545, + "step": 6890 + }, + { + "epoch": 4.701873935264055, + "grad_norm": 1.2025411128997803, + "learning_rate": 0.0002, + "loss": 1.3251, + "step": 6900 + }, + { + "epoch": 4.7086882453151615, + "grad_norm": 1.1654069423675537, + "learning_rate": 0.0002, + "loss": 1.3418, + "step": 6910 + }, + { + "epoch": 4.715502555366269, + "grad_norm": 1.1501442193984985, + "learning_rate": 0.0002, + "loss": 1.3091, + "step": 6920 + }, + { + "epoch": 4.722316865417376, + "grad_norm": 1.1083979606628418, + "learning_rate": 0.0002, + "loss": 1.2627, + "step": 6930 + }, + { + "epoch": 4.729131175468484, + "grad_norm": 0.9431378841400146, + "learning_rate": 0.0002, + "loss": 1.2836, + "step": 6940 + }, + { + "epoch": 4.735945485519591, + "grad_norm": 0.9722502827644348, + "learning_rate": 0.0002, + "loss": 1.3381, + "step": 6950 + }, + { + "epoch": 4.742759795570699, + "grad_norm": 0.9094559550285339, + "learning_rate": 0.0002, + "loss": 1.3228, + "step": 6960 + }, + { + "epoch": 4.7495741056218055, + "grad_norm": 0.9918473958969116, + "learning_rate": 0.0002, + "loss": 1.3474, + "step": 6970 + }, + { + "epoch": 4.756388415672913, + "grad_norm": 0.9999690651893616, + "learning_rate": 0.0002, + "loss": 1.3352, + "step": 6980 + }, + { + "epoch": 4.76320272572402, + "grad_norm": 1.0453810691833496, + "learning_rate": 0.0002, + "loss": 1.3579, + "step": 6990 + }, + { + "epoch": 4.770017035775128, + "grad_norm": 1.0167806148529053, + "learning_rate": 0.0002, + "loss": 1.294, + "step": 7000 + }, + { + "epoch": 4.776831345826235, + "grad_norm": 0.8133894801139832, + "learning_rate": 0.0002, + "loss": 1.3247, + "step": 7010 + }, + { + "epoch": 4.783645655877343, + "grad_norm": 0.8000897765159607, + "learning_rate": 0.0002, + "loss": 1.2577, + "step": 7020 + }, + { + "epoch": 4.7904599659284495, + "grad_norm": 0.992080569267273, + "learning_rate": 0.0002, + "loss": 1.2802, + "step": 7030 + }, + { + "epoch": 4.797274275979557, + "grad_norm": 0.9824522137641907, + "learning_rate": 0.0002, + "loss": 1.3269, + "step": 7040 + }, + { + "epoch": 4.804088586030664, + "grad_norm": 0.9808870553970337, + "learning_rate": 0.0002, + "loss": 1.279, + "step": 7050 + }, + { + "epoch": 4.810902896081772, + "grad_norm": 0.9679701924324036, + "learning_rate": 0.0002, + "loss": 1.3342, + "step": 7060 + }, + { + "epoch": 4.817717206132879, + "grad_norm": 0.9895215034484863, + "learning_rate": 0.0002, + "loss": 1.2711, + "step": 7070 + }, + { + "epoch": 4.824531516183987, + "grad_norm": 1.052246332168579, + "learning_rate": 0.0002, + "loss": 1.3008, + "step": 7080 + }, + { + "epoch": 4.8313458262350935, + "grad_norm": 0.9243564605712891, + "learning_rate": 0.0002, + "loss": 1.2874, + "step": 7090 + }, + { + "epoch": 4.838160136286201, + "grad_norm": 0.9545369744300842, + "learning_rate": 0.0002, + "loss": 1.2835, + "step": 7100 + }, + { + "epoch": 4.844974446337308, + "grad_norm": 0.9655884504318237, + "learning_rate": 0.0002, + "loss": 1.31, + "step": 7110 + }, + { + "epoch": 4.851788756388416, + "grad_norm": 0.9708049893379211, + "learning_rate": 0.0002, + "loss": 1.2862, + "step": 7120 + }, + { + "epoch": 4.858603066439523, + "grad_norm": 1.0064880847930908, + "learning_rate": 0.0002, + "loss": 1.3425, + "step": 7130 + }, + { + "epoch": 4.8654173764906306, + "grad_norm": 0.939943790435791, + "learning_rate": 0.0002, + "loss": 1.2899, + "step": 7140 + }, + { + "epoch": 4.872231686541737, + "grad_norm": 1.0750784873962402, + "learning_rate": 0.0002, + "loss": 1.2887, + "step": 7150 + }, + { + "epoch": 4.879045996592845, + "grad_norm": 0.9708989262580872, + "learning_rate": 0.0002, + "loss": 1.3367, + "step": 7160 + }, + { + "epoch": 4.885860306643952, + "grad_norm": 1.0228253602981567, + "learning_rate": 0.0002, + "loss": 1.2797, + "step": 7170 + }, + { + "epoch": 4.89267461669506, + "grad_norm": 0.8963132500648499, + "learning_rate": 0.0002, + "loss": 1.2695, + "step": 7180 + }, + { + "epoch": 4.899488926746167, + "grad_norm": 0.9198015928268433, + "learning_rate": 0.0002, + "loss": 1.3473, + "step": 7190 + }, + { + "epoch": 4.9063032367972745, + "grad_norm": 1.099906086921692, + "learning_rate": 0.0002, + "loss": 1.2541, + "step": 7200 + }, + { + "epoch": 4.913117546848381, + "grad_norm": 1.0624815225601196, + "learning_rate": 0.0002, + "loss": 1.3188, + "step": 7210 + }, + { + "epoch": 4.919931856899489, + "grad_norm": 0.9688444137573242, + "learning_rate": 0.0002, + "loss": 1.3026, + "step": 7220 + }, + { + "epoch": 4.926746166950596, + "grad_norm": 0.867011547088623, + "learning_rate": 0.0002, + "loss": 1.3379, + "step": 7230 + }, + { + "epoch": 4.933560477001704, + "grad_norm": 0.9600282311439514, + "learning_rate": 0.0002, + "loss": 1.289, + "step": 7240 + }, + { + "epoch": 4.940374787052811, + "grad_norm": 0.8979372978210449, + "learning_rate": 0.0002, + "loss": 1.2751, + "step": 7250 + }, + { + "epoch": 4.9471890971039185, + "grad_norm": 0.951474130153656, + "learning_rate": 0.0002, + "loss": 1.3426, + "step": 7260 + }, + { + "epoch": 4.954003407155025, + "grad_norm": 0.824851393699646, + "learning_rate": 0.0002, + "loss": 1.2726, + "step": 7270 + }, + { + "epoch": 4.960817717206133, + "grad_norm": 1.2926591634750366, + "learning_rate": 0.0002, + "loss": 1.2679, + "step": 7280 + }, + { + "epoch": 4.96763202725724, + "grad_norm": 1.1057835817337036, + "learning_rate": 0.0002, + "loss": 1.2974, + "step": 7290 + }, + { + "epoch": 4.974446337308348, + "grad_norm": 0.9814816117286682, + "learning_rate": 0.0002, + "loss": 1.2275, + "step": 7300 + }, + { + "epoch": 4.981260647359455, + "grad_norm": 1.0251333713531494, + "learning_rate": 0.0002, + "loss": 1.3001, + "step": 7310 + }, + { + "epoch": 4.9880749574105625, + "grad_norm": 0.9748668074607849, + "learning_rate": 0.0002, + "loss": 1.3113, + "step": 7320 + }, + { + "epoch": 4.994889267461669, + "grad_norm": 0.8552228808403015, + "learning_rate": 0.0002, + "loss": 1.3595, + "step": 7330 + }, + { + "epoch": 4.999659284497445, + "eval_loss": 2.03971004486084, + "eval_runtime": 67.4144, + "eval_samples_per_second": 7.521, + "eval_steps_per_second": 0.949, + "step": 7337 + } + ], + "logging_steps": 10, + "max_steps": 11736, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.76881483546624e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2f734531f14829705714a5a2703d90c81f36eeb0 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-7337/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2af5d486d0b370637d5a3999ab7668f94f5d53b6497f018967564aef5e8ad133 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/README.md b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/README.md new file mode 100644 index 0000000000000000000000000000000000000000..503a34a03e25483aa99213835fd87bfc8289a3fe --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/README.md @@ -0,0 +1,202 @@ +--- +base_model: google/gemma-2-9b-it +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.1 \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/adapter_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e4441e0c1b29f69c16b26c142944e440b8076 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2-9b-it", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 64, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/adapter_model.safetensors b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..55a91a5afcd9f4b98bd96c23deac9c3cb27192d3 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2191d62f4123382572a1af669b6ff7a70733200fe9e44438b6613f6a41c2f3e5 +size 143153376 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/optimizer.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1efe99885f4f0cfd16019be73893f44203bfff8b --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b47aae77da53d79dd136c8aee169e91d077c27fae9709181c1ee860a1c5c4b96 +size 72886650 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/rng_state.pth b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..5050247cfb6c84f7dd505638fb3bfb4e186f985a --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c520b6072bd6c9974d8331ad51443edf5104ce4c11e907237311db340abdb05 +size 14244 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/scheduler.pt b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d90b9384c80b0ce7b6bde80da4cb04e18fbae951 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:159cc1f6584b2180db71ca843a348de7e356f81898a81f5aa7c26d16e8bba099 +size 1064 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/trainer_state.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ea52e833e518f0e57c0a013025b048d41119f4e1 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/trainer_state.json @@ -0,0 +1,6241 @@ +{ + "best_metric": 1.8034634590148926, + "best_model_checkpoint": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467", + "epoch": 6.0, + "eval_steps": 10, + "global_step": 8805, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0068143100511073255, + "grad_norm": 0.635574460029602, + "learning_rate": 0.0002, + "loss": 3.0988, + "step": 10 + }, + { + "epoch": 0.013628620102214651, + "grad_norm": 1.0401769876480103, + "learning_rate": 0.0002, + "loss": 2.5594, + "step": 20 + }, + { + "epoch": 0.020442930153321975, + "grad_norm": 0.4514131247997284, + "learning_rate": 0.0002, + "loss": 2.3587, + "step": 30 + }, + { + "epoch": 0.027257240204429302, + "grad_norm": 0.6303355693817139, + "learning_rate": 0.0002, + "loss": 2.121, + "step": 40 + }, + { + "epoch": 0.034071550255536626, + "grad_norm": 0.4648270606994629, + "learning_rate": 0.0002, + "loss": 1.9833, + "step": 50 + }, + { + "epoch": 0.04088586030664395, + "grad_norm": 0.42953479290008545, + "learning_rate": 0.0002, + "loss": 1.9384, + "step": 60 + }, + { + "epoch": 0.04770017035775128, + "grad_norm": 0.433614581823349, + "learning_rate": 0.0002, + "loss": 1.9202, + "step": 70 + }, + { + "epoch": 0.054514480408858604, + "grad_norm": 0.45318254828453064, + "learning_rate": 0.0002, + "loss": 1.911, + "step": 80 + }, + { + "epoch": 0.06132879045996593, + "grad_norm": 0.4023568630218506, + "learning_rate": 0.0002, + "loss": 1.8895, + "step": 90 + }, + { + "epoch": 0.06814310051107325, + "grad_norm": 0.43260207772254944, + "learning_rate": 0.0002, + "loss": 1.9257, + "step": 100 + }, + { + "epoch": 0.07495741056218058, + "grad_norm": 0.43389809131622314, + "learning_rate": 0.0002, + "loss": 1.9178, + "step": 110 + }, + { + "epoch": 0.0817717206132879, + "grad_norm": 0.39307987689971924, + "learning_rate": 0.0002, + "loss": 1.9071, + "step": 120 + }, + { + "epoch": 0.08858603066439523, + "grad_norm": 0.7703037261962891, + "learning_rate": 0.0002, + "loss": 1.9482, + "step": 130 + }, + { + "epoch": 0.09540034071550256, + "grad_norm": 0.38547563552856445, + "learning_rate": 0.0002, + "loss": 1.8394, + "step": 140 + }, + { + "epoch": 0.10221465076660988, + "grad_norm": 0.37948688864707947, + "learning_rate": 0.0002, + "loss": 1.9037, + "step": 150 + }, + { + "epoch": 0.10902896081771721, + "grad_norm": 0.33022379875183105, + "learning_rate": 0.0002, + "loss": 1.8664, + "step": 160 + }, + { + "epoch": 0.11584327086882454, + "grad_norm": 0.33703792095184326, + "learning_rate": 0.0002, + "loss": 1.8659, + "step": 170 + }, + { + "epoch": 0.12265758091993186, + "grad_norm": 0.3259912431240082, + "learning_rate": 0.0002, + "loss": 1.8389, + "step": 180 + }, + { + "epoch": 0.12947189097103917, + "grad_norm": 0.3593858778476715, + "learning_rate": 0.0002, + "loss": 1.8424, + "step": 190 + }, + { + "epoch": 0.1362862010221465, + "grad_norm": 0.3401614725589752, + "learning_rate": 0.0002, + "loss": 1.8457, + "step": 200 + }, + { + "epoch": 0.14310051107325383, + "grad_norm": 0.3892078697681427, + "learning_rate": 0.0002, + "loss": 1.7721, + "step": 210 + }, + { + "epoch": 0.14991482112436116, + "grad_norm": 0.315374493598938, + "learning_rate": 0.0002, + "loss": 1.8351, + "step": 220 + }, + { + "epoch": 0.1567291311754685, + "grad_norm": 0.3679497539997101, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 230 + }, + { + "epoch": 0.1635434412265758, + "grad_norm": 0.336730033159256, + "learning_rate": 0.0002, + "loss": 1.816, + "step": 240 + }, + { + "epoch": 0.17035775127768313, + "grad_norm": 0.36472755670547485, + "learning_rate": 0.0002, + "loss": 1.7849, + "step": 250 + }, + { + "epoch": 0.17717206132879046, + "grad_norm": 0.34864023327827454, + "learning_rate": 0.0002, + "loss": 1.7766, + "step": 260 + }, + { + "epoch": 0.1839863713798978, + "grad_norm": 0.3350819945335388, + "learning_rate": 0.0002, + "loss": 1.896, + "step": 270 + }, + { + "epoch": 0.19080068143100512, + "grad_norm": 0.3844246566295624, + "learning_rate": 0.0002, + "loss": 1.8742, + "step": 280 + }, + { + "epoch": 0.19761499148211242, + "grad_norm": 0.38413065671920776, + "learning_rate": 0.0002, + "loss": 1.8531, + "step": 290 + }, + { + "epoch": 0.20442930153321975, + "grad_norm": 0.4080047607421875, + "learning_rate": 0.0002, + "loss": 1.8415, + "step": 300 + }, + { + "epoch": 0.21124361158432708, + "grad_norm": 0.3546800911426544, + "learning_rate": 0.0002, + "loss": 1.9009, + "step": 310 + }, + { + "epoch": 0.21805792163543442, + "grad_norm": 0.38577890396118164, + "learning_rate": 0.0002, + "loss": 1.8092, + "step": 320 + }, + { + "epoch": 0.22487223168654175, + "grad_norm": 0.38979098200798035, + "learning_rate": 0.0002, + "loss": 1.7113, + "step": 330 + }, + { + "epoch": 0.23168654173764908, + "grad_norm": 0.35105520486831665, + "learning_rate": 0.0002, + "loss": 1.8162, + "step": 340 + }, + { + "epoch": 0.23850085178875638, + "grad_norm": 0.31671223044395447, + "learning_rate": 0.0002, + "loss": 1.8408, + "step": 350 + }, + { + "epoch": 0.2453151618398637, + "grad_norm": 0.33034196496009827, + "learning_rate": 0.0002, + "loss": 1.8014, + "step": 360 + }, + { + "epoch": 0.252129471890971, + "grad_norm": 0.2990533709526062, + "learning_rate": 0.0002, + "loss": 1.7132, + "step": 370 + }, + { + "epoch": 0.25894378194207834, + "grad_norm": 0.362208753824234, + "learning_rate": 0.0002, + "loss": 1.8612, + "step": 380 + }, + { + "epoch": 0.2657580919931857, + "grad_norm": 0.3269096612930298, + "learning_rate": 0.0002, + "loss": 1.8001, + "step": 390 + }, + { + "epoch": 0.272572402044293, + "grad_norm": 0.30555954575538635, + "learning_rate": 0.0002, + "loss": 1.786, + "step": 400 + }, + { + "epoch": 0.27938671209540034, + "grad_norm": 0.332933247089386, + "learning_rate": 0.0002, + "loss": 1.8018, + "step": 410 + }, + { + "epoch": 0.28620102214650767, + "grad_norm": 0.39454060792922974, + "learning_rate": 0.0002, + "loss": 1.8157, + "step": 420 + }, + { + "epoch": 0.293015332197615, + "grad_norm": 0.34589633345603943, + "learning_rate": 0.0002, + "loss": 1.7862, + "step": 430 + }, + { + "epoch": 0.29982964224872233, + "grad_norm": 0.3747332990169525, + "learning_rate": 0.0002, + "loss": 1.7612, + "step": 440 + }, + { + "epoch": 0.30664395229982966, + "grad_norm": 0.34825369715690613, + "learning_rate": 0.0002, + "loss": 1.8476, + "step": 450 + }, + { + "epoch": 0.313458262350937, + "grad_norm": 0.32906976342201233, + "learning_rate": 0.0002, + "loss": 1.775, + "step": 460 + }, + { + "epoch": 0.3202725724020443, + "grad_norm": 0.33108609914779663, + "learning_rate": 0.0002, + "loss": 1.7764, + "step": 470 + }, + { + "epoch": 0.3270868824531516, + "grad_norm": 0.3170463442802429, + "learning_rate": 0.0002, + "loss": 1.8012, + "step": 480 + }, + { + "epoch": 0.3339011925042589, + "grad_norm": 0.30792081356048584, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 490 + }, + { + "epoch": 0.34071550255536626, + "grad_norm": 0.31772997975349426, + "learning_rate": 0.0002, + "loss": 1.7361, + "step": 500 + }, + { + "epoch": 0.3475298126064736, + "grad_norm": 0.32714012265205383, + "learning_rate": 0.0002, + "loss": 1.8147, + "step": 510 + }, + { + "epoch": 0.3543441226575809, + "grad_norm": 0.3021100163459778, + "learning_rate": 0.0002, + "loss": 1.8332, + "step": 520 + }, + { + "epoch": 0.36115843270868825, + "grad_norm": 0.6045835018157959, + "learning_rate": 0.0002, + "loss": 1.7337, + "step": 530 + }, + { + "epoch": 0.3679727427597956, + "grad_norm": 0.3003896474838257, + "learning_rate": 0.0002, + "loss": 1.8661, + "step": 540 + }, + { + "epoch": 0.3747870528109029, + "grad_norm": 0.3678470551967621, + "learning_rate": 0.0002, + "loss": 1.8359, + "step": 550 + }, + { + "epoch": 0.38160136286201024, + "grad_norm": 0.35787615180015564, + "learning_rate": 0.0002, + "loss": 1.7512, + "step": 560 + }, + { + "epoch": 0.38841567291311757, + "grad_norm": 0.31882143020629883, + "learning_rate": 0.0002, + "loss": 1.8048, + "step": 570 + }, + { + "epoch": 0.39522998296422485, + "grad_norm": 0.3186313509941101, + "learning_rate": 0.0002, + "loss": 1.8108, + "step": 580 + }, + { + "epoch": 0.4020442930153322, + "grad_norm": 0.41443702578544617, + "learning_rate": 0.0002, + "loss": 1.8012, + "step": 590 + }, + { + "epoch": 0.4088586030664395, + "grad_norm": 0.28773069381713867, + "learning_rate": 0.0002, + "loss": 1.7794, + "step": 600 + }, + { + "epoch": 0.41567291311754684, + "grad_norm": 0.35743263363838196, + "learning_rate": 0.0002, + "loss": 1.8231, + "step": 610 + }, + { + "epoch": 0.42248722316865417, + "grad_norm": 0.3360286355018616, + "learning_rate": 0.0002, + "loss": 1.7531, + "step": 620 + }, + { + "epoch": 0.4293015332197615, + "grad_norm": 0.32838866114616394, + "learning_rate": 0.0002, + "loss": 1.7654, + "step": 630 + }, + { + "epoch": 0.43611584327086883, + "grad_norm": 0.2994388937950134, + "learning_rate": 0.0002, + "loss": 1.8176, + "step": 640 + }, + { + "epoch": 0.44293015332197616, + "grad_norm": 0.3306307792663574, + "learning_rate": 0.0002, + "loss": 1.7241, + "step": 650 + }, + { + "epoch": 0.4497444633730835, + "grad_norm": 0.3129560351371765, + "learning_rate": 0.0002, + "loss": 1.8201, + "step": 660 + }, + { + "epoch": 0.4565587734241908, + "grad_norm": 0.3244289457798004, + "learning_rate": 0.0002, + "loss": 1.803, + "step": 670 + }, + { + "epoch": 0.46337308347529815, + "grad_norm": 0.3196892738342285, + "learning_rate": 0.0002, + "loss": 1.8399, + "step": 680 + }, + { + "epoch": 0.47018739352640543, + "grad_norm": 0.3135230243206024, + "learning_rate": 0.0002, + "loss": 1.8291, + "step": 690 + }, + { + "epoch": 0.47700170357751276, + "grad_norm": 0.28677991032600403, + "learning_rate": 0.0002, + "loss": 1.7423, + "step": 700 + }, + { + "epoch": 0.4838160136286201, + "grad_norm": 0.3074065148830414, + "learning_rate": 0.0002, + "loss": 1.6982, + "step": 710 + }, + { + "epoch": 0.4906303236797274, + "grad_norm": 0.3354290723800659, + "learning_rate": 0.0002, + "loss": 1.8078, + "step": 720 + }, + { + "epoch": 0.49744463373083475, + "grad_norm": 0.324370801448822, + "learning_rate": 0.0002, + "loss": 1.7333, + "step": 730 + }, + { + "epoch": 0.504258943781942, + "grad_norm": 0.29496142268180847, + "learning_rate": 0.0002, + "loss": 1.8506, + "step": 740 + }, + { + "epoch": 0.5110732538330494, + "grad_norm": 0.30694130063056946, + "learning_rate": 0.0002, + "loss": 1.7761, + "step": 750 + }, + { + "epoch": 0.5178875638841567, + "grad_norm": 0.36168408393859863, + "learning_rate": 0.0002, + "loss": 1.7854, + "step": 760 + }, + { + "epoch": 0.524701873935264, + "grad_norm": 0.2930343449115753, + "learning_rate": 0.0002, + "loss": 1.7149, + "step": 770 + }, + { + "epoch": 0.5315161839863713, + "grad_norm": 0.3023432493209839, + "learning_rate": 0.0002, + "loss": 1.7924, + "step": 780 + }, + { + "epoch": 0.5383304940374787, + "grad_norm": 0.3272720277309418, + "learning_rate": 0.0002, + "loss": 1.8467, + "step": 790 + }, + { + "epoch": 0.545144804088586, + "grad_norm": 0.2907974421977997, + "learning_rate": 0.0002, + "loss": 1.8639, + "step": 800 + }, + { + "epoch": 0.5519591141396933, + "grad_norm": 0.32267168164253235, + "learning_rate": 0.0002, + "loss": 1.9018, + "step": 810 + }, + { + "epoch": 0.5587734241908007, + "grad_norm": 0.32059940695762634, + "learning_rate": 0.0002, + "loss": 1.8311, + "step": 820 + }, + { + "epoch": 0.565587734241908, + "grad_norm": 0.30951258540153503, + "learning_rate": 0.0002, + "loss": 1.7234, + "step": 830 + }, + { + "epoch": 0.5724020442930153, + "grad_norm": 0.33976122736930847, + "learning_rate": 0.0002, + "loss": 1.8063, + "step": 840 + }, + { + "epoch": 0.5792163543441227, + "grad_norm": 0.3195820450782776, + "learning_rate": 0.0002, + "loss": 1.7021, + "step": 850 + }, + { + "epoch": 0.58603066439523, + "grad_norm": 0.2828562557697296, + "learning_rate": 0.0002, + "loss": 1.8073, + "step": 860 + }, + { + "epoch": 0.5928449744463373, + "grad_norm": 0.29591670632362366, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 870 + }, + { + "epoch": 0.5996592844974447, + "grad_norm": 0.3086104393005371, + "learning_rate": 0.0002, + "loss": 1.8109, + "step": 880 + }, + { + "epoch": 0.606473594548552, + "grad_norm": 0.3592929542064667, + "learning_rate": 0.0002, + "loss": 1.8207, + "step": 890 + }, + { + "epoch": 0.6132879045996593, + "grad_norm": 0.2830186188220978, + "learning_rate": 0.0002, + "loss": 1.7662, + "step": 900 + }, + { + "epoch": 0.6201022146507666, + "grad_norm": 0.3128598630428314, + "learning_rate": 0.0002, + "loss": 1.8344, + "step": 910 + }, + { + "epoch": 0.626916524701874, + "grad_norm": 0.31957563757896423, + "learning_rate": 0.0002, + "loss": 1.7953, + "step": 920 + }, + { + "epoch": 0.6337308347529813, + "grad_norm": 0.30994319915771484, + "learning_rate": 0.0002, + "loss": 1.7578, + "step": 930 + }, + { + "epoch": 0.6405451448040886, + "grad_norm": 0.3352845013141632, + "learning_rate": 0.0002, + "loss": 1.8494, + "step": 940 + }, + { + "epoch": 0.6473594548551959, + "grad_norm": 0.2960077226161957, + "learning_rate": 0.0002, + "loss": 1.7054, + "step": 950 + }, + { + "epoch": 0.6541737649063032, + "grad_norm": 0.32675081491470337, + "learning_rate": 0.0002, + "loss": 1.7209, + "step": 960 + }, + { + "epoch": 0.6609880749574105, + "grad_norm": 0.30042028427124023, + "learning_rate": 0.0002, + "loss": 1.7776, + "step": 970 + }, + { + "epoch": 0.6678023850085179, + "grad_norm": 0.3288673758506775, + "learning_rate": 0.0002, + "loss": 1.7597, + "step": 980 + }, + { + "epoch": 0.6746166950596252, + "grad_norm": 0.323215126991272, + "learning_rate": 0.0002, + "loss": 1.7962, + "step": 990 + }, + { + "epoch": 0.6814310051107325, + "grad_norm": 0.30041399598121643, + "learning_rate": 0.0002, + "loss": 1.821, + "step": 1000 + }, + { + "epoch": 0.6882453151618398, + "grad_norm": 0.3076179623603821, + "learning_rate": 0.0002, + "loss": 1.7716, + "step": 1010 + }, + { + "epoch": 0.6950596252129472, + "grad_norm": 0.2941909730434418, + "learning_rate": 0.0002, + "loss": 1.7827, + "step": 1020 + }, + { + "epoch": 0.7018739352640545, + "grad_norm": 0.32220420241355896, + "learning_rate": 0.0002, + "loss": 1.7964, + "step": 1030 + }, + { + "epoch": 0.7086882453151618, + "grad_norm": 0.2989702820777893, + "learning_rate": 0.0002, + "loss": 1.7101, + "step": 1040 + }, + { + "epoch": 0.7155025553662692, + "grad_norm": 0.3049640357494354, + "learning_rate": 0.0002, + "loss": 1.8749, + "step": 1050 + }, + { + "epoch": 0.7223168654173765, + "grad_norm": 0.3183284103870392, + "learning_rate": 0.0002, + "loss": 1.782, + "step": 1060 + }, + { + "epoch": 0.7291311754684838, + "grad_norm": 0.3070095181465149, + "learning_rate": 0.0002, + "loss": 1.785, + "step": 1070 + }, + { + "epoch": 0.7359454855195912, + "grad_norm": 0.33263063430786133, + "learning_rate": 0.0002, + "loss": 1.7832, + "step": 1080 + }, + { + "epoch": 0.7427597955706985, + "grad_norm": 0.28774312138557434, + "learning_rate": 0.0002, + "loss": 1.7627, + "step": 1090 + }, + { + "epoch": 0.7495741056218058, + "grad_norm": 0.29304224252700806, + "learning_rate": 0.0002, + "loss": 1.7343, + "step": 1100 + }, + { + "epoch": 0.7563884156729132, + "grad_norm": 0.27529507875442505, + "learning_rate": 0.0002, + "loss": 1.8082, + "step": 1110 + }, + { + "epoch": 0.7632027257240205, + "grad_norm": 0.32319945096969604, + "learning_rate": 0.0002, + "loss": 1.8071, + "step": 1120 + }, + { + "epoch": 0.7700170357751278, + "grad_norm": 0.33777597546577454, + "learning_rate": 0.0002, + "loss": 1.6998, + "step": 1130 + }, + { + "epoch": 0.7768313458262351, + "grad_norm": 0.29847201704978943, + "learning_rate": 0.0002, + "loss": 1.8488, + "step": 1140 + }, + { + "epoch": 0.7836456558773425, + "grad_norm": 0.31370633840560913, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 1150 + }, + { + "epoch": 0.7904599659284497, + "grad_norm": 0.31633856892585754, + "learning_rate": 0.0002, + "loss": 1.8175, + "step": 1160 + }, + { + "epoch": 0.797274275979557, + "grad_norm": 0.33849263191223145, + "learning_rate": 0.0002, + "loss": 1.7637, + "step": 1170 + }, + { + "epoch": 0.8040885860306644, + "grad_norm": 0.3306333124637604, + "learning_rate": 0.0002, + "loss": 1.8264, + "step": 1180 + }, + { + "epoch": 0.8109028960817717, + "grad_norm": 0.3249678313732147, + "learning_rate": 0.0002, + "loss": 1.777, + "step": 1190 + }, + { + "epoch": 0.817717206132879, + "grad_norm": 0.3252817690372467, + "learning_rate": 0.0002, + "loss": 1.7631, + "step": 1200 + }, + { + "epoch": 0.8245315161839863, + "grad_norm": 0.31772559881210327, + "learning_rate": 0.0002, + "loss": 1.7877, + "step": 1210 + }, + { + "epoch": 0.8313458262350937, + "grad_norm": 0.2803371846675873, + "learning_rate": 0.0002, + "loss": 1.8068, + "step": 1220 + }, + { + "epoch": 0.838160136286201, + "grad_norm": 0.26700571179389954, + "learning_rate": 0.0002, + "loss": 1.738, + "step": 1230 + }, + { + "epoch": 0.8449744463373083, + "grad_norm": 0.3060092031955719, + "learning_rate": 0.0002, + "loss": 1.8301, + "step": 1240 + }, + { + "epoch": 0.8517887563884157, + "grad_norm": 0.28831684589385986, + "learning_rate": 0.0002, + "loss": 1.7993, + "step": 1250 + }, + { + "epoch": 0.858603066439523, + "grad_norm": 0.30708742141723633, + "learning_rate": 0.0002, + "loss": 1.6909, + "step": 1260 + }, + { + "epoch": 0.8654173764906303, + "grad_norm": 0.2915987968444824, + "learning_rate": 0.0002, + "loss": 1.8506, + "step": 1270 + }, + { + "epoch": 0.8722316865417377, + "grad_norm": 0.2893589735031128, + "learning_rate": 0.0002, + "loss": 1.7536, + "step": 1280 + }, + { + "epoch": 0.879045996592845, + "grad_norm": 0.29545632004737854, + "learning_rate": 0.0002, + "loss": 1.7437, + "step": 1290 + }, + { + "epoch": 0.8858603066439523, + "grad_norm": 0.3354771137237549, + "learning_rate": 0.0002, + "loss": 1.859, + "step": 1300 + }, + { + "epoch": 0.8926746166950597, + "grad_norm": 0.37715399265289307, + "learning_rate": 0.0002, + "loss": 1.7644, + "step": 1310 + }, + { + "epoch": 0.899488926746167, + "grad_norm": 0.28847193717956543, + "learning_rate": 0.0002, + "loss": 1.7731, + "step": 1320 + }, + { + "epoch": 0.9063032367972743, + "grad_norm": 0.2780889868736267, + "learning_rate": 0.0002, + "loss": 1.9077, + "step": 1330 + }, + { + "epoch": 0.9131175468483816, + "grad_norm": 0.2714342176914215, + "learning_rate": 0.0002, + "loss": 1.7861, + "step": 1340 + }, + { + "epoch": 0.919931856899489, + "grad_norm": 0.2950133979320526, + "learning_rate": 0.0002, + "loss": 1.8, + "step": 1350 + }, + { + "epoch": 0.9267461669505963, + "grad_norm": 0.29097145795822144, + "learning_rate": 0.0002, + "loss": 1.7368, + "step": 1360 + }, + { + "epoch": 0.9335604770017035, + "grad_norm": 0.32540133595466614, + "learning_rate": 0.0002, + "loss": 1.7864, + "step": 1370 + }, + { + "epoch": 0.9403747870528109, + "grad_norm": 0.3076636493206024, + "learning_rate": 0.0002, + "loss": 1.7571, + "step": 1380 + }, + { + "epoch": 0.9471890971039182, + "grad_norm": 0.2962130308151245, + "learning_rate": 0.0002, + "loss": 1.8312, + "step": 1390 + }, + { + "epoch": 0.9540034071550255, + "grad_norm": 0.30086860060691833, + "learning_rate": 0.0002, + "loss": 1.7858, + "step": 1400 + }, + { + "epoch": 0.9608177172061328, + "grad_norm": 0.28634947538375854, + "learning_rate": 0.0002, + "loss": 1.7991, + "step": 1410 + }, + { + "epoch": 0.9676320272572402, + "grad_norm": 0.35314416885375977, + "learning_rate": 0.0002, + "loss": 1.8385, + "step": 1420 + }, + { + "epoch": 0.9744463373083475, + "grad_norm": 0.2939317524433136, + "learning_rate": 0.0002, + "loss": 1.8054, + "step": 1430 + }, + { + "epoch": 0.9812606473594548, + "grad_norm": 0.3010196089744568, + "learning_rate": 0.0002, + "loss": 1.7582, + "step": 1440 + }, + { + "epoch": 0.9880749574105622, + "grad_norm": 0.30816152691841125, + "learning_rate": 0.0002, + "loss": 1.758, + "step": 1450 + }, + { + "epoch": 0.9948892674616695, + "grad_norm": 0.28152793645858765, + "learning_rate": 0.0002, + "loss": 1.8159, + "step": 1460 + }, + { + "epoch": 0.9996592844974447, + "eval_loss": 1.8034634590148926, + "eval_runtime": 53.6017, + "eval_samples_per_second": 9.459, + "eval_steps_per_second": 1.194, + "step": 1467 + }, + { + "epoch": 1.0017035775127767, + "grad_norm": 0.29246416687965393, + "learning_rate": 0.0002, + "loss": 1.7072, + "step": 1470 + }, + { + "epoch": 1.008517887563884, + "grad_norm": 0.2668602168560028, + "learning_rate": 0.0002, + "loss": 1.8159, + "step": 1480 + }, + { + "epoch": 1.0153321976149914, + "grad_norm": 0.2998567819595337, + "learning_rate": 0.0002, + "loss": 1.6868, + "step": 1490 + }, + { + "epoch": 1.0221465076660987, + "grad_norm": 0.3284934461116791, + "learning_rate": 0.0002, + "loss": 1.7331, + "step": 1500 + }, + { + "epoch": 1.028960817717206, + "grad_norm": 0.3275827169418335, + "learning_rate": 0.0002, + "loss": 1.689, + "step": 1510 + }, + { + "epoch": 1.0357751277683134, + "grad_norm": 0.3382718563079834, + "learning_rate": 0.0002, + "loss": 1.7092, + "step": 1520 + }, + { + "epoch": 1.0425894378194207, + "grad_norm": 0.36737215518951416, + "learning_rate": 0.0002, + "loss": 1.7215, + "step": 1530 + }, + { + "epoch": 1.049403747870528, + "grad_norm": 0.3442603647708893, + "learning_rate": 0.0002, + "loss": 1.6993, + "step": 1540 + }, + { + "epoch": 1.0562180579216354, + "grad_norm": 0.3323381245136261, + "learning_rate": 0.0002, + "loss": 1.6996, + "step": 1550 + }, + { + "epoch": 1.0630323679727427, + "grad_norm": 0.341227650642395, + "learning_rate": 0.0002, + "loss": 1.7444, + "step": 1560 + }, + { + "epoch": 1.06984667802385, + "grad_norm": 0.3361579477787018, + "learning_rate": 0.0002, + "loss": 1.7419, + "step": 1570 + }, + { + "epoch": 1.0766609880749574, + "grad_norm": 0.3556230962276459, + "learning_rate": 0.0002, + "loss": 1.7337, + "step": 1580 + }, + { + "epoch": 1.0834752981260647, + "grad_norm": 0.27130424976348877, + "learning_rate": 0.0002, + "loss": 1.6978, + "step": 1590 + }, + { + "epoch": 1.090289608177172, + "grad_norm": 0.29366323351860046, + "learning_rate": 0.0002, + "loss": 1.6087, + "step": 1600 + }, + { + "epoch": 1.0971039182282794, + "grad_norm": 0.3581245541572571, + "learning_rate": 0.0002, + "loss": 1.6721, + "step": 1610 + }, + { + "epoch": 1.1039182282793867, + "grad_norm": 0.3021670877933502, + "learning_rate": 0.0002, + "loss": 1.7639, + "step": 1620 + }, + { + "epoch": 1.110732538330494, + "grad_norm": 0.3145572543144226, + "learning_rate": 0.0002, + "loss": 1.7314, + "step": 1630 + }, + { + "epoch": 1.1175468483816013, + "grad_norm": 0.35362154245376587, + "learning_rate": 0.0002, + "loss": 1.7408, + "step": 1640 + }, + { + "epoch": 1.1243611584327087, + "grad_norm": 0.5413113236427307, + "learning_rate": 0.0002, + "loss": 1.7071, + "step": 1650 + }, + { + "epoch": 1.131175468483816, + "grad_norm": 0.6858654022216797, + "learning_rate": 0.0002, + "loss": 1.7615, + "step": 1660 + }, + { + "epoch": 1.1379897785349233, + "grad_norm": 0.3781903386116028, + "learning_rate": 0.0002, + "loss": 1.7029, + "step": 1670 + }, + { + "epoch": 1.1448040885860307, + "grad_norm": 0.404864102602005, + "learning_rate": 0.0002, + "loss": 1.659, + "step": 1680 + }, + { + "epoch": 1.151618398637138, + "grad_norm": 0.3595100939273834, + "learning_rate": 0.0002, + "loss": 1.7212, + "step": 1690 + }, + { + "epoch": 1.1584327086882453, + "grad_norm": 0.33682283759117126, + "learning_rate": 0.0002, + "loss": 1.7023, + "step": 1700 + }, + { + "epoch": 1.1652470187393527, + "grad_norm": 0.3877373933792114, + "learning_rate": 0.0002, + "loss": 1.7336, + "step": 1710 + }, + { + "epoch": 1.17206132879046, + "grad_norm": 0.34606897830963135, + "learning_rate": 0.0002, + "loss": 1.7676, + "step": 1720 + }, + { + "epoch": 1.1788756388415673, + "grad_norm": 0.3122918903827667, + "learning_rate": 0.0002, + "loss": 1.6889, + "step": 1730 + }, + { + "epoch": 1.1856899488926746, + "grad_norm": 0.34081900119781494, + "learning_rate": 0.0002, + "loss": 1.6585, + "step": 1740 + }, + { + "epoch": 1.192504258943782, + "grad_norm": 0.3418807089328766, + "learning_rate": 0.0002, + "loss": 1.7794, + "step": 1750 + }, + { + "epoch": 1.1993185689948893, + "grad_norm": 0.3495500981807709, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 1760 + }, + { + "epoch": 1.2061328790459966, + "grad_norm": 0.557288408279419, + "learning_rate": 0.0002, + "loss": 1.6578, + "step": 1770 + }, + { + "epoch": 1.212947189097104, + "grad_norm": 0.3193778693675995, + "learning_rate": 0.0002, + "loss": 1.7488, + "step": 1780 + }, + { + "epoch": 1.2197614991482113, + "grad_norm": 0.3306216895580292, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 1790 + }, + { + "epoch": 1.2265758091993186, + "grad_norm": 0.37998732924461365, + "learning_rate": 0.0002, + "loss": 1.6772, + "step": 1800 + }, + { + "epoch": 1.233390119250426, + "grad_norm": 0.3255669176578522, + "learning_rate": 0.0002, + "loss": 1.7094, + "step": 1810 + }, + { + "epoch": 1.2402044293015333, + "grad_norm": 0.3741287291049957, + "learning_rate": 0.0002, + "loss": 1.7096, + "step": 1820 + }, + { + "epoch": 1.2470187393526406, + "grad_norm": 0.36727291345596313, + "learning_rate": 0.0002, + "loss": 1.7317, + "step": 1830 + }, + { + "epoch": 1.253833049403748, + "grad_norm": 0.3479527235031128, + "learning_rate": 0.0002, + "loss": 1.7418, + "step": 1840 + }, + { + "epoch": 1.2606473594548553, + "grad_norm": 0.3472636640071869, + "learning_rate": 0.0002, + "loss": 1.7062, + "step": 1850 + }, + { + "epoch": 1.2674616695059626, + "grad_norm": 0.3702869415283203, + "learning_rate": 0.0002, + "loss": 1.6263, + "step": 1860 + }, + { + "epoch": 1.27427597955707, + "grad_norm": 0.3934040069580078, + "learning_rate": 0.0002, + "loss": 1.6222, + "step": 1870 + }, + { + "epoch": 1.2810902896081773, + "grad_norm": 0.46887534856796265, + "learning_rate": 0.0002, + "loss": 1.7616, + "step": 1880 + }, + { + "epoch": 1.2879045996592846, + "grad_norm": 0.3191998600959778, + "learning_rate": 0.0002, + "loss": 1.6071, + "step": 1890 + }, + { + "epoch": 1.294718909710392, + "grad_norm": 0.34032225608825684, + "learning_rate": 0.0002, + "loss": 1.7001, + "step": 1900 + }, + { + "epoch": 1.3015332197614993, + "grad_norm": 0.33453696966171265, + "learning_rate": 0.0002, + "loss": 1.8316, + "step": 1910 + }, + { + "epoch": 1.3083475298126066, + "grad_norm": 0.3451494872570038, + "learning_rate": 0.0002, + "loss": 1.6567, + "step": 1920 + }, + { + "epoch": 1.315161839863714, + "grad_norm": 0.36203092336654663, + "learning_rate": 0.0002, + "loss": 1.708, + "step": 1930 + }, + { + "epoch": 1.321976149914821, + "grad_norm": 0.43794456124305725, + "learning_rate": 0.0002, + "loss": 1.7095, + "step": 1940 + }, + { + "epoch": 1.3287904599659284, + "grad_norm": 0.3630591034889221, + "learning_rate": 0.0002, + "loss": 1.7264, + "step": 1950 + }, + { + "epoch": 1.3356047700170357, + "grad_norm": 0.36951911449432373, + "learning_rate": 0.0002, + "loss": 1.6529, + "step": 1960 + }, + { + "epoch": 1.342419080068143, + "grad_norm": 0.4001159965991974, + "learning_rate": 0.0002, + "loss": 1.651, + "step": 1970 + }, + { + "epoch": 1.3492333901192504, + "grad_norm": 0.3820836544036865, + "learning_rate": 0.0002, + "loss": 1.677, + "step": 1980 + }, + { + "epoch": 1.3560477001703577, + "grad_norm": 0.3705870807170868, + "learning_rate": 0.0002, + "loss": 1.583, + "step": 1990 + }, + { + "epoch": 1.362862010221465, + "grad_norm": 0.3557972013950348, + "learning_rate": 0.0002, + "loss": 1.6207, + "step": 2000 + }, + { + "epoch": 1.3696763202725724, + "grad_norm": 0.38546398282051086, + "learning_rate": 0.0002, + "loss": 1.7656, + "step": 2010 + }, + { + "epoch": 1.3764906303236797, + "grad_norm": 0.3908020853996277, + "learning_rate": 0.0002, + "loss": 1.6881, + "step": 2020 + }, + { + "epoch": 1.383304940374787, + "grad_norm": 0.3822040855884552, + "learning_rate": 0.0002, + "loss": 1.727, + "step": 2030 + }, + { + "epoch": 1.3901192504258943, + "grad_norm": 0.425327867269516, + "learning_rate": 0.0002, + "loss": 1.7923, + "step": 2040 + }, + { + "epoch": 1.3969335604770017, + "grad_norm": 0.3436259329319, + "learning_rate": 0.0002, + "loss": 1.7032, + "step": 2050 + }, + { + "epoch": 1.403747870528109, + "grad_norm": 0.33124062418937683, + "learning_rate": 0.0002, + "loss": 1.6398, + "step": 2060 + }, + { + "epoch": 1.4105621805792163, + "grad_norm": 0.3662424683570862, + "learning_rate": 0.0002, + "loss": 1.6815, + "step": 2070 + }, + { + "epoch": 1.4173764906303237, + "grad_norm": 0.3720635175704956, + "learning_rate": 0.0002, + "loss": 1.7258, + "step": 2080 + }, + { + "epoch": 1.424190800681431, + "grad_norm": 0.3361680805683136, + "learning_rate": 0.0002, + "loss": 1.7186, + "step": 2090 + }, + { + "epoch": 1.4310051107325383, + "grad_norm": 0.32751724123954773, + "learning_rate": 0.0002, + "loss": 1.7606, + "step": 2100 + }, + { + "epoch": 1.4378194207836457, + "grad_norm": 0.34333378076553345, + "learning_rate": 0.0002, + "loss": 1.7051, + "step": 2110 + }, + { + "epoch": 1.444633730834753, + "grad_norm": 0.37777671217918396, + "learning_rate": 0.0002, + "loss": 1.6979, + "step": 2120 + }, + { + "epoch": 1.4514480408858603, + "grad_norm": 0.37126365303993225, + "learning_rate": 0.0002, + "loss": 1.7085, + "step": 2130 + }, + { + "epoch": 1.4582623509369677, + "grad_norm": 0.3602267801761627, + "learning_rate": 0.0002, + "loss": 1.721, + "step": 2140 + }, + { + "epoch": 1.465076660988075, + "grad_norm": 0.3287110924720764, + "learning_rate": 0.0002, + "loss": 1.8148, + "step": 2150 + }, + { + "epoch": 1.4718909710391823, + "grad_norm": 0.3562135100364685, + "learning_rate": 0.0002, + "loss": 1.6966, + "step": 2160 + }, + { + "epoch": 1.4787052810902896, + "grad_norm": 0.38292962312698364, + "learning_rate": 0.0002, + "loss": 1.713, + "step": 2170 + }, + { + "epoch": 1.485519591141397, + "grad_norm": 0.38220319151878357, + "learning_rate": 0.0002, + "loss": 1.7036, + "step": 2180 + }, + { + "epoch": 1.4923339011925043, + "grad_norm": 0.3570062220096588, + "learning_rate": 0.0002, + "loss": 1.7297, + "step": 2190 + }, + { + "epoch": 1.4991482112436116, + "grad_norm": 0.363146036863327, + "learning_rate": 0.0002, + "loss": 1.5652, + "step": 2200 + }, + { + "epoch": 1.5059625212947187, + "grad_norm": 0.37393274903297424, + "learning_rate": 0.0002, + "loss": 1.6716, + "step": 2210 + }, + { + "epoch": 1.512776831345826, + "grad_norm": 0.3628501892089844, + "learning_rate": 0.0002, + "loss": 1.6839, + "step": 2220 + }, + { + "epoch": 1.5195911413969334, + "grad_norm": 0.33430740237236023, + "learning_rate": 0.0002, + "loss": 1.6627, + "step": 2230 + }, + { + "epoch": 1.5264054514480407, + "grad_norm": 0.35363978147506714, + "learning_rate": 0.0002, + "loss": 1.7495, + "step": 2240 + }, + { + "epoch": 1.533219761499148, + "grad_norm": 0.37220337986946106, + "learning_rate": 0.0002, + "loss": 1.6364, + "step": 2250 + }, + { + "epoch": 1.5400340715502554, + "grad_norm": 0.35020262002944946, + "learning_rate": 0.0002, + "loss": 1.7773, + "step": 2260 + }, + { + "epoch": 1.5468483816013627, + "grad_norm": 0.35274937748908997, + "learning_rate": 0.0002, + "loss": 1.7453, + "step": 2270 + }, + { + "epoch": 1.55366269165247, + "grad_norm": 0.3797738552093506, + "learning_rate": 0.0002, + "loss": 1.7162, + "step": 2280 + }, + { + "epoch": 1.5604770017035774, + "grad_norm": 0.4160412847995758, + "learning_rate": 0.0002, + "loss": 1.6197, + "step": 2290 + }, + { + "epoch": 1.5672913117546847, + "grad_norm": 0.38779592514038086, + "learning_rate": 0.0002, + "loss": 1.7101, + "step": 2300 + }, + { + "epoch": 1.574105621805792, + "grad_norm": 0.39171287417411804, + "learning_rate": 0.0002, + "loss": 1.7254, + "step": 2310 + }, + { + "epoch": 1.5809199318568994, + "grad_norm": 0.3606826663017273, + "learning_rate": 0.0002, + "loss": 1.7087, + "step": 2320 + }, + { + "epoch": 1.5877342419080067, + "grad_norm": 0.3745017647743225, + "learning_rate": 0.0002, + "loss": 1.7269, + "step": 2330 + }, + { + "epoch": 1.594548551959114, + "grad_norm": 0.34933462738990784, + "learning_rate": 0.0002, + "loss": 1.7068, + "step": 2340 + }, + { + "epoch": 1.6013628620102214, + "grad_norm": 0.37268444895744324, + "learning_rate": 0.0002, + "loss": 1.7073, + "step": 2350 + }, + { + "epoch": 1.6081771720613287, + "grad_norm": 0.4603484869003296, + "learning_rate": 0.0002, + "loss": 1.728, + "step": 2360 + }, + { + "epoch": 1.614991482112436, + "grad_norm": 0.35689088702201843, + "learning_rate": 0.0002, + "loss": 1.7621, + "step": 2370 + }, + { + "epoch": 1.6218057921635434, + "grad_norm": 0.3392031490802765, + "learning_rate": 0.0002, + "loss": 1.6989, + "step": 2380 + }, + { + "epoch": 1.6286201022146507, + "grad_norm": 0.394653856754303, + "learning_rate": 0.0002, + "loss": 1.7077, + "step": 2390 + }, + { + "epoch": 1.635434412265758, + "grad_norm": 0.33972012996673584, + "learning_rate": 0.0002, + "loss": 1.7448, + "step": 2400 + }, + { + "epoch": 1.6422487223168654, + "grad_norm": 0.3854375183582306, + "learning_rate": 0.0002, + "loss": 1.7681, + "step": 2410 + }, + { + "epoch": 1.6490630323679727, + "grad_norm": 0.36143961548805237, + "learning_rate": 0.0002, + "loss": 1.7102, + "step": 2420 + }, + { + "epoch": 1.65587734241908, + "grad_norm": 0.35816189646720886, + "learning_rate": 0.0002, + "loss": 1.7432, + "step": 2430 + }, + { + "epoch": 1.6626916524701874, + "grad_norm": 0.36298736929893494, + "learning_rate": 0.0002, + "loss": 1.6407, + "step": 2440 + }, + { + "epoch": 1.6695059625212947, + "grad_norm": 0.36756929755210876, + "learning_rate": 0.0002, + "loss": 1.723, + "step": 2450 + }, + { + "epoch": 1.676320272572402, + "grad_norm": 0.35969603061676025, + "learning_rate": 0.0002, + "loss": 1.6824, + "step": 2460 + }, + { + "epoch": 1.6831345826235093, + "grad_norm": 0.38449376821517944, + "learning_rate": 0.0002, + "loss": 1.7014, + "step": 2470 + }, + { + "epoch": 1.6899488926746167, + "grad_norm": 0.44511452317237854, + "learning_rate": 0.0002, + "loss": 1.7261, + "step": 2480 + }, + { + "epoch": 1.696763202725724, + "grad_norm": 0.3831416368484497, + "learning_rate": 0.0002, + "loss": 1.7397, + "step": 2490 + }, + { + "epoch": 1.7035775127768313, + "grad_norm": 0.3795325756072998, + "learning_rate": 0.0002, + "loss": 1.7046, + "step": 2500 + }, + { + "epoch": 1.7103918228279387, + "grad_norm": 0.34978193044662476, + "learning_rate": 0.0002, + "loss": 1.6993, + "step": 2510 + }, + { + "epoch": 1.717206132879046, + "grad_norm": 0.35923877358436584, + "learning_rate": 0.0002, + "loss": 1.8307, + "step": 2520 + }, + { + "epoch": 1.7240204429301533, + "grad_norm": 0.352999746799469, + "learning_rate": 0.0002, + "loss": 1.6253, + "step": 2530 + }, + { + "epoch": 1.7308347529812607, + "grad_norm": 0.43673479557037354, + "learning_rate": 0.0002, + "loss": 1.6953, + "step": 2540 + }, + { + "epoch": 1.737649063032368, + "grad_norm": 0.4153687357902527, + "learning_rate": 0.0002, + "loss": 1.7079, + "step": 2550 + }, + { + "epoch": 1.7444633730834753, + "grad_norm": 0.35541167855262756, + "learning_rate": 0.0002, + "loss": 1.6714, + "step": 2560 + }, + { + "epoch": 1.7512776831345827, + "grad_norm": 0.3288775086402893, + "learning_rate": 0.0002, + "loss": 1.696, + "step": 2570 + }, + { + "epoch": 1.75809199318569, + "grad_norm": 0.3991123139858246, + "learning_rate": 0.0002, + "loss": 1.7486, + "step": 2580 + }, + { + "epoch": 1.7649063032367973, + "grad_norm": 0.39967241883277893, + "learning_rate": 0.0002, + "loss": 1.645, + "step": 2590 + }, + { + "epoch": 1.7717206132879046, + "grad_norm": 0.41104283928871155, + "learning_rate": 0.0002, + "loss": 1.6855, + "step": 2600 + }, + { + "epoch": 1.778534923339012, + "grad_norm": 0.44885286688804626, + "learning_rate": 0.0002, + "loss": 1.6993, + "step": 2610 + }, + { + "epoch": 1.7853492333901193, + "grad_norm": 0.38384467363357544, + "learning_rate": 0.0002, + "loss": 1.7224, + "step": 2620 + }, + { + "epoch": 1.7921635434412266, + "grad_norm": 0.35020917654037476, + "learning_rate": 0.0002, + "loss": 1.7213, + "step": 2630 + }, + { + "epoch": 1.798977853492334, + "grad_norm": 0.3360341489315033, + "learning_rate": 0.0002, + "loss": 1.6706, + "step": 2640 + }, + { + "epoch": 1.8057921635434413, + "grad_norm": 0.38875144720077515, + "learning_rate": 0.0002, + "loss": 1.7037, + "step": 2650 + }, + { + "epoch": 1.8126064735945486, + "grad_norm": 0.34876883029937744, + "learning_rate": 0.0002, + "loss": 1.693, + "step": 2660 + }, + { + "epoch": 1.819420783645656, + "grad_norm": 0.419979989528656, + "learning_rate": 0.0002, + "loss": 1.7743, + "step": 2670 + }, + { + "epoch": 1.8262350936967633, + "grad_norm": 0.3648919463157654, + "learning_rate": 0.0002, + "loss": 1.7089, + "step": 2680 + }, + { + "epoch": 1.8330494037478706, + "grad_norm": 0.3485383987426758, + "learning_rate": 0.0002, + "loss": 1.7828, + "step": 2690 + }, + { + "epoch": 1.839863713798978, + "grad_norm": 0.3647468686103821, + "learning_rate": 0.0002, + "loss": 1.705, + "step": 2700 + }, + { + "epoch": 1.8466780238500853, + "grad_norm": 0.37003210186958313, + "learning_rate": 0.0002, + "loss": 1.7318, + "step": 2710 + }, + { + "epoch": 1.8534923339011926, + "grad_norm": 0.37031617760658264, + "learning_rate": 0.0002, + "loss": 1.6647, + "step": 2720 + }, + { + "epoch": 1.8603066439523, + "grad_norm": 0.3438796103000641, + "learning_rate": 0.0002, + "loss": 1.69, + "step": 2730 + }, + { + "epoch": 1.8671209540034073, + "grad_norm": 0.41574627161026, + "learning_rate": 0.0002, + "loss": 1.5995, + "step": 2740 + }, + { + "epoch": 1.8739352640545146, + "grad_norm": 0.35049930214881897, + "learning_rate": 0.0002, + "loss": 1.6877, + "step": 2750 + }, + { + "epoch": 1.880749574105622, + "grad_norm": 0.3943989872932434, + "learning_rate": 0.0002, + "loss": 1.7048, + "step": 2760 + }, + { + "epoch": 1.8875638841567293, + "grad_norm": 0.3384978175163269, + "learning_rate": 0.0002, + "loss": 1.7047, + "step": 2770 + }, + { + "epoch": 1.8943781942078366, + "grad_norm": 0.3501328229904175, + "learning_rate": 0.0002, + "loss": 1.7848, + "step": 2780 + }, + { + "epoch": 1.901192504258944, + "grad_norm": 0.37484532594680786, + "learning_rate": 0.0002, + "loss": 1.6986, + "step": 2790 + }, + { + "epoch": 1.9080068143100513, + "grad_norm": 0.34497788548469543, + "learning_rate": 0.0002, + "loss": 1.6867, + "step": 2800 + }, + { + "epoch": 1.9148211243611586, + "grad_norm": 0.3530851900577545, + "learning_rate": 0.0002, + "loss": 1.6513, + "step": 2810 + }, + { + "epoch": 1.921635434412266, + "grad_norm": 0.3879254162311554, + "learning_rate": 0.0002, + "loss": 1.6369, + "step": 2820 + }, + { + "epoch": 1.9284497444633732, + "grad_norm": 0.3885590136051178, + "learning_rate": 0.0002, + "loss": 1.6786, + "step": 2830 + }, + { + "epoch": 1.9352640545144804, + "grad_norm": 0.3868715465068817, + "learning_rate": 0.0002, + "loss": 1.8049, + "step": 2840 + }, + { + "epoch": 1.9420783645655877, + "grad_norm": 0.4152422249317169, + "learning_rate": 0.0002, + "loss": 1.7099, + "step": 2850 + }, + { + "epoch": 1.948892674616695, + "grad_norm": 0.401714563369751, + "learning_rate": 0.0002, + "loss": 1.6696, + "step": 2860 + }, + { + "epoch": 1.9557069846678024, + "grad_norm": 0.34825265407562256, + "learning_rate": 0.0002, + "loss": 1.7182, + "step": 2870 + }, + { + "epoch": 1.9625212947189097, + "grad_norm": 0.3620675504207611, + "learning_rate": 0.0002, + "loss": 1.6612, + "step": 2880 + }, + { + "epoch": 1.969335604770017, + "grad_norm": 0.3977806866168976, + "learning_rate": 0.0002, + "loss": 1.7451, + "step": 2890 + }, + { + "epoch": 1.9761499148211243, + "grad_norm": 0.3687497079372406, + "learning_rate": 0.0002, + "loss": 1.6514, + "step": 2900 + }, + { + "epoch": 1.9829642248722317, + "grad_norm": 0.408640056848526, + "learning_rate": 0.0002, + "loss": 1.7712, + "step": 2910 + }, + { + "epoch": 1.989778534923339, + "grad_norm": 0.34510108828544617, + "learning_rate": 0.0002, + "loss": 1.695, + "step": 2920 + }, + { + "epoch": 1.9965928449744463, + "grad_norm": 0.3596334755420685, + "learning_rate": 0.0002, + "loss": 1.662, + "step": 2930 + }, + { + "epoch": 2.0, + "eval_loss": 1.8056600093841553, + "eval_runtime": 60.7049, + "eval_samples_per_second": 8.352, + "eval_steps_per_second": 1.054, + "step": 2935 + }, + { + "epoch": 2.0034071550255534, + "grad_norm": 0.3460802137851715, + "learning_rate": 0.0002, + "loss": 1.6918, + "step": 2940 + }, + { + "epoch": 2.0102214650766608, + "grad_norm": 0.4038620591163635, + "learning_rate": 0.0002, + "loss": 1.6036, + "step": 2950 + }, + { + "epoch": 2.017035775127768, + "grad_norm": 0.3950219750404358, + "learning_rate": 0.0002, + "loss": 1.5386, + "step": 2960 + }, + { + "epoch": 2.0238500851788754, + "grad_norm": 0.519116997718811, + "learning_rate": 0.0002, + "loss": 1.5649, + "step": 2970 + }, + { + "epoch": 2.0306643952299828, + "grad_norm": 0.4097684919834137, + "learning_rate": 0.0002, + "loss": 1.5397, + "step": 2980 + }, + { + "epoch": 2.03747870528109, + "grad_norm": 0.4153544306755066, + "learning_rate": 0.0002, + "loss": 1.6044, + "step": 2990 + }, + { + "epoch": 2.0442930153321974, + "grad_norm": 0.4351160526275635, + "learning_rate": 0.0002, + "loss": 1.6006, + "step": 3000 + }, + { + "epoch": 2.0511073253833048, + "grad_norm": 0.42036259174346924, + "learning_rate": 0.0002, + "loss": 1.5091, + "step": 3010 + }, + { + "epoch": 2.057921635434412, + "grad_norm": 0.4433218836784363, + "learning_rate": 0.0002, + "loss": 1.5686, + "step": 3020 + }, + { + "epoch": 2.0647359454855194, + "grad_norm": 0.46511581540107727, + "learning_rate": 0.0002, + "loss": 1.5478, + "step": 3030 + }, + { + "epoch": 2.0715502555366268, + "grad_norm": 0.4567560851573944, + "learning_rate": 0.0002, + "loss": 1.5554, + "step": 3040 + }, + { + "epoch": 2.078364565587734, + "grad_norm": 0.45671048760414124, + "learning_rate": 0.0002, + "loss": 1.5561, + "step": 3050 + }, + { + "epoch": 2.0851788756388414, + "grad_norm": 0.4598552882671356, + "learning_rate": 0.0002, + "loss": 1.606, + "step": 3060 + }, + { + "epoch": 2.0919931856899487, + "grad_norm": 0.4582861661911011, + "learning_rate": 0.0002, + "loss": 1.6357, + "step": 3070 + }, + { + "epoch": 2.098807495741056, + "grad_norm": 0.4366969168186188, + "learning_rate": 0.0002, + "loss": 1.5853, + "step": 3080 + }, + { + "epoch": 2.1056218057921634, + "grad_norm": 0.495917409658432, + "learning_rate": 0.0002, + "loss": 1.5574, + "step": 3090 + }, + { + "epoch": 2.1124361158432707, + "grad_norm": 1.6846044063568115, + "learning_rate": 0.0002, + "loss": 1.5798, + "step": 3100 + }, + { + "epoch": 2.119250425894378, + "grad_norm": 0.4765092134475708, + "learning_rate": 0.0002, + "loss": 1.5877, + "step": 3110 + }, + { + "epoch": 2.1260647359454854, + "grad_norm": 0.45029810070991516, + "learning_rate": 0.0002, + "loss": 1.6106, + "step": 3120 + }, + { + "epoch": 2.1328790459965927, + "grad_norm": 0.5706973075866699, + "learning_rate": 0.0002, + "loss": 1.5707, + "step": 3130 + }, + { + "epoch": 2.1396933560477, + "grad_norm": 0.4606274366378784, + "learning_rate": 0.0002, + "loss": 1.589, + "step": 3140 + }, + { + "epoch": 2.1465076660988074, + "grad_norm": 2.199115753173828, + "learning_rate": 0.0002, + "loss": 1.564, + "step": 3150 + }, + { + "epoch": 2.1533219761499147, + "grad_norm": 0.6231027245521545, + "learning_rate": 0.0002, + "loss": 1.5808, + "step": 3160 + }, + { + "epoch": 2.160136286201022, + "grad_norm": 0.46918219327926636, + "learning_rate": 0.0002, + "loss": 1.5397, + "step": 3170 + }, + { + "epoch": 2.1669505962521294, + "grad_norm": 0.5006393194198608, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 3180 + }, + { + "epoch": 2.1737649063032367, + "grad_norm": 0.4745093286037445, + "learning_rate": 0.0002, + "loss": 1.591, + "step": 3190 + }, + { + "epoch": 2.180579216354344, + "grad_norm": 0.511576771736145, + "learning_rate": 0.0002, + "loss": 1.6459, + "step": 3200 + }, + { + "epoch": 2.1873935264054514, + "grad_norm": 0.38622918725013733, + "learning_rate": 0.0002, + "loss": 1.6019, + "step": 3210 + }, + { + "epoch": 2.1942078364565587, + "grad_norm": 0.4425644278526306, + "learning_rate": 0.0002, + "loss": 1.6187, + "step": 3220 + }, + { + "epoch": 2.201022146507666, + "grad_norm": 0.45643091201782227, + "learning_rate": 0.0002, + "loss": 1.6114, + "step": 3230 + }, + { + "epoch": 2.2078364565587734, + "grad_norm": 0.4975406229496002, + "learning_rate": 0.0002, + "loss": 1.6224, + "step": 3240 + }, + { + "epoch": 2.2146507666098807, + "grad_norm": 0.4673331081867218, + "learning_rate": 0.0002, + "loss": 1.6654, + "step": 3250 + }, + { + "epoch": 2.221465076660988, + "grad_norm": 0.5081731081008911, + "learning_rate": 0.0002, + "loss": 1.6155, + "step": 3260 + }, + { + "epoch": 2.2282793867120954, + "grad_norm": 0.4790806770324707, + "learning_rate": 0.0002, + "loss": 1.53, + "step": 3270 + }, + { + "epoch": 2.2350936967632027, + "grad_norm": 0.5184140801429749, + "learning_rate": 0.0002, + "loss": 1.6362, + "step": 3280 + }, + { + "epoch": 2.24190800681431, + "grad_norm": 0.5159083604812622, + "learning_rate": 0.0002, + "loss": 1.5646, + "step": 3290 + }, + { + "epoch": 2.2487223168654173, + "grad_norm": 0.4876042604446411, + "learning_rate": 0.0002, + "loss": 1.6577, + "step": 3300 + }, + { + "epoch": 2.2555366269165247, + "grad_norm": 0.5454957485198975, + "learning_rate": 0.0002, + "loss": 1.5718, + "step": 3310 + }, + { + "epoch": 2.262350936967632, + "grad_norm": 0.49866822361946106, + "learning_rate": 0.0002, + "loss": 1.5885, + "step": 3320 + }, + { + "epoch": 2.2691652470187393, + "grad_norm": 0.42674365639686584, + "learning_rate": 0.0002, + "loss": 1.5923, + "step": 3330 + }, + { + "epoch": 2.2759795570698467, + "grad_norm": 0.5202316641807556, + "learning_rate": 0.0002, + "loss": 1.5588, + "step": 3340 + }, + { + "epoch": 2.282793867120954, + "grad_norm": 0.4849465489387512, + "learning_rate": 0.0002, + "loss": 1.6032, + "step": 3350 + }, + { + "epoch": 2.2896081771720613, + "grad_norm": 0.47202569246292114, + "learning_rate": 0.0002, + "loss": 1.6853, + "step": 3360 + }, + { + "epoch": 2.2964224872231687, + "grad_norm": 0.5311620235443115, + "learning_rate": 0.0002, + "loss": 1.6164, + "step": 3370 + }, + { + "epoch": 2.303236797274276, + "grad_norm": 0.49011409282684326, + "learning_rate": 0.0002, + "loss": 1.6004, + "step": 3380 + }, + { + "epoch": 2.3100511073253833, + "grad_norm": 0.4789247512817383, + "learning_rate": 0.0002, + "loss": 1.5484, + "step": 3390 + }, + { + "epoch": 2.3168654173764907, + "grad_norm": 0.46646103262901306, + "learning_rate": 0.0002, + "loss": 1.5866, + "step": 3400 + }, + { + "epoch": 2.323679727427598, + "grad_norm": 0.5552441477775574, + "learning_rate": 0.0002, + "loss": 1.5308, + "step": 3410 + }, + { + "epoch": 2.3304940374787053, + "grad_norm": 0.4530351758003235, + "learning_rate": 0.0002, + "loss": 1.5761, + "step": 3420 + }, + { + "epoch": 2.3373083475298126, + "grad_norm": 0.4806232750415802, + "learning_rate": 0.0002, + "loss": 1.5919, + "step": 3430 + }, + { + "epoch": 2.34412265758092, + "grad_norm": 0.5998363494873047, + "learning_rate": 0.0002, + "loss": 1.5569, + "step": 3440 + }, + { + "epoch": 2.3509369676320273, + "grad_norm": 0.4918554425239563, + "learning_rate": 0.0002, + "loss": 1.513, + "step": 3450 + }, + { + "epoch": 2.3577512776831346, + "grad_norm": 0.5359559655189514, + "learning_rate": 0.0002, + "loss": 1.6323, + "step": 3460 + }, + { + "epoch": 2.364565587734242, + "grad_norm": 0.5053277611732483, + "learning_rate": 0.0002, + "loss": 1.5973, + "step": 3470 + }, + { + "epoch": 2.3713798977853493, + "grad_norm": 0.5058915019035339, + "learning_rate": 0.0002, + "loss": 1.5673, + "step": 3480 + }, + { + "epoch": 2.3781942078364566, + "grad_norm": 0.5314899682998657, + "learning_rate": 0.0002, + "loss": 1.5434, + "step": 3490 + }, + { + "epoch": 2.385008517887564, + "grad_norm": 0.48035913705825806, + "learning_rate": 0.0002, + "loss": 1.5203, + "step": 3500 + }, + { + "epoch": 2.3918228279386713, + "grad_norm": 0.45864903926849365, + "learning_rate": 0.0002, + "loss": 1.5936, + "step": 3510 + }, + { + "epoch": 2.3986371379897786, + "grad_norm": 0.4553050398826599, + "learning_rate": 0.0002, + "loss": 1.6285, + "step": 3520 + }, + { + "epoch": 2.405451448040886, + "grad_norm": 0.4483442008495331, + "learning_rate": 0.0002, + "loss": 1.5444, + "step": 3530 + }, + { + "epoch": 2.4122657580919933, + "grad_norm": 0.5043742060661316, + "learning_rate": 0.0002, + "loss": 1.587, + "step": 3540 + }, + { + "epoch": 2.4190800681431006, + "grad_norm": 0.44807168841362, + "learning_rate": 0.0002, + "loss": 1.5692, + "step": 3550 + }, + { + "epoch": 2.425894378194208, + "grad_norm": 0.5065137147903442, + "learning_rate": 0.0002, + "loss": 1.6306, + "step": 3560 + }, + { + "epoch": 2.4327086882453153, + "grad_norm": 0.5186443328857422, + "learning_rate": 0.0002, + "loss": 1.5842, + "step": 3570 + }, + { + "epoch": 2.4395229982964226, + "grad_norm": 0.49743232131004333, + "learning_rate": 0.0002, + "loss": 1.5956, + "step": 3580 + }, + { + "epoch": 2.44633730834753, + "grad_norm": 0.524450421333313, + "learning_rate": 0.0002, + "loss": 1.6021, + "step": 3590 + }, + { + "epoch": 2.4531516183986373, + "grad_norm": 0.5053797364234924, + "learning_rate": 0.0002, + "loss": 1.6283, + "step": 3600 + }, + { + "epoch": 2.4599659284497446, + "grad_norm": 0.5223091840744019, + "learning_rate": 0.0002, + "loss": 1.6335, + "step": 3610 + }, + { + "epoch": 2.466780238500852, + "grad_norm": 0.4763810932636261, + "learning_rate": 0.0002, + "loss": 1.6315, + "step": 3620 + }, + { + "epoch": 2.4735945485519593, + "grad_norm": 0.5097282528877258, + "learning_rate": 0.0002, + "loss": 1.5623, + "step": 3630 + }, + { + "epoch": 2.4804088586030666, + "grad_norm": 0.5831942558288574, + "learning_rate": 0.0002, + "loss": 1.605, + "step": 3640 + }, + { + "epoch": 2.487223168654174, + "grad_norm": 0.47573572397232056, + "learning_rate": 0.0002, + "loss": 1.6074, + "step": 3650 + }, + { + "epoch": 2.4940374787052813, + "grad_norm": 0.49602726101875305, + "learning_rate": 0.0002, + "loss": 1.6411, + "step": 3660 + }, + { + "epoch": 2.500851788756388, + "grad_norm": 0.5468524694442749, + "learning_rate": 0.0002, + "loss": 1.571, + "step": 3670 + }, + { + "epoch": 2.507666098807496, + "grad_norm": 0.45899373292922974, + "learning_rate": 0.0002, + "loss": 1.5782, + "step": 3680 + }, + { + "epoch": 2.514480408858603, + "grad_norm": 0.5031567215919495, + "learning_rate": 0.0002, + "loss": 1.5114, + "step": 3690 + }, + { + "epoch": 2.5212947189097106, + "grad_norm": 0.5224900841712952, + "learning_rate": 0.0002, + "loss": 1.538, + "step": 3700 + }, + { + "epoch": 2.5281090289608175, + "grad_norm": 0.504769504070282, + "learning_rate": 0.0002, + "loss": 1.5269, + "step": 3710 + }, + { + "epoch": 2.5349233390119252, + "grad_norm": 0.6120529770851135, + "learning_rate": 0.0002, + "loss": 1.5141, + "step": 3720 + }, + { + "epoch": 2.541737649063032, + "grad_norm": 0.47930678725242615, + "learning_rate": 0.0002, + "loss": 1.5666, + "step": 3730 + }, + { + "epoch": 2.54855195911414, + "grad_norm": 0.5039092302322388, + "learning_rate": 0.0002, + "loss": 1.6156, + "step": 3740 + }, + { + "epoch": 2.555366269165247, + "grad_norm": 0.49758994579315186, + "learning_rate": 0.0002, + "loss": 1.5585, + "step": 3750 + }, + { + "epoch": 2.5621805792163546, + "grad_norm": 0.44739171862602234, + "learning_rate": 0.0002, + "loss": 1.5351, + "step": 3760 + }, + { + "epoch": 2.5689948892674614, + "grad_norm": 0.47056373953819275, + "learning_rate": 0.0002, + "loss": 1.5099, + "step": 3770 + }, + { + "epoch": 2.575809199318569, + "grad_norm": 0.5077595114707947, + "learning_rate": 0.0002, + "loss": 1.5524, + "step": 3780 + }, + { + "epoch": 2.582623509369676, + "grad_norm": 0.4981902837753296, + "learning_rate": 0.0002, + "loss": 1.5524, + "step": 3790 + }, + { + "epoch": 2.589437819420784, + "grad_norm": 0.5736238360404968, + "learning_rate": 0.0002, + "loss": 1.5381, + "step": 3800 + }, + { + "epoch": 2.5962521294718908, + "grad_norm": 0.4898384213447571, + "learning_rate": 0.0002, + "loss": 1.67, + "step": 3810 + }, + { + "epoch": 2.6030664395229985, + "grad_norm": 0.4611325263977051, + "learning_rate": 0.0002, + "loss": 1.5411, + "step": 3820 + }, + { + "epoch": 2.6098807495741054, + "grad_norm": 0.5285341739654541, + "learning_rate": 0.0002, + "loss": 1.5662, + "step": 3830 + }, + { + "epoch": 2.616695059625213, + "grad_norm": 0.5679430961608887, + "learning_rate": 0.0002, + "loss": 1.5875, + "step": 3840 + }, + { + "epoch": 2.62350936967632, + "grad_norm": 0.48532548546791077, + "learning_rate": 0.0002, + "loss": 1.5544, + "step": 3850 + }, + { + "epoch": 2.630323679727428, + "grad_norm": 0.45506492257118225, + "learning_rate": 0.0002, + "loss": 1.579, + "step": 3860 + }, + { + "epoch": 2.6371379897785348, + "grad_norm": 0.6552556753158569, + "learning_rate": 0.0002, + "loss": 1.5775, + "step": 3870 + }, + { + "epoch": 2.643952299829642, + "grad_norm": 0.537874698638916, + "learning_rate": 0.0002, + "loss": 1.608, + "step": 3880 + }, + { + "epoch": 2.6507666098807494, + "grad_norm": 0.46102389693260193, + "learning_rate": 0.0002, + "loss": 1.5653, + "step": 3890 + }, + { + "epoch": 2.6575809199318567, + "grad_norm": 0.45531195402145386, + "learning_rate": 0.0002, + "loss": 1.5293, + "step": 3900 + }, + { + "epoch": 2.664395229982964, + "grad_norm": 0.5327293872833252, + "learning_rate": 0.0002, + "loss": 1.5492, + "step": 3910 + }, + { + "epoch": 2.6712095400340714, + "grad_norm": 0.4968956410884857, + "learning_rate": 0.0002, + "loss": 1.5921, + "step": 3920 + }, + { + "epoch": 2.6780238500851787, + "grad_norm": 0.4790082275867462, + "learning_rate": 0.0002, + "loss": 1.5823, + "step": 3930 + }, + { + "epoch": 2.684838160136286, + "grad_norm": 0.5392967462539673, + "learning_rate": 0.0002, + "loss": 1.615, + "step": 3940 + }, + { + "epoch": 2.6916524701873934, + "grad_norm": 0.5076649785041809, + "learning_rate": 0.0002, + "loss": 1.6218, + "step": 3950 + }, + { + "epoch": 2.6984667802385007, + "grad_norm": 0.5628064274787903, + "learning_rate": 0.0002, + "loss": 1.6478, + "step": 3960 + }, + { + "epoch": 2.705281090289608, + "grad_norm": 0.5012659430503845, + "learning_rate": 0.0002, + "loss": 1.5417, + "step": 3970 + }, + { + "epoch": 2.7120954003407154, + "grad_norm": 0.4947647452354431, + "learning_rate": 0.0002, + "loss": 1.5339, + "step": 3980 + }, + { + "epoch": 2.7189097103918227, + "grad_norm": 0.4890969693660736, + "learning_rate": 0.0002, + "loss": 1.5724, + "step": 3990 + }, + { + "epoch": 2.72572402044293, + "grad_norm": 0.4471694231033325, + "learning_rate": 0.0002, + "loss": 1.5746, + "step": 4000 + }, + { + "epoch": 2.7325383304940374, + "grad_norm": 0.5116439461708069, + "learning_rate": 0.0002, + "loss": 1.6669, + "step": 4010 + }, + { + "epoch": 2.7393526405451447, + "grad_norm": 0.5720411539077759, + "learning_rate": 0.0002, + "loss": 1.584, + "step": 4020 + }, + { + "epoch": 2.746166950596252, + "grad_norm": 0.5529406070709229, + "learning_rate": 0.0002, + "loss": 1.6151, + "step": 4030 + }, + { + "epoch": 2.7529812606473594, + "grad_norm": 0.5229396820068359, + "learning_rate": 0.0002, + "loss": 1.6296, + "step": 4040 + }, + { + "epoch": 2.7597955706984667, + "grad_norm": 0.5270276069641113, + "learning_rate": 0.0002, + "loss": 1.5363, + "step": 4050 + }, + { + "epoch": 2.766609880749574, + "grad_norm": 0.48413026332855225, + "learning_rate": 0.0002, + "loss": 1.6305, + "step": 4060 + }, + { + "epoch": 2.7734241908006814, + "grad_norm": 0.5145403742790222, + "learning_rate": 0.0002, + "loss": 1.5488, + "step": 4070 + }, + { + "epoch": 2.7802385008517887, + "grad_norm": 0.48626071214675903, + "learning_rate": 0.0002, + "loss": 1.6023, + "step": 4080 + }, + { + "epoch": 2.787052810902896, + "grad_norm": 0.5018984079360962, + "learning_rate": 0.0002, + "loss": 1.6082, + "step": 4090 + }, + { + "epoch": 2.7938671209540034, + "grad_norm": 0.4881938695907593, + "learning_rate": 0.0002, + "loss": 1.7166, + "step": 4100 + }, + { + "epoch": 2.8006814310051107, + "grad_norm": 0.5151546001434326, + "learning_rate": 0.0002, + "loss": 1.515, + "step": 4110 + }, + { + "epoch": 2.807495741056218, + "grad_norm": 0.5109850764274597, + "learning_rate": 0.0002, + "loss": 1.6069, + "step": 4120 + }, + { + "epoch": 2.8143100511073254, + "grad_norm": 0.5109251141548157, + "learning_rate": 0.0002, + "loss": 1.6153, + "step": 4130 + }, + { + "epoch": 2.8211243611584327, + "grad_norm": 0.5025496482849121, + "learning_rate": 0.0002, + "loss": 1.6365, + "step": 4140 + }, + { + "epoch": 2.82793867120954, + "grad_norm": 0.49027004837989807, + "learning_rate": 0.0002, + "loss": 1.6292, + "step": 4150 + }, + { + "epoch": 2.8347529812606473, + "grad_norm": 0.4957362413406372, + "learning_rate": 0.0002, + "loss": 1.5591, + "step": 4160 + }, + { + "epoch": 2.8415672913117547, + "grad_norm": 0.5159927606582642, + "learning_rate": 0.0002, + "loss": 1.6759, + "step": 4170 + }, + { + "epoch": 2.848381601362862, + "grad_norm": 0.6040670871734619, + "learning_rate": 0.0002, + "loss": 1.577, + "step": 4180 + }, + { + "epoch": 2.8551959114139693, + "grad_norm": 0.5489953756332397, + "learning_rate": 0.0002, + "loss": 1.5295, + "step": 4190 + }, + { + "epoch": 2.8620102214650767, + "grad_norm": 0.5416634678840637, + "learning_rate": 0.0002, + "loss": 1.5909, + "step": 4200 + }, + { + "epoch": 2.868824531516184, + "grad_norm": 0.5278245210647583, + "learning_rate": 0.0002, + "loss": 1.6014, + "step": 4210 + }, + { + "epoch": 2.8756388415672913, + "grad_norm": 0.43382319808006287, + "learning_rate": 0.0002, + "loss": 1.4848, + "step": 4220 + }, + { + "epoch": 2.8824531516183987, + "grad_norm": 0.4724387228488922, + "learning_rate": 0.0002, + "loss": 1.6092, + "step": 4230 + }, + { + "epoch": 2.889267461669506, + "grad_norm": 0.49824780225753784, + "learning_rate": 0.0002, + "loss": 1.5748, + "step": 4240 + }, + { + "epoch": 2.8960817717206133, + "grad_norm": 0.5360262989997864, + "learning_rate": 0.0002, + "loss": 1.6163, + "step": 4250 + }, + { + "epoch": 2.9028960817717206, + "grad_norm": 0.49090322852134705, + "learning_rate": 0.0002, + "loss": 1.5934, + "step": 4260 + }, + { + "epoch": 2.909710391822828, + "grad_norm": 0.5613328218460083, + "learning_rate": 0.0002, + "loss": 1.624, + "step": 4270 + }, + { + "epoch": 2.9165247018739353, + "grad_norm": 0.4611356258392334, + "learning_rate": 0.0002, + "loss": 1.5713, + "step": 4280 + }, + { + "epoch": 2.9233390119250426, + "grad_norm": 0.550897479057312, + "learning_rate": 0.0002, + "loss": 1.5457, + "step": 4290 + }, + { + "epoch": 2.93015332197615, + "grad_norm": 0.5089612603187561, + "learning_rate": 0.0002, + "loss": 1.6225, + "step": 4300 + }, + { + "epoch": 2.9369676320272573, + "grad_norm": 0.5210904479026794, + "learning_rate": 0.0002, + "loss": 1.5897, + "step": 4310 + }, + { + "epoch": 2.9437819420783646, + "grad_norm": 0.5506424903869629, + "learning_rate": 0.0002, + "loss": 1.6198, + "step": 4320 + }, + { + "epoch": 2.950596252129472, + "grad_norm": 0.5118561387062073, + "learning_rate": 0.0002, + "loss": 1.6395, + "step": 4330 + }, + { + "epoch": 2.9574105621805793, + "grad_norm": 0.5034464597702026, + "learning_rate": 0.0002, + "loss": 1.704, + "step": 4340 + }, + { + "epoch": 2.9642248722316866, + "grad_norm": 0.5019990801811218, + "learning_rate": 0.0002, + "loss": 1.6314, + "step": 4350 + }, + { + "epoch": 2.971039182282794, + "grad_norm": 0.5423325300216675, + "learning_rate": 0.0002, + "loss": 1.6161, + "step": 4360 + }, + { + "epoch": 2.9778534923339013, + "grad_norm": 0.5287469625473022, + "learning_rate": 0.0002, + "loss": 1.6144, + "step": 4370 + }, + { + "epoch": 2.9846678023850086, + "grad_norm": 0.5206913352012634, + "learning_rate": 0.0002, + "loss": 1.5227, + "step": 4380 + }, + { + "epoch": 2.991482112436116, + "grad_norm": 0.5407394170761108, + "learning_rate": 0.0002, + "loss": 1.6026, + "step": 4390 + }, + { + "epoch": 2.9982964224872233, + "grad_norm": 0.5244600176811218, + "learning_rate": 0.0002, + "loss": 1.5908, + "step": 4400 + }, + { + "epoch": 2.9996592844974446, + "eval_loss": 1.8412635326385498, + "eval_runtime": 65.5583, + "eval_samples_per_second": 7.734, + "eval_steps_per_second": 0.976, + "step": 4402 + }, + { + "epoch": 3.0051107325383306, + "grad_norm": 0.5172150731086731, + "learning_rate": 0.0002, + "loss": 1.5157, + "step": 4410 + }, + { + "epoch": 3.011925042589438, + "grad_norm": 0.6882525086402893, + "learning_rate": 0.0002, + "loss": 1.398, + "step": 4420 + }, + { + "epoch": 3.0187393526405453, + "grad_norm": 0.6435003280639648, + "learning_rate": 0.0002, + "loss": 1.3884, + "step": 4430 + }, + { + "epoch": 3.0255536626916526, + "grad_norm": 0.7126057147979736, + "learning_rate": 0.0002, + "loss": 1.4493, + "step": 4440 + }, + { + "epoch": 3.03236797274276, + "grad_norm": 0.6634385585784912, + "learning_rate": 0.0002, + "loss": 1.4397, + "step": 4450 + }, + { + "epoch": 3.0391822827938673, + "grad_norm": 0.6468435525894165, + "learning_rate": 0.0002, + "loss": 1.3674, + "step": 4460 + }, + { + "epoch": 3.0459965928449746, + "grad_norm": 0.5690478086471558, + "learning_rate": 0.0002, + "loss": 1.4045, + "step": 4470 + }, + { + "epoch": 3.052810902896082, + "grad_norm": 0.7323708534240723, + "learning_rate": 0.0002, + "loss": 1.3742, + "step": 4480 + }, + { + "epoch": 3.0596252129471893, + "grad_norm": 0.6989302039146423, + "learning_rate": 0.0002, + "loss": 1.3281, + "step": 4490 + }, + { + "epoch": 3.0664395229982966, + "grad_norm": 0.6704450845718384, + "learning_rate": 0.0002, + "loss": 1.379, + "step": 4500 + }, + { + "epoch": 3.073253833049404, + "grad_norm": 0.769137442111969, + "learning_rate": 0.0002, + "loss": 1.4028, + "step": 4510 + }, + { + "epoch": 3.0800681431005112, + "grad_norm": 0.6556448936462402, + "learning_rate": 0.0002, + "loss": 1.4295, + "step": 4520 + }, + { + "epoch": 3.0868824531516186, + "grad_norm": 0.7143950462341309, + "learning_rate": 0.0002, + "loss": 1.2763, + "step": 4530 + }, + { + "epoch": 3.093696763202726, + "grad_norm": 0.7060510516166687, + "learning_rate": 0.0002, + "loss": 1.4806, + "step": 4540 + }, + { + "epoch": 3.1005110732538332, + "grad_norm": 0.6637526750564575, + "learning_rate": 0.0002, + "loss": 1.4097, + "step": 4550 + }, + { + "epoch": 3.1073253833049406, + "grad_norm": 0.822989284992218, + "learning_rate": 0.0002, + "loss": 1.4752, + "step": 4560 + }, + { + "epoch": 3.114139693356048, + "grad_norm": 0.5542152523994446, + "learning_rate": 0.0002, + "loss": 1.4994, + "step": 4570 + }, + { + "epoch": 3.1209540034071552, + "grad_norm": 0.7780306935310364, + "learning_rate": 0.0002, + "loss": 1.4306, + "step": 4580 + }, + { + "epoch": 3.1277683134582626, + "grad_norm": 0.7372637987136841, + "learning_rate": 0.0002, + "loss": 1.3909, + "step": 4590 + }, + { + "epoch": 3.1345826235093694, + "grad_norm": 0.6730087995529175, + "learning_rate": 0.0002, + "loss": 1.3989, + "step": 4600 + }, + { + "epoch": 3.1413969335604772, + "grad_norm": 0.6687398552894592, + "learning_rate": 0.0002, + "loss": 1.3591, + "step": 4610 + }, + { + "epoch": 3.148211243611584, + "grad_norm": 0.7645083665847778, + "learning_rate": 0.0002, + "loss": 1.436, + "step": 4620 + }, + { + "epoch": 3.155025553662692, + "grad_norm": 0.6770380139350891, + "learning_rate": 0.0002, + "loss": 1.3681, + "step": 4630 + }, + { + "epoch": 3.1618398637137988, + "grad_norm": 0.7200576663017273, + "learning_rate": 0.0002, + "loss": 1.405, + "step": 4640 + }, + { + "epoch": 3.168654173764906, + "grad_norm": 0.6663638949394226, + "learning_rate": 0.0002, + "loss": 1.3752, + "step": 4650 + }, + { + "epoch": 3.1754684838160134, + "grad_norm": 0.6602960228919983, + "learning_rate": 0.0002, + "loss": 1.4099, + "step": 4660 + }, + { + "epoch": 3.1822827938671208, + "grad_norm": 0.7838228344917297, + "learning_rate": 0.0002, + "loss": 1.4003, + "step": 4670 + }, + { + "epoch": 3.189097103918228, + "grad_norm": 0.7559184432029724, + "learning_rate": 0.0002, + "loss": 1.3853, + "step": 4680 + }, + { + "epoch": 3.1959114139693354, + "grad_norm": 0.6609814167022705, + "learning_rate": 0.0002, + "loss": 1.4516, + "step": 4690 + }, + { + "epoch": 3.2027257240204428, + "grad_norm": 0.8470419645309448, + "learning_rate": 0.0002, + "loss": 1.4464, + "step": 4700 + }, + { + "epoch": 3.20954003407155, + "grad_norm": 0.7282822728157043, + "learning_rate": 0.0002, + "loss": 1.428, + "step": 4710 + }, + { + "epoch": 3.2163543441226574, + "grad_norm": 0.6722773313522339, + "learning_rate": 0.0002, + "loss": 1.5261, + "step": 4720 + }, + { + "epoch": 3.2231686541737647, + "grad_norm": 0.7630265355110168, + "learning_rate": 0.0002, + "loss": 1.3809, + "step": 4730 + }, + { + "epoch": 3.229982964224872, + "grad_norm": 0.7102773785591125, + "learning_rate": 0.0002, + "loss": 1.42, + "step": 4740 + }, + { + "epoch": 3.2367972742759794, + "grad_norm": 0.7778299450874329, + "learning_rate": 0.0002, + "loss": 1.3529, + "step": 4750 + }, + { + "epoch": 3.2436115843270867, + "grad_norm": 0.7189921736717224, + "learning_rate": 0.0002, + "loss": 1.4715, + "step": 4760 + }, + { + "epoch": 3.250425894378194, + "grad_norm": 0.7708092331886292, + "learning_rate": 0.0002, + "loss": 1.4328, + "step": 4770 + }, + { + "epoch": 3.2572402044293014, + "grad_norm": 0.7208452224731445, + "learning_rate": 0.0002, + "loss": 1.3855, + "step": 4780 + }, + { + "epoch": 3.2640545144804087, + "grad_norm": 0.7220432758331299, + "learning_rate": 0.0002, + "loss": 1.3206, + "step": 4790 + }, + { + "epoch": 3.270868824531516, + "grad_norm": 0.7064954042434692, + "learning_rate": 0.0002, + "loss": 1.463, + "step": 4800 + }, + { + "epoch": 3.2776831345826234, + "grad_norm": 0.6618382334709167, + "learning_rate": 0.0002, + "loss": 1.4236, + "step": 4810 + }, + { + "epoch": 3.2844974446337307, + "grad_norm": 0.6854256391525269, + "learning_rate": 0.0002, + "loss": 1.3878, + "step": 4820 + }, + { + "epoch": 3.291311754684838, + "grad_norm": 0.6036319136619568, + "learning_rate": 0.0002, + "loss": 1.4236, + "step": 4830 + }, + { + "epoch": 3.2981260647359454, + "grad_norm": 0.714678943157196, + "learning_rate": 0.0002, + "loss": 1.4796, + "step": 4840 + }, + { + "epoch": 3.3049403747870527, + "grad_norm": 0.7218600511550903, + "learning_rate": 0.0002, + "loss": 1.4273, + "step": 4850 + }, + { + "epoch": 3.31175468483816, + "grad_norm": 0.7243074774742126, + "learning_rate": 0.0002, + "loss": 1.3915, + "step": 4860 + }, + { + "epoch": 3.3185689948892674, + "grad_norm": 0.7058630585670471, + "learning_rate": 0.0002, + "loss": 1.4088, + "step": 4870 + }, + { + "epoch": 3.3253833049403747, + "grad_norm": 0.7091076970100403, + "learning_rate": 0.0002, + "loss": 1.3837, + "step": 4880 + }, + { + "epoch": 3.332197614991482, + "grad_norm": 0.7375147342681885, + "learning_rate": 0.0002, + "loss": 1.4745, + "step": 4890 + }, + { + "epoch": 3.3390119250425894, + "grad_norm": 0.9426755309104919, + "learning_rate": 0.0002, + "loss": 1.4826, + "step": 4900 + }, + { + "epoch": 3.3458262350936967, + "grad_norm": 0.6508213877677917, + "learning_rate": 0.0002, + "loss": 1.369, + "step": 4910 + }, + { + "epoch": 3.352640545144804, + "grad_norm": 0.6945043206214905, + "learning_rate": 0.0002, + "loss": 1.3839, + "step": 4920 + }, + { + "epoch": 3.3594548551959114, + "grad_norm": 0.6335888504981995, + "learning_rate": 0.0002, + "loss": 1.3571, + "step": 4930 + }, + { + "epoch": 3.3662691652470187, + "grad_norm": 0.6947107911109924, + "learning_rate": 0.0002, + "loss": 1.4391, + "step": 4940 + }, + { + "epoch": 3.373083475298126, + "grad_norm": 0.8204733729362488, + "learning_rate": 0.0002, + "loss": 1.3885, + "step": 4950 + }, + { + "epoch": 3.3798977853492334, + "grad_norm": 0.7212244868278503, + "learning_rate": 0.0002, + "loss": 1.4886, + "step": 4960 + }, + { + "epoch": 3.3867120954003407, + "grad_norm": 0.6053042411804199, + "learning_rate": 0.0002, + "loss": 1.4581, + "step": 4970 + }, + { + "epoch": 3.393526405451448, + "grad_norm": 0.7820029854774475, + "learning_rate": 0.0002, + "loss": 1.3863, + "step": 4980 + }, + { + "epoch": 3.4003407155025553, + "grad_norm": 0.6866770386695862, + "learning_rate": 0.0002, + "loss": 1.4326, + "step": 4990 + }, + { + "epoch": 3.4071550255536627, + "grad_norm": 0.6652463674545288, + "learning_rate": 0.0002, + "loss": 1.4287, + "step": 5000 + }, + { + "epoch": 3.41396933560477, + "grad_norm": 1.1209032535552979, + "learning_rate": 0.0002, + "loss": 1.3667, + "step": 5010 + }, + { + "epoch": 3.4207836456558773, + "grad_norm": 0.8390814661979675, + "learning_rate": 0.0002, + "loss": 1.4461, + "step": 5020 + }, + { + "epoch": 3.4275979557069847, + "grad_norm": 0.7541858553886414, + "learning_rate": 0.0002, + "loss": 1.4556, + "step": 5030 + }, + { + "epoch": 3.434412265758092, + "grad_norm": 0.6902772784233093, + "learning_rate": 0.0002, + "loss": 1.4245, + "step": 5040 + }, + { + "epoch": 3.4412265758091993, + "grad_norm": 0.7070329785346985, + "learning_rate": 0.0002, + "loss": 1.3953, + "step": 5050 + }, + { + "epoch": 3.4480408858603067, + "grad_norm": 0.8075643181800842, + "learning_rate": 0.0002, + "loss": 1.3903, + "step": 5060 + }, + { + "epoch": 3.454855195911414, + "grad_norm": 0.7133861780166626, + "learning_rate": 0.0002, + "loss": 1.3929, + "step": 5070 + }, + { + "epoch": 3.4616695059625213, + "grad_norm": 0.6631823182106018, + "learning_rate": 0.0002, + "loss": 1.4632, + "step": 5080 + }, + { + "epoch": 3.4684838160136287, + "grad_norm": 0.673870325088501, + "learning_rate": 0.0002, + "loss": 1.4162, + "step": 5090 + }, + { + "epoch": 3.475298126064736, + "grad_norm": 0.6438634395599365, + "learning_rate": 0.0002, + "loss": 1.4247, + "step": 5100 + }, + { + "epoch": 3.4821124361158433, + "grad_norm": 0.7560495734214783, + "learning_rate": 0.0002, + "loss": 1.4421, + "step": 5110 + }, + { + "epoch": 3.4889267461669506, + "grad_norm": 0.6877814531326294, + "learning_rate": 0.0002, + "loss": 1.4125, + "step": 5120 + }, + { + "epoch": 3.495741056218058, + "grad_norm": 0.7031328678131104, + "learning_rate": 0.0002, + "loss": 1.4308, + "step": 5130 + }, + { + "epoch": 3.5025553662691653, + "grad_norm": 0.6797195672988892, + "learning_rate": 0.0002, + "loss": 1.3705, + "step": 5140 + }, + { + "epoch": 3.5093696763202726, + "grad_norm": 0.6766413450241089, + "learning_rate": 0.0002, + "loss": 1.4687, + "step": 5150 + }, + { + "epoch": 3.51618398637138, + "grad_norm": 0.666656494140625, + "learning_rate": 0.0002, + "loss": 1.4194, + "step": 5160 + }, + { + "epoch": 3.5229982964224873, + "grad_norm": 0.74996417760849, + "learning_rate": 0.0002, + "loss": 1.469, + "step": 5170 + }, + { + "epoch": 3.5298126064735946, + "grad_norm": 0.7370911836624146, + "learning_rate": 0.0002, + "loss": 1.4848, + "step": 5180 + }, + { + "epoch": 3.536626916524702, + "grad_norm": 0.9063456654548645, + "learning_rate": 0.0002, + "loss": 1.4523, + "step": 5190 + }, + { + "epoch": 3.5434412265758093, + "grad_norm": 0.6861422657966614, + "learning_rate": 0.0002, + "loss": 1.4726, + "step": 5200 + }, + { + "epoch": 3.5502555366269166, + "grad_norm": 0.7104039788246155, + "learning_rate": 0.0002, + "loss": 1.4803, + "step": 5210 + }, + { + "epoch": 3.557069846678024, + "grad_norm": 0.6578653454780579, + "learning_rate": 0.0002, + "loss": 1.4313, + "step": 5220 + }, + { + "epoch": 3.5638841567291313, + "grad_norm": 0.7336562275886536, + "learning_rate": 0.0002, + "loss": 1.4596, + "step": 5230 + }, + { + "epoch": 3.5706984667802386, + "grad_norm": 0.7163010835647583, + "learning_rate": 0.0002, + "loss": 1.4591, + "step": 5240 + }, + { + "epoch": 3.577512776831346, + "grad_norm": 0.8112391233444214, + "learning_rate": 0.0002, + "loss": 1.3814, + "step": 5250 + }, + { + "epoch": 3.5843270868824533, + "grad_norm": 0.7260391116142273, + "learning_rate": 0.0002, + "loss": 1.4249, + "step": 5260 + }, + { + "epoch": 3.5911413969335606, + "grad_norm": 0.7038731575012207, + "learning_rate": 0.0002, + "loss": 1.4249, + "step": 5270 + }, + { + "epoch": 3.597955706984668, + "grad_norm": 0.7864376902580261, + "learning_rate": 0.0002, + "loss": 1.4172, + "step": 5280 + }, + { + "epoch": 3.6047700170357753, + "grad_norm": 0.6968383193016052, + "learning_rate": 0.0002, + "loss": 1.4637, + "step": 5290 + }, + { + "epoch": 3.6115843270868826, + "grad_norm": 0.6726206541061401, + "learning_rate": 0.0002, + "loss": 1.5269, + "step": 5300 + }, + { + "epoch": 3.61839863713799, + "grad_norm": 0.6716854572296143, + "learning_rate": 0.0002, + "loss": 1.4199, + "step": 5310 + }, + { + "epoch": 3.6252129471890973, + "grad_norm": 0.7229742407798767, + "learning_rate": 0.0002, + "loss": 1.4686, + "step": 5320 + }, + { + "epoch": 3.6320272572402046, + "grad_norm": 0.7338683009147644, + "learning_rate": 0.0002, + "loss": 1.4441, + "step": 5330 + }, + { + "epoch": 3.638841567291312, + "grad_norm": 0.771672785282135, + "learning_rate": 0.0002, + "loss": 1.4116, + "step": 5340 + }, + { + "epoch": 3.645655877342419, + "grad_norm": 0.7024078369140625, + "learning_rate": 0.0002, + "loss": 1.4007, + "step": 5350 + }, + { + "epoch": 3.6524701873935266, + "grad_norm": 0.6847538352012634, + "learning_rate": 0.0002, + "loss": 1.4996, + "step": 5360 + }, + { + "epoch": 3.6592844974446335, + "grad_norm": 0.71802818775177, + "learning_rate": 0.0002, + "loss": 1.4111, + "step": 5370 + }, + { + "epoch": 3.6660988074957412, + "grad_norm": 0.78530353307724, + "learning_rate": 0.0002, + "loss": 1.4224, + "step": 5380 + }, + { + "epoch": 3.672913117546848, + "grad_norm": 0.7262226939201355, + "learning_rate": 0.0002, + "loss": 1.4582, + "step": 5390 + }, + { + "epoch": 3.679727427597956, + "grad_norm": 0.7608316540718079, + "learning_rate": 0.0002, + "loss": 1.4704, + "step": 5400 + }, + { + "epoch": 3.686541737649063, + "grad_norm": 0.6994926333427429, + "learning_rate": 0.0002, + "loss": 1.3742, + "step": 5410 + }, + { + "epoch": 3.6933560477001706, + "grad_norm": 0.7888479828834534, + "learning_rate": 0.0002, + "loss": 1.4738, + "step": 5420 + }, + { + "epoch": 3.7001703577512775, + "grad_norm": 0.7053858041763306, + "learning_rate": 0.0002, + "loss": 1.4213, + "step": 5430 + }, + { + "epoch": 3.7069846678023852, + "grad_norm": 0.7063165903091431, + "learning_rate": 0.0002, + "loss": 1.4988, + "step": 5440 + }, + { + "epoch": 3.713798977853492, + "grad_norm": 0.6603744626045227, + "learning_rate": 0.0002, + "loss": 1.4386, + "step": 5450 + }, + { + "epoch": 3.7206132879046, + "grad_norm": 0.7043602466583252, + "learning_rate": 0.0002, + "loss": 1.4695, + "step": 5460 + }, + { + "epoch": 3.7274275979557068, + "grad_norm": 0.7026081681251526, + "learning_rate": 0.0002, + "loss": 1.5051, + "step": 5470 + }, + { + "epoch": 3.7342419080068145, + "grad_norm": 0.7200090289115906, + "learning_rate": 0.0002, + "loss": 1.5613, + "step": 5480 + }, + { + "epoch": 3.7410562180579214, + "grad_norm": 0.7170904278755188, + "learning_rate": 0.0002, + "loss": 1.4182, + "step": 5490 + }, + { + "epoch": 3.747870528109029, + "grad_norm": 0.7489104866981506, + "learning_rate": 0.0002, + "loss": 1.4344, + "step": 5500 + }, + { + "epoch": 3.754684838160136, + "grad_norm": 0.6540989875793457, + "learning_rate": 0.0002, + "loss": 1.4911, + "step": 5510 + }, + { + "epoch": 3.761499148211244, + "grad_norm": 0.6654048562049866, + "learning_rate": 0.0002, + "loss": 1.4955, + "step": 5520 + }, + { + "epoch": 3.7683134582623508, + "grad_norm": 0.6577395796775818, + "learning_rate": 0.0002, + "loss": 1.4487, + "step": 5530 + }, + { + "epoch": 3.7751277683134585, + "grad_norm": 0.7762192487716675, + "learning_rate": 0.0002, + "loss": 1.4283, + "step": 5540 + }, + { + "epoch": 3.7819420783645654, + "grad_norm": 0.6336314678192139, + "learning_rate": 0.0002, + "loss": 1.4727, + "step": 5550 + }, + { + "epoch": 3.7887563884156727, + "grad_norm": 0.7098057866096497, + "learning_rate": 0.0002, + "loss": 1.4588, + "step": 5560 + }, + { + "epoch": 3.79557069846678, + "grad_norm": 0.7379715442657471, + "learning_rate": 0.0002, + "loss": 1.4679, + "step": 5570 + }, + { + "epoch": 3.8023850085178874, + "grad_norm": 0.6726924777030945, + "learning_rate": 0.0002, + "loss": 1.4633, + "step": 5580 + }, + { + "epoch": 3.8091993185689947, + "grad_norm": 1.1212009191513062, + "learning_rate": 0.0002, + "loss": 1.4751, + "step": 5590 + }, + { + "epoch": 3.816013628620102, + "grad_norm": 0.6503795981407166, + "learning_rate": 0.0002, + "loss": 1.4503, + "step": 5600 + }, + { + "epoch": 3.8228279386712094, + "grad_norm": 0.7041325569152832, + "learning_rate": 0.0002, + "loss": 1.4754, + "step": 5610 + }, + { + "epoch": 3.8296422487223167, + "grad_norm": 0.7962933778762817, + "learning_rate": 0.0002, + "loss": 1.4199, + "step": 5620 + }, + { + "epoch": 3.836456558773424, + "grad_norm": 0.6613591909408569, + "learning_rate": 0.0002, + "loss": 1.4672, + "step": 5630 + }, + { + "epoch": 3.8432708688245314, + "grad_norm": 0.7293516397476196, + "learning_rate": 0.0002, + "loss": 1.5688, + "step": 5640 + }, + { + "epoch": 3.8500851788756387, + "grad_norm": 0.7388607859611511, + "learning_rate": 0.0002, + "loss": 1.4149, + "step": 5650 + }, + { + "epoch": 3.856899488926746, + "grad_norm": 0.6440677642822266, + "learning_rate": 0.0002, + "loss": 1.4743, + "step": 5660 + }, + { + "epoch": 3.8637137989778534, + "grad_norm": 0.7729013562202454, + "learning_rate": 0.0002, + "loss": 1.5082, + "step": 5670 + }, + { + "epoch": 3.8705281090289607, + "grad_norm": 0.6696794033050537, + "learning_rate": 0.0002, + "loss": 1.4608, + "step": 5680 + }, + { + "epoch": 3.877342419080068, + "grad_norm": 0.7151781320571899, + "learning_rate": 0.0002, + "loss": 1.472, + "step": 5690 + }, + { + "epoch": 3.8841567291311754, + "grad_norm": 0.6736966371536255, + "learning_rate": 0.0002, + "loss": 1.4923, + "step": 5700 + }, + { + "epoch": 3.8909710391822827, + "grad_norm": 0.7444243431091309, + "learning_rate": 0.0002, + "loss": 1.4453, + "step": 5710 + }, + { + "epoch": 3.89778534923339, + "grad_norm": 0.6701464653015137, + "learning_rate": 0.0002, + "loss": 1.4562, + "step": 5720 + }, + { + "epoch": 3.9045996592844974, + "grad_norm": 0.7231952548027039, + "learning_rate": 0.0002, + "loss": 1.4478, + "step": 5730 + }, + { + "epoch": 3.9114139693356047, + "grad_norm": 0.831954300403595, + "learning_rate": 0.0002, + "loss": 1.4539, + "step": 5740 + }, + { + "epoch": 3.918228279386712, + "grad_norm": 0.7697733640670776, + "learning_rate": 0.0002, + "loss": 1.5122, + "step": 5750 + }, + { + "epoch": 3.9250425894378194, + "grad_norm": 0.6964395046234131, + "learning_rate": 0.0002, + "loss": 1.4552, + "step": 5760 + }, + { + "epoch": 3.9318568994889267, + "grad_norm": 0.6942925453186035, + "learning_rate": 0.0002, + "loss": 1.4688, + "step": 5770 + }, + { + "epoch": 3.938671209540034, + "grad_norm": 0.6491202712059021, + "learning_rate": 0.0002, + "loss": 1.4668, + "step": 5780 + }, + { + "epoch": 3.9454855195911414, + "grad_norm": 0.7004382610321045, + "learning_rate": 0.0002, + "loss": 1.4404, + "step": 5790 + }, + { + "epoch": 3.9522998296422487, + "grad_norm": 0.7337747812271118, + "learning_rate": 0.0002, + "loss": 1.5022, + "step": 5800 + }, + { + "epoch": 3.959114139693356, + "grad_norm": 0.6923640966415405, + "learning_rate": 0.0002, + "loss": 1.5314, + "step": 5810 + }, + { + "epoch": 3.9659284497444633, + "grad_norm": 0.6815266609191895, + "learning_rate": 0.0002, + "loss": 1.4811, + "step": 5820 + }, + { + "epoch": 3.9727427597955707, + "grad_norm": 0.6755654811859131, + "learning_rate": 0.0002, + "loss": 1.437, + "step": 5830 + }, + { + "epoch": 3.979557069846678, + "grad_norm": 0.6912487149238586, + "learning_rate": 0.0002, + "loss": 1.4277, + "step": 5840 + }, + { + "epoch": 3.9863713798977853, + "grad_norm": 0.6948044896125793, + "learning_rate": 0.0002, + "loss": 1.4654, + "step": 5850 + }, + { + "epoch": 3.9931856899488927, + "grad_norm": 0.6735455989837646, + "learning_rate": 0.0002, + "loss": 1.4779, + "step": 5860 + }, + { + "epoch": 4.0, + "grad_norm": 0.7005048990249634, + "learning_rate": 0.0002, + "loss": 1.5102, + "step": 5870 + }, + { + "epoch": 4.0, + "eval_loss": 1.923058032989502, + "eval_runtime": 58.9903, + "eval_samples_per_second": 8.595, + "eval_steps_per_second": 1.085, + "step": 5870 + }, + { + "epoch": 4.006814310051107, + "grad_norm": 0.809018075466156, + "learning_rate": 0.0002, + "loss": 1.2417, + "step": 5880 + }, + { + "epoch": 4.013628620102215, + "grad_norm": 0.9499403238296509, + "learning_rate": 0.0002, + "loss": 1.2874, + "step": 5890 + }, + { + "epoch": 4.0204429301533215, + "grad_norm": 0.7944574356079102, + "learning_rate": 0.0002, + "loss": 1.2245, + "step": 5900 + }, + { + "epoch": 4.027257240204429, + "grad_norm": 0.9501046538352966, + "learning_rate": 0.0002, + "loss": 1.2751, + "step": 5910 + }, + { + "epoch": 4.034071550255536, + "grad_norm": 0.8247923254966736, + "learning_rate": 0.0002, + "loss": 1.2706, + "step": 5920 + }, + { + "epoch": 4.040885860306644, + "grad_norm": 0.9358038902282715, + "learning_rate": 0.0002, + "loss": 1.2762, + "step": 5930 + }, + { + "epoch": 4.047700170357751, + "grad_norm": 1.0102452039718628, + "learning_rate": 0.0002, + "loss": 1.2953, + "step": 5940 + }, + { + "epoch": 4.054514480408859, + "grad_norm": 1.0248252153396606, + "learning_rate": 0.0002, + "loss": 1.216, + "step": 5950 + }, + { + "epoch": 4.0613287904599655, + "grad_norm": 1.0438553094863892, + "learning_rate": 0.0002, + "loss": 1.2115, + "step": 5960 + }, + { + "epoch": 4.068143100511073, + "grad_norm": 0.7964957356452942, + "learning_rate": 0.0002, + "loss": 1.2516, + "step": 5970 + }, + { + "epoch": 4.07495741056218, + "grad_norm": 0.9757015109062195, + "learning_rate": 0.0002, + "loss": 1.1555, + "step": 5980 + }, + { + "epoch": 4.081771720613288, + "grad_norm": 0.9157161116600037, + "learning_rate": 0.0002, + "loss": 1.2243, + "step": 5990 + }, + { + "epoch": 4.088586030664395, + "grad_norm": 0.9372851848602295, + "learning_rate": 0.0002, + "loss": 1.2481, + "step": 6000 + }, + { + "epoch": 4.095400340715503, + "grad_norm": 1.240779995918274, + "learning_rate": 0.0002, + "loss": 1.2091, + "step": 6010 + }, + { + "epoch": 4.1022146507666095, + "grad_norm": 0.8394840359687805, + "learning_rate": 0.0002, + "loss": 1.1727, + "step": 6020 + }, + { + "epoch": 4.109028960817717, + "grad_norm": 1.1081455945968628, + "learning_rate": 0.0002, + "loss": 1.2926, + "step": 6030 + }, + { + "epoch": 4.115843270868824, + "grad_norm": 0.9227745532989502, + "learning_rate": 0.0002, + "loss": 1.2417, + "step": 6040 + }, + { + "epoch": 4.122657580919932, + "grad_norm": 0.8487664461135864, + "learning_rate": 0.0002, + "loss": 1.1994, + "step": 6050 + }, + { + "epoch": 4.129471890971039, + "grad_norm": 0.9643339514732361, + "learning_rate": 0.0002, + "loss": 1.2378, + "step": 6060 + }, + { + "epoch": 4.136286201022147, + "grad_norm": 1.0296099185943604, + "learning_rate": 0.0002, + "loss": 1.2254, + "step": 6070 + }, + { + "epoch": 4.1431005110732535, + "grad_norm": 0.9534215927124023, + "learning_rate": 0.0002, + "loss": 1.2419, + "step": 6080 + }, + { + "epoch": 4.149914821124361, + "grad_norm": 0.9647086262702942, + "learning_rate": 0.0002, + "loss": 1.1849, + "step": 6090 + }, + { + "epoch": 4.156729131175468, + "grad_norm": 1.084836721420288, + "learning_rate": 0.0002, + "loss": 1.2713, + "step": 6100 + }, + { + "epoch": 4.163543441226576, + "grad_norm": 0.9315235614776611, + "learning_rate": 0.0002, + "loss": 1.1788, + "step": 6110 + }, + { + "epoch": 4.170357751277683, + "grad_norm": 0.9541679620742798, + "learning_rate": 0.0002, + "loss": 1.17, + "step": 6120 + }, + { + "epoch": 4.177172061328791, + "grad_norm": 0.9792100191116333, + "learning_rate": 0.0002, + "loss": 1.1407, + "step": 6130 + }, + { + "epoch": 4.1839863713798975, + "grad_norm": 1.065783143043518, + "learning_rate": 0.0002, + "loss": 1.2069, + "step": 6140 + }, + { + "epoch": 4.190800681431005, + "grad_norm": 1.036161184310913, + "learning_rate": 0.0002, + "loss": 1.2512, + "step": 6150 + }, + { + "epoch": 4.197614991482112, + "grad_norm": 0.8979679942131042, + "learning_rate": 0.0002, + "loss": 1.2371, + "step": 6160 + }, + { + "epoch": 4.20442930153322, + "grad_norm": 0.7584333419799805, + "learning_rate": 0.0002, + "loss": 1.2212, + "step": 6170 + }, + { + "epoch": 4.211243611584327, + "grad_norm": 1.1970131397247314, + "learning_rate": 0.0002, + "loss": 1.2128, + "step": 6180 + }, + { + "epoch": 4.218057921635435, + "grad_norm": 2.6447298526763916, + "learning_rate": 0.0002, + "loss": 1.1982, + "step": 6190 + }, + { + "epoch": 4.2248722316865415, + "grad_norm": 0.9357487559318542, + "learning_rate": 0.0002, + "loss": 1.2465, + "step": 6200 + }, + { + "epoch": 4.231686541737649, + "grad_norm": 0.9141183495521545, + "learning_rate": 0.0002, + "loss": 1.2963, + "step": 6210 + }, + { + "epoch": 4.238500851788756, + "grad_norm": 1.0606296062469482, + "learning_rate": 0.0002, + "loss": 1.1959, + "step": 6220 + }, + { + "epoch": 4.245315161839864, + "grad_norm": 0.9999088048934937, + "learning_rate": 0.0002, + "loss": 1.2629, + "step": 6230 + }, + { + "epoch": 4.252129471890971, + "grad_norm": 0.9469764232635498, + "learning_rate": 0.0002, + "loss": 1.1471, + "step": 6240 + }, + { + "epoch": 4.258943781942079, + "grad_norm": 1.1508198976516724, + "learning_rate": 0.0002, + "loss": 1.223, + "step": 6250 + }, + { + "epoch": 4.2657580919931855, + "grad_norm": 1.2576130628585815, + "learning_rate": 0.0002, + "loss": 1.2677, + "step": 6260 + }, + { + "epoch": 4.272572402044293, + "grad_norm": 0.9435968399047852, + "learning_rate": 0.0002, + "loss": 1.2216, + "step": 6270 + }, + { + "epoch": 4.2793867120954, + "grad_norm": 0.9290348887443542, + "learning_rate": 0.0002, + "loss": 1.2788, + "step": 6280 + }, + { + "epoch": 4.286201022146508, + "grad_norm": 0.9973701238632202, + "learning_rate": 0.0002, + "loss": 1.2631, + "step": 6290 + }, + { + "epoch": 4.293015332197615, + "grad_norm": 1.012855887413025, + "learning_rate": 0.0002, + "loss": 1.2276, + "step": 6300 + }, + { + "epoch": 4.2998296422487225, + "grad_norm": 0.8371705412864685, + "learning_rate": 0.0002, + "loss": 1.2115, + "step": 6310 + }, + { + "epoch": 4.306643952299829, + "grad_norm": 1.0867925882339478, + "learning_rate": 0.0002, + "loss": 1.2423, + "step": 6320 + }, + { + "epoch": 4.313458262350937, + "grad_norm": 0.9763767123222351, + "learning_rate": 0.0002, + "loss": 1.2262, + "step": 6330 + }, + { + "epoch": 4.320272572402044, + "grad_norm": 1.1844252347946167, + "learning_rate": 0.0002, + "loss": 1.2557, + "step": 6340 + }, + { + "epoch": 4.327086882453152, + "grad_norm": 0.8292830586433411, + "learning_rate": 0.0002, + "loss": 1.2635, + "step": 6350 + }, + { + "epoch": 4.333901192504259, + "grad_norm": 0.9351436495780945, + "learning_rate": 0.0002, + "loss": 1.262, + "step": 6360 + }, + { + "epoch": 4.3407155025553665, + "grad_norm": 1.0425835847854614, + "learning_rate": 0.0002, + "loss": 1.2678, + "step": 6370 + }, + { + "epoch": 4.347529812606473, + "grad_norm": 0.8894261121749878, + "learning_rate": 0.0002, + "loss": 1.2476, + "step": 6380 + }, + { + "epoch": 4.354344122657581, + "grad_norm": 0.9663366079330444, + "learning_rate": 0.0002, + "loss": 1.2965, + "step": 6390 + }, + { + "epoch": 4.361158432708688, + "grad_norm": 0.8915578126907349, + "learning_rate": 0.0002, + "loss": 1.2529, + "step": 6400 + }, + { + "epoch": 4.367972742759796, + "grad_norm": 1.0393000841140747, + "learning_rate": 0.0002, + "loss": 1.2573, + "step": 6410 + }, + { + "epoch": 4.374787052810903, + "grad_norm": 0.917398989200592, + "learning_rate": 0.0002, + "loss": 1.2254, + "step": 6420 + }, + { + "epoch": 4.3816013628620105, + "grad_norm": 1.0496646165847778, + "learning_rate": 0.0002, + "loss": 1.3115, + "step": 6430 + }, + { + "epoch": 4.388415672913117, + "grad_norm": 0.9349859356880188, + "learning_rate": 0.0002, + "loss": 1.2607, + "step": 6440 + }, + { + "epoch": 4.395229982964225, + "grad_norm": 1.0981004238128662, + "learning_rate": 0.0002, + "loss": 1.3414, + "step": 6450 + }, + { + "epoch": 4.402044293015332, + "grad_norm": 0.9794871807098389, + "learning_rate": 0.0002, + "loss": 1.2391, + "step": 6460 + }, + { + "epoch": 4.40885860306644, + "grad_norm": 0.9321421384811401, + "learning_rate": 0.0002, + "loss": 1.208, + "step": 6470 + }, + { + "epoch": 4.415672913117547, + "grad_norm": 0.9158342480659485, + "learning_rate": 0.0002, + "loss": 1.3398, + "step": 6480 + }, + { + "epoch": 4.4224872231686545, + "grad_norm": 0.9462087750434875, + "learning_rate": 0.0002, + "loss": 1.1832, + "step": 6490 + }, + { + "epoch": 4.429301533219761, + "grad_norm": 0.9740175604820251, + "learning_rate": 0.0002, + "loss": 1.2366, + "step": 6500 + }, + { + "epoch": 4.436115843270869, + "grad_norm": 0.8477463126182556, + "learning_rate": 0.0002, + "loss": 1.3074, + "step": 6510 + }, + { + "epoch": 4.442930153321976, + "grad_norm": 1.0296647548675537, + "learning_rate": 0.0002, + "loss": 1.2719, + "step": 6520 + }, + { + "epoch": 4.449744463373084, + "grad_norm": 0.9437751173973083, + "learning_rate": 0.0002, + "loss": 1.2647, + "step": 6530 + }, + { + "epoch": 4.456558773424191, + "grad_norm": 1.011192798614502, + "learning_rate": 0.0002, + "loss": 1.2043, + "step": 6540 + }, + { + "epoch": 4.4633730834752985, + "grad_norm": 0.8836222290992737, + "learning_rate": 0.0002, + "loss": 1.3673, + "step": 6550 + }, + { + "epoch": 4.470187393526405, + "grad_norm": 1.2799941301345825, + "learning_rate": 0.0002, + "loss": 1.3028, + "step": 6560 + }, + { + "epoch": 4.477001703577513, + "grad_norm": 0.925910472869873, + "learning_rate": 0.0002, + "loss": 1.2789, + "step": 6570 + }, + { + "epoch": 4.48381601362862, + "grad_norm": 0.957401692867279, + "learning_rate": 0.0002, + "loss": 1.2723, + "step": 6580 + }, + { + "epoch": 4.490630323679728, + "grad_norm": 1.0789544582366943, + "learning_rate": 0.0002, + "loss": 1.242, + "step": 6590 + }, + { + "epoch": 4.497444633730835, + "grad_norm": 0.8874586820602417, + "learning_rate": 0.0002, + "loss": 1.2553, + "step": 6600 + }, + { + "epoch": 4.504258943781942, + "grad_norm": 0.9394784569740295, + "learning_rate": 0.0002, + "loss": 1.2779, + "step": 6610 + }, + { + "epoch": 4.511073253833049, + "grad_norm": 1.029640793800354, + "learning_rate": 0.0002, + "loss": 1.2744, + "step": 6620 + }, + { + "epoch": 4.517887563884157, + "grad_norm": 0.9510841965675354, + "learning_rate": 0.0002, + "loss": 1.2634, + "step": 6630 + }, + { + "epoch": 4.524701873935264, + "grad_norm": 0.9992963671684265, + "learning_rate": 0.0002, + "loss": 1.2562, + "step": 6640 + }, + { + "epoch": 4.531516183986371, + "grad_norm": 0.9312878847122192, + "learning_rate": 0.0002, + "loss": 1.2942, + "step": 6650 + }, + { + "epoch": 4.538330494037479, + "grad_norm": 0.9406482577323914, + "learning_rate": 0.0002, + "loss": 1.2572, + "step": 6660 + }, + { + "epoch": 4.5451448040885865, + "grad_norm": 1.1058286428451538, + "learning_rate": 0.0002, + "loss": 1.2283, + "step": 6670 + }, + { + "epoch": 4.551959114139693, + "grad_norm": 0.9389635920524597, + "learning_rate": 0.0002, + "loss": 1.2391, + "step": 6680 + }, + { + "epoch": 4.5587734241908, + "grad_norm": 1.0356028079986572, + "learning_rate": 0.0002, + "loss": 1.2696, + "step": 6690 + }, + { + "epoch": 4.565587734241908, + "grad_norm": 0.9370909929275513, + "learning_rate": 0.0002, + "loss": 1.2935, + "step": 6700 + }, + { + "epoch": 4.572402044293016, + "grad_norm": 0.9917567372322083, + "learning_rate": 0.0002, + "loss": 1.2914, + "step": 6710 + }, + { + "epoch": 4.579216354344123, + "grad_norm": 0.9065384864807129, + "learning_rate": 0.0002, + "loss": 1.3318, + "step": 6720 + }, + { + "epoch": 4.5860306643952296, + "grad_norm": 1.3347833156585693, + "learning_rate": 0.0002, + "loss": 1.2909, + "step": 6730 + }, + { + "epoch": 4.592844974446337, + "grad_norm": 0.910632312297821, + "learning_rate": 0.0002, + "loss": 1.3322, + "step": 6740 + }, + { + "epoch": 4.599659284497445, + "grad_norm": 0.8874805569648743, + "learning_rate": 0.0002, + "loss": 1.2584, + "step": 6750 + }, + { + "epoch": 4.606473594548552, + "grad_norm": 0.9355664253234863, + "learning_rate": 0.0002, + "loss": 1.3173, + "step": 6760 + }, + { + "epoch": 4.613287904599659, + "grad_norm": 0.9360204339027405, + "learning_rate": 0.0002, + "loss": 1.3515, + "step": 6770 + }, + { + "epoch": 4.620102214650767, + "grad_norm": 0.9931750893592834, + "learning_rate": 0.0002, + "loss": 1.2326, + "step": 6780 + }, + { + "epoch": 4.626916524701874, + "grad_norm": 0.9195131063461304, + "learning_rate": 0.0002, + "loss": 1.2677, + "step": 6790 + }, + { + "epoch": 4.633730834752981, + "grad_norm": 0.9448373913764954, + "learning_rate": 0.0002, + "loss": 1.3417, + "step": 6800 + }, + { + "epoch": 4.640545144804088, + "grad_norm": 1.162890911102295, + "learning_rate": 0.0002, + "loss": 1.2658, + "step": 6810 + }, + { + "epoch": 4.647359454855196, + "grad_norm": 0.9739466905593872, + "learning_rate": 0.0002, + "loss": 1.2841, + "step": 6820 + }, + { + "epoch": 4.654173764906303, + "grad_norm": 0.9462909698486328, + "learning_rate": 0.0002, + "loss": 1.3068, + "step": 6830 + }, + { + "epoch": 4.660988074957411, + "grad_norm": 1.042639970779419, + "learning_rate": 0.0002, + "loss": 1.284, + "step": 6840 + }, + { + "epoch": 4.6678023850085175, + "grad_norm": 0.8910539150238037, + "learning_rate": 0.0002, + "loss": 1.3337, + "step": 6850 + }, + { + "epoch": 4.674616695059625, + "grad_norm": 1.0806447267532349, + "learning_rate": 0.0002, + "loss": 1.3025, + "step": 6860 + }, + { + "epoch": 4.681431005110732, + "grad_norm": 1.0054864883422852, + "learning_rate": 0.0002, + "loss": 1.2258, + "step": 6870 + }, + { + "epoch": 4.68824531516184, + "grad_norm": 0.7774158120155334, + "learning_rate": 0.0002, + "loss": 1.3261, + "step": 6880 + }, + { + "epoch": 4.695059625212947, + "grad_norm": 0.9729512333869934, + "learning_rate": 0.0002, + "loss": 1.2545, + "step": 6890 + }, + { + "epoch": 4.701873935264055, + "grad_norm": 1.2025411128997803, + "learning_rate": 0.0002, + "loss": 1.3251, + "step": 6900 + }, + { + "epoch": 4.7086882453151615, + "grad_norm": 1.1654069423675537, + "learning_rate": 0.0002, + "loss": 1.3418, + "step": 6910 + }, + { + "epoch": 4.715502555366269, + "grad_norm": 1.1501442193984985, + "learning_rate": 0.0002, + "loss": 1.3091, + "step": 6920 + }, + { + "epoch": 4.722316865417376, + "grad_norm": 1.1083979606628418, + "learning_rate": 0.0002, + "loss": 1.2627, + "step": 6930 + }, + { + "epoch": 4.729131175468484, + "grad_norm": 0.9431378841400146, + "learning_rate": 0.0002, + "loss": 1.2836, + "step": 6940 + }, + { + "epoch": 4.735945485519591, + "grad_norm": 0.9722502827644348, + "learning_rate": 0.0002, + "loss": 1.3381, + "step": 6950 + }, + { + "epoch": 4.742759795570699, + "grad_norm": 0.9094559550285339, + "learning_rate": 0.0002, + "loss": 1.3228, + "step": 6960 + }, + { + "epoch": 4.7495741056218055, + "grad_norm": 0.9918473958969116, + "learning_rate": 0.0002, + "loss": 1.3474, + "step": 6970 + }, + { + "epoch": 4.756388415672913, + "grad_norm": 0.9999690651893616, + "learning_rate": 0.0002, + "loss": 1.3352, + "step": 6980 + }, + { + "epoch": 4.76320272572402, + "grad_norm": 1.0453810691833496, + "learning_rate": 0.0002, + "loss": 1.3579, + "step": 6990 + }, + { + "epoch": 4.770017035775128, + "grad_norm": 1.0167806148529053, + "learning_rate": 0.0002, + "loss": 1.294, + "step": 7000 + }, + { + "epoch": 4.776831345826235, + "grad_norm": 0.8133894801139832, + "learning_rate": 0.0002, + "loss": 1.3247, + "step": 7010 + }, + { + "epoch": 4.783645655877343, + "grad_norm": 0.8000897765159607, + "learning_rate": 0.0002, + "loss": 1.2577, + "step": 7020 + }, + { + "epoch": 4.7904599659284495, + "grad_norm": 0.992080569267273, + "learning_rate": 0.0002, + "loss": 1.2802, + "step": 7030 + }, + { + "epoch": 4.797274275979557, + "grad_norm": 0.9824522137641907, + "learning_rate": 0.0002, + "loss": 1.3269, + "step": 7040 + }, + { + "epoch": 4.804088586030664, + "grad_norm": 0.9808870553970337, + "learning_rate": 0.0002, + "loss": 1.279, + "step": 7050 + }, + { + "epoch": 4.810902896081772, + "grad_norm": 0.9679701924324036, + "learning_rate": 0.0002, + "loss": 1.3342, + "step": 7060 + }, + { + "epoch": 4.817717206132879, + "grad_norm": 0.9895215034484863, + "learning_rate": 0.0002, + "loss": 1.2711, + "step": 7070 + }, + { + "epoch": 4.824531516183987, + "grad_norm": 1.052246332168579, + "learning_rate": 0.0002, + "loss": 1.3008, + "step": 7080 + }, + { + "epoch": 4.8313458262350935, + "grad_norm": 0.9243564605712891, + "learning_rate": 0.0002, + "loss": 1.2874, + "step": 7090 + }, + { + "epoch": 4.838160136286201, + "grad_norm": 0.9545369744300842, + "learning_rate": 0.0002, + "loss": 1.2835, + "step": 7100 + }, + { + "epoch": 4.844974446337308, + "grad_norm": 0.9655884504318237, + "learning_rate": 0.0002, + "loss": 1.31, + "step": 7110 + }, + { + "epoch": 4.851788756388416, + "grad_norm": 0.9708049893379211, + "learning_rate": 0.0002, + "loss": 1.2862, + "step": 7120 + }, + { + "epoch": 4.858603066439523, + "grad_norm": 1.0064880847930908, + "learning_rate": 0.0002, + "loss": 1.3425, + "step": 7130 + }, + { + "epoch": 4.8654173764906306, + "grad_norm": 0.939943790435791, + "learning_rate": 0.0002, + "loss": 1.2899, + "step": 7140 + }, + { + "epoch": 4.872231686541737, + "grad_norm": 1.0750784873962402, + "learning_rate": 0.0002, + "loss": 1.2887, + "step": 7150 + }, + { + "epoch": 4.879045996592845, + "grad_norm": 0.9708989262580872, + "learning_rate": 0.0002, + "loss": 1.3367, + "step": 7160 + }, + { + "epoch": 4.885860306643952, + "grad_norm": 1.0228253602981567, + "learning_rate": 0.0002, + "loss": 1.2797, + "step": 7170 + }, + { + "epoch": 4.89267461669506, + "grad_norm": 0.8963132500648499, + "learning_rate": 0.0002, + "loss": 1.2695, + "step": 7180 + }, + { + "epoch": 4.899488926746167, + "grad_norm": 0.9198015928268433, + "learning_rate": 0.0002, + "loss": 1.3473, + "step": 7190 + }, + { + "epoch": 4.9063032367972745, + "grad_norm": 1.099906086921692, + "learning_rate": 0.0002, + "loss": 1.2541, + "step": 7200 + }, + { + "epoch": 4.913117546848381, + "grad_norm": 1.0624815225601196, + "learning_rate": 0.0002, + "loss": 1.3188, + "step": 7210 + }, + { + "epoch": 4.919931856899489, + "grad_norm": 0.9688444137573242, + "learning_rate": 0.0002, + "loss": 1.3026, + "step": 7220 + }, + { + "epoch": 4.926746166950596, + "grad_norm": 0.867011547088623, + "learning_rate": 0.0002, + "loss": 1.3379, + "step": 7230 + }, + { + "epoch": 4.933560477001704, + "grad_norm": 0.9600282311439514, + "learning_rate": 0.0002, + "loss": 1.289, + "step": 7240 + }, + { + "epoch": 4.940374787052811, + "grad_norm": 0.8979372978210449, + "learning_rate": 0.0002, + "loss": 1.2751, + "step": 7250 + }, + { + "epoch": 4.9471890971039185, + "grad_norm": 0.951474130153656, + "learning_rate": 0.0002, + "loss": 1.3426, + "step": 7260 + }, + { + "epoch": 4.954003407155025, + "grad_norm": 0.824851393699646, + "learning_rate": 0.0002, + "loss": 1.2726, + "step": 7270 + }, + { + "epoch": 4.960817717206133, + "grad_norm": 1.2926591634750366, + "learning_rate": 0.0002, + "loss": 1.2679, + "step": 7280 + }, + { + "epoch": 4.96763202725724, + "grad_norm": 1.1057835817337036, + "learning_rate": 0.0002, + "loss": 1.2974, + "step": 7290 + }, + { + "epoch": 4.974446337308348, + "grad_norm": 0.9814816117286682, + "learning_rate": 0.0002, + "loss": 1.2275, + "step": 7300 + }, + { + "epoch": 4.981260647359455, + "grad_norm": 1.0251333713531494, + "learning_rate": 0.0002, + "loss": 1.3001, + "step": 7310 + }, + { + "epoch": 4.9880749574105625, + "grad_norm": 0.9748668074607849, + "learning_rate": 0.0002, + "loss": 1.3113, + "step": 7320 + }, + { + "epoch": 4.994889267461669, + "grad_norm": 0.8552228808403015, + "learning_rate": 0.0002, + "loss": 1.3595, + "step": 7330 + }, + { + "epoch": 4.999659284497445, + "eval_loss": 2.03971004486084, + "eval_runtime": 67.4144, + "eval_samples_per_second": 7.521, + "eval_steps_per_second": 0.949, + "step": 7337 + }, + { + "epoch": 5.001703577512777, + "grad_norm": 0.8210785388946533, + "learning_rate": 0.0002, + "loss": 1.2464, + "step": 7340 + }, + { + "epoch": 5.008517887563884, + "grad_norm": 1.2577511072158813, + "learning_rate": 0.0002, + "loss": 1.0356, + "step": 7350 + }, + { + "epoch": 5.015332197614992, + "grad_norm": 1.280604362487793, + "learning_rate": 0.0002, + "loss": 0.9944, + "step": 7360 + }, + { + "epoch": 5.022146507666099, + "grad_norm": 1.3985474109649658, + "learning_rate": 0.0002, + "loss": 1.0858, + "step": 7370 + }, + { + "epoch": 5.0289608177172065, + "grad_norm": 1.1621310710906982, + "learning_rate": 0.0002, + "loss": 1.0122, + "step": 7380 + }, + { + "epoch": 5.035775127768313, + "grad_norm": 1.3278541564941406, + "learning_rate": 0.0002, + "loss": 1.05, + "step": 7390 + }, + { + "epoch": 5.042589437819421, + "grad_norm": 1.1166491508483887, + "learning_rate": 0.0002, + "loss": 1.0237, + "step": 7400 + }, + { + "epoch": 5.049403747870528, + "grad_norm": 1.8087667226791382, + "learning_rate": 0.0002, + "loss": 1.0397, + "step": 7410 + }, + { + "epoch": 5.056218057921636, + "grad_norm": 1.1517921686172485, + "learning_rate": 0.0002, + "loss": 1.0191, + "step": 7420 + }, + { + "epoch": 5.063032367972743, + "grad_norm": 1.2875889539718628, + "learning_rate": 0.0002, + "loss": 1.025, + "step": 7430 + }, + { + "epoch": 5.0698466780238505, + "grad_norm": 1.199702262878418, + "learning_rate": 0.0002, + "loss": 1.043, + "step": 7440 + }, + { + "epoch": 5.076660988074957, + "grad_norm": 1.2912452220916748, + "learning_rate": 0.0002, + "loss": 1.0176, + "step": 7450 + }, + { + "epoch": 5.083475298126065, + "grad_norm": 1.1446452140808105, + "learning_rate": 0.0002, + "loss": 1.0042, + "step": 7460 + }, + { + "epoch": 5.090289608177172, + "grad_norm": 1.3625746965408325, + "learning_rate": 0.0002, + "loss": 1.047, + "step": 7470 + }, + { + "epoch": 5.09710391822828, + "grad_norm": 1.2116546630859375, + "learning_rate": 0.0002, + "loss": 1.052, + "step": 7480 + }, + { + "epoch": 5.103918228279387, + "grad_norm": 1.3896098136901855, + "learning_rate": 0.0002, + "loss": 1.1041, + "step": 7490 + }, + { + "epoch": 5.1107325383304945, + "grad_norm": 1.6265277862548828, + "learning_rate": 0.0002, + "loss": 1.0668, + "step": 7500 + }, + { + "epoch": 5.117546848381601, + "grad_norm": 1.1468392610549927, + "learning_rate": 0.0002, + "loss": 1.028, + "step": 7510 + }, + { + "epoch": 5.124361158432709, + "grad_norm": 1.2649329900741577, + "learning_rate": 0.0002, + "loss": 0.9915, + "step": 7520 + }, + { + "epoch": 5.131175468483816, + "grad_norm": 1.1866015195846558, + "learning_rate": 0.0002, + "loss": 1.0251, + "step": 7530 + }, + { + "epoch": 5.137989778534923, + "grad_norm": 1.1517255306243896, + "learning_rate": 0.0002, + "loss": 1.0626, + "step": 7540 + }, + { + "epoch": 5.144804088586031, + "grad_norm": 1.3475146293640137, + "learning_rate": 0.0002, + "loss": 1.0303, + "step": 7550 + }, + { + "epoch": 5.151618398637138, + "grad_norm": 1.1167018413543701, + "learning_rate": 0.0002, + "loss": 1.0456, + "step": 7560 + }, + { + "epoch": 5.158432708688245, + "grad_norm": 1.209572434425354, + "learning_rate": 0.0002, + "loss": 1.04, + "step": 7570 + }, + { + "epoch": 5.165247018739352, + "grad_norm": 1.3578280210494995, + "learning_rate": 0.0002, + "loss": 1.0533, + "step": 7580 + }, + { + "epoch": 5.17206132879046, + "grad_norm": 1.2447012662887573, + "learning_rate": 0.0002, + "loss": 1.0958, + "step": 7590 + }, + { + "epoch": 5.178875638841567, + "grad_norm": 1.3715848922729492, + "learning_rate": 0.0002, + "loss": 1.0521, + "step": 7600 + }, + { + "epoch": 5.185689948892675, + "grad_norm": 1.435860276222229, + "learning_rate": 0.0002, + "loss": 1.0556, + "step": 7610 + }, + { + "epoch": 5.1925042589437815, + "grad_norm": 1.4093858003616333, + "learning_rate": 0.0002, + "loss": 1.0504, + "step": 7620 + }, + { + "epoch": 5.199318568994889, + "grad_norm": 1.1747535467147827, + "learning_rate": 0.0002, + "loss": 1.083, + "step": 7630 + }, + { + "epoch": 5.206132879045996, + "grad_norm": 1.4704833030700684, + "learning_rate": 0.0002, + "loss": 1.048, + "step": 7640 + }, + { + "epoch": 5.212947189097104, + "grad_norm": 1.2270972728729248, + "learning_rate": 0.0002, + "loss": 0.9991, + "step": 7650 + }, + { + "epoch": 5.219761499148211, + "grad_norm": 1.2215691804885864, + "learning_rate": 0.0002, + "loss": 1.0738, + "step": 7660 + }, + { + "epoch": 5.226575809199319, + "grad_norm": 1.3641486167907715, + "learning_rate": 0.0002, + "loss": 1.0628, + "step": 7670 + }, + { + "epoch": 5.2333901192504255, + "grad_norm": 1.3532041311264038, + "learning_rate": 0.0002, + "loss": 1.1066, + "step": 7680 + }, + { + "epoch": 5.240204429301533, + "grad_norm": 1.2243095636367798, + "learning_rate": 0.0002, + "loss": 1.0209, + "step": 7690 + }, + { + "epoch": 5.24701873935264, + "grad_norm": 1.3644746541976929, + "learning_rate": 0.0002, + "loss": 1.0503, + "step": 7700 + }, + { + "epoch": 5.253833049403748, + "grad_norm": 1.18478262424469, + "learning_rate": 0.0002, + "loss": 1.0406, + "step": 7710 + }, + { + "epoch": 5.260647359454855, + "grad_norm": 1.2146114110946655, + "learning_rate": 0.0002, + "loss": 1.1023, + "step": 7720 + }, + { + "epoch": 5.267461669505963, + "grad_norm": 1.233984112739563, + "learning_rate": 0.0002, + "loss": 1.1528, + "step": 7730 + }, + { + "epoch": 5.2742759795570695, + "grad_norm": 1.3709665536880493, + "learning_rate": 0.0002, + "loss": 1.0681, + "step": 7740 + }, + { + "epoch": 5.281090289608177, + "grad_norm": 1.36055326461792, + "learning_rate": 0.0002, + "loss": 1.0195, + "step": 7750 + }, + { + "epoch": 5.287904599659284, + "grad_norm": 1.6232351064682007, + "learning_rate": 0.0002, + "loss": 1.0447, + "step": 7760 + }, + { + "epoch": 5.294718909710392, + "grad_norm": 1.3359960317611694, + "learning_rate": 0.0002, + "loss": 1.0627, + "step": 7770 + }, + { + "epoch": 5.301533219761499, + "grad_norm": 1.3815656900405884, + "learning_rate": 0.0002, + "loss": 1.1082, + "step": 7780 + }, + { + "epoch": 5.308347529812607, + "grad_norm": 1.1392076015472412, + "learning_rate": 0.0002, + "loss": 1.0891, + "step": 7790 + }, + { + "epoch": 5.3151618398637135, + "grad_norm": 1.3006905317306519, + "learning_rate": 0.0002, + "loss": 1.0364, + "step": 7800 + }, + { + "epoch": 5.321976149914821, + "grad_norm": 1.503645896911621, + "learning_rate": 0.0002, + "loss": 1.1005, + "step": 7810 + }, + { + "epoch": 5.328790459965928, + "grad_norm": 1.141939640045166, + "learning_rate": 0.0002, + "loss": 1.0075, + "step": 7820 + }, + { + "epoch": 5.335604770017036, + "grad_norm": 1.4654004573822021, + "learning_rate": 0.0002, + "loss": 1.0284, + "step": 7830 + }, + { + "epoch": 5.342419080068143, + "grad_norm": 1.4195219278335571, + "learning_rate": 0.0002, + "loss": 1.1185, + "step": 7840 + }, + { + "epoch": 5.349233390119251, + "grad_norm": 1.2354168891906738, + "learning_rate": 0.0002, + "loss": 1.0535, + "step": 7850 + }, + { + "epoch": 5.3560477001703575, + "grad_norm": 1.529862880706787, + "learning_rate": 0.0002, + "loss": 1.0923, + "step": 7860 + }, + { + "epoch": 5.362862010221465, + "grad_norm": 1.364678978919983, + "learning_rate": 0.0002, + "loss": 1.1005, + "step": 7870 + }, + { + "epoch": 5.369676320272572, + "grad_norm": 1.1010444164276123, + "learning_rate": 0.0002, + "loss": 1.1084, + "step": 7880 + }, + { + "epoch": 5.37649063032368, + "grad_norm": 1.1949712038040161, + "learning_rate": 0.0002, + "loss": 1.1225, + "step": 7890 + }, + { + "epoch": 5.383304940374787, + "grad_norm": 1.485922932624817, + "learning_rate": 0.0002, + "loss": 1.058, + "step": 7900 + }, + { + "epoch": 5.390119250425895, + "grad_norm": 1.0844227075576782, + "learning_rate": 0.0002, + "loss": 0.9894, + "step": 7910 + }, + { + "epoch": 5.3969335604770015, + "grad_norm": 1.3784468173980713, + "learning_rate": 0.0002, + "loss": 1.0418, + "step": 7920 + }, + { + "epoch": 5.403747870528109, + "grad_norm": 1.4771490097045898, + "learning_rate": 0.0002, + "loss": 1.0542, + "step": 7930 + }, + { + "epoch": 5.410562180579216, + "grad_norm": 1.2460103034973145, + "learning_rate": 0.0002, + "loss": 1.1265, + "step": 7940 + }, + { + "epoch": 5.417376490630324, + "grad_norm": 1.3047645092010498, + "learning_rate": 0.0002, + "loss": 1.096, + "step": 7950 + }, + { + "epoch": 5.424190800681431, + "grad_norm": 1.1396620273590088, + "learning_rate": 0.0002, + "loss": 1.0956, + "step": 7960 + }, + { + "epoch": 5.4310051107325386, + "grad_norm": 1.4193450212478638, + "learning_rate": 0.0002, + "loss": 1.0685, + "step": 7970 + }, + { + "epoch": 5.437819420783645, + "grad_norm": 1.2085850238800049, + "learning_rate": 0.0002, + "loss": 1.1347, + "step": 7980 + }, + { + "epoch": 5.444633730834753, + "grad_norm": 1.2721607685089111, + "learning_rate": 0.0002, + "loss": 1.0277, + "step": 7990 + }, + { + "epoch": 5.45144804088586, + "grad_norm": 1.4134020805358887, + "learning_rate": 0.0002, + "loss": 1.1316, + "step": 8000 + }, + { + "epoch": 5.458262350936968, + "grad_norm": 1.4283325672149658, + "learning_rate": 0.0002, + "loss": 1.0576, + "step": 8010 + }, + { + "epoch": 5.465076660988075, + "grad_norm": 1.3127079010009766, + "learning_rate": 0.0002, + "loss": 1.0505, + "step": 8020 + }, + { + "epoch": 5.4718909710391825, + "grad_norm": 1.2924352884292603, + "learning_rate": 0.0002, + "loss": 1.0812, + "step": 8030 + }, + { + "epoch": 5.478705281090289, + "grad_norm": 1.8000653982162476, + "learning_rate": 0.0002, + "loss": 1.1178, + "step": 8040 + }, + { + "epoch": 5.485519591141397, + "grad_norm": 1.1538785696029663, + "learning_rate": 0.0002, + "loss": 1.1205, + "step": 8050 + }, + { + "epoch": 5.492333901192504, + "grad_norm": 1.1173290014266968, + "learning_rate": 0.0002, + "loss": 1.1015, + "step": 8060 + }, + { + "epoch": 5.499148211243612, + "grad_norm": 1.1501243114471436, + "learning_rate": 0.0002, + "loss": 1.1597, + "step": 8070 + }, + { + "epoch": 5.505962521294719, + "grad_norm": 1.1335760354995728, + "learning_rate": 0.0002, + "loss": 1.1465, + "step": 8080 + }, + { + "epoch": 5.5127768313458265, + "grad_norm": 1.565274953842163, + "learning_rate": 0.0002, + "loss": 1.1005, + "step": 8090 + }, + { + "epoch": 5.519591141396933, + "grad_norm": 1.3415014743804932, + "learning_rate": 0.0002, + "loss": 1.1085, + "step": 8100 + }, + { + "epoch": 5.526405451448041, + "grad_norm": 1.2377240657806396, + "learning_rate": 0.0002, + "loss": 1.1166, + "step": 8110 + }, + { + "epoch": 5.533219761499148, + "grad_norm": 1.3333637714385986, + "learning_rate": 0.0002, + "loss": 1.0766, + "step": 8120 + }, + { + "epoch": 5.540034071550256, + "grad_norm": 1.2620662450790405, + "learning_rate": 0.0002, + "loss": 1.1515, + "step": 8130 + }, + { + "epoch": 5.546848381601363, + "grad_norm": 1.2806652784347534, + "learning_rate": 0.0002, + "loss": 1.0839, + "step": 8140 + }, + { + "epoch": 5.5536626916524705, + "grad_norm": 1.2057335376739502, + "learning_rate": 0.0002, + "loss": 1.1221, + "step": 8150 + }, + { + "epoch": 5.560477001703577, + "grad_norm": 1.411726951599121, + "learning_rate": 0.0002, + "loss": 1.1292, + "step": 8160 + }, + { + "epoch": 5.567291311754685, + "grad_norm": 1.381104588508606, + "learning_rate": 0.0002, + "loss": 1.0887, + "step": 8170 + }, + { + "epoch": 5.574105621805792, + "grad_norm": 1.3449294567108154, + "learning_rate": 0.0002, + "loss": 1.1317, + "step": 8180 + }, + { + "epoch": 5.5809199318569, + "grad_norm": 1.2791016101837158, + "learning_rate": 0.0002, + "loss": 1.1392, + "step": 8190 + }, + { + "epoch": 5.587734241908007, + "grad_norm": 1.276891827583313, + "learning_rate": 0.0002, + "loss": 1.0972, + "step": 8200 + }, + { + "epoch": 5.5945485519591145, + "grad_norm": 1.3951541185379028, + "learning_rate": 0.0002, + "loss": 1.1001, + "step": 8210 + }, + { + "epoch": 5.601362862010221, + "grad_norm": 1.4167890548706055, + "learning_rate": 0.0002, + "loss": 1.0993, + "step": 8220 + }, + { + "epoch": 5.608177172061329, + "grad_norm": 1.4388375282287598, + "learning_rate": 0.0002, + "loss": 1.0826, + "step": 8230 + }, + { + "epoch": 5.614991482112436, + "grad_norm": 1.210157036781311, + "learning_rate": 0.0002, + "loss": 1.1941, + "step": 8240 + }, + { + "epoch": 5.621805792163544, + "grad_norm": 1.0557862520217896, + "learning_rate": 0.0002, + "loss": 1.0833, + "step": 8250 + }, + { + "epoch": 5.628620102214651, + "grad_norm": 1.2913990020751953, + "learning_rate": 0.0002, + "loss": 1.1197, + "step": 8260 + }, + { + "epoch": 5.6354344122657585, + "grad_norm": 1.2204737663269043, + "learning_rate": 0.0002, + "loss": 1.0346, + "step": 8270 + }, + { + "epoch": 5.642248722316865, + "grad_norm": 1.57016921043396, + "learning_rate": 0.0002, + "loss": 1.1429, + "step": 8280 + }, + { + "epoch": 5.649063032367973, + "grad_norm": 1.0117967128753662, + "learning_rate": 0.0002, + "loss": 1.0988, + "step": 8290 + }, + { + "epoch": 5.65587734241908, + "grad_norm": 1.3195525407791138, + "learning_rate": 0.0002, + "loss": 1.0786, + "step": 8300 + }, + { + "epoch": 5.662691652470187, + "grad_norm": 1.2566497325897217, + "learning_rate": 0.0002, + "loss": 1.0618, + "step": 8310 + }, + { + "epoch": 5.669505962521295, + "grad_norm": 1.1446818113327026, + "learning_rate": 0.0002, + "loss": 1.1635, + "step": 8320 + }, + { + "epoch": 5.6763202725724025, + "grad_norm": 1.2928680181503296, + "learning_rate": 0.0002, + "loss": 1.2201, + "step": 8330 + }, + { + "epoch": 5.683134582623509, + "grad_norm": 1.2823996543884277, + "learning_rate": 0.0002, + "loss": 1.1488, + "step": 8340 + }, + { + "epoch": 5.689948892674616, + "grad_norm": 1.1523874998092651, + "learning_rate": 0.0002, + "loss": 1.0686, + "step": 8350 + }, + { + "epoch": 5.696763202725724, + "grad_norm": 1.0819287300109863, + "learning_rate": 0.0002, + "loss": 1.0938, + "step": 8360 + }, + { + "epoch": 5.703577512776832, + "grad_norm": 1.2384417057037354, + "learning_rate": 0.0002, + "loss": 1.167, + "step": 8370 + }, + { + "epoch": 5.710391822827939, + "grad_norm": 1.1733224391937256, + "learning_rate": 0.0002, + "loss": 1.1136, + "step": 8380 + }, + { + "epoch": 5.7172061328790456, + "grad_norm": 1.3173418045043945, + "learning_rate": 0.0002, + "loss": 1.1041, + "step": 8390 + }, + { + "epoch": 5.724020442930153, + "grad_norm": 1.285880446434021, + "learning_rate": 0.0002, + "loss": 1.1014, + "step": 8400 + }, + { + "epoch": 5.730834752981261, + "grad_norm": 1.1404874324798584, + "learning_rate": 0.0002, + "loss": 1.1161, + "step": 8410 + }, + { + "epoch": 5.737649063032368, + "grad_norm": 1.2432540655136108, + "learning_rate": 0.0002, + "loss": 1.192, + "step": 8420 + }, + { + "epoch": 5.744463373083475, + "grad_norm": 1.2432233095169067, + "learning_rate": 0.0002, + "loss": 1.1702, + "step": 8430 + }, + { + "epoch": 5.751277683134583, + "grad_norm": 1.154496669769287, + "learning_rate": 0.0002, + "loss": 1.1357, + "step": 8440 + }, + { + "epoch": 5.75809199318569, + "grad_norm": 1.3301030397415161, + "learning_rate": 0.0002, + "loss": 1.1706, + "step": 8450 + }, + { + "epoch": 5.764906303236797, + "grad_norm": 1.243760347366333, + "learning_rate": 0.0002, + "loss": 1.2052, + "step": 8460 + }, + { + "epoch": 5.771720613287904, + "grad_norm": 1.4083361625671387, + "learning_rate": 0.0002, + "loss": 1.1035, + "step": 8470 + }, + { + "epoch": 5.778534923339012, + "grad_norm": 1.5662120580673218, + "learning_rate": 0.0002, + "loss": 1.1362, + "step": 8480 + }, + { + "epoch": 5.78534923339012, + "grad_norm": 1.2111139297485352, + "learning_rate": 0.0002, + "loss": 1.1578, + "step": 8490 + }, + { + "epoch": 5.792163543441227, + "grad_norm": 1.2776305675506592, + "learning_rate": 0.0002, + "loss": 1.1333, + "step": 8500 + }, + { + "epoch": 5.7989778534923335, + "grad_norm": 1.1777727603912354, + "learning_rate": 0.0002, + "loss": 1.1439, + "step": 8510 + }, + { + "epoch": 5.805792163543441, + "grad_norm": 1.1696112155914307, + "learning_rate": 0.0002, + "loss": 1.0859, + "step": 8520 + }, + { + "epoch": 5.812606473594548, + "grad_norm": 1.137397289276123, + "learning_rate": 0.0002, + "loss": 1.162, + "step": 8530 + }, + { + "epoch": 5.819420783645656, + "grad_norm": 1.3182098865509033, + "learning_rate": 0.0002, + "loss": 1.2099, + "step": 8540 + }, + { + "epoch": 5.826235093696763, + "grad_norm": 1.359756588935852, + "learning_rate": 0.0002, + "loss": 1.1427, + "step": 8550 + }, + { + "epoch": 5.833049403747871, + "grad_norm": 1.4118162393569946, + "learning_rate": 0.0002, + "loss": 1.1714, + "step": 8560 + }, + { + "epoch": 5.8398637137989775, + "grad_norm": 1.1899290084838867, + "learning_rate": 0.0002, + "loss": 1.1758, + "step": 8570 + }, + { + "epoch": 5.846678023850085, + "grad_norm": 1.1764532327651978, + "learning_rate": 0.0002, + "loss": 1.1511, + "step": 8580 + }, + { + "epoch": 5.853492333901192, + "grad_norm": 1.33274245262146, + "learning_rate": 0.0002, + "loss": 1.1633, + "step": 8590 + }, + { + "epoch": 5.8603066439523, + "grad_norm": 1.2571861743927002, + "learning_rate": 0.0002, + "loss": 1.1092, + "step": 8600 + }, + { + "epoch": 5.867120954003407, + "grad_norm": 1.3523616790771484, + "learning_rate": 0.0002, + "loss": 1.1137, + "step": 8610 + }, + { + "epoch": 5.873935264054515, + "grad_norm": 1.3556902408599854, + "learning_rate": 0.0002, + "loss": 1.2442, + "step": 8620 + }, + { + "epoch": 5.8807495741056215, + "grad_norm": 1.2864879369735718, + "learning_rate": 0.0002, + "loss": 1.0967, + "step": 8630 + }, + { + "epoch": 5.887563884156729, + "grad_norm": 1.2872768640518188, + "learning_rate": 0.0002, + "loss": 1.1491, + "step": 8640 + }, + { + "epoch": 5.894378194207836, + "grad_norm": 1.1446053981781006, + "learning_rate": 0.0002, + "loss": 1.1003, + "step": 8650 + }, + { + "epoch": 5.901192504258944, + "grad_norm": 1.292615532875061, + "learning_rate": 0.0002, + "loss": 1.1095, + "step": 8660 + }, + { + "epoch": 5.908006814310051, + "grad_norm": 1.190891981124878, + "learning_rate": 0.0002, + "loss": 1.2009, + "step": 8670 + }, + { + "epoch": 5.914821124361159, + "grad_norm": 1.330273985862732, + "learning_rate": 0.0002, + "loss": 1.1386, + "step": 8680 + }, + { + "epoch": 5.9216354344122655, + "grad_norm": 1.41121244430542, + "learning_rate": 0.0002, + "loss": 1.1874, + "step": 8690 + }, + { + "epoch": 5.928449744463373, + "grad_norm": 1.1360729932785034, + "learning_rate": 0.0002, + "loss": 1.1573, + "step": 8700 + }, + { + "epoch": 5.93526405451448, + "grad_norm": 1.2220772504806519, + "learning_rate": 0.0002, + "loss": 1.115, + "step": 8710 + }, + { + "epoch": 5.942078364565588, + "grad_norm": 1.1077110767364502, + "learning_rate": 0.0002, + "loss": 1.1696, + "step": 8720 + }, + { + "epoch": 5.948892674616695, + "grad_norm": 1.3632500171661377, + "learning_rate": 0.0002, + "loss": 1.1443, + "step": 8730 + }, + { + "epoch": 5.955706984667803, + "grad_norm": 1.4695830345153809, + "learning_rate": 0.0002, + "loss": 1.1474, + "step": 8740 + }, + { + "epoch": 5.9625212947189095, + "grad_norm": 1.217741847038269, + "learning_rate": 0.0002, + "loss": 1.1825, + "step": 8750 + }, + { + "epoch": 5.969335604770017, + "grad_norm": 1.0386874675750732, + "learning_rate": 0.0002, + "loss": 1.1495, + "step": 8760 + }, + { + "epoch": 5.976149914821124, + "grad_norm": 1.2067872285842896, + "learning_rate": 0.0002, + "loss": 1.1146, + "step": 8770 + }, + { + "epoch": 5.982964224872232, + "grad_norm": 1.3842018842697144, + "learning_rate": 0.0002, + "loss": 1.1987, + "step": 8780 + }, + { + "epoch": 5.989778534923339, + "grad_norm": 1.4584033489227295, + "learning_rate": 0.0002, + "loss": 1.2147, + "step": 8790 + }, + { + "epoch": 5.996592844974447, + "grad_norm": 1.1912888288497925, + "learning_rate": 0.0002, + "loss": 1.2078, + "step": 8800 + }, + { + "epoch": 6.0, + "eval_loss": 2.261807441711426, + "eval_runtime": 68.1125, + "eval_samples_per_second": 7.444, + "eval_steps_per_second": 0.94, + "step": 8805 + } + ], + "logging_steps": 10, + "max_steps": 11736, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.522577802559488e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2f734531f14829705714a5a2703d90c81f36eeb0 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-8805/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2af5d486d0b370637d5a3999ab7668f94f5d53b6497f018967564aef5e8ad133 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/special_tokens_map.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..0acb52c84d6ea33178bee426ec6706bfba8ba637 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/special_tokens_map.json @@ -0,0 +1,28 @@ +{ + "additional_special_tokens": [ + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/tokenizer.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..af0eac5c0056f83b8f3fcdb79165f8847111c305 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f289bc05132635a8bc7aca7aa21255efd5e18f3710f43e3cdb96bcd41be4922 +size 17525357 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/tokenizer.model b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/tokenizer_config.json b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1aa249f4dc9f84e87ad8983458e7800ae5bf5454 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/tokenizer_config.json @@ -0,0 +1,2013 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "4": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "5": { + "content": "<2mass>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "6": { + "content": "[@BOS@]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "7": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "8": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "9": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "10": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "11": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "12": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "13": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "14": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "15": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "16": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "17": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "18": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "19": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "20": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "21": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "22": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "23": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "24": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "25": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "26": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "27": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "28": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "29": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "30": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "31": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "32": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "33": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "34": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "35": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "36": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "37": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "38": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "39": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "40": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "41": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "42": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "43": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "44": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "45": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "46": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "47": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "48": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "49": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "50": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "51": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "52": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "53": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "54": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "55": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "56": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "57": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "58": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "59": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "60": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "61": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "62": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "63": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "64": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "65": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "66": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "67": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "68": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "69": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "70": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "71": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "72": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "73": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "74": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "75": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "76": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "77": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "78": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "79": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "80": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "81": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "82": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "83": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "84": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "85": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "86": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "87": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "88": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "89": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "90": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "91": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "92": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "93": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "94": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "95": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "96": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "97": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "98": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "99": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "100": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "101": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "102": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "103": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "104": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "105": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "106": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "107": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "108": { + "content": "\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "109": { + "content": "\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "110": { + "content": "\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "111": { + "content": "\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "112": { + "content": "\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "113": { + "content": "\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "114": { + "content": "\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "115": { + "content": "\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "116": { + "content": "\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "117": { + "content": "\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "118": { + "content": "\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "119": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "120": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "121": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "122": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "123": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "124": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "125": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "126": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "127": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "128": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "129": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "130": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "131": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "132": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "133": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "134": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "135": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "136": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "137": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "138": { + "content": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "139": { + "content": "▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "140": { + "content": "▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "141": { + "content": "▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "142": { + "content": "▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "143": { + "content": "▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "144": { + "content": "▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "145": { + "content": "▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "146": { + "content": "▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "147": { + "content": "▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "148": { + "content": "▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "149": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "150": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "152": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "153": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "154": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "155": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "156": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "157": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "158": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "159": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "160": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "161": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "162": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "163": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "164": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "165": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "166": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "167": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "168": { + "content": "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "169": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "170": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "172": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "173": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "174": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "175": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "171": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "176": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "177": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "178": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "179": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "180": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "181": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "182": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "183": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "184": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "185": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "186": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "187": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "188": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "189": { + "content": "

", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "190": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "191": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "192": { + "content": "
", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "193": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "194": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "195": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "196": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "197": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "198": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "199": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "200": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "201": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "202": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "203": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "204": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "205": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "206": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "207": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "208": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "209": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "210": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "211": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "212": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "213": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "214": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "215": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "216": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255968": { + "content": "[toxicity=0]", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255969": { + "content": "\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255970": { + "content": "\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255971": { + "content": "\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255972": { + "content": "\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255973": { + "content": "\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255974": { + "content": "\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255975": { + "content": "\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255976": { + "content": "\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255977": { + "content": "\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255978": { + "content": "\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255979": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255980": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255981": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255982": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255983": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255984": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255985": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255986": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255987": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255988": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255989": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255990": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255991": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255992": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255993": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255994": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255995": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255996": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255997": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255998": { + "content": "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "255999": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "additional_special_tokens": [ + "", + "" + ], + "bos_token": "", + "chat_template": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '' + role + '\n' + message['content'] | trim + '\n' }}{% endfor %}{% if add_generation_prompt %}{{'model\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/training_args.bin b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..2f734531f14829705714a5a2703d90c81f36eeb0 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2af5d486d0b370637d5a3999ab7668f94f5d53b6497f018967564aef5e8ad133 +size 5560 diff --git a/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/training_log.jsonl b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/training_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..75c7b288444e164c2b0702358ed8395c47b09407 --- /dev/null +++ b/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/training_log.jsonl @@ -0,0 +1,12 @@ +{"epoch": 0.9996592844974447, "step": 1467, "epoch_duration": 6187.546434402466, "total_accumulated_duration": 6187.546434402466, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0976, "grad_norm": 0.6619049310684204, "learning_rate": 0.0002, "epoch": 0.0068143100511073255, "step": 10}, {"loss": 2.547, "grad_norm": 0.5631795525550842, "learning_rate": 0.0002, "epoch": 0.013628620102214651, "step": 20}, {"loss": 2.3582, "grad_norm": 0.4544294476509094, "learning_rate": 0.0002, "epoch": 0.020442930153321975, "step": 30}, {"loss": 2.115, "grad_norm": 0.660383403301239, "learning_rate": 0.0002, "epoch": 0.027257240204429302, "step": 40}, {"loss": 1.9784, "grad_norm": 0.49150362610816956, "learning_rate": 0.0002, "epoch": 0.034071550255536626, "step": 50}, {"loss": 1.9318, "grad_norm": 0.4280940890312195, "learning_rate": 0.0002, "epoch": 0.04088586030664395, "step": 60}, {"loss": 1.9202, "grad_norm": 0.4371754229068756, "learning_rate": 0.0002, "epoch": 0.04770017035775128, "step": 70}, {"loss": 1.9117, "grad_norm": 0.4179750680923462, "learning_rate": 0.0002, "epoch": 0.054514480408858604, "step": 80}, {"loss": 1.8893, "grad_norm": 0.46386831998825073, "learning_rate": 0.0002, "epoch": 0.06132879045996593, "step": 90}, {"loss": 1.9265, "grad_norm": 0.3902638554573059, "learning_rate": 0.0002, "epoch": 0.06814310051107325, "step": 100}, {"loss": 1.9176, "grad_norm": 0.39120614528656006, "learning_rate": 0.0002, "epoch": 0.07495741056218058, "step": 110}, {"loss": 1.9093, "grad_norm": 0.383449912071228, "learning_rate": 0.0002, "epoch": 0.0817717206132879, "step": 120}, {"loss": 1.9433, "grad_norm": 0.38034161925315857, "learning_rate": 0.0002, "epoch": 0.08858603066439523, "step": 130}, {"loss": 1.8378, "grad_norm": 0.34730958938598633, "learning_rate": 0.0002, "epoch": 0.09540034071550256, "step": 140}, {"loss": 1.9029, "grad_norm": 0.37230566143989563, "learning_rate": 0.0002, "epoch": 0.10221465076660988, "step": 150}, {"loss": 1.8657, "grad_norm": 0.32333120703697205, "learning_rate": 0.0002, "epoch": 0.10902896081771721, "step": 160}, {"loss": 1.8653, "grad_norm": 0.32818782329559326, "learning_rate": 0.0002, "epoch": 0.11584327086882454, "step": 170}, {"loss": 1.8337, "grad_norm": 0.3234839141368866, "learning_rate": 0.0002, "epoch": 0.12265758091993186, "step": 180}, {"loss": 1.8452, "grad_norm": 0.383037269115448, "learning_rate": 0.0002, "epoch": 0.12947189097103917, "step": 190}, {"loss": 1.841, "grad_norm": 0.34148409962654114, "learning_rate": 0.0002, "epoch": 0.1362862010221465, "step": 200}, {"loss": 1.77, "grad_norm": 0.39243417978286743, "learning_rate": 0.0002, "epoch": 0.14310051107325383, "step": 210}, {"loss": 1.8364, "grad_norm": 0.34821203351020813, "learning_rate": 0.0002, "epoch": 0.14991482112436116, "step": 220}, {"loss": 1.8296, "grad_norm": 0.3619382381439209, "learning_rate": 0.0002, "epoch": 0.1567291311754685, "step": 230}, {"loss": 1.8143, "grad_norm": 0.33573633432388306, "learning_rate": 0.0002, "epoch": 0.1635434412265758, "step": 240}, {"loss": 1.785, "grad_norm": 0.32658854126930237, "learning_rate": 0.0002, "epoch": 0.17035775127768313, "step": 250}, {"loss": 1.778, "grad_norm": 0.3388872742652893, "learning_rate": 0.0002, "epoch": 0.17717206132879046, "step": 260}, {"loss": 1.8927, "grad_norm": 0.32334640622138977, "learning_rate": 0.0002, "epoch": 0.1839863713798978, "step": 270}, {"loss": 1.8704, "grad_norm": 0.3492778539657593, "learning_rate": 0.0002, "epoch": 0.19080068143100512, "step": 280}, {"loss": 1.8485, "grad_norm": 0.33015841245651245, "learning_rate": 0.0002, "epoch": 0.19761499148211242, "step": 290}, {"loss": 1.8403, "grad_norm": 0.367026150226593, "learning_rate": 0.0002, "epoch": 0.20442930153321975, "step": 300}, {"loss": 1.9042, "grad_norm": 0.34462663531303406, "learning_rate": 0.0002, "epoch": 0.21124361158432708, "step": 310}, {"loss": 1.8073, "grad_norm": 0.3164310157299042, "learning_rate": 0.0002, "epoch": 0.21805792163543442, "step": 320}, {"loss": 1.7067, "grad_norm": 0.35300973057746887, "learning_rate": 0.0002, "epoch": 0.22487223168654175, "step": 330}, {"loss": 1.8213, "grad_norm": 0.9696137309074402, "learning_rate": 0.0002, "epoch": 0.23168654173764908, "step": 340}, {"loss": 1.8443, "grad_norm": 0.3685234487056732, "learning_rate": 0.0002, "epoch": 0.23850085178875638, "step": 350}, {"loss": 1.803, "grad_norm": 0.337001234292984, "learning_rate": 0.0002, "epoch": 0.2453151618398637, "step": 360}, {"loss": 1.715, "grad_norm": 0.30413344502449036, "learning_rate": 0.0002, "epoch": 0.252129471890971, "step": 370}, {"loss": 1.8557, "grad_norm": 0.3503521978855133, "learning_rate": 0.0002, "epoch": 0.25894378194207834, "step": 380}, {"loss": 1.8005, "grad_norm": 0.33427876234054565, "learning_rate": 0.0002, "epoch": 0.2657580919931857, "step": 390}, {"loss": 1.7888, "grad_norm": 0.3562292456626892, "learning_rate": 0.0002, "epoch": 0.272572402044293, "step": 400}, {"loss": 1.8036, "grad_norm": 0.31621435284614563, "learning_rate": 0.0002, "epoch": 0.27938671209540034, "step": 410}, {"loss": 1.8125, "grad_norm": 0.2902769446372986, "learning_rate": 0.0002, "epoch": 0.28620102214650767, "step": 420}, {"loss": 1.7847, "grad_norm": 0.34336286783218384, "learning_rate": 0.0002, "epoch": 0.293015332197615, "step": 430}, {"loss": 1.7589, "grad_norm": 0.35475897789001465, "learning_rate": 0.0002, "epoch": 0.29982964224872233, "step": 440}, {"loss": 1.8453, "grad_norm": 0.3561566472053528, "learning_rate": 0.0002, "epoch": 0.30664395229982966, "step": 450}, {"loss": 1.773, "grad_norm": 0.333740234375, "learning_rate": 0.0002, "epoch": 0.313458262350937, "step": 460}, {"loss": 1.7766, "grad_norm": 0.3247889280319214, "learning_rate": 0.0002, "epoch": 0.3202725724020443, "step": 470}, {"loss": 1.797, "grad_norm": 0.2977445423603058, "learning_rate": 0.0002, "epoch": 0.3270868824531516, "step": 480}, {"loss": 1.8276, "grad_norm": 0.29708001017570496, "learning_rate": 0.0002, "epoch": 0.3339011925042589, "step": 490}, {"loss": 1.7362, "grad_norm": 0.32514530420303345, "learning_rate": 0.0002, "epoch": 0.34071550255536626, "step": 500}, {"loss": 1.817, "grad_norm": 0.35129407048225403, "learning_rate": 0.0002, "epoch": 0.3475298126064736, "step": 510}, {"loss": 1.8329, "grad_norm": 0.2874673902988434, "learning_rate": 0.0002, "epoch": 0.3543441226575809, "step": 520}, {"loss": 1.7326, "grad_norm": 0.29031169414520264, "learning_rate": 0.0002, "epoch": 0.36115843270868825, "step": 530}, {"loss": 1.8622, "grad_norm": 0.30145788192749023, "learning_rate": 0.0002, "epoch": 0.3679727427597956, "step": 540}, {"loss": 1.8333, "grad_norm": 0.28803712129592896, "learning_rate": 0.0002, "epoch": 0.3747870528109029, "step": 550}, {"loss": 1.7488, "grad_norm": 0.3346766531467438, "learning_rate": 0.0002, "epoch": 0.38160136286201024, "step": 560}, {"loss": 1.8045, "grad_norm": 0.2939014136791229, "learning_rate": 0.0002, "epoch": 0.38841567291311757, "step": 570}, {"loss": 1.8099, "grad_norm": 0.3137381672859192, "learning_rate": 0.0002, "epoch": 0.39522998296422485, "step": 580}, {"loss": 1.7981, "grad_norm": 0.3626657724380493, "learning_rate": 0.0002, "epoch": 0.4020442930153322, "step": 590}, {"loss": 1.7781, "grad_norm": 0.28681913018226624, "learning_rate": 0.0002, "epoch": 0.4088586030664395, "step": 600}, {"loss": 1.8191, "grad_norm": 0.31776726245880127, "learning_rate": 0.0002, "epoch": 0.41567291311754684, "step": 610}, {"loss": 1.753, "grad_norm": 0.3336711823940277, "learning_rate": 0.0002, "epoch": 0.42248722316865417, "step": 620}, {"loss": 1.7662, "grad_norm": 0.34150728583335876, "learning_rate": 0.0002, "epoch": 0.4293015332197615, "step": 630}, {"loss": 1.8155, "grad_norm": 0.298698365688324, "learning_rate": 0.0002, "epoch": 0.43611584327086883, "step": 640}, {"loss": 1.7251, "grad_norm": 0.32022398710250854, "learning_rate": 0.0002, "epoch": 0.44293015332197616, "step": 650}, {"loss": 1.8182, "grad_norm": 0.30933332443237305, "learning_rate": 0.0002, "epoch": 0.4497444633730835, "step": 660}, {"loss": 1.7995, "grad_norm": 0.3054162263870239, "learning_rate": 0.0002, "epoch": 0.4565587734241908, "step": 670}, {"loss": 1.8425, "grad_norm": 0.2973117232322693, "learning_rate": 0.0002, "epoch": 0.46337308347529815, "step": 680}, {"loss": 1.8258, "grad_norm": 0.31043437123298645, "learning_rate": 0.0002, "epoch": 0.47018739352640543, "step": 690}, {"loss": 1.7412, "grad_norm": 0.2790590524673462, "learning_rate": 0.0002, "epoch": 0.47700170357751276, "step": 700}, {"loss": 1.6941, "grad_norm": 0.30055496096611023, "learning_rate": 0.0002, "epoch": 0.4838160136286201, "step": 710}, {"loss": 1.8052, "grad_norm": 0.3318493962287903, "learning_rate": 0.0002, "epoch": 0.4906303236797274, "step": 720}, {"loss": 1.7364, "grad_norm": 0.3010044991970062, "learning_rate": 0.0002, "epoch": 0.49744463373083475, "step": 730}, {"loss": 1.8516, "grad_norm": 0.2797680199146271, "learning_rate": 0.0002, "epoch": 0.504258943781942, "step": 740}, {"loss": 1.7728, "grad_norm": 0.300462543964386, "learning_rate": 0.0002, "epoch": 0.5110732538330494, "step": 750}, {"loss": 1.78, "grad_norm": 0.35925522446632385, "learning_rate": 0.0002, "epoch": 0.5178875638841567, "step": 760}, {"loss": 1.7131, "grad_norm": 0.3014671504497528, "learning_rate": 0.0002, "epoch": 0.524701873935264, "step": 770}, {"loss": 1.7887, "grad_norm": 0.3070279359817505, "learning_rate": 0.0002, "epoch": 0.5315161839863713, "step": 780}, {"loss": 1.8425, "grad_norm": 0.3145890533924103, "learning_rate": 0.0002, "epoch": 0.5383304940374787, "step": 790}, {"loss": 1.8633, "grad_norm": 0.27921077609062195, "learning_rate": 0.0002, "epoch": 0.545144804088586, "step": 800}, {"loss": 1.9057, "grad_norm": 0.322721928358078, "learning_rate": 0.0002, "epoch": 0.5519591141396933, "step": 810}, {"loss": 1.8313, "grad_norm": 0.3108958899974823, "learning_rate": 0.0002, "epoch": 0.5587734241908007, "step": 820}, {"loss": 1.7177, "grad_norm": 0.31835779547691345, "learning_rate": 0.0002, "epoch": 0.565587734241908, "step": 830}, {"loss": 1.8012, "grad_norm": 0.31493470072746277, "learning_rate": 0.0002, "epoch": 0.5724020442930153, "step": 840}, {"loss": 1.6978, "grad_norm": 0.327980637550354, "learning_rate": 0.0002, "epoch": 0.5792163543441227, "step": 850}, {"loss": 1.8043, "grad_norm": 0.2927771508693695, "learning_rate": 0.0002, "epoch": 0.58603066439523, "step": 860}, {"loss": 1.6905, "grad_norm": 0.2817339301109314, "learning_rate": 0.0002, "epoch": 0.5928449744463373, "step": 870}, {"loss": 1.8067, "grad_norm": 0.2886543869972229, "learning_rate": 0.0002, "epoch": 0.5996592844974447, "step": 880}, {"loss": 1.8204, "grad_norm": 0.5515122413635254, "learning_rate": 0.0002, "epoch": 0.606473594548552, "step": 890}, {"loss": 1.7675, "grad_norm": 0.2941148579120636, "learning_rate": 0.0002, "epoch": 0.6132879045996593, "step": 900}, {"loss": 1.8354, "grad_norm": 0.32348746061325073, "learning_rate": 0.0002, "epoch": 0.6201022146507666, "step": 910}, {"loss": 1.7943, "grad_norm": 0.32600894570350647, "learning_rate": 0.0002, "epoch": 0.626916524701874, "step": 920}, {"loss": 1.7613, "grad_norm": 0.3188323378562927, "learning_rate": 0.0002, "epoch": 0.6337308347529813, "step": 930}, {"loss": 1.8504, "grad_norm": 0.34776967763900757, "learning_rate": 0.0002, "epoch": 0.6405451448040886, "step": 940}, {"loss": 1.7072, "grad_norm": 0.2902178466320038, "learning_rate": 0.0002, "epoch": 0.6473594548551959, "step": 950}, {"loss": 1.7168, "grad_norm": 0.3203663229942322, "learning_rate": 0.0002, "epoch": 0.6541737649063032, "step": 960}, {"loss": 1.776, "grad_norm": 0.29058200120925903, "learning_rate": 0.0002, "epoch": 0.6609880749574105, "step": 970}, {"loss": 1.7572, "grad_norm": 0.3275909125804901, "learning_rate": 0.0002, "epoch": 0.6678023850085179, "step": 980}, {"loss": 1.7944, "grad_norm": 0.3337591886520386, "learning_rate": 0.0002, "epoch": 0.6746166950596252, "step": 990}, {"loss": 1.8193, "grad_norm": 0.288392037153244, "learning_rate": 0.0002, "epoch": 0.6814310051107325, "step": 1000}, {"loss": 1.7702, "grad_norm": 0.3153143525123596, "learning_rate": 0.0002, "epoch": 0.6882453151618398, "step": 1010}, {"loss": 1.7805, "grad_norm": 0.28699997067451477, "learning_rate": 0.0002, "epoch": 0.6950596252129472, "step": 1020}, {"loss": 1.7942, "grad_norm": 0.31469282507896423, "learning_rate": 0.0002, "epoch": 0.7018739352640545, "step": 1030}, {"loss": 1.7094, "grad_norm": 0.30225491523742676, "learning_rate": 0.0002, "epoch": 0.7086882453151618, "step": 1040}, {"loss": 1.8735, "grad_norm": 0.30643412470817566, "learning_rate": 0.0002, "epoch": 0.7155025553662692, "step": 1050}, {"loss": 1.7806, "grad_norm": 0.29897069931030273, "learning_rate": 0.0002, "epoch": 0.7223168654173765, "step": 1060}, {"loss": 1.7828, "grad_norm": 0.3009231388568878, "learning_rate": 0.0002, "epoch": 0.7291311754684838, "step": 1070}, {"loss": 1.7809, "grad_norm": 0.3102840185165405, "learning_rate": 0.0002, "epoch": 0.7359454855195912, "step": 1080}, {"loss": 1.7663, "grad_norm": 0.3419271409511566, "learning_rate": 0.0002, "epoch": 0.7427597955706985, "step": 1090}, {"loss": 1.7315, "grad_norm": 0.27658945322036743, "learning_rate": 0.0002, "epoch": 0.7495741056218058, "step": 1100}, {"loss": 1.8064, "grad_norm": 0.2777837812900543, "learning_rate": 0.0002, "epoch": 0.7563884156729132, "step": 1110}, {"loss": 1.8082, "grad_norm": 0.30963730812072754, "learning_rate": 0.0002, "epoch": 0.7632027257240205, "step": 1120}, {"loss": 1.6977, "grad_norm": 0.33374249935150146, "learning_rate": 0.0002, "epoch": 0.7700170357751278, "step": 1130}, {"loss": 1.849, "grad_norm": 0.30809515714645386, "learning_rate": 0.0002, "epoch": 0.7768313458262351, "step": 1140}, {"loss": 1.7024, "grad_norm": 0.3199043273925781, "learning_rate": 0.0002, "epoch": 0.7836456558773425, "step": 1150}, {"loss": 1.8154, "grad_norm": 0.3205869793891907, "learning_rate": 0.0002, "epoch": 0.7904599659284497, "step": 1160}, {"loss": 1.7643, "grad_norm": 0.3073376417160034, "learning_rate": 0.0002, "epoch": 0.797274275979557, "step": 1170}, {"loss": 1.8251, "grad_norm": 0.3235151469707489, "learning_rate": 0.0002, "epoch": 0.8040885860306644, "step": 1180}, {"loss": 1.7758, "grad_norm": 0.31971070170402527, "learning_rate": 0.0002, "epoch": 0.8109028960817717, "step": 1190}, {"loss": 1.7608, "grad_norm": 0.37459835410118103, "learning_rate": 0.0002, "epoch": 0.817717206132879, "step": 1200}, {"loss": 1.7905, "grad_norm": 0.2987082302570343, "learning_rate": 0.0002, "epoch": 0.8245315161839863, "step": 1210}, {"loss": 1.8076, "grad_norm": 0.2725985646247864, "learning_rate": 0.0002, "epoch": 0.8313458262350937, "step": 1220}, {"loss": 1.7372, "grad_norm": 0.31776314973831177, "learning_rate": 0.0002, "epoch": 0.838160136286201, "step": 1230}, {"loss": 1.8302, "grad_norm": 0.31189337372779846, "learning_rate": 0.0002, "epoch": 0.8449744463373083, "step": 1240}, {"loss": 1.7988, "grad_norm": 0.2912134528160095, "learning_rate": 0.0002, "epoch": 0.8517887563884157, "step": 1250}, {"loss": 1.6884, "grad_norm": 0.3015559911727905, "learning_rate": 0.0002, "epoch": 0.858603066439523, "step": 1260}, {"loss": 1.8476, "grad_norm": 0.3126254677772522, "learning_rate": 0.0002, "epoch": 0.8654173764906303, "step": 1270}, {"loss": 1.7493, "grad_norm": 0.2871003746986389, "learning_rate": 0.0002, "epoch": 0.8722316865417377, "step": 1280}, {"loss": 1.7439, "grad_norm": 0.3037576377391815, "learning_rate": 0.0002, "epoch": 0.879045996592845, "step": 1290}, {"loss": 1.8596, "grad_norm": 0.3130686581134796, "learning_rate": 0.0002, "epoch": 0.8858603066439523, "step": 1300}, {"loss": 1.7669, "grad_norm": 0.31773796677589417, "learning_rate": 0.0002, "epoch": 0.8926746166950597, "step": 1310}, {"loss": 1.7714, "grad_norm": 0.2927514314651489, "learning_rate": 0.0002, "epoch": 0.899488926746167, "step": 1320}, {"loss": 1.9055, "grad_norm": 0.2849563658237457, "learning_rate": 0.0002, "epoch": 0.9063032367972743, "step": 1330}, {"loss": 1.7869, "grad_norm": 0.2721436321735382, "learning_rate": 0.0002, "epoch": 0.9131175468483816, "step": 1340}, {"loss": 1.7977, "grad_norm": 0.2880842089653015, "learning_rate": 0.0002, "epoch": 0.919931856899489, "step": 1350}, {"loss": 1.7368, "grad_norm": 0.2919510304927826, "learning_rate": 0.0002, "epoch": 0.9267461669505963, "step": 1360}, {"loss": 1.7829, "grad_norm": 0.31398487091064453, "learning_rate": 0.0002, "epoch": 0.9335604770017035, "step": 1370}, {"loss": 1.7531, "grad_norm": 0.3119431436061859, "learning_rate": 0.0002, "epoch": 0.9403747870528109, "step": 1380}, {"loss": 1.8298, "grad_norm": 0.2999113202095032, "learning_rate": 0.0002, "epoch": 0.9471890971039182, "step": 1390}, {"loss": 1.7851, "grad_norm": 0.2981210947036743, "learning_rate": 0.0002, "epoch": 0.9540034071550255, "step": 1400}, {"loss": 1.7964, "grad_norm": 0.28395724296569824, "learning_rate": 0.0002, "epoch": 0.9608177172061328, "step": 1410}, {"loss": 1.8355, "grad_norm": 0.36125198006629944, "learning_rate": 0.0002, "epoch": 0.9676320272572402, "step": 1420}, {"loss": 1.8076, "grad_norm": 0.28394874930381775, "learning_rate": 0.0002, "epoch": 0.9744463373083475, "step": 1430}, {"loss": 1.7583, "grad_norm": 0.3032495975494385, "learning_rate": 0.0002, "epoch": 0.9812606473594548, "step": 1440}, {"loss": 1.7549, "grad_norm": 0.3179830014705658, "learning_rate": 0.0002, "epoch": 0.9880749574105622, "step": 1450}, {"loss": 1.8135, "grad_norm": 0.2817234992980957, "learning_rate": 0.0002, "epoch": 0.9948892674616695, "step": 1460}]} +{"epoch": 0.9996592844974447, "step": 1467, "epoch_duration": 5085.766824245453, "total_accumulated_duration": 5085.766824245453, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0956, "grad_norm": 0.4890078604221344, "learning_rate": 0.0002, "epoch": 0.0068143100511073255, "step": 10}, {"loss": 2.5382, "grad_norm": 0.560669481754303, "learning_rate": 0.0002, "epoch": 0.013628620102214651, "step": 20}, {"loss": 2.3479, "grad_norm": 0.4488241970539093, "learning_rate": 0.0002, "epoch": 0.020442930153321975, "step": 30}, {"loss": 2.0976, "grad_norm": 0.5320836305618286, "learning_rate": 0.0002, "epoch": 0.027257240204429302, "step": 40}, {"loss": 1.9701, "grad_norm": 0.4483726918697357, "learning_rate": 0.0002, "epoch": 0.034071550255536626, "step": 50}, {"loss": 1.9309, "grad_norm": 0.46413180232048035, "learning_rate": 0.0002, "epoch": 0.04088586030664395, "step": 60}, {"loss": 1.918, "grad_norm": 0.409530371427536, "learning_rate": 0.0002, "epoch": 0.04770017035775128, "step": 70}, {"loss": 1.9094, "grad_norm": 0.4179694652557373, "learning_rate": 0.0002, "epoch": 0.054514480408858604, "step": 80}, {"loss": 1.8902, "grad_norm": 0.453759104013443, "learning_rate": 0.0002, "epoch": 0.06132879045996593, "step": 90}, {"loss": 1.9253, "grad_norm": 0.40485697984695435, "learning_rate": 0.0002, "epoch": 0.06814310051107325, "step": 100}, {"loss": 1.918, "grad_norm": 0.39115458726882935, "learning_rate": 0.0002, "epoch": 0.07495741056218058, "step": 110}, {"loss": 1.9043, "grad_norm": 0.3934798240661621, "learning_rate": 0.0002, "epoch": 0.0817717206132879, "step": 120}, {"loss": 1.9465, "grad_norm": 0.3925539553165436, "learning_rate": 0.0002, "epoch": 0.08858603066439523, "step": 130}, {"loss": 1.8401, "grad_norm": 0.3578416705131531, "learning_rate": 0.0002, "epoch": 0.09540034071550256, "step": 140}, {"loss": 1.9013, "grad_norm": 0.38667625188827515, "learning_rate": 0.0002, "epoch": 0.10221465076660988, "step": 150}, {"loss": 1.8672, "grad_norm": 0.3250713348388672, "learning_rate": 0.0002, "epoch": 0.10902896081771721, "step": 160}, {"loss": 1.8669, "grad_norm": 0.3284672200679779, "learning_rate": 0.0002, "epoch": 0.11584327086882454, "step": 170}, {"loss": 1.8319, "grad_norm": 0.334200918674469, "learning_rate": 0.0002, "epoch": 0.12265758091993186, "step": 180}, {"loss": 1.8415, "grad_norm": 0.3846002221107483, "learning_rate": 0.0002, "epoch": 0.12947189097103917, "step": 190}, {"loss": 1.8445, "grad_norm": 0.38023582100868225, "learning_rate": 0.0002, "epoch": 0.1362862010221465, "step": 200}, {"loss": 1.7714, "grad_norm": 0.42780670523643494, "learning_rate": 0.0002, "epoch": 0.14310051107325383, "step": 210}, {"loss": 1.8332, "grad_norm": 0.33164316415786743, "learning_rate": 0.0002, "epoch": 0.14991482112436116, "step": 220}, {"loss": 1.8319, "grad_norm": 0.35145387053489685, "learning_rate": 0.0002, "epoch": 0.1567291311754685, "step": 230}, {"loss": 1.8149, "grad_norm": 0.36590245366096497, "learning_rate": 0.0002, "epoch": 0.1635434412265758, "step": 240}, {"loss": 1.7886, "grad_norm": 0.34135767817497253, "learning_rate": 0.0002, "epoch": 0.17035775127768313, "step": 250}, {"loss": 1.7795, "grad_norm": 0.3528055250644684, "learning_rate": 0.0002, "epoch": 0.17717206132879046, "step": 260}, {"loss": 1.895, "grad_norm": 0.33867526054382324, "learning_rate": 0.0002, "epoch": 0.1839863713798978, "step": 270}, {"loss": 1.871, "grad_norm": 0.3495153486728668, "learning_rate": 0.0002, "epoch": 0.19080068143100512, "step": 280}, {"loss": 1.8508, "grad_norm": 0.4151289463043213, "learning_rate": 0.0002, "epoch": 0.19761499148211242, "step": 290}, {"loss": 1.8447, "grad_norm": 0.3759978711605072, "learning_rate": 0.0002, "epoch": 0.20442930153321975, "step": 300}, {"loss": 1.9011, "grad_norm": 0.5714281797409058, "learning_rate": 0.0002, "epoch": 0.21124361158432708, "step": 310}, {"loss": 1.809, "grad_norm": 0.3285363018512726, "learning_rate": 0.0002, "epoch": 0.21805792163543442, "step": 320}, {"loss": 1.7066, "grad_norm": 0.3517664074897766, "learning_rate": 0.0002, "epoch": 0.22487223168654175, "step": 330}, {"loss": 1.8194, "grad_norm": 0.38792750239372253, "learning_rate": 0.0002, "epoch": 0.23168654173764908, "step": 340}, {"loss": 1.8386, "grad_norm": 0.3300336003303528, "learning_rate": 0.0002, "epoch": 0.23850085178875638, "step": 350}, {"loss": 1.8009, "grad_norm": 0.3322029411792755, "learning_rate": 0.0002, "epoch": 0.2453151618398637, "step": 360}, {"loss": 1.7154, "grad_norm": 0.3096524477005005, "learning_rate": 0.0002, "epoch": 0.252129471890971, "step": 370}, {"loss": 1.8584, "grad_norm": 0.3605906665325165, "learning_rate": 0.0002, "epoch": 0.25894378194207834, "step": 380}, {"loss": 1.8006, "grad_norm": 0.3240072727203369, "learning_rate": 0.0002, "epoch": 0.2657580919931857, "step": 390}, {"loss": 1.7911, "grad_norm": 0.32536280155181885, "learning_rate": 0.0002, "epoch": 0.272572402044293, "step": 400}, {"loss": 1.8049, "grad_norm": 0.32110732793807983, "learning_rate": 0.0002, "epoch": 0.27938671209540034, "step": 410}, {"loss": 1.8137, "grad_norm": 0.2854306697845459, "learning_rate": 0.0002, "epoch": 0.28620102214650767, "step": 420}, {"loss": 1.7859, "grad_norm": 0.33118736743927, "learning_rate": 0.0002, "epoch": 0.293015332197615, "step": 430}, {"loss": 1.7624, "grad_norm": 0.44273754954338074, "learning_rate": 0.0002, "epoch": 0.29982964224872233, "step": 440}, {"loss": 1.85, "grad_norm": 0.3363669216632843, "learning_rate": 0.0002, "epoch": 0.30664395229982966, "step": 450}, {"loss": 1.7745, "grad_norm": 0.4057871103286743, "learning_rate": 0.0002, "epoch": 0.313458262350937, "step": 460}, {"loss": 1.7801, "grad_norm": 0.37123292684555054, "learning_rate": 0.0002, "epoch": 0.3202725724020443, "step": 470}, {"loss": 1.8029, "grad_norm": 0.3181988298892975, "learning_rate": 0.0002, "epoch": 0.3270868824531516, "step": 480}, {"loss": 1.8276, "grad_norm": 0.31384387612342834, "learning_rate": 0.0002, "epoch": 0.3339011925042589, "step": 490}, {"loss": 1.7388, "grad_norm": 0.31863880157470703, "learning_rate": 0.0002, "epoch": 0.34071550255536626, "step": 500}, {"loss": 1.819, "grad_norm": 0.337476909160614, "learning_rate": 0.0002, "epoch": 0.3475298126064736, "step": 510}, {"loss": 1.8345, "grad_norm": 0.3001832365989685, "learning_rate": 0.0002, "epoch": 0.3543441226575809, "step": 520}, {"loss": 1.733, "grad_norm": 0.32997000217437744, "learning_rate": 0.0002, "epoch": 0.36115843270868825, "step": 530}, {"loss": 1.8626, "grad_norm": 0.30522841215133667, "learning_rate": 0.0002, "epoch": 0.3679727427597956, "step": 540}, {"loss": 1.8379, "grad_norm": 0.3623015880584717, "learning_rate": 0.0002, "epoch": 0.3747870528109029, "step": 550}, {"loss": 1.7529, "grad_norm": 0.33002111315727234, "learning_rate": 0.0002, "epoch": 0.38160136286201024, "step": 560}, {"loss": 1.805, "grad_norm": 0.3013359606266022, "learning_rate": 0.0002, "epoch": 0.38841567291311757, "step": 570}, {"loss": 1.8125, "grad_norm": 0.3127090334892273, "learning_rate": 0.0002, "epoch": 0.39522998296422485, "step": 580}, {"loss": 1.7993, "grad_norm": 0.33546915650367737, "learning_rate": 0.0002, "epoch": 0.4020442930153322, "step": 590}, {"loss": 1.7809, "grad_norm": 0.28783854842185974, "learning_rate": 0.0002, "epoch": 0.4088586030664395, "step": 600}, {"loss": 1.8208, "grad_norm": 0.3139500617980957, "learning_rate": 0.0002, "epoch": 0.41567291311754684, "step": 610}, {"loss": 1.754, "grad_norm": 0.32865241169929504, "learning_rate": 0.0002, "epoch": 0.42248722316865417, "step": 620}, {"loss": 1.7674, "grad_norm": 0.3169761300086975, "learning_rate": 0.0002, "epoch": 0.4293015332197615, "step": 630}, {"loss": 1.8168, "grad_norm": 0.2998940944671631, "learning_rate": 0.0002, "epoch": 0.43611584327086883, "step": 640}, {"loss": 1.7248, "grad_norm": 0.3202044367790222, "learning_rate": 0.0002, "epoch": 0.44293015332197616, "step": 650}, {"loss": 1.8206, "grad_norm": 0.31355756521224976, "learning_rate": 0.0002, "epoch": 0.4497444633730835, "step": 660}, {"loss": 1.7997, "grad_norm": 0.3137073218822479, "learning_rate": 0.0002, "epoch": 0.4565587734241908, "step": 670}, {"loss": 1.8428, "grad_norm": 0.3120451867580414, "learning_rate": 0.0002, "epoch": 0.46337308347529815, "step": 680}, {"loss": 1.8291, "grad_norm": 0.31536173820495605, "learning_rate": 0.0002, "epoch": 0.47018739352640543, "step": 690}, {"loss": 1.7386, "grad_norm": 0.28827103972435, "learning_rate": 0.0002, "epoch": 0.47700170357751276, "step": 700}, {"loss": 1.6968, "grad_norm": 0.3561989963054657, "learning_rate": 0.0002, "epoch": 0.4838160136286201, "step": 710}, {"loss": 1.8093, "grad_norm": 0.3765421509742737, "learning_rate": 0.0002, "epoch": 0.4906303236797274, "step": 720}, {"loss": 1.7341, "grad_norm": 0.3090366721153259, "learning_rate": 0.0002, "epoch": 0.49744463373083475, "step": 730}, {"loss": 1.8524, "grad_norm": 0.28166860342025757, "learning_rate": 0.0002, "epoch": 0.504258943781942, "step": 740}, {"loss": 1.7755, "grad_norm": 0.3202856183052063, "learning_rate": 0.0002, "epoch": 0.5110732538330494, "step": 750}, {"loss": 1.7856, "grad_norm": 0.37479111552238464, "learning_rate": 0.0002, "epoch": 0.5178875638841567, "step": 760}, {"loss": 1.7167, "grad_norm": 0.3002205193042755, "learning_rate": 0.0002, "epoch": 0.524701873935264, "step": 770}, {"loss": 1.7907, "grad_norm": 0.3010348081588745, "learning_rate": 0.0002, "epoch": 0.5315161839863713, "step": 780}, {"loss": 1.8472, "grad_norm": 0.32340362668037415, "learning_rate": 0.0002, "epoch": 0.5383304940374787, "step": 790}, {"loss": 1.8647, "grad_norm": 0.29102805256843567, "learning_rate": 0.0002, "epoch": 0.545144804088586, "step": 800}, {"loss": 1.9026, "grad_norm": 0.3151300251483917, "learning_rate": 0.0002, "epoch": 0.5519591141396933, "step": 810}, {"loss": 1.8314, "grad_norm": 0.32248660922050476, "learning_rate": 0.0002, "epoch": 0.5587734241908007, "step": 820}, {"loss": 1.722, "grad_norm": 0.3732532262802124, "learning_rate": 0.0002, "epoch": 0.565587734241908, "step": 830}, {"loss": 1.8073, "grad_norm": 0.34599027037620544, "learning_rate": 0.0002, "epoch": 0.5724020442930153, "step": 840}, {"loss": 1.7, "grad_norm": 0.3217315673828125, "learning_rate": 0.0002, "epoch": 0.5792163543441227, "step": 850}, {"loss": 1.8076, "grad_norm": 0.28454235196113586, "learning_rate": 0.0002, "epoch": 0.58603066439523, "step": 860}, {"loss": 1.6921, "grad_norm": 0.30069926381111145, "learning_rate": 0.0002, "epoch": 0.5928449744463373, "step": 870}, {"loss": 1.811, "grad_norm": 0.2921348810195923, "learning_rate": 0.0002, "epoch": 0.5996592844974447, "step": 880}, {"loss": 1.8208, "grad_norm": 0.35356172919273376, "learning_rate": 0.0002, "epoch": 0.606473594548552, "step": 890}, {"loss": 1.767, "grad_norm": 0.29252395033836365, "learning_rate": 0.0002, "epoch": 0.6132879045996593, "step": 900}, {"loss": 1.8387, "grad_norm": 0.31998178362846375, "learning_rate": 0.0002, "epoch": 0.6201022146507666, "step": 910}, {"loss": 1.7963, "grad_norm": 0.31121236085891724, "learning_rate": 0.0002, "epoch": 0.626916524701874, "step": 920}, {"loss": 1.76, "grad_norm": 0.3099833130836487, "learning_rate": 0.0002, "epoch": 0.6337308347529813, "step": 930}, {"loss": 1.8496, "grad_norm": 0.3317437171936035, "learning_rate": 0.0002, "epoch": 0.6405451448040886, "step": 940}, {"loss": 1.7053, "grad_norm": 0.2920125126838684, "learning_rate": 0.0002, "epoch": 0.6473594548551959, "step": 950}, {"loss": 1.7148, "grad_norm": 0.3215860426425934, "learning_rate": 0.0002, "epoch": 0.6541737649063032, "step": 960}, {"loss": 1.776, "grad_norm": 0.3287025988101959, "learning_rate": 0.0002, "epoch": 0.6609880749574105, "step": 970}, {"loss": 1.7575, "grad_norm": 0.34582677483558655, "learning_rate": 0.0002, "epoch": 0.6678023850085179, "step": 980}, {"loss": 1.7952, "grad_norm": 0.3392165005207062, "learning_rate": 0.0002, "epoch": 0.6746166950596252, "step": 990}, {"loss": 1.8195, "grad_norm": 0.2994081974029541, "learning_rate": 0.0002, "epoch": 0.6814310051107325, "step": 1000}, {"loss": 1.7705, "grad_norm": 0.3185518682003021, "learning_rate": 0.0002, "epoch": 0.6882453151618398, "step": 1010}, {"loss": 1.7852, "grad_norm": 0.2897794544696808, "learning_rate": 0.0002, "epoch": 0.6950596252129472, "step": 1020}, {"loss": 1.795, "grad_norm": 0.3000950813293457, "learning_rate": 0.0002, "epoch": 0.7018739352640545, "step": 1030}, {"loss": 1.7089, "grad_norm": 0.3021177649497986, "learning_rate": 0.0002, "epoch": 0.7086882453151618, "step": 1040}, {"loss": 1.8728, "grad_norm": 0.304629385471344, "learning_rate": 0.0002, "epoch": 0.7155025553662692, "step": 1050}, {"loss": 1.7822, "grad_norm": 0.29901665449142456, "learning_rate": 0.0002, "epoch": 0.7223168654173765, "step": 1060}, {"loss": 1.7842, "grad_norm": 0.3098410367965698, "learning_rate": 0.0002, "epoch": 0.7291311754684838, "step": 1070}, {"loss": 1.7835, "grad_norm": 0.293082594871521, "learning_rate": 0.0002, "epoch": 0.7359454855195912, "step": 1080}, {"loss": 1.7658, "grad_norm": 0.32809993624687195, "learning_rate": 0.0002, "epoch": 0.7427597955706985, "step": 1090}, {"loss": 1.7316, "grad_norm": 0.4396800994873047, "learning_rate": 0.0002, "epoch": 0.7495741056218058, "step": 1100}, {"loss": 1.8082, "grad_norm": 0.28425362706184387, "learning_rate": 0.0002, "epoch": 0.7563884156729132, "step": 1110}, {"loss": 1.8087, "grad_norm": 0.3192683160305023, "learning_rate": 0.0002, "epoch": 0.7632027257240205, "step": 1120}, {"loss": 1.6968, "grad_norm": 0.3428346514701843, "learning_rate": 0.0002, "epoch": 0.7700170357751278, "step": 1130}, {"loss": 1.8486, "grad_norm": 0.32198479771614075, "learning_rate": 0.0002, "epoch": 0.7768313458262351, "step": 1140}, {"loss": 1.7027, "grad_norm": 0.3131236433982849, "learning_rate": 0.0002, "epoch": 0.7836456558773425, "step": 1150}, {"loss": 1.8171, "grad_norm": 0.31295809149742126, "learning_rate": 0.0002, "epoch": 0.7904599659284497, "step": 1160}, {"loss": 1.7632, "grad_norm": 0.29603999853134155, "learning_rate": 0.0002, "epoch": 0.797274275979557, "step": 1170}, {"loss": 1.8266, "grad_norm": 0.31465768814086914, "learning_rate": 0.0002, "epoch": 0.8040885860306644, "step": 1180}, {"loss": 1.7766, "grad_norm": 0.3142889142036438, "learning_rate": 0.0002, "epoch": 0.8109028960817717, "step": 1190}, {"loss": 1.7581, "grad_norm": 0.3155096769332886, "learning_rate": 0.0002, "epoch": 0.817717206132879, "step": 1200}, {"loss": 1.7909, "grad_norm": 0.28239554166793823, "learning_rate": 0.0002, "epoch": 0.8245315161839863, "step": 1210}, {"loss": 1.8045, "grad_norm": 0.2708231508731842, "learning_rate": 0.0002, "epoch": 0.8313458262350937, "step": 1220}, {"loss": 1.739, "grad_norm": 0.27224382758140564, "learning_rate": 0.0002, "epoch": 0.838160136286201, "step": 1230}, {"loss": 1.8305, "grad_norm": 0.3035840690135956, "learning_rate": 0.0002, "epoch": 0.8449744463373083, "step": 1240}, {"loss": 1.7971, "grad_norm": 0.29008471965789795, "learning_rate": 0.0002, "epoch": 0.8517887563884157, "step": 1250}, {"loss": 1.6882, "grad_norm": 0.30559810996055603, "learning_rate": 0.0002, "epoch": 0.858603066439523, "step": 1260}, {"loss": 1.8492, "grad_norm": 0.3483515977859497, "learning_rate": 0.0002, "epoch": 0.8654173764906303, "step": 1270}, {"loss": 1.752, "grad_norm": 0.2996648848056793, "learning_rate": 0.0002, "epoch": 0.8722316865417377, "step": 1280}, {"loss": 1.7457, "grad_norm": 0.2933368682861328, "learning_rate": 0.0002, "epoch": 0.879045996592845, "step": 1290}, {"loss": 1.8596, "grad_norm": 0.3157258629798889, "learning_rate": 0.0002, "epoch": 0.8858603066439523, "step": 1300}, {"loss": 1.7625, "grad_norm": 0.3113366961479187, "learning_rate": 0.0002, "epoch": 0.8926746166950597, "step": 1310}, {"loss": 1.7719, "grad_norm": 0.2785132825374603, "learning_rate": 0.0002, "epoch": 0.899488926746167, "step": 1320}, {"loss": 1.9069, "grad_norm": 0.28709521889686584, "learning_rate": 0.0002, "epoch": 0.9063032367972743, "step": 1330}, {"loss": 1.7891, "grad_norm": 0.2823180854320526, "learning_rate": 0.0002, "epoch": 0.9131175468483816, "step": 1340}, {"loss": 1.7992, "grad_norm": 0.2845880389213562, "learning_rate": 0.0002, "epoch": 0.919931856899489, "step": 1350}, {"loss": 1.737, "grad_norm": 0.2904936671257019, "learning_rate": 0.0002, "epoch": 0.9267461669505963, "step": 1360}, {"loss": 1.7844, "grad_norm": 0.34917423129081726, "learning_rate": 0.0002, "epoch": 0.9335604770017035, "step": 1370}, {"loss": 1.7541, "grad_norm": 0.29743409156799316, "learning_rate": 0.0002, "epoch": 0.9403747870528109, "step": 1380}, {"loss": 1.8305, "grad_norm": 0.3027336001396179, "learning_rate": 0.0002, "epoch": 0.9471890971039182, "step": 1390}, {"loss": 1.785, "grad_norm": 0.30059128999710083, "learning_rate": 0.0002, "epoch": 0.9540034071550255, "step": 1400}, {"loss": 1.7969, "grad_norm": 0.28271496295928955, "learning_rate": 0.0002, "epoch": 0.9608177172061328, "step": 1410}, {"loss": 1.8343, "grad_norm": 0.3470972776412964, "learning_rate": 0.0002, "epoch": 0.9676320272572402, "step": 1420}, {"loss": 1.8034, "grad_norm": 0.33903709053993225, "learning_rate": 0.0002, "epoch": 0.9744463373083475, "step": 1430}, {"loss": 1.7603, "grad_norm": 0.2882603406906128, "learning_rate": 0.0002, "epoch": 0.9812606473594548, "step": 1440}, {"loss": 1.7558, "grad_norm": 0.3057056665420532, "learning_rate": 0.0002, "epoch": 0.9880749574105622, "step": 1450}, {"loss": 1.8119, "grad_norm": 0.27955082058906555, "learning_rate": 0.0002, "epoch": 0.9948892674616695, "step": 1460}]} +{"epoch": 0.9996592844974447, "step": 1467, "epoch_duration": 5102.333571195602, "total_accumulated_duration": 5102.333571195602, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0978, "grad_norm": 0.5088208317756653, "learning_rate": 0.0002, "epoch": 0.0068143100511073255, "step": 10}, {"loss": 2.5496, "grad_norm": 1.0593037605285645, "learning_rate": 0.0002, "epoch": 0.013628620102214651, "step": 20}, {"loss": 2.3515, "grad_norm": 0.4729953110218048, "learning_rate": 0.0002, "epoch": 0.020442930153321975, "step": 30}, {"loss": 2.1102, "grad_norm": 0.5926728248596191, "learning_rate": 0.0002, "epoch": 0.027257240204429302, "step": 40}, {"loss": 1.9802, "grad_norm": 0.6847966909408569, "learning_rate": 0.0002, "epoch": 0.034071550255536626, "step": 50}, {"loss": 1.9376, "grad_norm": 0.5126395225524902, "learning_rate": 0.0002, "epoch": 0.04088586030664395, "step": 60}, {"loss": 1.9174, "grad_norm": 0.39128491282463074, "learning_rate": 0.0002, "epoch": 0.04770017035775128, "step": 70}, {"loss": 1.9144, "grad_norm": 0.5008781552314758, "learning_rate": 0.0002, "epoch": 0.054514480408858604, "step": 80}, {"loss": 1.8948, "grad_norm": 0.5142364501953125, "learning_rate": 0.0002, "epoch": 0.06132879045996593, "step": 90}, {"loss": 1.9261, "grad_norm": 0.5292380452156067, "learning_rate": 0.0002, "epoch": 0.06814310051107325, "step": 100}, {"loss": 1.9215, "grad_norm": 0.41132253408432007, "learning_rate": 0.0002, "epoch": 0.07495741056218058, "step": 110}, {"loss": 1.9064, "grad_norm": 0.37462374567985535, "learning_rate": 0.0002, "epoch": 0.0817717206132879, "step": 120}, {"loss": 1.9491, "grad_norm": 0.6477067470550537, "learning_rate": 0.0002, "epoch": 0.08858603066439523, "step": 130}, {"loss": 1.8404, "grad_norm": 0.33299797773361206, "learning_rate": 0.0002, "epoch": 0.09540034071550256, "step": 140}, {"loss": 1.9068, "grad_norm": 0.4089040458202362, "learning_rate": 0.0002, "epoch": 0.10221465076660988, "step": 150}, {"loss": 1.8706, "grad_norm": 0.3481667935848236, "learning_rate": 0.0002, "epoch": 0.10902896081771721, "step": 160}, {"loss": 1.8655, "grad_norm": 0.31345364451408386, "learning_rate": 0.0002, "epoch": 0.11584327086882454, "step": 170}, {"loss": 1.832, "grad_norm": 0.3320314288139343, "learning_rate": 0.0002, "epoch": 0.12265758091993186, "step": 180}, {"loss": 1.8396, "grad_norm": 0.3953656554222107, "learning_rate": 0.0002, "epoch": 0.12947189097103917, "step": 190}, {"loss": 1.848, "grad_norm": 0.34698614478111267, "learning_rate": 0.0002, "epoch": 0.1362862010221465, "step": 200}, {"loss": 1.7698, "grad_norm": 0.40951043367385864, "learning_rate": 0.0002, "epoch": 0.14310051107325383, "step": 210}, {"loss": 1.8329, "grad_norm": 0.3920935392379761, "learning_rate": 0.0002, "epoch": 0.14991482112436116, "step": 220}, {"loss": 1.8354, "grad_norm": 0.35603517293930054, "learning_rate": 0.0002, "epoch": 0.1567291311754685, "step": 230}, {"loss": 1.8148, "grad_norm": 0.33723801374435425, "learning_rate": 0.0002, "epoch": 0.1635434412265758, "step": 240}, {"loss": 1.7884, "grad_norm": 0.35914871096611023, "learning_rate": 0.0002, "epoch": 0.17035775127768313, "step": 250}, {"loss": 1.7795, "grad_norm": 0.33864158391952515, "learning_rate": 0.0002, "epoch": 0.17717206132879046, "step": 260}, {"loss": 1.8944, "grad_norm": 0.3345390260219574, "learning_rate": 0.0002, "epoch": 0.1839863713798978, "step": 270}, {"loss": 1.8747, "grad_norm": 0.3952043950557709, "learning_rate": 0.0002, "epoch": 0.19080068143100512, "step": 280}, {"loss": 1.856, "grad_norm": 0.4231615364551544, "learning_rate": 0.0002, "epoch": 0.19761499148211242, "step": 290}, {"loss": 1.844, "grad_norm": 0.38938260078430176, "learning_rate": 0.0002, "epoch": 0.20442930153321975, "step": 300}, {"loss": 1.9045, "grad_norm": 0.39781057834625244, "learning_rate": 0.0002, "epoch": 0.21124361158432708, "step": 310}, {"loss": 1.8109, "grad_norm": 0.33381906151771545, "learning_rate": 0.0002, "epoch": 0.21805792163543442, "step": 320}, {"loss": 1.7083, "grad_norm": 0.7406433820724487, "learning_rate": 0.0002, "epoch": 0.22487223168654175, "step": 330}, {"loss": 1.8202, "grad_norm": 0.3887518048286438, "learning_rate": 0.0002, "epoch": 0.23168654173764908, "step": 340}, {"loss": 1.8396, "grad_norm": 0.31778785586357117, "learning_rate": 0.0002, "epoch": 0.23850085178875638, "step": 350}, {"loss": 1.7994, "grad_norm": 0.3431912660598755, "learning_rate": 0.0002, "epoch": 0.2453151618398637, "step": 360}, {"loss": 1.7167, "grad_norm": 0.302903950214386, "learning_rate": 0.0002, "epoch": 0.252129471890971, "step": 370}, {"loss": 1.8598, "grad_norm": 0.36719897389411926, "learning_rate": 0.0002, "epoch": 0.25894378194207834, "step": 380}, {"loss": 1.7993, "grad_norm": 0.3140374720096588, "learning_rate": 0.0002, "epoch": 0.2657580919931857, "step": 390}, {"loss": 1.785, "grad_norm": 0.30111634731292725, "learning_rate": 0.0002, "epoch": 0.272572402044293, "step": 400}, {"loss": 1.8007, "grad_norm": 0.3212359547615051, "learning_rate": 0.0002, "epoch": 0.27938671209540034, "step": 410}, {"loss": 1.8133, "grad_norm": 0.31072258949279785, "learning_rate": 0.0002, "epoch": 0.28620102214650767, "step": 420}, {"loss": 1.7862, "grad_norm": 0.3531157672405243, "learning_rate": 0.0002, "epoch": 0.293015332197615, "step": 430}, {"loss": 1.7611, "grad_norm": 0.3677731454372406, "learning_rate": 0.0002, "epoch": 0.29982964224872233, "step": 440}, {"loss": 1.8494, "grad_norm": 0.3518213927745819, "learning_rate": 0.0002, "epoch": 0.30664395229982966, "step": 450}, {"loss": 1.7758, "grad_norm": 0.3363339900970459, "learning_rate": 0.0002, "epoch": 0.313458262350937, "step": 460}, {"loss": 1.7753, "grad_norm": 0.4648301601409912, "learning_rate": 0.0002, "epoch": 0.3202725724020443, "step": 470}, {"loss": 1.8023, "grad_norm": 0.31766363978385925, "learning_rate": 0.0002, "epoch": 0.3270868824531516, "step": 480}, {"loss": 1.8233, "grad_norm": 0.3012169599533081, "learning_rate": 0.0002, "epoch": 0.3339011925042589, "step": 490}, {"loss": 1.7406, "grad_norm": 0.4041035771369934, "learning_rate": 0.0002, "epoch": 0.34071550255536626, "step": 500}, {"loss": 1.8188, "grad_norm": 0.34090086817741394, "learning_rate": 0.0002, "epoch": 0.3475298126064736, "step": 510}, {"loss": 1.8361, "grad_norm": 0.3133261501789093, "learning_rate": 0.0002, "epoch": 0.3543441226575809, "step": 520}, {"loss": 1.7353, "grad_norm": 0.30976736545562744, "learning_rate": 0.0002, "epoch": 0.36115843270868825, "step": 530}, {"loss": 1.8623, "grad_norm": 0.3080470561981201, "learning_rate": 0.0002, "epoch": 0.3679727427597956, "step": 540}, {"loss": 1.8375, "grad_norm": 0.2941526770591736, "learning_rate": 0.0002, "epoch": 0.3747870528109029, "step": 550}, {"loss": 1.7503, "grad_norm": 0.34511008858680725, "learning_rate": 0.0002, "epoch": 0.38160136286201024, "step": 560}, {"loss": 1.8067, "grad_norm": 0.29871416091918945, "learning_rate": 0.0002, "epoch": 0.38841567291311757, "step": 570}, {"loss": 1.8104, "grad_norm": 0.3266373574733734, "learning_rate": 0.0002, "epoch": 0.39522998296422485, "step": 580}, {"loss": 1.7967, "grad_norm": 0.32749661803245544, "learning_rate": 0.0002, "epoch": 0.4020442930153322, "step": 590}, {"loss": 1.7817, "grad_norm": 0.2935587167739868, "learning_rate": 0.0002, "epoch": 0.4088586030664395, "step": 600}, {"loss": 1.8221, "grad_norm": 0.31525930762290955, "learning_rate": 0.0002, "epoch": 0.41567291311754684, "step": 610}, {"loss": 1.7519, "grad_norm": 0.32136717438697815, "learning_rate": 0.0002, "epoch": 0.42248722316865417, "step": 620}, {"loss": 1.7685, "grad_norm": 0.3483738601207733, "learning_rate": 0.0002, "epoch": 0.4293015332197615, "step": 630}, {"loss": 1.8169, "grad_norm": 0.3116815984249115, "learning_rate": 0.0002, "epoch": 0.43611584327086883, "step": 640}, {"loss": 1.7238, "grad_norm": 0.32137081027030945, "learning_rate": 0.0002, "epoch": 0.44293015332197616, "step": 650}, {"loss": 1.8189, "grad_norm": 0.3489629626274109, "learning_rate": 0.0002, "epoch": 0.4497444633730835, "step": 660}, {"loss": 1.801, "grad_norm": 0.3354140520095825, "learning_rate": 0.0002, "epoch": 0.4565587734241908, "step": 670}, {"loss": 1.8406, "grad_norm": 0.3436032831668854, "learning_rate": 0.0002, "epoch": 0.46337308347529815, "step": 680}, {"loss": 1.8281, "grad_norm": 0.3081585168838501, "learning_rate": 0.0002, "epoch": 0.47018739352640543, "step": 690}, {"loss": 1.7397, "grad_norm": 0.29491159319877625, "learning_rate": 0.0002, "epoch": 0.47700170357751276, "step": 700}, {"loss": 1.697, "grad_norm": 0.29597049951553345, "learning_rate": 0.0002, "epoch": 0.4838160136286201, "step": 710}, {"loss": 1.8072, "grad_norm": 0.33674708008766174, "learning_rate": 0.0002, "epoch": 0.4906303236797274, "step": 720}, {"loss": 1.734, "grad_norm": 0.31072619557380676, "learning_rate": 0.0002, "epoch": 0.49744463373083475, "step": 730}, {"loss": 1.85, "grad_norm": 0.29212507605552673, "learning_rate": 0.0002, "epoch": 0.504258943781942, "step": 740}, {"loss": 1.7757, "grad_norm": 0.30673232674598694, "learning_rate": 0.0002, "epoch": 0.5110732538330494, "step": 750}, {"loss": 1.7832, "grad_norm": 0.39193248748779297, "learning_rate": 0.0002, "epoch": 0.5178875638841567, "step": 760}, {"loss": 1.7173, "grad_norm": 0.3359121084213257, "learning_rate": 0.0002, "epoch": 0.524701873935264, "step": 770}, {"loss": 1.789, "grad_norm": 0.30146411061286926, "learning_rate": 0.0002, "epoch": 0.5315161839863713, "step": 780}, {"loss": 1.8406, "grad_norm": 0.3188167214393616, "learning_rate": 0.0002, "epoch": 0.5383304940374787, "step": 790}, {"loss": 1.8643, "grad_norm": 0.2894856333732605, "learning_rate": 0.0002, "epoch": 0.545144804088586, "step": 800}, {"loss": 1.9025, "grad_norm": 0.3204955756664276, "learning_rate": 0.0002, "epoch": 0.5519591141396933, "step": 810}, {"loss": 1.8323, "grad_norm": 0.31357795000076294, "learning_rate": 0.0002, "epoch": 0.5587734241908007, "step": 820}, {"loss": 1.7202, "grad_norm": 0.2930838465690613, "learning_rate": 0.0002, "epoch": 0.565587734241908, "step": 830}, {"loss": 1.8043, "grad_norm": 0.36771127581596375, "learning_rate": 0.0002, "epoch": 0.5724020442930153, "step": 840}, {"loss": 1.6992, "grad_norm": 0.32667481899261475, "learning_rate": 0.0002, "epoch": 0.5792163543441227, "step": 850}, {"loss": 1.8047, "grad_norm": 0.47640711069107056, "learning_rate": 0.0002, "epoch": 0.58603066439523, "step": 860}, {"loss": 1.6871, "grad_norm": 0.2909122407436371, "learning_rate": 0.0002, "epoch": 0.5928449744463373, "step": 870}, {"loss": 1.8104, "grad_norm": 0.3152811527252197, "learning_rate": 0.0002, "epoch": 0.5996592844974447, "step": 880}, {"loss": 1.8208, "grad_norm": 0.3857297897338867, "learning_rate": 0.0002, "epoch": 0.606473594548552, "step": 890}, {"loss": 1.7666, "grad_norm": 0.2939709424972534, "learning_rate": 0.0002, "epoch": 0.6132879045996593, "step": 900}, {"loss": 1.8342, "grad_norm": 0.3089073598384857, "learning_rate": 0.0002, "epoch": 0.6201022146507666, "step": 910}, {"loss": 1.7962, "grad_norm": 0.3309785723686218, "learning_rate": 0.0002, "epoch": 0.626916524701874, "step": 920}, {"loss": 1.7592, "grad_norm": 0.3254700303077698, "learning_rate": 0.0002, "epoch": 0.6337308347529813, "step": 930}, {"loss": 1.8489, "grad_norm": 0.32593998312950134, "learning_rate": 0.0002, "epoch": 0.6405451448040886, "step": 940}, {"loss": 1.7075, "grad_norm": 0.2889043390750885, "learning_rate": 0.0002, "epoch": 0.6473594548551959, "step": 950}, {"loss": 1.7205, "grad_norm": 0.3143400549888611, "learning_rate": 0.0002, "epoch": 0.6541737649063032, "step": 960}, {"loss": 1.7773, "grad_norm": 0.2884869873523712, "learning_rate": 0.0002, "epoch": 0.6609880749574105, "step": 970}, {"loss": 1.7568, "grad_norm": 0.3458554744720459, "learning_rate": 0.0002, "epoch": 0.6678023850085179, "step": 980}, {"loss": 1.7932, "grad_norm": 0.3239525556564331, "learning_rate": 0.0002, "epoch": 0.6746166950596252, "step": 990}, {"loss": 1.8216, "grad_norm": 0.29648423194885254, "learning_rate": 0.0002, "epoch": 0.6814310051107325, "step": 1000}, {"loss": 1.7692, "grad_norm": 0.296655535697937, "learning_rate": 0.0002, "epoch": 0.6882453151618398, "step": 1010}, {"loss": 1.7818, "grad_norm": 0.2894240915775299, "learning_rate": 0.0002, "epoch": 0.6950596252129472, "step": 1020}, {"loss": 1.7965, "grad_norm": 0.3165854811668396, "learning_rate": 0.0002, "epoch": 0.7018739352640545, "step": 1030}, {"loss": 1.7108, "grad_norm": 0.31260034441947937, "learning_rate": 0.0002, "epoch": 0.7086882453151618, "step": 1040}, {"loss": 1.8761, "grad_norm": 0.2844091057777405, "learning_rate": 0.0002, "epoch": 0.7155025553662692, "step": 1050}, {"loss": 1.781, "grad_norm": 0.30020004510879517, "learning_rate": 0.0002, "epoch": 0.7223168654173765, "step": 1060}, {"loss": 1.7851, "grad_norm": 0.29758691787719727, "learning_rate": 0.0002, "epoch": 0.7291311754684838, "step": 1070}, {"loss": 1.7807, "grad_norm": 0.40287351608276367, "learning_rate": 0.0002, "epoch": 0.7359454855195912, "step": 1080}, {"loss": 1.7628, "grad_norm": 0.2822872996330261, "learning_rate": 0.0002, "epoch": 0.7427597955706985, "step": 1090}, {"loss": 1.7356, "grad_norm": 0.2732164263725281, "learning_rate": 0.0002, "epoch": 0.7495741056218058, "step": 1100}, {"loss": 1.8061, "grad_norm": 0.2798553705215454, "learning_rate": 0.0002, "epoch": 0.7563884156729132, "step": 1110}, {"loss": 1.8058, "grad_norm": 0.32549113035202026, "learning_rate": 0.0002, "epoch": 0.7632027257240205, "step": 1120}, {"loss": 1.6998, "grad_norm": 0.3060782551765442, "learning_rate": 0.0002, "epoch": 0.7700170357751278, "step": 1130}, {"loss": 1.8464, "grad_norm": 0.30375203490257263, "learning_rate": 0.0002, "epoch": 0.7768313458262351, "step": 1140}, {"loss": 1.7029, "grad_norm": 0.31432613730430603, "learning_rate": 0.0002, "epoch": 0.7836456558773425, "step": 1150}, {"loss": 1.8174, "grad_norm": 0.31449654698371887, "learning_rate": 0.0002, "epoch": 0.7904599659284497, "step": 1160}, {"loss": 1.762, "grad_norm": 0.2884395122528076, "learning_rate": 0.0002, "epoch": 0.797274275979557, "step": 1170}, {"loss": 1.8256, "grad_norm": 0.3159497082233429, "learning_rate": 0.0002, "epoch": 0.8040885860306644, "step": 1180}, {"loss": 1.7775, "grad_norm": 0.3125587999820709, "learning_rate": 0.0002, "epoch": 0.8109028960817717, "step": 1190}, {"loss": 1.7591, "grad_norm": 0.318744957447052, "learning_rate": 0.0002, "epoch": 0.817717206132879, "step": 1200}, {"loss": 1.7898, "grad_norm": 0.35875993967056274, "learning_rate": 0.0002, "epoch": 0.8245315161839863, "step": 1210}, {"loss": 1.8066, "grad_norm": 0.2865098714828491, "learning_rate": 0.0002, "epoch": 0.8313458262350937, "step": 1220}, {"loss": 1.7363, "grad_norm": 0.2804172933101654, "learning_rate": 0.0002, "epoch": 0.838160136286201, "step": 1230}, {"loss": 1.8287, "grad_norm": 0.29850155115127563, "learning_rate": 0.0002, "epoch": 0.8449744463373083, "step": 1240}, {"loss": 1.7974, "grad_norm": 0.29156357049942017, "learning_rate": 0.0002, "epoch": 0.8517887563884157, "step": 1250}, {"loss": 1.688, "grad_norm": 0.3176652193069458, "learning_rate": 0.0002, "epoch": 0.858603066439523, "step": 1260}, {"loss": 1.8492, "grad_norm": 0.3148926794528961, "learning_rate": 0.0002, "epoch": 0.8654173764906303, "step": 1270}, {"loss": 1.7514, "grad_norm": 0.28457680344581604, "learning_rate": 0.0002, "epoch": 0.8722316865417377, "step": 1280}, {"loss": 1.7455, "grad_norm": 0.30599263310432434, "learning_rate": 0.0002, "epoch": 0.879045996592845, "step": 1290}, {"loss": 1.8599, "grad_norm": 0.3326919972896576, "learning_rate": 0.0002, "epoch": 0.8858603066439523, "step": 1300}, {"loss": 1.7648, "grad_norm": 0.32320621609687805, "learning_rate": 0.0002, "epoch": 0.8926746166950597, "step": 1310}, {"loss": 1.7746, "grad_norm": 0.2892012596130371, "learning_rate": 0.0002, "epoch": 0.899488926746167, "step": 1320}, {"loss": 1.905, "grad_norm": 0.28488948941230774, "learning_rate": 0.0002, "epoch": 0.9063032367972743, "step": 1330}, {"loss": 1.7901, "grad_norm": 0.27067679166793823, "learning_rate": 0.0002, "epoch": 0.9131175468483816, "step": 1340}, {"loss": 1.8018, "grad_norm": 0.29527920484542847, "learning_rate": 0.0002, "epoch": 0.919931856899489, "step": 1350}, {"loss": 1.7395, "grad_norm": 0.5234482884407043, "learning_rate": 0.0002, "epoch": 0.9267461669505963, "step": 1360}, {"loss": 1.7849, "grad_norm": 0.3627465069293976, "learning_rate": 0.0002, "epoch": 0.9335604770017035, "step": 1370}, {"loss": 1.7571, "grad_norm": 0.304675430059433, "learning_rate": 0.0002, "epoch": 0.9403747870528109, "step": 1380}, {"loss": 1.8314, "grad_norm": 0.2998062074184418, "learning_rate": 0.0002, "epoch": 0.9471890971039182, "step": 1390}, {"loss": 1.7863, "grad_norm": 0.29927361011505127, "learning_rate": 0.0002, "epoch": 0.9540034071550255, "step": 1400}, {"loss": 1.7969, "grad_norm": 0.2941257953643799, "learning_rate": 0.0002, "epoch": 0.9608177172061328, "step": 1410}, {"loss": 1.8353, "grad_norm": 0.567848265171051, "learning_rate": 0.0002, "epoch": 0.9676320272572402, "step": 1420}, {"loss": 1.805, "grad_norm": 0.3168632388114929, "learning_rate": 0.0002, "epoch": 0.9744463373083475, "step": 1430}, {"loss": 1.7597, "grad_norm": 0.2995985448360443, "learning_rate": 0.0002, "epoch": 0.9812606473594548, "step": 1440}, {"loss": 1.7584, "grad_norm": 0.31000128388404846, "learning_rate": 0.0002, "epoch": 0.9880749574105622, "step": 1450}, {"loss": 1.8125, "grad_norm": 0.27469682693481445, "learning_rate": 0.0002, "epoch": 0.9948892674616695, "step": 1460}]} +{"epoch": 0.9996592844974447, "step": 1467, "epoch_duration": 5952.160561800003, "total_accumulated_duration": 5952.160561800003, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0987, "grad_norm": 0.498365193605423, "learning_rate": 0.0002, "epoch": 0.0068143100511073255, "step": 10}, {"loss": 2.5429, "grad_norm": 0.6220734119415283, "learning_rate": 0.0002, "epoch": 0.013628620102214651, "step": 20}, {"loss": 2.3589, "grad_norm": 0.4439898729324341, "learning_rate": 0.0002, "epoch": 0.020442930153321975, "step": 30}, {"loss": 2.1224, "grad_norm": 0.6975229978561401, "learning_rate": 0.0002, "epoch": 0.027257240204429302, "step": 40}, {"loss": 1.982, "grad_norm": 0.5258626341819763, "learning_rate": 0.0002, "epoch": 0.034071550255536626, "step": 50}, {"loss": 1.9314, "grad_norm": 0.43059441447257996, "learning_rate": 0.0002, "epoch": 0.04088586030664395, "step": 60}, {"loss": 1.9169, "grad_norm": 0.4406642019748688, "learning_rate": 0.0002, "epoch": 0.04770017035775128, "step": 70}, {"loss": 1.9103, "grad_norm": 0.421134889125824, "learning_rate": 0.0002, "epoch": 0.054514480408858604, "step": 80}, {"loss": 1.8863, "grad_norm": 0.4264920651912689, "learning_rate": 0.0002, "epoch": 0.06132879045996593, "step": 90}, {"loss": 1.9253, "grad_norm": 0.4626292288303375, "learning_rate": 0.0002, "epoch": 0.06814310051107325, "step": 100}, {"loss": 1.9163, "grad_norm": 0.40281006693840027, "learning_rate": 0.0002, "epoch": 0.07495741056218058, "step": 110}, {"loss": 1.9046, "grad_norm": 0.3965360224246979, "learning_rate": 0.0002, "epoch": 0.0817717206132879, "step": 120}, {"loss": 1.9446, "grad_norm": 0.40374159812927246, "learning_rate": 0.0002, "epoch": 0.08858603066439523, "step": 130}, {"loss": 1.8404, "grad_norm": 0.3667079210281372, "learning_rate": 0.0002, "epoch": 0.09540034071550256, "step": 140}, {"loss": 1.9009, "grad_norm": 0.3610645830631256, "learning_rate": 0.0002, "epoch": 0.10221465076660988, "step": 150}, {"loss": 1.8675, "grad_norm": 0.3471229076385498, "learning_rate": 0.0002, "epoch": 0.10902896081771721, "step": 160}, {"loss": 1.8644, "grad_norm": 0.3271232843399048, "learning_rate": 0.0002, "epoch": 0.11584327086882454, "step": 170}, {"loss": 1.8334, "grad_norm": 0.32523733377456665, "learning_rate": 0.0002, "epoch": 0.12265758091993186, "step": 180}, {"loss": 1.8408, "grad_norm": 0.380400687456131, "learning_rate": 0.0002, "epoch": 0.12947189097103917, "step": 190}, {"loss": 1.8442, "grad_norm": 0.3395659625530243, "learning_rate": 0.0002, "epoch": 0.1362862010221465, "step": 200}, {"loss": 1.767, "grad_norm": 0.3838060200214386, "learning_rate": 0.0002, "epoch": 0.14310051107325383, "step": 210}, {"loss": 1.832, "grad_norm": 0.3244315981864929, "learning_rate": 0.0002, "epoch": 0.14991482112436116, "step": 220}, {"loss": 1.8289, "grad_norm": 0.4522663652896881, "learning_rate": 0.0002, "epoch": 0.1567291311754685, "step": 230}, {"loss": 1.8126, "grad_norm": 0.33541423082351685, "learning_rate": 0.0002, "epoch": 0.1635434412265758, "step": 240}, {"loss": 1.7865, "grad_norm": 0.34351587295532227, "learning_rate": 0.0002, "epoch": 0.17035775127768313, "step": 250}, {"loss": 1.7783, "grad_norm": 0.34302911162376404, "learning_rate": 0.0002, "epoch": 0.17717206132879046, "step": 260}, {"loss": 1.8884, "grad_norm": 0.3259182572364807, "learning_rate": 0.0002, "epoch": 0.1839863713798978, "step": 270}, {"loss": 1.8704, "grad_norm": 0.36186060309410095, "learning_rate": 0.0002, "epoch": 0.19080068143100512, "step": 280}, {"loss": 1.8491, "grad_norm": 0.35534435510635376, "learning_rate": 0.0002, "epoch": 0.19761499148211242, "step": 290}, {"loss": 1.8394, "grad_norm": 0.4033444821834564, "learning_rate": 0.0002, "epoch": 0.20442930153321975, "step": 300}, {"loss": 1.9012, "grad_norm": 0.3625958561897278, "learning_rate": 0.0002, "epoch": 0.21124361158432708, "step": 310}, {"loss": 1.811, "grad_norm": 0.32784244418144226, "learning_rate": 0.0002, "epoch": 0.21805792163543442, "step": 320}, {"loss": 1.7068, "grad_norm": 0.3513999283313751, "learning_rate": 0.0002, "epoch": 0.22487223168654175, "step": 330}, {"loss": 1.8163, "grad_norm": 0.3713851273059845, "learning_rate": 0.0002, "epoch": 0.23168654173764908, "step": 340}, {"loss": 1.8401, "grad_norm": 0.31737443804740906, "learning_rate": 0.0002, "epoch": 0.23850085178875638, "step": 350}, {"loss": 1.7996, "grad_norm": 0.34314948320388794, "learning_rate": 0.0002, "epoch": 0.2453151618398637, "step": 360}, {"loss": 1.7154, "grad_norm": 0.3009597361087799, "learning_rate": 0.0002, "epoch": 0.252129471890971, "step": 370}, {"loss": 1.8601, "grad_norm": 0.3803360164165497, "learning_rate": 0.0002, "epoch": 0.25894378194207834, "step": 380}, {"loss": 1.8052, "grad_norm": 0.36017531156539917, "learning_rate": 0.0002, "epoch": 0.2657580919931857, "step": 390}, {"loss": 1.7855, "grad_norm": 0.31508535146713257, "learning_rate": 0.0002, "epoch": 0.272572402044293, "step": 400}, {"loss": 1.8034, "grad_norm": 0.31448641419410706, "learning_rate": 0.0002, "epoch": 0.27938671209540034, "step": 410}, {"loss": 1.8127, "grad_norm": 0.2856496572494507, "learning_rate": 0.0002, "epoch": 0.28620102214650767, "step": 420}, {"loss": 1.786, "grad_norm": 0.33829060196876526, "learning_rate": 0.0002, "epoch": 0.293015332197615, "step": 430}, {"loss": 1.7604, "grad_norm": 0.37043651938438416, "learning_rate": 0.0002, "epoch": 0.29982964224872233, "step": 440}, {"loss": 1.8456, "grad_norm": 0.3388267159461975, "learning_rate": 0.0002, "epoch": 0.30664395229982966, "step": 450}, {"loss": 1.7723, "grad_norm": 0.33093902468681335, "learning_rate": 0.0002, "epoch": 0.313458262350937, "step": 460}, {"loss": 1.7771, "grad_norm": 0.38566166162490845, "learning_rate": 0.0002, "epoch": 0.3202725724020443, "step": 470}, {"loss": 1.7974, "grad_norm": 0.31166040897369385, "learning_rate": 0.0002, "epoch": 0.3270868824531516, "step": 480}, {"loss": 1.8266, "grad_norm": 0.3021579682826996, "learning_rate": 0.0002, "epoch": 0.3339011925042589, "step": 490}, {"loss": 1.7372, "grad_norm": 0.3234574794769287, "learning_rate": 0.0002, "epoch": 0.34071550255536626, "step": 500}, {"loss": 1.8181, "grad_norm": 0.3360481560230255, "learning_rate": 0.0002, "epoch": 0.3475298126064736, "step": 510}, {"loss": 1.8317, "grad_norm": 0.3021681606769562, "learning_rate": 0.0002, "epoch": 0.3543441226575809, "step": 520}, {"loss": 1.7368, "grad_norm": 0.3115723133087158, "learning_rate": 0.0002, "epoch": 0.36115843270868825, "step": 530}, {"loss": 1.8637, "grad_norm": 0.36090970039367676, "learning_rate": 0.0002, "epoch": 0.3679727427597956, "step": 540}, {"loss": 1.8366, "grad_norm": 0.3562108874320984, "learning_rate": 0.0002, "epoch": 0.3747870528109029, "step": 550}, {"loss": 1.749, "grad_norm": 0.32650601863861084, "learning_rate": 0.0002, "epoch": 0.38160136286201024, "step": 560}, {"loss": 1.805, "grad_norm": 0.3092857003211975, "learning_rate": 0.0002, "epoch": 0.38841567291311757, "step": 570}, {"loss": 1.8108, "grad_norm": 0.3257334530353546, "learning_rate": 0.0002, "epoch": 0.39522998296422485, "step": 580}, {"loss": 1.7986, "grad_norm": 0.3228786587715149, "learning_rate": 0.0002, "epoch": 0.4020442930153322, "step": 590}, {"loss": 1.779, "grad_norm": 0.29040178656578064, "learning_rate": 0.0002, "epoch": 0.4088586030664395, "step": 600}, {"loss": 1.8197, "grad_norm": 0.31186041235923767, "learning_rate": 0.0002, "epoch": 0.41567291311754684, "step": 610}, {"loss": 1.7513, "grad_norm": 0.3320131301879883, "learning_rate": 0.0002, "epoch": 0.42248722316865417, "step": 620}, {"loss": 1.7667, "grad_norm": 0.329073041677475, "learning_rate": 0.0002, "epoch": 0.4293015332197615, "step": 630}, {"loss": 1.815, "grad_norm": 0.29447662830352783, "learning_rate": 0.0002, "epoch": 0.43611584327086883, "step": 640}, {"loss": 1.7234, "grad_norm": 0.31414613127708435, "learning_rate": 0.0002, "epoch": 0.44293015332197616, "step": 650}, {"loss": 1.8233, "grad_norm": 0.31708189845085144, "learning_rate": 0.0002, "epoch": 0.4497444633730835, "step": 660}, {"loss": 1.8014, "grad_norm": 0.3106320798397064, "learning_rate": 0.0002, "epoch": 0.4565587734241908, "step": 670}, {"loss": 1.8396, "grad_norm": 0.31112468242645264, "learning_rate": 0.0002, "epoch": 0.46337308347529815, "step": 680}, {"loss": 1.8286, "grad_norm": 0.31917497515678406, "learning_rate": 0.0002, "epoch": 0.47018739352640543, "step": 690}, {"loss": 1.742, "grad_norm": 0.27992361783981323, "learning_rate": 0.0002, "epoch": 0.47700170357751276, "step": 700}, {"loss": 1.695, "grad_norm": 0.30341657996177673, "learning_rate": 0.0002, "epoch": 0.4838160136286201, "step": 710}, {"loss": 1.8061, "grad_norm": 0.3544738292694092, "learning_rate": 0.0002, "epoch": 0.4906303236797274, "step": 720}, {"loss": 1.7353, "grad_norm": 0.3036152124404907, "learning_rate": 0.0002, "epoch": 0.49744463373083475, "step": 730}, {"loss": 1.8512, "grad_norm": 0.28593453764915466, "learning_rate": 0.0002, "epoch": 0.504258943781942, "step": 740}, {"loss": 1.7743, "grad_norm": 0.3141646385192871, "learning_rate": 0.0002, "epoch": 0.5110732538330494, "step": 750}, {"loss": 1.7817, "grad_norm": 0.354331910610199, "learning_rate": 0.0002, "epoch": 0.5178875638841567, "step": 760}, {"loss": 1.7129, "grad_norm": 0.29167047142982483, "learning_rate": 0.0002, "epoch": 0.524701873935264, "step": 770}, {"loss": 1.7899, "grad_norm": 0.3061729371547699, "learning_rate": 0.0002, "epoch": 0.5315161839863713, "step": 780}, {"loss": 1.8433, "grad_norm": 0.3108988404273987, "learning_rate": 0.0002, "epoch": 0.5383304940374787, "step": 790}, {"loss": 1.8625, "grad_norm": 0.3126199543476105, "learning_rate": 0.0002, "epoch": 0.545144804088586, "step": 800}, {"loss": 1.905, "grad_norm": 0.32045695185661316, "learning_rate": 0.0002, "epoch": 0.5519591141396933, "step": 810}, {"loss": 1.8324, "grad_norm": 0.31588202714920044, "learning_rate": 0.0002, "epoch": 0.5587734241908007, "step": 820}, {"loss": 1.7183, "grad_norm": 0.2906382977962494, "learning_rate": 0.0002, "epoch": 0.565587734241908, "step": 830}, {"loss": 1.8043, "grad_norm": 0.31454139947891235, "learning_rate": 0.0002, "epoch": 0.5724020442930153, "step": 840}, {"loss": 1.699, "grad_norm": 0.3258282244205475, "learning_rate": 0.0002, "epoch": 0.5792163543441227, "step": 850}, {"loss": 1.8034, "grad_norm": 0.2872781455516815, "learning_rate": 0.0002, "epoch": 0.58603066439523, "step": 860}, {"loss": 1.6896, "grad_norm": 0.29874536395072937, "learning_rate": 0.0002, "epoch": 0.5928449744463373, "step": 870}, {"loss": 1.81, "grad_norm": 0.29631322622299194, "learning_rate": 0.0002, "epoch": 0.5996592844974447, "step": 880}, {"loss": 1.8214, "grad_norm": 0.34305325150489807, "learning_rate": 0.0002, "epoch": 0.606473594548552, "step": 890}, {"loss": 1.7669, "grad_norm": 0.2909330427646637, "learning_rate": 0.0002, "epoch": 0.6132879045996593, "step": 900}, {"loss": 1.8345, "grad_norm": 0.3237459063529968, "learning_rate": 0.0002, "epoch": 0.6201022146507666, "step": 910}, {"loss": 1.7959, "grad_norm": 0.3253563642501831, "learning_rate": 0.0002, "epoch": 0.626916524701874, "step": 920}, {"loss": 1.7597, "grad_norm": 0.3089054524898529, "learning_rate": 0.0002, "epoch": 0.6337308347529813, "step": 930}, {"loss": 1.85, "grad_norm": 0.3492709994316101, "learning_rate": 0.0002, "epoch": 0.6405451448040886, "step": 940}, {"loss": 1.707, "grad_norm": 0.29194191098213196, "learning_rate": 0.0002, "epoch": 0.6473594548551959, "step": 950}, {"loss": 1.715, "grad_norm": 0.322201132774353, "learning_rate": 0.0002, "epoch": 0.6541737649063032, "step": 960}, {"loss": 1.7784, "grad_norm": 0.29320329427719116, "learning_rate": 0.0002, "epoch": 0.6609880749574105, "step": 970}, {"loss": 1.7581, "grad_norm": 0.3330402970314026, "learning_rate": 0.0002, "epoch": 0.6678023850085179, "step": 980}, {"loss": 1.7923, "grad_norm": 0.32067734003067017, "learning_rate": 0.0002, "epoch": 0.6746166950596252, "step": 990}, {"loss": 1.8199, "grad_norm": 0.30349814891815186, "learning_rate": 0.0002, "epoch": 0.6814310051107325, "step": 1000}, {"loss": 1.7709, "grad_norm": 0.2959327697753906, "learning_rate": 0.0002, "epoch": 0.6882453151618398, "step": 1010}, {"loss": 1.7824, "grad_norm": 0.2884613275527954, "learning_rate": 0.0002, "epoch": 0.6950596252129472, "step": 1020}, {"loss": 1.7942, "grad_norm": 0.3494454622268677, "learning_rate": 0.0002, "epoch": 0.7018739352640545, "step": 1030}, {"loss": 1.7104, "grad_norm": 0.2978321313858032, "learning_rate": 0.0002, "epoch": 0.7086882453151618, "step": 1040}, {"loss": 1.8721, "grad_norm": 0.30021217465400696, "learning_rate": 0.0002, "epoch": 0.7155025553662692, "step": 1050}, {"loss": 1.7811, "grad_norm": 0.3258594572544098, "learning_rate": 0.0002, "epoch": 0.7223168654173765, "step": 1060}, {"loss": 1.7836, "grad_norm": 0.2974754571914673, "learning_rate": 0.0002, "epoch": 0.7291311754684838, "step": 1070}, {"loss": 1.7812, "grad_norm": 0.2970089018344879, "learning_rate": 0.0002, "epoch": 0.7359454855195912, "step": 1080}, {"loss": 1.767, "grad_norm": 0.3410273492336273, "learning_rate": 0.0002, "epoch": 0.7427597955706985, "step": 1090}, {"loss": 1.7331, "grad_norm": 0.27737298607826233, "learning_rate": 0.0002, "epoch": 0.7495741056218058, "step": 1100}, {"loss": 1.806, "grad_norm": 0.2745804190635681, "learning_rate": 0.0002, "epoch": 0.7563884156729132, "step": 1110}, {"loss": 1.8067, "grad_norm": 0.31683582067489624, "learning_rate": 0.0002, "epoch": 0.7632027257240205, "step": 1120}, {"loss": 1.6998, "grad_norm": 0.3282010853290558, "learning_rate": 0.0002, "epoch": 0.7700170357751278, "step": 1130}, {"loss": 1.8469, "grad_norm": 0.29620859026908875, "learning_rate": 0.0002, "epoch": 0.7768313458262351, "step": 1140}, {"loss": 1.7014, "grad_norm": 0.2987517714500427, "learning_rate": 0.0002, "epoch": 0.7836456558773425, "step": 1150}, {"loss": 1.8152, "grad_norm": 0.30824369192123413, "learning_rate": 0.0002, "epoch": 0.7904599659284497, "step": 1160}, {"loss": 1.7628, "grad_norm": 0.2937208116054535, "learning_rate": 0.0002, "epoch": 0.797274275979557, "step": 1170}, {"loss": 1.8246, "grad_norm": 0.31872445344924927, "learning_rate": 0.0002, "epoch": 0.8040885860306644, "step": 1180}, {"loss": 1.7757, "grad_norm": 0.3155718147754669, "learning_rate": 0.0002, "epoch": 0.8109028960817717, "step": 1190}, {"loss": 1.7588, "grad_norm": 0.3172800540924072, "learning_rate": 0.0002, "epoch": 0.817717206132879, "step": 1200}, {"loss": 1.7906, "grad_norm": 0.2964530885219574, "learning_rate": 0.0002, "epoch": 0.8245315161839863, "step": 1210}, {"loss": 1.8038, "grad_norm": 0.27863815426826477, "learning_rate": 0.0002, "epoch": 0.8313458262350937, "step": 1220}, {"loss": 1.7378, "grad_norm": 0.3413368761539459, "learning_rate": 0.0002, "epoch": 0.838160136286201, "step": 1230}, {"loss": 1.8283, "grad_norm": 0.31256571412086487, "learning_rate": 0.0002, "epoch": 0.8449744463373083, "step": 1240}, {"loss": 1.7992, "grad_norm": 0.2880518436431885, "learning_rate": 0.0002, "epoch": 0.8517887563884157, "step": 1250}, {"loss": 1.6853, "grad_norm": 0.3013632893562317, "learning_rate": 0.0002, "epoch": 0.858603066439523, "step": 1260}, {"loss": 1.85, "grad_norm": 0.2975374162197113, "learning_rate": 0.0002, "epoch": 0.8654173764906303, "step": 1270}, {"loss": 1.752, "grad_norm": 0.28066179156303406, "learning_rate": 0.0002, "epoch": 0.8722316865417377, "step": 1280}, {"loss": 1.7444, "grad_norm": 0.2936013340950012, "learning_rate": 0.0002, "epoch": 0.879045996592845, "step": 1290}, {"loss": 1.8569, "grad_norm": 0.3259948194026947, "learning_rate": 0.0002, "epoch": 0.8858603066439523, "step": 1300}, {"loss": 1.7641, "grad_norm": 0.32330527901649475, "learning_rate": 0.0002, "epoch": 0.8926746166950597, "step": 1310}, {"loss": 1.771, "grad_norm": 0.2923520803451538, "learning_rate": 0.0002, "epoch": 0.899488926746167, "step": 1320}, {"loss": 1.9084, "grad_norm": 0.29356926679611206, "learning_rate": 0.0002, "epoch": 0.9063032367972743, "step": 1330}, {"loss": 1.7867, "grad_norm": 0.26950427889823914, "learning_rate": 0.0002, "epoch": 0.9131175468483816, "step": 1340}, {"loss": 1.8001, "grad_norm": 0.2880370020866394, "learning_rate": 0.0002, "epoch": 0.919931856899489, "step": 1350}, {"loss": 1.736, "grad_norm": 0.2875469923019409, "learning_rate": 0.0002, "epoch": 0.9267461669505963, "step": 1360}, {"loss": 1.7843, "grad_norm": 0.32968875765800476, "learning_rate": 0.0002, "epoch": 0.9335604770017035, "step": 1370}, {"loss": 1.7543, "grad_norm": 0.30070847272872925, "learning_rate": 0.0002, "epoch": 0.9403747870528109, "step": 1380}, {"loss": 1.8313, "grad_norm": 0.29682332277297974, "learning_rate": 0.0002, "epoch": 0.9471890971039182, "step": 1390}, {"loss": 1.7846, "grad_norm": 0.3033919036388397, "learning_rate": 0.0002, "epoch": 0.9540034071550255, "step": 1400}, {"loss": 1.7929, "grad_norm": 0.2910512685775757, "learning_rate": 0.0002, "epoch": 0.9608177172061328, "step": 1410}, {"loss": 1.837, "grad_norm": 0.34930622577667236, "learning_rate": 0.0002, "epoch": 0.9676320272572402, "step": 1420}, {"loss": 1.8057, "grad_norm": 0.28808438777923584, "learning_rate": 0.0002, "epoch": 0.9744463373083475, "step": 1430}, {"loss": 1.7563, "grad_norm": 0.3028844892978668, "learning_rate": 0.0002, "epoch": 0.9812606473594548, "step": 1440}, {"loss": 1.7546, "grad_norm": 0.3123234212398529, "learning_rate": 0.0002, "epoch": 0.9880749574105622, "step": 1450}, {"loss": 1.8141, "grad_norm": 0.28183823823928833, "learning_rate": 0.0002, "epoch": 0.9948892674616695, "step": 1460}]} +{"epoch": 0.9996592844974447, "step": 1467, "epoch_duration": 2186.659736633301, "total_accumulated_duration": 2186.659736633301, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 11696.9921875}, "avg_memory_reserved": {"GPU_0": 12758.0}, "peak_memory_reserved": {"GPU_0": 12758.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "N/A", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0988, "grad_norm": 0.635574460029602, "learning_rate": 0.0002, "epoch": 0.0068143100511073255, "step": 10}, {"loss": 2.5594, "grad_norm": 1.0401769876480103, "learning_rate": 0.0002, "epoch": 0.013628620102214651, "step": 20}, {"loss": 2.3587, "grad_norm": 0.4514131247997284, "learning_rate": 0.0002, "epoch": 0.020442930153321975, "step": 30}, {"loss": 2.121, "grad_norm": 0.6303355693817139, "learning_rate": 0.0002, "epoch": 0.027257240204429302, "step": 40}, {"loss": 1.9833, "grad_norm": 0.4648270606994629, "learning_rate": 0.0002, "epoch": 0.034071550255536626, "step": 50}, {"loss": 1.9384, "grad_norm": 0.42953479290008545, "learning_rate": 0.0002, "epoch": 0.04088586030664395, "step": 60}, {"loss": 1.9202, "grad_norm": 0.433614581823349, "learning_rate": 0.0002, "epoch": 0.04770017035775128, "step": 70}, {"loss": 1.911, "grad_norm": 0.45318254828453064, "learning_rate": 0.0002, "epoch": 0.054514480408858604, "step": 80}, {"loss": 1.8895, "grad_norm": 0.4023568630218506, "learning_rate": 0.0002, "epoch": 0.06132879045996593, "step": 90}, {"loss": 1.9257, "grad_norm": 0.43260207772254944, "learning_rate": 0.0002, "epoch": 0.06814310051107325, "step": 100}, {"loss": 1.9178, "grad_norm": 0.43389809131622314, "learning_rate": 0.0002, "epoch": 0.07495741056218058, "step": 110}, {"loss": 1.9071, "grad_norm": 0.39307987689971924, "learning_rate": 0.0002, "epoch": 0.0817717206132879, "step": 120}, {"loss": 1.9482, "grad_norm": 0.7703037261962891, "learning_rate": 0.0002, "epoch": 0.08858603066439523, "step": 130}, {"loss": 1.8394, "grad_norm": 0.38547563552856445, "learning_rate": 0.0002, "epoch": 0.09540034071550256, "step": 140}, {"loss": 1.9037, "grad_norm": 0.37948688864707947, "learning_rate": 0.0002, "epoch": 0.10221465076660988, "step": 150}, {"loss": 1.8664, "grad_norm": 0.33022379875183105, "learning_rate": 0.0002, "epoch": 0.10902896081771721, "step": 160}, {"loss": 1.8659, "grad_norm": 0.33703792095184326, "learning_rate": 0.0002, "epoch": 0.11584327086882454, "step": 170}, {"loss": 1.8389, "grad_norm": 0.3259912431240082, "learning_rate": 0.0002, "epoch": 0.12265758091993186, "step": 180}, {"loss": 1.8424, "grad_norm": 0.3593858778476715, "learning_rate": 0.0002, "epoch": 0.12947189097103917, "step": 190}, {"loss": 1.8457, "grad_norm": 0.3401614725589752, "learning_rate": 0.0002, "epoch": 0.1362862010221465, "step": 200}, {"loss": 1.7721, "grad_norm": 0.3892078697681427, "learning_rate": 0.0002, "epoch": 0.14310051107325383, "step": 210}, {"loss": 1.8351, "grad_norm": 0.315374493598938, "learning_rate": 0.0002, "epoch": 0.14991482112436116, "step": 220}, {"loss": 1.8307, "grad_norm": 0.3679497539997101, "learning_rate": 0.0002, "epoch": 0.1567291311754685, "step": 230}, {"loss": 1.816, "grad_norm": 0.336730033159256, "learning_rate": 0.0002, "epoch": 0.1635434412265758, "step": 240}, {"loss": 1.7849, "grad_norm": 0.36472755670547485, "learning_rate": 0.0002, "epoch": 0.17035775127768313, "step": 250}, {"loss": 1.7766, "grad_norm": 0.34864023327827454, "learning_rate": 0.0002, "epoch": 0.17717206132879046, "step": 260}, {"loss": 1.896, "grad_norm": 0.3350819945335388, "learning_rate": 0.0002, "epoch": 0.1839863713798978, "step": 270}, {"loss": 1.8742, "grad_norm": 0.3844246566295624, "learning_rate": 0.0002, "epoch": 0.19080068143100512, "step": 280}, {"loss": 1.8531, "grad_norm": 0.38413065671920776, "learning_rate": 0.0002, "epoch": 0.19761499148211242, "step": 290}, {"loss": 1.8415, "grad_norm": 0.4080047607421875, "learning_rate": 0.0002, "epoch": 0.20442930153321975, "step": 300}, {"loss": 1.9009, "grad_norm": 0.3546800911426544, "learning_rate": 0.0002, "epoch": 0.21124361158432708, "step": 310}, {"loss": 1.8092, "grad_norm": 0.38577890396118164, "learning_rate": 0.0002, "epoch": 0.21805792163543442, "step": 320}, {"loss": 1.7113, "grad_norm": 0.38979098200798035, "learning_rate": 0.0002, "epoch": 0.22487223168654175, "step": 330}, {"loss": 1.8162, "grad_norm": 0.35105520486831665, "learning_rate": 0.0002, "epoch": 0.23168654173764908, "step": 340}, {"loss": 1.8408, "grad_norm": 0.31671223044395447, "learning_rate": 0.0002, "epoch": 0.23850085178875638, "step": 350}, {"loss": 1.8014, "grad_norm": 0.33034196496009827, "learning_rate": 0.0002, "epoch": 0.2453151618398637, "step": 360}, {"loss": 1.7132, "grad_norm": 0.2990533709526062, "learning_rate": 0.0002, "epoch": 0.252129471890971, "step": 370}, {"loss": 1.8612, "grad_norm": 0.362208753824234, "learning_rate": 0.0002, "epoch": 0.25894378194207834, "step": 380}, {"loss": 1.8001, "grad_norm": 0.3269096612930298, "learning_rate": 0.0002, "epoch": 0.2657580919931857, "step": 390}, {"loss": 1.786, "grad_norm": 0.30555954575538635, "learning_rate": 0.0002, "epoch": 0.272572402044293, "step": 400}, {"loss": 1.8018, "grad_norm": 0.332933247089386, "learning_rate": 0.0002, "epoch": 0.27938671209540034, "step": 410}, {"loss": 1.8157, "grad_norm": 0.39454060792922974, "learning_rate": 0.0002, "epoch": 0.28620102214650767, "step": 420}, {"loss": 1.7862, "grad_norm": 0.34589633345603943, "learning_rate": 0.0002, "epoch": 0.293015332197615, "step": 430}, {"loss": 1.7612, "grad_norm": 0.3747332990169525, "learning_rate": 0.0002, "epoch": 0.29982964224872233, "step": 440}, {"loss": 1.8476, "grad_norm": 0.34825369715690613, "learning_rate": 0.0002, "epoch": 0.30664395229982966, "step": 450}, {"loss": 1.775, "grad_norm": 0.32906976342201233, "learning_rate": 0.0002, "epoch": 0.313458262350937, "step": 460}, {"loss": 1.7764, "grad_norm": 0.33108609914779663, "learning_rate": 0.0002, "epoch": 0.3202725724020443, "step": 470}, {"loss": 1.8012, "grad_norm": 0.3170463442802429, "learning_rate": 0.0002, "epoch": 0.3270868824531516, "step": 480}, {"loss": 1.8264, "grad_norm": 0.30792081356048584, "learning_rate": 0.0002, "epoch": 0.3339011925042589, "step": 490}, {"loss": 1.7361, "grad_norm": 0.31772997975349426, "learning_rate": 0.0002, "epoch": 0.34071550255536626, "step": 500}, {"loss": 1.8147, "grad_norm": 0.32714012265205383, "learning_rate": 0.0002, "epoch": 0.3475298126064736, "step": 510}, {"loss": 1.8332, "grad_norm": 0.3021100163459778, "learning_rate": 0.0002, "epoch": 0.3543441226575809, "step": 520}, {"loss": 1.7337, "grad_norm": 0.6045835018157959, "learning_rate": 0.0002, "epoch": 0.36115843270868825, "step": 530}, {"loss": 1.8661, "grad_norm": 0.3003896474838257, "learning_rate": 0.0002, "epoch": 0.3679727427597956, "step": 540}, {"loss": 1.8359, "grad_norm": 0.3678470551967621, "learning_rate": 0.0002, "epoch": 0.3747870528109029, "step": 550}, {"loss": 1.7512, "grad_norm": 0.35787615180015564, "learning_rate": 0.0002, "epoch": 0.38160136286201024, "step": 560}, {"loss": 1.8048, "grad_norm": 0.31882143020629883, "learning_rate": 0.0002, "epoch": 0.38841567291311757, "step": 570}, {"loss": 1.8108, "grad_norm": 0.3186313509941101, "learning_rate": 0.0002, "epoch": 0.39522998296422485, "step": 580}, {"loss": 1.8012, "grad_norm": 0.41443702578544617, "learning_rate": 0.0002, "epoch": 0.4020442930153322, "step": 590}, {"loss": 1.7794, "grad_norm": 0.28773069381713867, "learning_rate": 0.0002, "epoch": 0.4088586030664395, "step": 600}, {"loss": 1.8231, "grad_norm": 0.35743263363838196, "learning_rate": 0.0002, "epoch": 0.41567291311754684, "step": 610}, {"loss": 1.7531, "grad_norm": 0.3360286355018616, "learning_rate": 0.0002, "epoch": 0.42248722316865417, "step": 620}, {"loss": 1.7654, "grad_norm": 0.32838866114616394, "learning_rate": 0.0002, "epoch": 0.4293015332197615, "step": 630}, {"loss": 1.8176, "grad_norm": 0.2994388937950134, "learning_rate": 0.0002, "epoch": 0.43611584327086883, "step": 640}, {"loss": 1.7241, "grad_norm": 0.3306307792663574, "learning_rate": 0.0002, "epoch": 0.44293015332197616, "step": 650}, {"loss": 1.8201, "grad_norm": 0.3129560351371765, "learning_rate": 0.0002, "epoch": 0.4497444633730835, "step": 660}, {"loss": 1.803, "grad_norm": 0.3244289457798004, "learning_rate": 0.0002, "epoch": 0.4565587734241908, "step": 670}, {"loss": 1.8399, "grad_norm": 0.3196892738342285, "learning_rate": 0.0002, "epoch": 0.46337308347529815, "step": 680}, {"loss": 1.8291, "grad_norm": 0.3135230243206024, "learning_rate": 0.0002, "epoch": 0.47018739352640543, "step": 690}, {"loss": 1.7423, "grad_norm": 0.28677991032600403, "learning_rate": 0.0002, "epoch": 0.47700170357751276, "step": 700}, {"loss": 1.6982, "grad_norm": 0.3074065148830414, "learning_rate": 0.0002, "epoch": 0.4838160136286201, "step": 710}, {"loss": 1.8078, "grad_norm": 0.3354290723800659, "learning_rate": 0.0002, "epoch": 0.4906303236797274, "step": 720}, {"loss": 1.7333, "grad_norm": 0.324370801448822, "learning_rate": 0.0002, "epoch": 0.49744463373083475, "step": 730}, {"loss": 1.8506, "grad_norm": 0.29496142268180847, "learning_rate": 0.0002, "epoch": 0.504258943781942, "step": 740}, {"loss": 1.7761, "grad_norm": 0.30694130063056946, "learning_rate": 0.0002, "epoch": 0.5110732538330494, "step": 750}, {"loss": 1.7854, "grad_norm": 0.36168408393859863, "learning_rate": 0.0002, "epoch": 0.5178875638841567, "step": 760}, {"loss": 1.7149, "grad_norm": 0.2930343449115753, "learning_rate": 0.0002, "epoch": 0.524701873935264, "step": 770}, {"loss": 1.7924, "grad_norm": 0.3023432493209839, "learning_rate": 0.0002, "epoch": 0.5315161839863713, "step": 780}, {"loss": 1.8467, "grad_norm": 0.3272720277309418, "learning_rate": 0.0002, "epoch": 0.5383304940374787, "step": 790}, {"loss": 1.8639, "grad_norm": 0.2907974421977997, "learning_rate": 0.0002, "epoch": 0.545144804088586, "step": 800}, {"loss": 1.9018, "grad_norm": 0.32267168164253235, "learning_rate": 0.0002, "epoch": 0.5519591141396933, "step": 810}, {"loss": 1.8311, "grad_norm": 0.32059940695762634, "learning_rate": 0.0002, "epoch": 0.5587734241908007, "step": 820}, {"loss": 1.7234, "grad_norm": 0.30951258540153503, "learning_rate": 0.0002, "epoch": 0.565587734241908, "step": 830}, {"loss": 1.8063, "grad_norm": 0.33976122736930847, "learning_rate": 0.0002, "epoch": 0.5724020442930153, "step": 840}, {"loss": 1.7021, "grad_norm": 0.3195820450782776, "learning_rate": 0.0002, "epoch": 0.5792163543441227, "step": 850}, {"loss": 1.8073, "grad_norm": 0.2828562557697296, "learning_rate": 0.0002, "epoch": 0.58603066439523, "step": 860}, {"loss": 1.695, "grad_norm": 0.29591670632362366, "learning_rate": 0.0002, "epoch": 0.5928449744463373, "step": 870}, {"loss": 1.8109, "grad_norm": 0.3086104393005371, "learning_rate": 0.0002, "epoch": 0.5996592844974447, "step": 880}, {"loss": 1.8207, "grad_norm": 0.3592929542064667, "learning_rate": 0.0002, "epoch": 0.606473594548552, "step": 890}, {"loss": 1.7662, "grad_norm": 0.2830186188220978, "learning_rate": 0.0002, "epoch": 0.6132879045996593, "step": 900}, {"loss": 1.8344, "grad_norm": 0.3128598630428314, "learning_rate": 0.0002, "epoch": 0.6201022146507666, "step": 910}, {"loss": 1.7953, "grad_norm": 0.31957563757896423, "learning_rate": 0.0002, "epoch": 0.626916524701874, "step": 920}, {"loss": 1.7578, "grad_norm": 0.30994319915771484, "learning_rate": 0.0002, "epoch": 0.6337308347529813, "step": 930}, {"loss": 1.8494, "grad_norm": 0.3352845013141632, "learning_rate": 0.0002, "epoch": 0.6405451448040886, "step": 940}, {"loss": 1.7054, "grad_norm": 0.2960077226161957, "learning_rate": 0.0002, "epoch": 0.6473594548551959, "step": 950}, {"loss": 1.7209, "grad_norm": 0.32675081491470337, "learning_rate": 0.0002, "epoch": 0.6541737649063032, "step": 960}, {"loss": 1.7776, "grad_norm": 0.30042028427124023, "learning_rate": 0.0002, "epoch": 0.6609880749574105, "step": 970}, {"loss": 1.7597, "grad_norm": 0.3288673758506775, "learning_rate": 0.0002, "epoch": 0.6678023850085179, "step": 980}, {"loss": 1.7962, "grad_norm": 0.323215126991272, "learning_rate": 0.0002, "epoch": 0.6746166950596252, "step": 990}, {"loss": 1.821, "grad_norm": 0.30041399598121643, "learning_rate": 0.0002, "epoch": 0.6814310051107325, "step": 1000}, {"loss": 1.7716, "grad_norm": 0.3076179623603821, "learning_rate": 0.0002, "epoch": 0.6882453151618398, "step": 1010}, {"loss": 1.7827, "grad_norm": 0.2941909730434418, "learning_rate": 0.0002, "epoch": 0.6950596252129472, "step": 1020}, {"loss": 1.7964, "grad_norm": 0.32220420241355896, "learning_rate": 0.0002, "epoch": 0.7018739352640545, "step": 1030}, {"loss": 1.7101, "grad_norm": 0.2989702820777893, "learning_rate": 0.0002, "epoch": 0.7086882453151618, "step": 1040}, {"loss": 1.8749, "grad_norm": 0.3049640357494354, "learning_rate": 0.0002, "epoch": 0.7155025553662692, "step": 1050}, {"loss": 1.782, "grad_norm": 0.3183284103870392, "learning_rate": 0.0002, "epoch": 0.7223168654173765, "step": 1060}, {"loss": 1.785, "grad_norm": 0.3070095181465149, "learning_rate": 0.0002, "epoch": 0.7291311754684838, "step": 1070}, {"loss": 1.7832, "grad_norm": 0.33263063430786133, "learning_rate": 0.0002, "epoch": 0.7359454855195912, "step": 1080}, {"loss": 1.7627, "grad_norm": 0.28774312138557434, "learning_rate": 0.0002, "epoch": 0.7427597955706985, "step": 1090}, {"loss": 1.7343, "grad_norm": 0.29304224252700806, "learning_rate": 0.0002, "epoch": 0.7495741056218058, "step": 1100}, {"loss": 1.8082, "grad_norm": 0.27529507875442505, "learning_rate": 0.0002, "epoch": 0.7563884156729132, "step": 1110}, {"loss": 1.8071, "grad_norm": 0.32319945096969604, "learning_rate": 0.0002, "epoch": 0.7632027257240205, "step": 1120}, {"loss": 1.6998, "grad_norm": 0.33777597546577454, "learning_rate": 0.0002, "epoch": 0.7700170357751278, "step": 1130}, {"loss": 1.8488, "grad_norm": 0.29847201704978943, "learning_rate": 0.0002, "epoch": 0.7768313458262351, "step": 1140}, {"loss": 1.7014, "grad_norm": 0.31370633840560913, "learning_rate": 0.0002, "epoch": 0.7836456558773425, "step": 1150}, {"loss": 1.8175, "grad_norm": 0.31633856892585754, "learning_rate": 0.0002, "epoch": 0.7904599659284497, "step": 1160}, {"loss": 1.7637, "grad_norm": 0.33849263191223145, "learning_rate": 0.0002, "epoch": 0.797274275979557, "step": 1170}, {"loss": 1.8264, "grad_norm": 0.3306333124637604, "learning_rate": 0.0002, "epoch": 0.8040885860306644, "step": 1180}, {"loss": 1.777, "grad_norm": 0.3249678313732147, "learning_rate": 0.0002, "epoch": 0.8109028960817717, "step": 1190}, {"loss": 1.7631, "grad_norm": 0.3252817690372467, "learning_rate": 0.0002, "epoch": 0.817717206132879, "step": 1200}, {"loss": 1.7877, "grad_norm": 0.31772559881210327, "learning_rate": 0.0002, "epoch": 0.8245315161839863, "step": 1210}, {"loss": 1.8068, "grad_norm": 0.2803371846675873, "learning_rate": 0.0002, "epoch": 0.8313458262350937, "step": 1220}, {"loss": 1.738, "grad_norm": 0.26700571179389954, "learning_rate": 0.0002, "epoch": 0.838160136286201, "step": 1230}, {"loss": 1.8301, "grad_norm": 0.3060092031955719, "learning_rate": 0.0002, "epoch": 0.8449744463373083, "step": 1240}, {"loss": 1.7993, "grad_norm": 0.28831684589385986, "learning_rate": 0.0002, "epoch": 0.8517887563884157, "step": 1250}, {"loss": 1.6909, "grad_norm": 0.30708742141723633, "learning_rate": 0.0002, "epoch": 0.858603066439523, "step": 1260}, {"loss": 1.8506, "grad_norm": 0.2915987968444824, "learning_rate": 0.0002, "epoch": 0.8654173764906303, "step": 1270}, {"loss": 1.7536, "grad_norm": 0.2893589735031128, "learning_rate": 0.0002, "epoch": 0.8722316865417377, "step": 1280}, {"loss": 1.7437, "grad_norm": 0.29545632004737854, "learning_rate": 0.0002, "epoch": 0.879045996592845, "step": 1290}, {"loss": 1.859, "grad_norm": 0.3354771137237549, "learning_rate": 0.0002, "epoch": 0.8858603066439523, "step": 1300}, {"loss": 1.7644, "grad_norm": 0.37715399265289307, "learning_rate": 0.0002, "epoch": 0.8926746166950597, "step": 1310}, {"loss": 1.7731, "grad_norm": 0.28847193717956543, "learning_rate": 0.0002, "epoch": 0.899488926746167, "step": 1320}, {"loss": 1.9077, "grad_norm": 0.2780889868736267, "learning_rate": 0.0002, "epoch": 0.9063032367972743, "step": 1330}, {"loss": 1.7861, "grad_norm": 0.2714342176914215, "learning_rate": 0.0002, "epoch": 0.9131175468483816, "step": 1340}, {"loss": 1.8, "grad_norm": 0.2950133979320526, "learning_rate": 0.0002, "epoch": 0.919931856899489, "step": 1350}, {"loss": 1.7368, "grad_norm": 0.29097145795822144, "learning_rate": 0.0002, "epoch": 0.9267461669505963, "step": 1360}, {"loss": 1.7864, "grad_norm": 0.32540133595466614, "learning_rate": 0.0002, "epoch": 0.9335604770017035, "step": 1370}, {"loss": 1.7571, "grad_norm": 0.3076636493206024, "learning_rate": 0.0002, "epoch": 0.9403747870528109, "step": 1380}, {"loss": 1.8312, "grad_norm": 0.2962130308151245, "learning_rate": 0.0002, "epoch": 0.9471890971039182, "step": 1390}, {"loss": 1.7858, "grad_norm": 0.30086860060691833, "learning_rate": 0.0002, "epoch": 0.9540034071550255, "step": 1400}, {"loss": 1.7991, "grad_norm": 0.28634947538375854, "learning_rate": 0.0002, "epoch": 0.9608177172061328, "step": 1410}, {"loss": 1.8385, "grad_norm": 0.35314416885375977, "learning_rate": 0.0002, "epoch": 0.9676320272572402, "step": 1420}, {"loss": 1.8054, "grad_norm": 0.2939317524433136, "learning_rate": 0.0002, "epoch": 0.9744463373083475, "step": 1430}, {"loss": 1.7582, "grad_norm": 0.3010196089744568, "learning_rate": 0.0002, "epoch": 0.9812606473594548, "step": 1440}, {"loss": 1.758, "grad_norm": 0.30816152691841125, "learning_rate": 0.0002, "epoch": 0.9880749574105622, "step": 1450}, {"loss": 1.8159, "grad_norm": 0.28152793645858765, "learning_rate": 0.0002, "epoch": 0.9948892674616695, "step": 1460}]} +{"epoch": 2.0, "step": 2935, "epoch_duration": 2323.3215596675873, "total_accumulated_duration": 4509.981296300888, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28746.0}, "peak_memory_reserved": {"GPU_0": 28746.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0988, "grad_norm": 0.635574460029602, "learning_rate": 0.0002, "epoch": 0.0068143100511073255, "step": 10}, {"loss": 2.5594, "grad_norm": 1.0401769876480103, "learning_rate": 0.0002, "epoch": 0.013628620102214651, "step": 20}, {"loss": 2.3587, "grad_norm": 0.4514131247997284, "learning_rate": 0.0002, "epoch": 0.020442930153321975, "step": 30}, {"loss": 2.121, "grad_norm": 0.6303355693817139, "learning_rate": 0.0002, "epoch": 0.027257240204429302, "step": 40}, {"loss": 1.9833, "grad_norm": 0.4648270606994629, "learning_rate": 0.0002, "epoch": 0.034071550255536626, "step": 50}, {"loss": 1.9384, "grad_norm": 0.42953479290008545, "learning_rate": 0.0002, "epoch": 0.04088586030664395, "step": 60}, {"loss": 1.9202, "grad_norm": 0.433614581823349, "learning_rate": 0.0002, "epoch": 0.04770017035775128, "step": 70}, {"loss": 1.911, "grad_norm": 0.45318254828453064, "learning_rate": 0.0002, "epoch": 0.054514480408858604, "step": 80}, {"loss": 1.8895, "grad_norm": 0.4023568630218506, "learning_rate": 0.0002, "epoch": 0.06132879045996593, "step": 90}, {"loss": 1.9257, "grad_norm": 0.43260207772254944, "learning_rate": 0.0002, "epoch": 0.06814310051107325, "step": 100}, {"loss": 1.9178, "grad_norm": 0.43389809131622314, "learning_rate": 0.0002, "epoch": 0.07495741056218058, "step": 110}, {"loss": 1.9071, "grad_norm": 0.39307987689971924, "learning_rate": 0.0002, "epoch": 0.0817717206132879, "step": 120}, {"loss": 1.9482, "grad_norm": 0.7703037261962891, "learning_rate": 0.0002, "epoch": 0.08858603066439523, "step": 130}, {"loss": 1.8394, "grad_norm": 0.38547563552856445, "learning_rate": 0.0002, "epoch": 0.09540034071550256, "step": 140}, {"loss": 1.9037, "grad_norm": 0.37948688864707947, "learning_rate": 0.0002, "epoch": 0.10221465076660988, "step": 150}, {"loss": 1.8664, "grad_norm": 0.33022379875183105, "learning_rate": 0.0002, "epoch": 0.10902896081771721, "step": 160}, {"loss": 1.8659, "grad_norm": 0.33703792095184326, "learning_rate": 0.0002, "epoch": 0.11584327086882454, "step": 170}, {"loss": 1.8389, "grad_norm": 0.3259912431240082, "learning_rate": 0.0002, "epoch": 0.12265758091993186, "step": 180}, {"loss": 1.8424, "grad_norm": 0.3593858778476715, "learning_rate": 0.0002, "epoch": 0.12947189097103917, "step": 190}, {"loss": 1.8457, "grad_norm": 0.3401614725589752, "learning_rate": 0.0002, "epoch": 0.1362862010221465, "step": 200}, {"loss": 1.7721, "grad_norm": 0.3892078697681427, "learning_rate": 0.0002, "epoch": 0.14310051107325383, "step": 210}, {"loss": 1.8351, "grad_norm": 0.315374493598938, "learning_rate": 0.0002, "epoch": 0.14991482112436116, "step": 220}, {"loss": 1.8307, "grad_norm": 0.3679497539997101, "learning_rate": 0.0002, "epoch": 0.1567291311754685, "step": 230}, {"loss": 1.816, "grad_norm": 0.336730033159256, "learning_rate": 0.0002, "epoch": 0.1635434412265758, "step": 240}, {"loss": 1.7849, "grad_norm": 0.36472755670547485, "learning_rate": 0.0002, "epoch": 0.17035775127768313, "step": 250}, {"loss": 1.7766, "grad_norm": 0.34864023327827454, "learning_rate": 0.0002, "epoch": 0.17717206132879046, "step": 260}, {"loss": 1.896, "grad_norm": 0.3350819945335388, "learning_rate": 0.0002, "epoch": 0.1839863713798978, "step": 270}, {"loss": 1.8742, "grad_norm": 0.3844246566295624, "learning_rate": 0.0002, "epoch": 0.19080068143100512, "step": 280}, {"loss": 1.8531, "grad_norm": 0.38413065671920776, "learning_rate": 0.0002, "epoch": 0.19761499148211242, "step": 290}, {"loss": 1.8415, "grad_norm": 0.4080047607421875, "learning_rate": 0.0002, "epoch": 0.20442930153321975, "step": 300}, {"loss": 1.9009, "grad_norm": 0.3546800911426544, "learning_rate": 0.0002, "epoch": 0.21124361158432708, "step": 310}, {"loss": 1.8092, "grad_norm": 0.38577890396118164, "learning_rate": 0.0002, "epoch": 0.21805792163543442, "step": 320}, {"loss": 1.7113, "grad_norm": 0.38979098200798035, "learning_rate": 0.0002, "epoch": 0.22487223168654175, "step": 330}, {"loss": 1.8162, "grad_norm": 0.35105520486831665, "learning_rate": 0.0002, "epoch": 0.23168654173764908, "step": 340}, {"loss": 1.8408, "grad_norm": 0.31671223044395447, "learning_rate": 0.0002, "epoch": 0.23850085178875638, "step": 350}, {"loss": 1.8014, "grad_norm": 0.33034196496009827, "learning_rate": 0.0002, "epoch": 0.2453151618398637, "step": 360}, {"loss": 1.7132, "grad_norm": 0.2990533709526062, "learning_rate": 0.0002, "epoch": 0.252129471890971, "step": 370}, {"loss": 1.8612, "grad_norm": 0.362208753824234, "learning_rate": 0.0002, "epoch": 0.25894378194207834, "step": 380}, {"loss": 1.8001, "grad_norm": 0.3269096612930298, "learning_rate": 0.0002, "epoch": 0.2657580919931857, "step": 390}, {"loss": 1.786, "grad_norm": 0.30555954575538635, "learning_rate": 0.0002, "epoch": 0.272572402044293, "step": 400}, {"loss": 1.8018, "grad_norm": 0.332933247089386, "learning_rate": 0.0002, "epoch": 0.27938671209540034, "step": 410}, {"loss": 1.8157, "grad_norm": 0.39454060792922974, "learning_rate": 0.0002, "epoch": 0.28620102214650767, "step": 420}, {"loss": 1.7862, "grad_norm": 0.34589633345603943, "learning_rate": 0.0002, "epoch": 0.293015332197615, "step": 430}, {"loss": 1.7612, "grad_norm": 0.3747332990169525, "learning_rate": 0.0002, "epoch": 0.29982964224872233, "step": 440}, {"loss": 1.8476, "grad_norm": 0.34825369715690613, "learning_rate": 0.0002, "epoch": 0.30664395229982966, "step": 450}, {"loss": 1.775, "grad_norm": 0.32906976342201233, "learning_rate": 0.0002, "epoch": 0.313458262350937, "step": 460}, {"loss": 1.7764, "grad_norm": 0.33108609914779663, "learning_rate": 0.0002, "epoch": 0.3202725724020443, "step": 470}, {"loss": 1.8012, "grad_norm": 0.3170463442802429, "learning_rate": 0.0002, "epoch": 0.3270868824531516, "step": 480}, {"loss": 1.8264, "grad_norm": 0.30792081356048584, "learning_rate": 0.0002, "epoch": 0.3339011925042589, "step": 490}, {"loss": 1.7361, "grad_norm": 0.31772997975349426, "learning_rate": 0.0002, "epoch": 0.34071550255536626, "step": 500}, {"loss": 1.8147, "grad_norm": 0.32714012265205383, "learning_rate": 0.0002, "epoch": 0.3475298126064736, "step": 510}, {"loss": 1.8332, "grad_norm": 0.3021100163459778, "learning_rate": 0.0002, "epoch": 0.3543441226575809, "step": 520}, {"loss": 1.7337, "grad_norm": 0.6045835018157959, "learning_rate": 0.0002, "epoch": 0.36115843270868825, "step": 530}, {"loss": 1.8661, "grad_norm": 0.3003896474838257, "learning_rate": 0.0002, "epoch": 0.3679727427597956, "step": 540}, {"loss": 1.8359, "grad_norm": 0.3678470551967621, "learning_rate": 0.0002, "epoch": 0.3747870528109029, "step": 550}, {"loss": 1.7512, "grad_norm": 0.35787615180015564, "learning_rate": 0.0002, "epoch": 0.38160136286201024, "step": 560}, {"loss": 1.8048, "grad_norm": 0.31882143020629883, "learning_rate": 0.0002, "epoch": 0.38841567291311757, "step": 570}, {"loss": 1.8108, "grad_norm": 0.3186313509941101, "learning_rate": 0.0002, "epoch": 0.39522998296422485, "step": 580}, {"loss": 1.8012, "grad_norm": 0.41443702578544617, "learning_rate": 0.0002, "epoch": 0.4020442930153322, "step": 590}, {"loss": 1.7794, "grad_norm": 0.28773069381713867, "learning_rate": 0.0002, "epoch": 0.4088586030664395, "step": 600}, {"loss": 1.8231, "grad_norm": 0.35743263363838196, "learning_rate": 0.0002, "epoch": 0.41567291311754684, "step": 610}, {"loss": 1.7531, "grad_norm": 0.3360286355018616, "learning_rate": 0.0002, "epoch": 0.42248722316865417, "step": 620}, {"loss": 1.7654, "grad_norm": 0.32838866114616394, "learning_rate": 0.0002, "epoch": 0.4293015332197615, "step": 630}, {"loss": 1.8176, "grad_norm": 0.2994388937950134, "learning_rate": 0.0002, "epoch": 0.43611584327086883, "step": 640}, {"loss": 1.7241, "grad_norm": 0.3306307792663574, "learning_rate": 0.0002, "epoch": 0.44293015332197616, "step": 650}, {"loss": 1.8201, "grad_norm": 0.3129560351371765, "learning_rate": 0.0002, "epoch": 0.4497444633730835, "step": 660}, {"loss": 1.803, "grad_norm": 0.3244289457798004, "learning_rate": 0.0002, "epoch": 0.4565587734241908, "step": 670}, {"loss": 1.8399, "grad_norm": 0.3196892738342285, "learning_rate": 0.0002, "epoch": 0.46337308347529815, "step": 680}, {"loss": 1.8291, "grad_norm": 0.3135230243206024, "learning_rate": 0.0002, "epoch": 0.47018739352640543, "step": 690}, {"loss": 1.7423, "grad_norm": 0.28677991032600403, "learning_rate": 0.0002, "epoch": 0.47700170357751276, "step": 700}, {"loss": 1.6982, "grad_norm": 0.3074065148830414, "learning_rate": 0.0002, "epoch": 0.4838160136286201, "step": 710}, {"loss": 1.8078, "grad_norm": 0.3354290723800659, "learning_rate": 0.0002, "epoch": 0.4906303236797274, "step": 720}, {"loss": 1.7333, "grad_norm": 0.324370801448822, "learning_rate": 0.0002, "epoch": 0.49744463373083475, "step": 730}, {"loss": 1.8506, "grad_norm": 0.29496142268180847, "learning_rate": 0.0002, "epoch": 0.504258943781942, "step": 740}, {"loss": 1.7761, "grad_norm": 0.30694130063056946, "learning_rate": 0.0002, "epoch": 0.5110732538330494, "step": 750}, {"loss": 1.7854, "grad_norm": 0.36168408393859863, "learning_rate": 0.0002, "epoch": 0.5178875638841567, "step": 760}, {"loss": 1.7149, "grad_norm": 0.2930343449115753, "learning_rate": 0.0002, "epoch": 0.524701873935264, "step": 770}, {"loss": 1.7924, "grad_norm": 0.3023432493209839, "learning_rate": 0.0002, "epoch": 0.5315161839863713, "step": 780}, {"loss": 1.8467, "grad_norm": 0.3272720277309418, "learning_rate": 0.0002, "epoch": 0.5383304940374787, "step": 790}, {"loss": 1.8639, "grad_norm": 0.2907974421977997, "learning_rate": 0.0002, "epoch": 0.545144804088586, "step": 800}, {"loss": 1.9018, "grad_norm": 0.32267168164253235, "learning_rate": 0.0002, "epoch": 0.5519591141396933, "step": 810}, {"loss": 1.8311, "grad_norm": 0.32059940695762634, "learning_rate": 0.0002, "epoch": 0.5587734241908007, "step": 820}, {"loss": 1.7234, "grad_norm": 0.30951258540153503, "learning_rate": 0.0002, "epoch": 0.565587734241908, "step": 830}, {"loss": 1.8063, "grad_norm": 0.33976122736930847, "learning_rate": 0.0002, "epoch": 0.5724020442930153, "step": 840}, {"loss": 1.7021, "grad_norm": 0.3195820450782776, "learning_rate": 0.0002, "epoch": 0.5792163543441227, "step": 850}, {"loss": 1.8073, "grad_norm": 0.2828562557697296, "learning_rate": 0.0002, "epoch": 0.58603066439523, "step": 860}, {"loss": 1.695, "grad_norm": 0.29591670632362366, "learning_rate": 0.0002, "epoch": 0.5928449744463373, "step": 870}, {"loss": 1.8109, "grad_norm": 0.3086104393005371, "learning_rate": 0.0002, "epoch": 0.5996592844974447, "step": 880}, {"loss": 1.8207, "grad_norm": 0.3592929542064667, "learning_rate": 0.0002, "epoch": 0.606473594548552, "step": 890}, {"loss": 1.7662, "grad_norm": 0.2830186188220978, "learning_rate": 0.0002, "epoch": 0.6132879045996593, "step": 900}, {"loss": 1.8344, "grad_norm": 0.3128598630428314, "learning_rate": 0.0002, "epoch": 0.6201022146507666, "step": 910}, {"loss": 1.7953, "grad_norm": 0.31957563757896423, "learning_rate": 0.0002, "epoch": 0.626916524701874, "step": 920}, {"loss": 1.7578, "grad_norm": 0.30994319915771484, "learning_rate": 0.0002, "epoch": 0.6337308347529813, "step": 930}, {"loss": 1.8494, "grad_norm": 0.3352845013141632, "learning_rate": 0.0002, "epoch": 0.6405451448040886, "step": 940}, {"loss": 1.7054, "grad_norm": 0.2960077226161957, "learning_rate": 0.0002, "epoch": 0.6473594548551959, "step": 950}, {"loss": 1.7209, "grad_norm": 0.32675081491470337, "learning_rate": 0.0002, "epoch": 0.6541737649063032, "step": 960}, {"loss": 1.7776, "grad_norm": 0.30042028427124023, "learning_rate": 0.0002, "epoch": 0.6609880749574105, "step": 970}, {"loss": 1.7597, "grad_norm": 0.3288673758506775, "learning_rate": 0.0002, "epoch": 0.6678023850085179, "step": 980}, {"loss": 1.7962, "grad_norm": 0.323215126991272, "learning_rate": 0.0002, "epoch": 0.6746166950596252, "step": 990}, {"loss": 1.821, "grad_norm": 0.30041399598121643, "learning_rate": 0.0002, "epoch": 0.6814310051107325, "step": 1000}, {"loss": 1.7716, "grad_norm": 0.3076179623603821, "learning_rate": 0.0002, "epoch": 0.6882453151618398, "step": 1010}, {"loss": 1.7827, "grad_norm": 0.2941909730434418, "learning_rate": 0.0002, "epoch": 0.6950596252129472, "step": 1020}, {"loss": 1.7964, "grad_norm": 0.32220420241355896, "learning_rate": 0.0002, "epoch": 0.7018739352640545, "step": 1030}, {"loss": 1.7101, "grad_norm": 0.2989702820777893, "learning_rate": 0.0002, "epoch": 0.7086882453151618, "step": 1040}, {"loss": 1.8749, "grad_norm": 0.3049640357494354, "learning_rate": 0.0002, "epoch": 0.7155025553662692, "step": 1050}, {"loss": 1.782, "grad_norm": 0.3183284103870392, "learning_rate": 0.0002, "epoch": 0.7223168654173765, "step": 1060}, {"loss": 1.785, "grad_norm": 0.3070095181465149, "learning_rate": 0.0002, "epoch": 0.7291311754684838, "step": 1070}, {"loss": 1.7832, "grad_norm": 0.33263063430786133, "learning_rate": 0.0002, "epoch": 0.7359454855195912, "step": 1080}, {"loss": 1.7627, "grad_norm": 0.28774312138557434, "learning_rate": 0.0002, "epoch": 0.7427597955706985, "step": 1090}, {"loss": 1.7343, "grad_norm": 0.29304224252700806, "learning_rate": 0.0002, "epoch": 0.7495741056218058, "step": 1100}, {"loss": 1.8082, "grad_norm": 0.27529507875442505, "learning_rate": 0.0002, "epoch": 0.7563884156729132, "step": 1110}, {"loss": 1.8071, "grad_norm": 0.32319945096969604, "learning_rate": 0.0002, "epoch": 0.7632027257240205, "step": 1120}, {"loss": 1.6998, "grad_norm": 0.33777597546577454, "learning_rate": 0.0002, "epoch": 0.7700170357751278, "step": 1130}, {"loss": 1.8488, "grad_norm": 0.29847201704978943, "learning_rate": 0.0002, "epoch": 0.7768313458262351, "step": 1140}, {"loss": 1.7014, "grad_norm": 0.31370633840560913, "learning_rate": 0.0002, "epoch": 0.7836456558773425, "step": 1150}, {"loss": 1.8175, "grad_norm": 0.31633856892585754, "learning_rate": 0.0002, "epoch": 0.7904599659284497, "step": 1160}, {"loss": 1.7637, "grad_norm": 0.33849263191223145, "learning_rate": 0.0002, "epoch": 0.797274275979557, "step": 1170}, {"loss": 1.8264, "grad_norm": 0.3306333124637604, "learning_rate": 0.0002, "epoch": 0.8040885860306644, "step": 1180}, {"loss": 1.777, "grad_norm": 0.3249678313732147, "learning_rate": 0.0002, "epoch": 0.8109028960817717, "step": 1190}, {"loss": 1.7631, "grad_norm": 0.3252817690372467, "learning_rate": 0.0002, "epoch": 0.817717206132879, "step": 1200}, {"loss": 1.7877, "grad_norm": 0.31772559881210327, "learning_rate": 0.0002, "epoch": 0.8245315161839863, "step": 1210}, {"loss": 1.8068, "grad_norm": 0.2803371846675873, "learning_rate": 0.0002, "epoch": 0.8313458262350937, "step": 1220}, {"loss": 1.738, "grad_norm": 0.26700571179389954, "learning_rate": 0.0002, "epoch": 0.838160136286201, "step": 1230}, {"loss": 1.8301, "grad_norm": 0.3060092031955719, "learning_rate": 0.0002, "epoch": 0.8449744463373083, "step": 1240}, {"loss": 1.7993, "grad_norm": 0.28831684589385986, "learning_rate": 0.0002, "epoch": 0.8517887563884157, "step": 1250}, {"loss": 1.6909, "grad_norm": 0.30708742141723633, "learning_rate": 0.0002, "epoch": 0.858603066439523, "step": 1260}, {"loss": 1.8506, "grad_norm": 0.2915987968444824, "learning_rate": 0.0002, "epoch": 0.8654173764906303, "step": 1270}, {"loss": 1.7536, "grad_norm": 0.2893589735031128, "learning_rate": 0.0002, "epoch": 0.8722316865417377, "step": 1280}, {"loss": 1.7437, "grad_norm": 0.29545632004737854, "learning_rate": 0.0002, "epoch": 0.879045996592845, "step": 1290}, {"loss": 1.859, "grad_norm": 0.3354771137237549, "learning_rate": 0.0002, "epoch": 0.8858603066439523, "step": 1300}, {"loss": 1.7644, "grad_norm": 0.37715399265289307, "learning_rate": 0.0002, "epoch": 0.8926746166950597, "step": 1310}, {"loss": 1.7731, "grad_norm": 0.28847193717956543, "learning_rate": 0.0002, "epoch": 0.899488926746167, "step": 1320}, {"loss": 1.9077, "grad_norm": 0.2780889868736267, "learning_rate": 0.0002, "epoch": 0.9063032367972743, "step": 1330}, {"loss": 1.7861, "grad_norm": 0.2714342176914215, "learning_rate": 0.0002, "epoch": 0.9131175468483816, "step": 1340}, {"loss": 1.8, "grad_norm": 0.2950133979320526, "learning_rate": 0.0002, "epoch": 0.919931856899489, "step": 1350}, {"loss": 1.7368, "grad_norm": 0.29097145795822144, "learning_rate": 0.0002, "epoch": 0.9267461669505963, "step": 1360}, {"loss": 1.7864, "grad_norm": 0.32540133595466614, "learning_rate": 0.0002, "epoch": 0.9335604770017035, "step": 1370}, {"loss": 1.7571, "grad_norm": 0.3076636493206024, "learning_rate": 0.0002, "epoch": 0.9403747870528109, "step": 1380}, {"loss": 1.8312, "grad_norm": 0.2962130308151245, "learning_rate": 0.0002, "epoch": 0.9471890971039182, "step": 1390}, {"loss": 1.7858, "grad_norm": 0.30086860060691833, "learning_rate": 0.0002, "epoch": 0.9540034071550255, "step": 1400}, {"loss": 1.7991, "grad_norm": 0.28634947538375854, "learning_rate": 0.0002, "epoch": 0.9608177172061328, "step": 1410}, {"loss": 1.8385, "grad_norm": 0.35314416885375977, "learning_rate": 0.0002, "epoch": 0.9676320272572402, "step": 1420}, {"loss": 1.8054, "grad_norm": 0.2939317524433136, "learning_rate": 0.0002, "epoch": 0.9744463373083475, "step": 1430}, {"loss": 1.7582, "grad_norm": 0.3010196089744568, "learning_rate": 0.0002, "epoch": 0.9812606473594548, "step": 1440}, {"loss": 1.758, "grad_norm": 0.30816152691841125, "learning_rate": 0.0002, "epoch": 0.9880749574105622, "step": 1450}, {"loss": 1.8159, "grad_norm": 0.28152793645858765, "learning_rate": 0.0002, "epoch": 0.9948892674616695, "step": 1460}, {"eval_loss": 1.8034634590148926, "eval_runtime": 53.6017, "eval_samples_per_second": 9.459, "eval_steps_per_second": 1.194, "epoch": 0.9996592844974447, "step": 1467}, {"loss": 1.7072, "grad_norm": 0.29246416687965393, "learning_rate": 0.0002, "epoch": 1.0017035775127767, "step": 1470}, {"loss": 1.8159, "grad_norm": 0.2668602168560028, "learning_rate": 0.0002, "epoch": 1.008517887563884, "step": 1480}, {"loss": 1.6868, "grad_norm": 0.2998567819595337, "learning_rate": 0.0002, "epoch": 1.0153321976149914, "step": 1490}, {"loss": 1.7331, "grad_norm": 0.3284934461116791, "learning_rate": 0.0002, "epoch": 1.0221465076660987, "step": 1500}, {"loss": 1.689, "grad_norm": 0.3275827169418335, "learning_rate": 0.0002, "epoch": 1.028960817717206, "step": 1510}, {"loss": 1.7092, "grad_norm": 0.3382718563079834, "learning_rate": 0.0002, "epoch": 1.0357751277683134, "step": 1520}, {"loss": 1.7215, "grad_norm": 0.36737215518951416, "learning_rate": 0.0002, "epoch": 1.0425894378194207, "step": 1530}, {"loss": 1.6993, "grad_norm": 0.3442603647708893, "learning_rate": 0.0002, "epoch": 1.049403747870528, "step": 1540}, {"loss": 1.6996, "grad_norm": 0.3323381245136261, "learning_rate": 0.0002, "epoch": 1.0562180579216354, "step": 1550}, {"loss": 1.7444, "grad_norm": 0.341227650642395, "learning_rate": 0.0002, "epoch": 1.0630323679727427, "step": 1560}, {"loss": 1.7419, "grad_norm": 0.3361579477787018, "learning_rate": 0.0002, "epoch": 1.06984667802385, "step": 1570}, {"loss": 1.7337, "grad_norm": 0.3556230962276459, "learning_rate": 0.0002, "epoch": 1.0766609880749574, "step": 1580}, {"loss": 1.6978, "grad_norm": 0.27130424976348877, "learning_rate": 0.0002, "epoch": 1.0834752981260647, "step": 1590}, {"loss": 1.6087, "grad_norm": 0.29366323351860046, "learning_rate": 0.0002, "epoch": 1.090289608177172, "step": 1600}, {"loss": 1.6721, "grad_norm": 0.3581245541572571, "learning_rate": 0.0002, "epoch": 1.0971039182282794, "step": 1610}, {"loss": 1.7639, "grad_norm": 0.3021670877933502, "learning_rate": 0.0002, "epoch": 1.1039182282793867, "step": 1620}, {"loss": 1.7314, "grad_norm": 0.3145572543144226, "learning_rate": 0.0002, "epoch": 1.110732538330494, "step": 1630}, {"loss": 1.7408, "grad_norm": 0.35362154245376587, "learning_rate": 0.0002, "epoch": 1.1175468483816013, "step": 1640}, {"loss": 1.7071, "grad_norm": 0.5413113236427307, "learning_rate": 0.0002, "epoch": 1.1243611584327087, "step": 1650}, {"loss": 1.7615, "grad_norm": 0.6858654022216797, "learning_rate": 0.0002, "epoch": 1.131175468483816, "step": 1660}, {"loss": 1.7029, "grad_norm": 0.3781903386116028, "learning_rate": 0.0002, "epoch": 1.1379897785349233, "step": 1670}, {"loss": 1.659, "grad_norm": 0.404864102602005, "learning_rate": 0.0002, "epoch": 1.1448040885860307, "step": 1680}, {"loss": 1.7212, "grad_norm": 0.3595100939273834, "learning_rate": 0.0002, "epoch": 1.151618398637138, "step": 1690}, {"loss": 1.7023, "grad_norm": 0.33682283759117126, "learning_rate": 0.0002, "epoch": 1.1584327086882453, "step": 1700}, {"loss": 1.7336, "grad_norm": 0.3877373933792114, "learning_rate": 0.0002, "epoch": 1.1652470187393527, "step": 1710}, {"loss": 1.7676, "grad_norm": 0.34606897830963135, "learning_rate": 0.0002, "epoch": 1.17206132879046, "step": 1720}, {"loss": 1.6889, "grad_norm": 0.3122918903827667, "learning_rate": 0.0002, "epoch": 1.1788756388415673, "step": 1730}, {"loss": 1.6585, "grad_norm": 0.34081900119781494, "learning_rate": 0.0002, "epoch": 1.1856899488926746, "step": 1740}, {"loss": 1.7794, "grad_norm": 0.3418807089328766, "learning_rate": 0.0002, "epoch": 1.192504258943782, "step": 1750}, {"loss": 1.7773, "grad_norm": 0.3495500981807709, "learning_rate": 0.0002, "epoch": 1.1993185689948893, "step": 1760}, {"loss": 1.6578, "grad_norm": 0.557288408279419, "learning_rate": 0.0002, "epoch": 1.2061328790459966, "step": 1770}, {"loss": 1.7488, "grad_norm": 0.3193778693675995, "learning_rate": 0.0002, "epoch": 1.212947189097104, "step": 1780}, {"loss": 1.6815, "grad_norm": 0.3306216895580292, "learning_rate": 0.0002, "epoch": 1.2197614991482113, "step": 1790}, {"loss": 1.6772, "grad_norm": 0.37998732924461365, "learning_rate": 0.0002, "epoch": 1.2265758091993186, "step": 1800}, {"loss": 1.7094, "grad_norm": 0.3255669176578522, "learning_rate": 0.0002, "epoch": 1.233390119250426, "step": 1810}, {"loss": 1.7096, "grad_norm": 0.3741287291049957, "learning_rate": 0.0002, "epoch": 1.2402044293015333, "step": 1820}, {"loss": 1.7317, "grad_norm": 0.36727291345596313, "learning_rate": 0.0002, "epoch": 1.2470187393526406, "step": 1830}, {"loss": 1.7418, "grad_norm": 0.3479527235031128, "learning_rate": 0.0002, "epoch": 1.253833049403748, "step": 1840}, {"loss": 1.7062, "grad_norm": 0.3472636640071869, "learning_rate": 0.0002, "epoch": 1.2606473594548553, "step": 1850}, {"loss": 1.6263, "grad_norm": 0.3702869415283203, "learning_rate": 0.0002, "epoch": 1.2674616695059626, "step": 1860}, {"loss": 1.6222, "grad_norm": 0.3934040069580078, "learning_rate": 0.0002, "epoch": 1.27427597955707, "step": 1870}, {"loss": 1.7616, "grad_norm": 0.46887534856796265, "learning_rate": 0.0002, "epoch": 1.2810902896081773, "step": 1880}, {"loss": 1.6071, "grad_norm": 0.3191998600959778, "learning_rate": 0.0002, "epoch": 1.2879045996592846, "step": 1890}, {"loss": 1.7001, "grad_norm": 0.34032225608825684, "learning_rate": 0.0002, "epoch": 1.294718909710392, "step": 1900}, {"loss": 1.8316, "grad_norm": 0.33453696966171265, "learning_rate": 0.0002, "epoch": 1.3015332197614993, "step": 1910}, {"loss": 1.6567, "grad_norm": 0.3451494872570038, "learning_rate": 0.0002, "epoch": 1.3083475298126066, "step": 1920}, {"loss": 1.708, "grad_norm": 0.36203092336654663, "learning_rate": 0.0002, "epoch": 1.315161839863714, "step": 1930}, {"loss": 1.7095, "grad_norm": 0.43794456124305725, "learning_rate": 0.0002, "epoch": 1.321976149914821, "step": 1940}, {"loss": 1.7264, "grad_norm": 0.3630591034889221, "learning_rate": 0.0002, "epoch": 1.3287904599659284, "step": 1950}, {"loss": 1.6529, "grad_norm": 0.36951911449432373, "learning_rate": 0.0002, "epoch": 1.3356047700170357, "step": 1960}, {"loss": 1.651, "grad_norm": 0.4001159965991974, "learning_rate": 0.0002, "epoch": 1.342419080068143, "step": 1970}, {"loss": 1.677, "grad_norm": 0.3820836544036865, "learning_rate": 0.0002, "epoch": 1.3492333901192504, "step": 1980}, {"loss": 1.583, "grad_norm": 0.3705870807170868, "learning_rate": 0.0002, "epoch": 1.3560477001703577, "step": 1990}, {"loss": 1.6207, "grad_norm": 0.3557972013950348, "learning_rate": 0.0002, "epoch": 1.362862010221465, "step": 2000}, {"loss": 1.7656, "grad_norm": 0.38546398282051086, "learning_rate": 0.0002, "epoch": 1.3696763202725724, "step": 2010}, {"loss": 1.6881, "grad_norm": 0.3908020853996277, "learning_rate": 0.0002, "epoch": 1.3764906303236797, "step": 2020}, {"loss": 1.727, "grad_norm": 0.3822040855884552, "learning_rate": 0.0002, "epoch": 1.383304940374787, "step": 2030}, {"loss": 1.7923, "grad_norm": 0.425327867269516, "learning_rate": 0.0002, "epoch": 1.3901192504258943, "step": 2040}, {"loss": 1.7032, "grad_norm": 0.3436259329319, "learning_rate": 0.0002, "epoch": 1.3969335604770017, "step": 2050}, {"loss": 1.6398, "grad_norm": 0.33124062418937683, "learning_rate": 0.0002, "epoch": 1.403747870528109, "step": 2060}, {"loss": 1.6815, "grad_norm": 0.3662424683570862, "learning_rate": 0.0002, "epoch": 1.4105621805792163, "step": 2070}, {"loss": 1.7258, "grad_norm": 0.3720635175704956, "learning_rate": 0.0002, "epoch": 1.4173764906303237, "step": 2080}, {"loss": 1.7186, "grad_norm": 0.3361680805683136, "learning_rate": 0.0002, "epoch": 1.424190800681431, "step": 2090}, {"loss": 1.7606, "grad_norm": 0.32751724123954773, "learning_rate": 0.0002, "epoch": 1.4310051107325383, "step": 2100}, {"loss": 1.7051, "grad_norm": 0.34333378076553345, "learning_rate": 0.0002, "epoch": 1.4378194207836457, "step": 2110}, {"loss": 1.6979, "grad_norm": 0.37777671217918396, "learning_rate": 0.0002, "epoch": 1.444633730834753, "step": 2120}, {"loss": 1.7085, "grad_norm": 0.37126365303993225, "learning_rate": 0.0002, "epoch": 1.4514480408858603, "step": 2130}, {"loss": 1.721, "grad_norm": 0.3602267801761627, "learning_rate": 0.0002, "epoch": 1.4582623509369677, "step": 2140}, {"loss": 1.8148, "grad_norm": 0.3287110924720764, "learning_rate": 0.0002, "epoch": 1.465076660988075, "step": 2150}, {"loss": 1.6966, "grad_norm": 0.3562135100364685, "learning_rate": 0.0002, "epoch": 1.4718909710391823, "step": 2160}, {"loss": 1.713, "grad_norm": 0.38292962312698364, "learning_rate": 0.0002, "epoch": 1.4787052810902896, "step": 2170}, {"loss": 1.7036, "grad_norm": 0.38220319151878357, "learning_rate": 0.0002, "epoch": 1.485519591141397, "step": 2180}, {"loss": 1.7297, "grad_norm": 0.3570062220096588, "learning_rate": 0.0002, "epoch": 1.4923339011925043, "step": 2190}, {"loss": 1.5652, "grad_norm": 0.363146036863327, "learning_rate": 0.0002, "epoch": 1.4991482112436116, "step": 2200}, {"loss": 1.6716, "grad_norm": 0.37393274903297424, "learning_rate": 0.0002, "epoch": 1.5059625212947187, "step": 2210}, {"loss": 1.6839, "grad_norm": 0.3628501892089844, "learning_rate": 0.0002, "epoch": 1.512776831345826, "step": 2220}, {"loss": 1.6627, "grad_norm": 0.33430740237236023, "learning_rate": 0.0002, "epoch": 1.5195911413969334, "step": 2230}, {"loss": 1.7495, "grad_norm": 0.35363978147506714, "learning_rate": 0.0002, "epoch": 1.5264054514480407, "step": 2240}, {"loss": 1.6364, "grad_norm": 0.37220337986946106, "learning_rate": 0.0002, "epoch": 1.533219761499148, "step": 2250}, {"loss": 1.7773, "grad_norm": 0.35020262002944946, "learning_rate": 0.0002, "epoch": 1.5400340715502554, "step": 2260}, {"loss": 1.7453, "grad_norm": 0.35274937748908997, "learning_rate": 0.0002, "epoch": 1.5468483816013627, "step": 2270}, {"loss": 1.7162, "grad_norm": 0.3797738552093506, "learning_rate": 0.0002, "epoch": 1.55366269165247, "step": 2280}, {"loss": 1.6197, "grad_norm": 0.4160412847995758, "learning_rate": 0.0002, "epoch": 1.5604770017035774, "step": 2290}, {"loss": 1.7101, "grad_norm": 0.38779592514038086, "learning_rate": 0.0002, "epoch": 1.5672913117546847, "step": 2300}, {"loss": 1.7254, "grad_norm": 0.39171287417411804, "learning_rate": 0.0002, "epoch": 1.574105621805792, "step": 2310}, {"loss": 1.7087, "grad_norm": 0.3606826663017273, "learning_rate": 0.0002, "epoch": 1.5809199318568994, "step": 2320}, {"loss": 1.7269, "grad_norm": 0.3745017647743225, "learning_rate": 0.0002, "epoch": 1.5877342419080067, "step": 2330}, {"loss": 1.7068, "grad_norm": 0.34933462738990784, "learning_rate": 0.0002, "epoch": 1.594548551959114, "step": 2340}, {"loss": 1.7073, "grad_norm": 0.37268444895744324, "learning_rate": 0.0002, "epoch": 1.6013628620102214, "step": 2350}, {"loss": 1.728, "grad_norm": 0.4603484869003296, "learning_rate": 0.0002, "epoch": 1.6081771720613287, "step": 2360}, {"loss": 1.7621, "grad_norm": 0.35689088702201843, "learning_rate": 0.0002, "epoch": 1.614991482112436, "step": 2370}, {"loss": 1.6989, "grad_norm": 0.3392031490802765, "learning_rate": 0.0002, "epoch": 1.6218057921635434, "step": 2380}, {"loss": 1.7077, "grad_norm": 0.394653856754303, "learning_rate": 0.0002, "epoch": 1.6286201022146507, "step": 2390}, {"loss": 1.7448, "grad_norm": 0.33972012996673584, "learning_rate": 0.0002, "epoch": 1.635434412265758, "step": 2400}, {"loss": 1.7681, "grad_norm": 0.3854375183582306, "learning_rate": 0.0002, "epoch": 1.6422487223168654, "step": 2410}, {"loss": 1.7102, "grad_norm": 0.36143961548805237, "learning_rate": 0.0002, "epoch": 1.6490630323679727, "step": 2420}, {"loss": 1.7432, "grad_norm": 0.35816189646720886, "learning_rate": 0.0002, "epoch": 1.65587734241908, "step": 2430}, {"loss": 1.6407, "grad_norm": 0.36298736929893494, "learning_rate": 0.0002, "epoch": 1.6626916524701874, "step": 2440}, {"loss": 1.723, "grad_norm": 0.36756929755210876, "learning_rate": 0.0002, "epoch": 1.6695059625212947, "step": 2450}, {"loss": 1.6824, "grad_norm": 0.35969603061676025, "learning_rate": 0.0002, "epoch": 1.676320272572402, "step": 2460}, {"loss": 1.7014, "grad_norm": 0.38449376821517944, "learning_rate": 0.0002, "epoch": 1.6831345826235093, "step": 2470}, {"loss": 1.7261, "grad_norm": 0.44511452317237854, "learning_rate": 0.0002, "epoch": 1.6899488926746167, "step": 2480}, {"loss": 1.7397, "grad_norm": 0.3831416368484497, "learning_rate": 0.0002, "epoch": 1.696763202725724, "step": 2490}, {"loss": 1.7046, "grad_norm": 0.3795325756072998, "learning_rate": 0.0002, "epoch": 1.7035775127768313, "step": 2500}, {"loss": 1.6993, "grad_norm": 0.34978193044662476, "learning_rate": 0.0002, "epoch": 1.7103918228279387, "step": 2510}, {"loss": 1.8307, "grad_norm": 0.35923877358436584, "learning_rate": 0.0002, "epoch": 1.717206132879046, "step": 2520}, {"loss": 1.6253, "grad_norm": 0.352999746799469, "learning_rate": 0.0002, "epoch": 1.7240204429301533, "step": 2530}, {"loss": 1.6953, "grad_norm": 0.43673479557037354, "learning_rate": 0.0002, "epoch": 1.7308347529812607, "step": 2540}, {"loss": 1.7079, "grad_norm": 0.4153687357902527, "learning_rate": 0.0002, "epoch": 1.737649063032368, "step": 2550}, {"loss": 1.6714, "grad_norm": 0.35541167855262756, "learning_rate": 0.0002, "epoch": 1.7444633730834753, "step": 2560}, {"loss": 1.696, "grad_norm": 0.3288775086402893, "learning_rate": 0.0002, "epoch": 1.7512776831345827, "step": 2570}, {"loss": 1.7486, "grad_norm": 0.3991123139858246, "learning_rate": 0.0002, "epoch": 1.75809199318569, "step": 2580}, {"loss": 1.645, "grad_norm": 0.39967241883277893, "learning_rate": 0.0002, "epoch": 1.7649063032367973, "step": 2590}, {"loss": 1.6855, "grad_norm": 0.41104283928871155, "learning_rate": 0.0002, "epoch": 1.7717206132879046, "step": 2600}, {"loss": 1.6993, "grad_norm": 0.44885286688804626, "learning_rate": 0.0002, "epoch": 1.778534923339012, "step": 2610}, {"loss": 1.7224, "grad_norm": 0.38384467363357544, "learning_rate": 0.0002, "epoch": 1.7853492333901193, "step": 2620}, {"loss": 1.7213, "grad_norm": 0.35020917654037476, "learning_rate": 0.0002, "epoch": 1.7921635434412266, "step": 2630}, {"loss": 1.6706, "grad_norm": 0.3360341489315033, "learning_rate": 0.0002, "epoch": 1.798977853492334, "step": 2640}, {"loss": 1.7037, "grad_norm": 0.38875144720077515, "learning_rate": 0.0002, "epoch": 1.8057921635434413, "step": 2650}, {"loss": 1.693, "grad_norm": 0.34876883029937744, "learning_rate": 0.0002, "epoch": 1.8126064735945486, "step": 2660}, {"loss": 1.7743, "grad_norm": 0.419979989528656, "learning_rate": 0.0002, "epoch": 1.819420783645656, "step": 2670}, {"loss": 1.7089, "grad_norm": 0.3648919463157654, "learning_rate": 0.0002, "epoch": 1.8262350936967633, "step": 2680}, {"loss": 1.7828, "grad_norm": 0.3485383987426758, "learning_rate": 0.0002, "epoch": 1.8330494037478706, "step": 2690}, {"loss": 1.705, "grad_norm": 0.3647468686103821, "learning_rate": 0.0002, "epoch": 1.839863713798978, "step": 2700}, {"loss": 1.7318, "grad_norm": 0.37003210186958313, "learning_rate": 0.0002, "epoch": 1.8466780238500853, "step": 2710}, {"loss": 1.6647, "grad_norm": 0.37031617760658264, "learning_rate": 0.0002, "epoch": 1.8534923339011926, "step": 2720}, {"loss": 1.69, "grad_norm": 0.3438796103000641, "learning_rate": 0.0002, "epoch": 1.8603066439523, "step": 2730}, {"loss": 1.5995, "grad_norm": 0.41574627161026, "learning_rate": 0.0002, "epoch": 1.8671209540034073, "step": 2740}, {"loss": 1.6877, "grad_norm": 0.35049930214881897, "learning_rate": 0.0002, "epoch": 1.8739352640545146, "step": 2750}, {"loss": 1.7048, "grad_norm": 0.3943989872932434, "learning_rate": 0.0002, "epoch": 1.880749574105622, "step": 2760}, {"loss": 1.7047, "grad_norm": 0.3384978175163269, "learning_rate": 0.0002, "epoch": 1.8875638841567293, "step": 2770}, {"loss": 1.7848, "grad_norm": 0.3501328229904175, "learning_rate": 0.0002, "epoch": 1.8943781942078366, "step": 2780}, {"loss": 1.6986, "grad_norm": 0.37484532594680786, "learning_rate": 0.0002, "epoch": 1.901192504258944, "step": 2790}, {"loss": 1.6867, "grad_norm": 0.34497788548469543, "learning_rate": 0.0002, "epoch": 1.9080068143100513, "step": 2800}, {"loss": 1.6513, "grad_norm": 0.3530851900577545, "learning_rate": 0.0002, "epoch": 1.9148211243611586, "step": 2810}, {"loss": 1.6369, "grad_norm": 0.3879254162311554, "learning_rate": 0.0002, "epoch": 1.921635434412266, "step": 2820}, {"loss": 1.6786, "grad_norm": 0.3885590136051178, "learning_rate": 0.0002, "epoch": 1.9284497444633732, "step": 2830}, {"loss": 1.8049, "grad_norm": 0.3868715465068817, "learning_rate": 0.0002, "epoch": 1.9352640545144804, "step": 2840}, {"loss": 1.7099, "grad_norm": 0.4152422249317169, "learning_rate": 0.0002, "epoch": 1.9420783645655877, "step": 2850}, {"loss": 1.6696, "grad_norm": 0.401714563369751, "learning_rate": 0.0002, "epoch": 1.948892674616695, "step": 2860}, {"loss": 1.7182, "grad_norm": 0.34825265407562256, "learning_rate": 0.0002, "epoch": 1.9557069846678024, "step": 2870}, {"loss": 1.6612, "grad_norm": 0.3620675504207611, "learning_rate": 0.0002, "epoch": 1.9625212947189097, "step": 2880}, {"loss": 1.7451, "grad_norm": 0.3977806866168976, "learning_rate": 0.0002, "epoch": 1.969335604770017, "step": 2890}, {"loss": 1.6514, "grad_norm": 0.3687497079372406, "learning_rate": 0.0002, "epoch": 1.9761499148211243, "step": 2900}, {"loss": 1.7712, "grad_norm": 0.408640056848526, "learning_rate": 0.0002, "epoch": 1.9829642248722317, "step": 2910}, {"loss": 1.695, "grad_norm": 0.34510108828544617, "learning_rate": 0.0002, "epoch": 1.989778534923339, "step": 2920}, {"loss": 1.662, "grad_norm": 0.3596334755420685, "learning_rate": 0.0002, "epoch": 1.9965928449744463, "step": 2930}]} +{"epoch": 2.9996592844974446, "step": 4402, "epoch_duration": 2722.2362093925476, "total_accumulated_duration": 7232.217505693436, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28746.0}, "peak_memory_reserved": {"GPU_0": 28746.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0988, "grad_norm": 0.635574460029602, "learning_rate": 0.0002, "epoch": 0.0068143100511073255, "step": 10}, {"loss": 2.5594, "grad_norm": 1.0401769876480103, "learning_rate": 0.0002, "epoch": 0.013628620102214651, "step": 20}, {"loss": 2.3587, "grad_norm": 0.4514131247997284, "learning_rate": 0.0002, "epoch": 0.020442930153321975, "step": 30}, {"loss": 2.121, "grad_norm": 0.6303355693817139, "learning_rate": 0.0002, "epoch": 0.027257240204429302, "step": 40}, {"loss": 1.9833, "grad_norm": 0.4648270606994629, "learning_rate": 0.0002, "epoch": 0.034071550255536626, "step": 50}, {"loss": 1.9384, "grad_norm": 0.42953479290008545, "learning_rate": 0.0002, "epoch": 0.04088586030664395, "step": 60}, {"loss": 1.9202, "grad_norm": 0.433614581823349, "learning_rate": 0.0002, "epoch": 0.04770017035775128, "step": 70}, {"loss": 1.911, "grad_norm": 0.45318254828453064, "learning_rate": 0.0002, "epoch": 0.054514480408858604, "step": 80}, {"loss": 1.8895, "grad_norm": 0.4023568630218506, "learning_rate": 0.0002, "epoch": 0.06132879045996593, "step": 90}, {"loss": 1.9257, "grad_norm": 0.43260207772254944, "learning_rate": 0.0002, "epoch": 0.06814310051107325, "step": 100}, {"loss": 1.9178, "grad_norm": 0.43389809131622314, "learning_rate": 0.0002, "epoch": 0.07495741056218058, "step": 110}, {"loss": 1.9071, "grad_norm": 0.39307987689971924, "learning_rate": 0.0002, "epoch": 0.0817717206132879, "step": 120}, {"loss": 1.9482, "grad_norm": 0.7703037261962891, "learning_rate": 0.0002, "epoch": 0.08858603066439523, "step": 130}, {"loss": 1.8394, "grad_norm": 0.38547563552856445, "learning_rate": 0.0002, "epoch": 0.09540034071550256, "step": 140}, {"loss": 1.9037, "grad_norm": 0.37948688864707947, "learning_rate": 0.0002, "epoch": 0.10221465076660988, "step": 150}, {"loss": 1.8664, "grad_norm": 0.33022379875183105, "learning_rate": 0.0002, "epoch": 0.10902896081771721, "step": 160}, {"loss": 1.8659, "grad_norm": 0.33703792095184326, "learning_rate": 0.0002, "epoch": 0.11584327086882454, "step": 170}, {"loss": 1.8389, "grad_norm": 0.3259912431240082, "learning_rate": 0.0002, "epoch": 0.12265758091993186, "step": 180}, {"loss": 1.8424, "grad_norm": 0.3593858778476715, "learning_rate": 0.0002, "epoch": 0.12947189097103917, "step": 190}, {"loss": 1.8457, "grad_norm": 0.3401614725589752, "learning_rate": 0.0002, "epoch": 0.1362862010221465, "step": 200}, {"loss": 1.7721, "grad_norm": 0.3892078697681427, "learning_rate": 0.0002, "epoch": 0.14310051107325383, "step": 210}, {"loss": 1.8351, "grad_norm": 0.315374493598938, "learning_rate": 0.0002, "epoch": 0.14991482112436116, "step": 220}, {"loss": 1.8307, "grad_norm": 0.3679497539997101, "learning_rate": 0.0002, "epoch": 0.1567291311754685, "step": 230}, {"loss": 1.816, "grad_norm": 0.336730033159256, "learning_rate": 0.0002, "epoch": 0.1635434412265758, "step": 240}, {"loss": 1.7849, "grad_norm": 0.36472755670547485, "learning_rate": 0.0002, "epoch": 0.17035775127768313, "step": 250}, {"loss": 1.7766, "grad_norm": 0.34864023327827454, "learning_rate": 0.0002, "epoch": 0.17717206132879046, "step": 260}, {"loss": 1.896, "grad_norm": 0.3350819945335388, "learning_rate": 0.0002, "epoch": 0.1839863713798978, "step": 270}, {"loss": 1.8742, "grad_norm": 0.3844246566295624, "learning_rate": 0.0002, "epoch": 0.19080068143100512, "step": 280}, {"loss": 1.8531, "grad_norm": 0.38413065671920776, "learning_rate": 0.0002, "epoch": 0.19761499148211242, "step": 290}, {"loss": 1.8415, "grad_norm": 0.4080047607421875, "learning_rate": 0.0002, "epoch": 0.20442930153321975, "step": 300}, {"loss": 1.9009, "grad_norm": 0.3546800911426544, "learning_rate": 0.0002, "epoch": 0.21124361158432708, "step": 310}, {"loss": 1.8092, "grad_norm": 0.38577890396118164, "learning_rate": 0.0002, "epoch": 0.21805792163543442, "step": 320}, {"loss": 1.7113, "grad_norm": 0.38979098200798035, "learning_rate": 0.0002, "epoch": 0.22487223168654175, "step": 330}, {"loss": 1.8162, "grad_norm": 0.35105520486831665, "learning_rate": 0.0002, "epoch": 0.23168654173764908, "step": 340}, {"loss": 1.8408, "grad_norm": 0.31671223044395447, "learning_rate": 0.0002, "epoch": 0.23850085178875638, "step": 350}, {"loss": 1.8014, "grad_norm": 0.33034196496009827, "learning_rate": 0.0002, "epoch": 0.2453151618398637, "step": 360}, {"loss": 1.7132, "grad_norm": 0.2990533709526062, "learning_rate": 0.0002, "epoch": 0.252129471890971, "step": 370}, {"loss": 1.8612, "grad_norm": 0.362208753824234, "learning_rate": 0.0002, "epoch": 0.25894378194207834, "step": 380}, {"loss": 1.8001, "grad_norm": 0.3269096612930298, "learning_rate": 0.0002, "epoch": 0.2657580919931857, "step": 390}, {"loss": 1.786, "grad_norm": 0.30555954575538635, "learning_rate": 0.0002, "epoch": 0.272572402044293, "step": 400}, {"loss": 1.8018, "grad_norm": 0.332933247089386, "learning_rate": 0.0002, "epoch": 0.27938671209540034, "step": 410}, {"loss": 1.8157, "grad_norm": 0.39454060792922974, "learning_rate": 0.0002, "epoch": 0.28620102214650767, "step": 420}, {"loss": 1.7862, "grad_norm": 0.34589633345603943, "learning_rate": 0.0002, "epoch": 0.293015332197615, "step": 430}, {"loss": 1.7612, "grad_norm": 0.3747332990169525, "learning_rate": 0.0002, "epoch": 0.29982964224872233, "step": 440}, {"loss": 1.8476, "grad_norm": 0.34825369715690613, "learning_rate": 0.0002, "epoch": 0.30664395229982966, "step": 450}, {"loss": 1.775, "grad_norm": 0.32906976342201233, "learning_rate": 0.0002, "epoch": 0.313458262350937, "step": 460}, {"loss": 1.7764, "grad_norm": 0.33108609914779663, "learning_rate": 0.0002, "epoch": 0.3202725724020443, "step": 470}, {"loss": 1.8012, "grad_norm": 0.3170463442802429, "learning_rate": 0.0002, "epoch": 0.3270868824531516, "step": 480}, {"loss": 1.8264, "grad_norm": 0.30792081356048584, "learning_rate": 0.0002, "epoch": 0.3339011925042589, "step": 490}, {"loss": 1.7361, "grad_norm": 0.31772997975349426, "learning_rate": 0.0002, "epoch": 0.34071550255536626, "step": 500}, {"loss": 1.8147, "grad_norm": 0.32714012265205383, "learning_rate": 0.0002, "epoch": 0.3475298126064736, "step": 510}, {"loss": 1.8332, "grad_norm": 0.3021100163459778, "learning_rate": 0.0002, "epoch": 0.3543441226575809, "step": 520}, {"loss": 1.7337, "grad_norm": 0.6045835018157959, "learning_rate": 0.0002, "epoch": 0.36115843270868825, "step": 530}, {"loss": 1.8661, "grad_norm": 0.3003896474838257, "learning_rate": 0.0002, "epoch": 0.3679727427597956, "step": 540}, {"loss": 1.8359, "grad_norm": 0.3678470551967621, "learning_rate": 0.0002, "epoch": 0.3747870528109029, "step": 550}, {"loss": 1.7512, "grad_norm": 0.35787615180015564, "learning_rate": 0.0002, "epoch": 0.38160136286201024, "step": 560}, {"loss": 1.8048, "grad_norm": 0.31882143020629883, "learning_rate": 0.0002, "epoch": 0.38841567291311757, "step": 570}, {"loss": 1.8108, "grad_norm": 0.3186313509941101, "learning_rate": 0.0002, "epoch": 0.39522998296422485, "step": 580}, {"loss": 1.8012, "grad_norm": 0.41443702578544617, "learning_rate": 0.0002, "epoch": 0.4020442930153322, "step": 590}, {"loss": 1.7794, "grad_norm": 0.28773069381713867, "learning_rate": 0.0002, "epoch": 0.4088586030664395, "step": 600}, {"loss": 1.8231, "grad_norm": 0.35743263363838196, "learning_rate": 0.0002, "epoch": 0.41567291311754684, "step": 610}, {"loss": 1.7531, "grad_norm": 0.3360286355018616, "learning_rate": 0.0002, "epoch": 0.42248722316865417, "step": 620}, {"loss": 1.7654, "grad_norm": 0.32838866114616394, "learning_rate": 0.0002, "epoch": 0.4293015332197615, "step": 630}, {"loss": 1.8176, "grad_norm": 0.2994388937950134, "learning_rate": 0.0002, "epoch": 0.43611584327086883, "step": 640}, {"loss": 1.7241, "grad_norm": 0.3306307792663574, "learning_rate": 0.0002, "epoch": 0.44293015332197616, "step": 650}, {"loss": 1.8201, "grad_norm": 0.3129560351371765, "learning_rate": 0.0002, "epoch": 0.4497444633730835, "step": 660}, {"loss": 1.803, "grad_norm": 0.3244289457798004, "learning_rate": 0.0002, "epoch": 0.4565587734241908, "step": 670}, {"loss": 1.8399, "grad_norm": 0.3196892738342285, "learning_rate": 0.0002, "epoch": 0.46337308347529815, "step": 680}, {"loss": 1.8291, "grad_norm": 0.3135230243206024, "learning_rate": 0.0002, "epoch": 0.47018739352640543, "step": 690}, {"loss": 1.7423, "grad_norm": 0.28677991032600403, "learning_rate": 0.0002, "epoch": 0.47700170357751276, "step": 700}, {"loss": 1.6982, "grad_norm": 0.3074065148830414, "learning_rate": 0.0002, "epoch": 0.4838160136286201, "step": 710}, {"loss": 1.8078, "grad_norm": 0.3354290723800659, "learning_rate": 0.0002, "epoch": 0.4906303236797274, "step": 720}, {"loss": 1.7333, "grad_norm": 0.324370801448822, "learning_rate": 0.0002, "epoch": 0.49744463373083475, "step": 730}, {"loss": 1.8506, "grad_norm": 0.29496142268180847, "learning_rate": 0.0002, "epoch": 0.504258943781942, "step": 740}, {"loss": 1.7761, "grad_norm": 0.30694130063056946, "learning_rate": 0.0002, "epoch": 0.5110732538330494, "step": 750}, {"loss": 1.7854, "grad_norm": 0.36168408393859863, "learning_rate": 0.0002, "epoch": 0.5178875638841567, "step": 760}, {"loss": 1.7149, "grad_norm": 0.2930343449115753, "learning_rate": 0.0002, "epoch": 0.524701873935264, "step": 770}, {"loss": 1.7924, "grad_norm": 0.3023432493209839, "learning_rate": 0.0002, "epoch": 0.5315161839863713, "step": 780}, {"loss": 1.8467, "grad_norm": 0.3272720277309418, "learning_rate": 0.0002, "epoch": 0.5383304940374787, "step": 790}, {"loss": 1.8639, "grad_norm": 0.2907974421977997, "learning_rate": 0.0002, "epoch": 0.545144804088586, "step": 800}, {"loss": 1.9018, "grad_norm": 0.32267168164253235, "learning_rate": 0.0002, "epoch": 0.5519591141396933, "step": 810}, {"loss": 1.8311, "grad_norm": 0.32059940695762634, "learning_rate": 0.0002, "epoch": 0.5587734241908007, "step": 820}, {"loss": 1.7234, "grad_norm": 0.30951258540153503, "learning_rate": 0.0002, "epoch": 0.565587734241908, "step": 830}, {"loss": 1.8063, "grad_norm": 0.33976122736930847, "learning_rate": 0.0002, "epoch": 0.5724020442930153, "step": 840}, {"loss": 1.7021, "grad_norm": 0.3195820450782776, "learning_rate": 0.0002, "epoch": 0.5792163543441227, "step": 850}, {"loss": 1.8073, "grad_norm": 0.2828562557697296, "learning_rate": 0.0002, "epoch": 0.58603066439523, "step": 860}, {"loss": 1.695, "grad_norm": 0.29591670632362366, "learning_rate": 0.0002, "epoch": 0.5928449744463373, "step": 870}, {"loss": 1.8109, "grad_norm": 0.3086104393005371, "learning_rate": 0.0002, "epoch": 0.5996592844974447, "step": 880}, {"loss": 1.8207, "grad_norm": 0.3592929542064667, "learning_rate": 0.0002, "epoch": 0.606473594548552, "step": 890}, {"loss": 1.7662, "grad_norm": 0.2830186188220978, "learning_rate": 0.0002, "epoch": 0.6132879045996593, "step": 900}, {"loss": 1.8344, "grad_norm": 0.3128598630428314, "learning_rate": 0.0002, "epoch": 0.6201022146507666, "step": 910}, {"loss": 1.7953, "grad_norm": 0.31957563757896423, "learning_rate": 0.0002, "epoch": 0.626916524701874, "step": 920}, {"loss": 1.7578, "grad_norm": 0.30994319915771484, "learning_rate": 0.0002, "epoch": 0.6337308347529813, "step": 930}, {"loss": 1.8494, "grad_norm": 0.3352845013141632, "learning_rate": 0.0002, "epoch": 0.6405451448040886, "step": 940}, {"loss": 1.7054, "grad_norm": 0.2960077226161957, "learning_rate": 0.0002, "epoch": 0.6473594548551959, "step": 950}, {"loss": 1.7209, "grad_norm": 0.32675081491470337, "learning_rate": 0.0002, "epoch": 0.6541737649063032, "step": 960}, {"loss": 1.7776, "grad_norm": 0.30042028427124023, "learning_rate": 0.0002, "epoch": 0.6609880749574105, "step": 970}, {"loss": 1.7597, "grad_norm": 0.3288673758506775, "learning_rate": 0.0002, "epoch": 0.6678023850085179, "step": 980}, {"loss": 1.7962, "grad_norm": 0.323215126991272, "learning_rate": 0.0002, "epoch": 0.6746166950596252, "step": 990}, {"loss": 1.821, "grad_norm": 0.30041399598121643, "learning_rate": 0.0002, "epoch": 0.6814310051107325, "step": 1000}, {"loss": 1.7716, "grad_norm": 0.3076179623603821, "learning_rate": 0.0002, "epoch": 0.6882453151618398, "step": 1010}, {"loss": 1.7827, "grad_norm": 0.2941909730434418, "learning_rate": 0.0002, "epoch": 0.6950596252129472, "step": 1020}, {"loss": 1.7964, "grad_norm": 0.32220420241355896, "learning_rate": 0.0002, "epoch": 0.7018739352640545, "step": 1030}, {"loss": 1.7101, "grad_norm": 0.2989702820777893, "learning_rate": 0.0002, "epoch": 0.7086882453151618, "step": 1040}, {"loss": 1.8749, "grad_norm": 0.3049640357494354, "learning_rate": 0.0002, "epoch": 0.7155025553662692, "step": 1050}, {"loss": 1.782, "grad_norm": 0.3183284103870392, "learning_rate": 0.0002, "epoch": 0.7223168654173765, "step": 1060}, {"loss": 1.785, "grad_norm": 0.3070095181465149, "learning_rate": 0.0002, "epoch": 0.7291311754684838, "step": 1070}, {"loss": 1.7832, "grad_norm": 0.33263063430786133, "learning_rate": 0.0002, "epoch": 0.7359454855195912, "step": 1080}, {"loss": 1.7627, "grad_norm": 0.28774312138557434, "learning_rate": 0.0002, "epoch": 0.7427597955706985, "step": 1090}, {"loss": 1.7343, "grad_norm": 0.29304224252700806, "learning_rate": 0.0002, "epoch": 0.7495741056218058, "step": 1100}, {"loss": 1.8082, "grad_norm": 0.27529507875442505, "learning_rate": 0.0002, "epoch": 0.7563884156729132, "step": 1110}, {"loss": 1.8071, "grad_norm": 0.32319945096969604, "learning_rate": 0.0002, "epoch": 0.7632027257240205, "step": 1120}, {"loss": 1.6998, "grad_norm": 0.33777597546577454, "learning_rate": 0.0002, "epoch": 0.7700170357751278, "step": 1130}, {"loss": 1.8488, "grad_norm": 0.29847201704978943, "learning_rate": 0.0002, "epoch": 0.7768313458262351, "step": 1140}, {"loss": 1.7014, "grad_norm": 0.31370633840560913, "learning_rate": 0.0002, "epoch": 0.7836456558773425, "step": 1150}, {"loss": 1.8175, "grad_norm": 0.31633856892585754, "learning_rate": 0.0002, "epoch": 0.7904599659284497, "step": 1160}, {"loss": 1.7637, "grad_norm": 0.33849263191223145, "learning_rate": 0.0002, "epoch": 0.797274275979557, "step": 1170}, {"loss": 1.8264, "grad_norm": 0.3306333124637604, "learning_rate": 0.0002, "epoch": 0.8040885860306644, "step": 1180}, {"loss": 1.777, "grad_norm": 0.3249678313732147, "learning_rate": 0.0002, "epoch": 0.8109028960817717, "step": 1190}, {"loss": 1.7631, "grad_norm": 0.3252817690372467, "learning_rate": 0.0002, "epoch": 0.817717206132879, "step": 1200}, {"loss": 1.7877, "grad_norm": 0.31772559881210327, "learning_rate": 0.0002, "epoch": 0.8245315161839863, "step": 1210}, {"loss": 1.8068, "grad_norm": 0.2803371846675873, "learning_rate": 0.0002, "epoch": 0.8313458262350937, "step": 1220}, {"loss": 1.738, "grad_norm": 0.26700571179389954, "learning_rate": 0.0002, "epoch": 0.838160136286201, "step": 1230}, {"loss": 1.8301, "grad_norm": 0.3060092031955719, "learning_rate": 0.0002, "epoch": 0.8449744463373083, "step": 1240}, {"loss": 1.7993, "grad_norm": 0.28831684589385986, "learning_rate": 0.0002, "epoch": 0.8517887563884157, "step": 1250}, {"loss": 1.6909, "grad_norm": 0.30708742141723633, "learning_rate": 0.0002, "epoch": 0.858603066439523, "step": 1260}, {"loss": 1.8506, "grad_norm": 0.2915987968444824, "learning_rate": 0.0002, "epoch": 0.8654173764906303, "step": 1270}, {"loss": 1.7536, "grad_norm": 0.2893589735031128, "learning_rate": 0.0002, "epoch": 0.8722316865417377, "step": 1280}, {"loss": 1.7437, "grad_norm": 0.29545632004737854, "learning_rate": 0.0002, "epoch": 0.879045996592845, "step": 1290}, {"loss": 1.859, "grad_norm": 0.3354771137237549, "learning_rate": 0.0002, "epoch": 0.8858603066439523, "step": 1300}, {"loss": 1.7644, "grad_norm": 0.37715399265289307, "learning_rate": 0.0002, "epoch": 0.8926746166950597, "step": 1310}, {"loss": 1.7731, "grad_norm": 0.28847193717956543, "learning_rate": 0.0002, "epoch": 0.899488926746167, "step": 1320}, {"loss": 1.9077, "grad_norm": 0.2780889868736267, "learning_rate": 0.0002, "epoch": 0.9063032367972743, "step": 1330}, {"loss": 1.7861, "grad_norm": 0.2714342176914215, "learning_rate": 0.0002, "epoch": 0.9131175468483816, "step": 1340}, {"loss": 1.8, "grad_norm": 0.2950133979320526, "learning_rate": 0.0002, "epoch": 0.919931856899489, "step": 1350}, {"loss": 1.7368, "grad_norm": 0.29097145795822144, "learning_rate": 0.0002, "epoch": 0.9267461669505963, "step": 1360}, {"loss": 1.7864, "grad_norm": 0.32540133595466614, "learning_rate": 0.0002, "epoch": 0.9335604770017035, "step": 1370}, {"loss": 1.7571, "grad_norm": 0.3076636493206024, "learning_rate": 0.0002, "epoch": 0.9403747870528109, "step": 1380}, {"loss": 1.8312, "grad_norm": 0.2962130308151245, "learning_rate": 0.0002, "epoch": 0.9471890971039182, "step": 1390}, {"loss": 1.7858, "grad_norm": 0.30086860060691833, "learning_rate": 0.0002, "epoch": 0.9540034071550255, "step": 1400}, {"loss": 1.7991, "grad_norm": 0.28634947538375854, "learning_rate": 0.0002, "epoch": 0.9608177172061328, "step": 1410}, {"loss": 1.8385, "grad_norm": 0.35314416885375977, "learning_rate": 0.0002, "epoch": 0.9676320272572402, "step": 1420}, {"loss": 1.8054, "grad_norm": 0.2939317524433136, "learning_rate": 0.0002, "epoch": 0.9744463373083475, "step": 1430}, {"loss": 1.7582, "grad_norm": 0.3010196089744568, "learning_rate": 0.0002, "epoch": 0.9812606473594548, "step": 1440}, {"loss": 1.758, "grad_norm": 0.30816152691841125, "learning_rate": 0.0002, "epoch": 0.9880749574105622, "step": 1450}, {"loss": 1.8159, "grad_norm": 0.28152793645858765, "learning_rate": 0.0002, "epoch": 0.9948892674616695, "step": 1460}, {"eval_loss": 1.8034634590148926, "eval_runtime": 53.6017, "eval_samples_per_second": 9.459, "eval_steps_per_second": 1.194, "epoch": 0.9996592844974447, "step": 1467}, {"loss": 1.7072, "grad_norm": 0.29246416687965393, "learning_rate": 0.0002, "epoch": 1.0017035775127767, "step": 1470}, {"loss": 1.8159, "grad_norm": 0.2668602168560028, "learning_rate": 0.0002, "epoch": 1.008517887563884, "step": 1480}, {"loss": 1.6868, "grad_norm": 0.2998567819595337, "learning_rate": 0.0002, "epoch": 1.0153321976149914, "step": 1490}, {"loss": 1.7331, "grad_norm": 0.3284934461116791, "learning_rate": 0.0002, "epoch": 1.0221465076660987, "step": 1500}, {"loss": 1.689, "grad_norm": 0.3275827169418335, "learning_rate": 0.0002, "epoch": 1.028960817717206, "step": 1510}, {"loss": 1.7092, "grad_norm": 0.3382718563079834, "learning_rate": 0.0002, "epoch": 1.0357751277683134, "step": 1520}, {"loss": 1.7215, "grad_norm": 0.36737215518951416, "learning_rate": 0.0002, "epoch": 1.0425894378194207, "step": 1530}, {"loss": 1.6993, "grad_norm": 0.3442603647708893, "learning_rate": 0.0002, "epoch": 1.049403747870528, "step": 1540}, {"loss": 1.6996, "grad_norm": 0.3323381245136261, "learning_rate": 0.0002, "epoch": 1.0562180579216354, "step": 1550}, {"loss": 1.7444, "grad_norm": 0.341227650642395, "learning_rate": 0.0002, "epoch": 1.0630323679727427, "step": 1560}, {"loss": 1.7419, "grad_norm": 0.3361579477787018, "learning_rate": 0.0002, "epoch": 1.06984667802385, "step": 1570}, {"loss": 1.7337, "grad_norm": 0.3556230962276459, "learning_rate": 0.0002, "epoch": 1.0766609880749574, "step": 1580}, {"loss": 1.6978, "grad_norm": 0.27130424976348877, "learning_rate": 0.0002, "epoch": 1.0834752981260647, "step": 1590}, {"loss": 1.6087, "grad_norm": 0.29366323351860046, "learning_rate": 0.0002, "epoch": 1.090289608177172, "step": 1600}, {"loss": 1.6721, "grad_norm": 0.3581245541572571, "learning_rate": 0.0002, "epoch": 1.0971039182282794, "step": 1610}, {"loss": 1.7639, "grad_norm": 0.3021670877933502, "learning_rate": 0.0002, "epoch": 1.1039182282793867, "step": 1620}, {"loss": 1.7314, "grad_norm": 0.3145572543144226, "learning_rate": 0.0002, "epoch": 1.110732538330494, "step": 1630}, {"loss": 1.7408, "grad_norm": 0.35362154245376587, "learning_rate": 0.0002, "epoch": 1.1175468483816013, "step": 1640}, {"loss": 1.7071, "grad_norm": 0.5413113236427307, "learning_rate": 0.0002, "epoch": 1.1243611584327087, "step": 1650}, {"loss": 1.7615, "grad_norm": 0.6858654022216797, "learning_rate": 0.0002, "epoch": 1.131175468483816, "step": 1660}, {"loss": 1.7029, "grad_norm": 0.3781903386116028, "learning_rate": 0.0002, "epoch": 1.1379897785349233, "step": 1670}, {"loss": 1.659, "grad_norm": 0.404864102602005, "learning_rate": 0.0002, "epoch": 1.1448040885860307, "step": 1680}, {"loss": 1.7212, "grad_norm": 0.3595100939273834, "learning_rate": 0.0002, "epoch": 1.151618398637138, "step": 1690}, {"loss": 1.7023, "grad_norm": 0.33682283759117126, "learning_rate": 0.0002, "epoch": 1.1584327086882453, "step": 1700}, {"loss": 1.7336, "grad_norm": 0.3877373933792114, "learning_rate": 0.0002, "epoch": 1.1652470187393527, "step": 1710}, {"loss": 1.7676, "grad_norm": 0.34606897830963135, "learning_rate": 0.0002, "epoch": 1.17206132879046, "step": 1720}, {"loss": 1.6889, "grad_norm": 0.3122918903827667, "learning_rate": 0.0002, "epoch": 1.1788756388415673, "step": 1730}, {"loss": 1.6585, "grad_norm": 0.34081900119781494, "learning_rate": 0.0002, "epoch": 1.1856899488926746, "step": 1740}, {"loss": 1.7794, "grad_norm": 0.3418807089328766, "learning_rate": 0.0002, "epoch": 1.192504258943782, "step": 1750}, {"loss": 1.7773, "grad_norm": 0.3495500981807709, "learning_rate": 0.0002, "epoch": 1.1993185689948893, "step": 1760}, {"loss": 1.6578, "grad_norm": 0.557288408279419, "learning_rate": 0.0002, "epoch": 1.2061328790459966, "step": 1770}, {"loss": 1.7488, "grad_norm": 0.3193778693675995, "learning_rate": 0.0002, "epoch": 1.212947189097104, "step": 1780}, {"loss": 1.6815, "grad_norm": 0.3306216895580292, "learning_rate": 0.0002, "epoch": 1.2197614991482113, "step": 1790}, {"loss": 1.6772, "grad_norm": 0.37998732924461365, "learning_rate": 0.0002, "epoch": 1.2265758091993186, "step": 1800}, {"loss": 1.7094, "grad_norm": 0.3255669176578522, "learning_rate": 0.0002, "epoch": 1.233390119250426, "step": 1810}, {"loss": 1.7096, "grad_norm": 0.3741287291049957, "learning_rate": 0.0002, "epoch": 1.2402044293015333, "step": 1820}, {"loss": 1.7317, "grad_norm": 0.36727291345596313, "learning_rate": 0.0002, "epoch": 1.2470187393526406, "step": 1830}, {"loss": 1.7418, "grad_norm": 0.3479527235031128, "learning_rate": 0.0002, "epoch": 1.253833049403748, "step": 1840}, {"loss": 1.7062, "grad_norm": 0.3472636640071869, "learning_rate": 0.0002, "epoch": 1.2606473594548553, "step": 1850}, {"loss": 1.6263, "grad_norm": 0.3702869415283203, "learning_rate": 0.0002, "epoch": 1.2674616695059626, "step": 1860}, {"loss": 1.6222, "grad_norm": 0.3934040069580078, "learning_rate": 0.0002, "epoch": 1.27427597955707, "step": 1870}, {"loss": 1.7616, "grad_norm": 0.46887534856796265, "learning_rate": 0.0002, "epoch": 1.2810902896081773, "step": 1880}, {"loss": 1.6071, "grad_norm": 0.3191998600959778, "learning_rate": 0.0002, "epoch": 1.2879045996592846, "step": 1890}, {"loss": 1.7001, "grad_norm": 0.34032225608825684, "learning_rate": 0.0002, "epoch": 1.294718909710392, "step": 1900}, {"loss": 1.8316, "grad_norm": 0.33453696966171265, "learning_rate": 0.0002, "epoch": 1.3015332197614993, "step": 1910}, {"loss": 1.6567, "grad_norm": 0.3451494872570038, "learning_rate": 0.0002, "epoch": 1.3083475298126066, "step": 1920}, {"loss": 1.708, "grad_norm": 0.36203092336654663, "learning_rate": 0.0002, "epoch": 1.315161839863714, "step": 1930}, {"loss": 1.7095, "grad_norm": 0.43794456124305725, "learning_rate": 0.0002, "epoch": 1.321976149914821, "step": 1940}, {"loss": 1.7264, "grad_norm": 0.3630591034889221, "learning_rate": 0.0002, "epoch": 1.3287904599659284, "step": 1950}, {"loss": 1.6529, "grad_norm": 0.36951911449432373, "learning_rate": 0.0002, "epoch": 1.3356047700170357, "step": 1960}, {"loss": 1.651, "grad_norm": 0.4001159965991974, "learning_rate": 0.0002, "epoch": 1.342419080068143, "step": 1970}, {"loss": 1.677, "grad_norm": 0.3820836544036865, "learning_rate": 0.0002, "epoch": 1.3492333901192504, "step": 1980}, {"loss": 1.583, "grad_norm": 0.3705870807170868, "learning_rate": 0.0002, "epoch": 1.3560477001703577, "step": 1990}, {"loss": 1.6207, "grad_norm": 0.3557972013950348, "learning_rate": 0.0002, "epoch": 1.362862010221465, "step": 2000}, {"loss": 1.7656, "grad_norm": 0.38546398282051086, "learning_rate": 0.0002, "epoch": 1.3696763202725724, "step": 2010}, {"loss": 1.6881, "grad_norm": 0.3908020853996277, "learning_rate": 0.0002, "epoch": 1.3764906303236797, "step": 2020}, {"loss": 1.727, "grad_norm": 0.3822040855884552, "learning_rate": 0.0002, "epoch": 1.383304940374787, "step": 2030}, {"loss": 1.7923, "grad_norm": 0.425327867269516, "learning_rate": 0.0002, "epoch": 1.3901192504258943, "step": 2040}, {"loss": 1.7032, "grad_norm": 0.3436259329319, "learning_rate": 0.0002, "epoch": 1.3969335604770017, "step": 2050}, {"loss": 1.6398, "grad_norm": 0.33124062418937683, "learning_rate": 0.0002, "epoch": 1.403747870528109, "step": 2060}, {"loss": 1.6815, "grad_norm": 0.3662424683570862, "learning_rate": 0.0002, "epoch": 1.4105621805792163, "step": 2070}, {"loss": 1.7258, "grad_norm": 0.3720635175704956, "learning_rate": 0.0002, "epoch": 1.4173764906303237, "step": 2080}, {"loss": 1.7186, "grad_norm": 0.3361680805683136, "learning_rate": 0.0002, "epoch": 1.424190800681431, "step": 2090}, {"loss": 1.7606, "grad_norm": 0.32751724123954773, "learning_rate": 0.0002, "epoch": 1.4310051107325383, "step": 2100}, {"loss": 1.7051, "grad_norm": 0.34333378076553345, "learning_rate": 0.0002, "epoch": 1.4378194207836457, "step": 2110}, {"loss": 1.6979, "grad_norm": 0.37777671217918396, "learning_rate": 0.0002, "epoch": 1.444633730834753, "step": 2120}, {"loss": 1.7085, "grad_norm": 0.37126365303993225, "learning_rate": 0.0002, "epoch": 1.4514480408858603, "step": 2130}, {"loss": 1.721, "grad_norm": 0.3602267801761627, "learning_rate": 0.0002, "epoch": 1.4582623509369677, "step": 2140}, {"loss": 1.8148, "grad_norm": 0.3287110924720764, "learning_rate": 0.0002, "epoch": 1.465076660988075, "step": 2150}, {"loss": 1.6966, "grad_norm": 0.3562135100364685, "learning_rate": 0.0002, "epoch": 1.4718909710391823, "step": 2160}, {"loss": 1.713, "grad_norm": 0.38292962312698364, "learning_rate": 0.0002, "epoch": 1.4787052810902896, "step": 2170}, {"loss": 1.7036, "grad_norm": 0.38220319151878357, "learning_rate": 0.0002, "epoch": 1.485519591141397, "step": 2180}, {"loss": 1.7297, "grad_norm": 0.3570062220096588, "learning_rate": 0.0002, "epoch": 1.4923339011925043, "step": 2190}, {"loss": 1.5652, "grad_norm": 0.363146036863327, "learning_rate": 0.0002, "epoch": 1.4991482112436116, "step": 2200}, {"loss": 1.6716, "grad_norm": 0.37393274903297424, "learning_rate": 0.0002, "epoch": 1.5059625212947187, "step": 2210}, {"loss": 1.6839, "grad_norm": 0.3628501892089844, "learning_rate": 0.0002, "epoch": 1.512776831345826, "step": 2220}, {"loss": 1.6627, "grad_norm": 0.33430740237236023, "learning_rate": 0.0002, "epoch": 1.5195911413969334, "step": 2230}, {"loss": 1.7495, "grad_norm": 0.35363978147506714, "learning_rate": 0.0002, "epoch": 1.5264054514480407, "step": 2240}, {"loss": 1.6364, "grad_norm": 0.37220337986946106, "learning_rate": 0.0002, "epoch": 1.533219761499148, "step": 2250}, {"loss": 1.7773, "grad_norm": 0.35020262002944946, "learning_rate": 0.0002, "epoch": 1.5400340715502554, "step": 2260}, {"loss": 1.7453, "grad_norm": 0.35274937748908997, "learning_rate": 0.0002, "epoch": 1.5468483816013627, "step": 2270}, {"loss": 1.7162, "grad_norm": 0.3797738552093506, "learning_rate": 0.0002, "epoch": 1.55366269165247, "step": 2280}, {"loss": 1.6197, "grad_norm": 0.4160412847995758, "learning_rate": 0.0002, "epoch": 1.5604770017035774, "step": 2290}, {"loss": 1.7101, "grad_norm": 0.38779592514038086, "learning_rate": 0.0002, "epoch": 1.5672913117546847, "step": 2300}, {"loss": 1.7254, "grad_norm": 0.39171287417411804, "learning_rate": 0.0002, "epoch": 1.574105621805792, "step": 2310}, {"loss": 1.7087, "grad_norm": 0.3606826663017273, "learning_rate": 0.0002, "epoch": 1.5809199318568994, "step": 2320}, {"loss": 1.7269, "grad_norm": 0.3745017647743225, "learning_rate": 0.0002, "epoch": 1.5877342419080067, "step": 2330}, {"loss": 1.7068, "grad_norm": 0.34933462738990784, "learning_rate": 0.0002, "epoch": 1.594548551959114, "step": 2340}, {"loss": 1.7073, "grad_norm": 0.37268444895744324, "learning_rate": 0.0002, "epoch": 1.6013628620102214, "step": 2350}, {"loss": 1.728, "grad_norm": 0.4603484869003296, "learning_rate": 0.0002, "epoch": 1.6081771720613287, "step": 2360}, {"loss": 1.7621, "grad_norm": 0.35689088702201843, "learning_rate": 0.0002, "epoch": 1.614991482112436, "step": 2370}, {"loss": 1.6989, "grad_norm": 0.3392031490802765, "learning_rate": 0.0002, "epoch": 1.6218057921635434, "step": 2380}, {"loss": 1.7077, "grad_norm": 0.394653856754303, "learning_rate": 0.0002, "epoch": 1.6286201022146507, "step": 2390}, {"loss": 1.7448, "grad_norm": 0.33972012996673584, "learning_rate": 0.0002, "epoch": 1.635434412265758, "step": 2400}, {"loss": 1.7681, "grad_norm": 0.3854375183582306, "learning_rate": 0.0002, "epoch": 1.6422487223168654, "step": 2410}, {"loss": 1.7102, "grad_norm": 0.36143961548805237, "learning_rate": 0.0002, "epoch": 1.6490630323679727, "step": 2420}, {"loss": 1.7432, "grad_norm": 0.35816189646720886, "learning_rate": 0.0002, "epoch": 1.65587734241908, "step": 2430}, {"loss": 1.6407, "grad_norm": 0.36298736929893494, "learning_rate": 0.0002, "epoch": 1.6626916524701874, "step": 2440}, {"loss": 1.723, "grad_norm": 0.36756929755210876, "learning_rate": 0.0002, "epoch": 1.6695059625212947, "step": 2450}, {"loss": 1.6824, "grad_norm": 0.35969603061676025, "learning_rate": 0.0002, "epoch": 1.676320272572402, "step": 2460}, {"loss": 1.7014, "grad_norm": 0.38449376821517944, "learning_rate": 0.0002, "epoch": 1.6831345826235093, "step": 2470}, {"loss": 1.7261, "grad_norm": 0.44511452317237854, "learning_rate": 0.0002, "epoch": 1.6899488926746167, "step": 2480}, {"loss": 1.7397, "grad_norm": 0.3831416368484497, "learning_rate": 0.0002, "epoch": 1.696763202725724, "step": 2490}, {"loss": 1.7046, "grad_norm": 0.3795325756072998, "learning_rate": 0.0002, "epoch": 1.7035775127768313, "step": 2500}, {"loss": 1.6993, "grad_norm": 0.34978193044662476, "learning_rate": 0.0002, "epoch": 1.7103918228279387, "step": 2510}, {"loss": 1.8307, "grad_norm": 0.35923877358436584, "learning_rate": 0.0002, "epoch": 1.717206132879046, "step": 2520}, {"loss": 1.6253, "grad_norm": 0.352999746799469, "learning_rate": 0.0002, "epoch": 1.7240204429301533, "step": 2530}, {"loss": 1.6953, "grad_norm": 0.43673479557037354, "learning_rate": 0.0002, "epoch": 1.7308347529812607, "step": 2540}, {"loss": 1.7079, "grad_norm": 0.4153687357902527, "learning_rate": 0.0002, "epoch": 1.737649063032368, "step": 2550}, {"loss": 1.6714, "grad_norm": 0.35541167855262756, "learning_rate": 0.0002, "epoch": 1.7444633730834753, "step": 2560}, {"loss": 1.696, "grad_norm": 0.3288775086402893, "learning_rate": 0.0002, "epoch": 1.7512776831345827, "step": 2570}, {"loss": 1.7486, "grad_norm": 0.3991123139858246, "learning_rate": 0.0002, "epoch": 1.75809199318569, "step": 2580}, {"loss": 1.645, "grad_norm": 0.39967241883277893, "learning_rate": 0.0002, "epoch": 1.7649063032367973, "step": 2590}, {"loss": 1.6855, "grad_norm": 0.41104283928871155, "learning_rate": 0.0002, "epoch": 1.7717206132879046, "step": 2600}, {"loss": 1.6993, "grad_norm": 0.44885286688804626, "learning_rate": 0.0002, "epoch": 1.778534923339012, "step": 2610}, {"loss": 1.7224, "grad_norm": 0.38384467363357544, "learning_rate": 0.0002, "epoch": 1.7853492333901193, "step": 2620}, {"loss": 1.7213, "grad_norm": 0.35020917654037476, "learning_rate": 0.0002, "epoch": 1.7921635434412266, "step": 2630}, {"loss": 1.6706, "grad_norm": 0.3360341489315033, "learning_rate": 0.0002, "epoch": 1.798977853492334, "step": 2640}, {"loss": 1.7037, "grad_norm": 0.38875144720077515, "learning_rate": 0.0002, "epoch": 1.8057921635434413, "step": 2650}, {"loss": 1.693, "grad_norm": 0.34876883029937744, "learning_rate": 0.0002, "epoch": 1.8126064735945486, "step": 2660}, {"loss": 1.7743, "grad_norm": 0.419979989528656, "learning_rate": 0.0002, "epoch": 1.819420783645656, "step": 2670}, {"loss": 1.7089, "grad_norm": 0.3648919463157654, "learning_rate": 0.0002, "epoch": 1.8262350936967633, "step": 2680}, {"loss": 1.7828, "grad_norm": 0.3485383987426758, "learning_rate": 0.0002, "epoch": 1.8330494037478706, "step": 2690}, {"loss": 1.705, "grad_norm": 0.3647468686103821, "learning_rate": 0.0002, "epoch": 1.839863713798978, "step": 2700}, {"loss": 1.7318, "grad_norm": 0.37003210186958313, "learning_rate": 0.0002, "epoch": 1.8466780238500853, "step": 2710}, {"loss": 1.6647, "grad_norm": 0.37031617760658264, "learning_rate": 0.0002, "epoch": 1.8534923339011926, "step": 2720}, {"loss": 1.69, "grad_norm": 0.3438796103000641, "learning_rate": 0.0002, "epoch": 1.8603066439523, "step": 2730}, {"loss": 1.5995, "grad_norm": 0.41574627161026, "learning_rate": 0.0002, "epoch": 1.8671209540034073, "step": 2740}, {"loss": 1.6877, "grad_norm": 0.35049930214881897, "learning_rate": 0.0002, "epoch": 1.8739352640545146, "step": 2750}, {"loss": 1.7048, "grad_norm": 0.3943989872932434, "learning_rate": 0.0002, "epoch": 1.880749574105622, "step": 2760}, {"loss": 1.7047, "grad_norm": 0.3384978175163269, "learning_rate": 0.0002, "epoch": 1.8875638841567293, "step": 2770}, {"loss": 1.7848, "grad_norm": 0.3501328229904175, "learning_rate": 0.0002, "epoch": 1.8943781942078366, "step": 2780}, {"loss": 1.6986, "grad_norm": 0.37484532594680786, "learning_rate": 0.0002, "epoch": 1.901192504258944, "step": 2790}, {"loss": 1.6867, "grad_norm": 0.34497788548469543, "learning_rate": 0.0002, "epoch": 1.9080068143100513, "step": 2800}, {"loss": 1.6513, "grad_norm": 0.3530851900577545, "learning_rate": 0.0002, "epoch": 1.9148211243611586, "step": 2810}, {"loss": 1.6369, "grad_norm": 0.3879254162311554, "learning_rate": 0.0002, "epoch": 1.921635434412266, "step": 2820}, {"loss": 1.6786, "grad_norm": 0.3885590136051178, "learning_rate": 0.0002, "epoch": 1.9284497444633732, "step": 2830}, {"loss": 1.8049, "grad_norm": 0.3868715465068817, "learning_rate": 0.0002, "epoch": 1.9352640545144804, "step": 2840}, {"loss": 1.7099, "grad_norm": 0.4152422249317169, "learning_rate": 0.0002, "epoch": 1.9420783645655877, "step": 2850}, {"loss": 1.6696, "grad_norm": 0.401714563369751, "learning_rate": 0.0002, "epoch": 1.948892674616695, "step": 2860}, {"loss": 1.7182, "grad_norm": 0.34825265407562256, "learning_rate": 0.0002, "epoch": 1.9557069846678024, "step": 2870}, {"loss": 1.6612, "grad_norm": 0.3620675504207611, "learning_rate": 0.0002, "epoch": 1.9625212947189097, "step": 2880}, {"loss": 1.7451, "grad_norm": 0.3977806866168976, "learning_rate": 0.0002, "epoch": 1.969335604770017, "step": 2890}, {"loss": 1.6514, "grad_norm": 0.3687497079372406, "learning_rate": 0.0002, "epoch": 1.9761499148211243, "step": 2900}, {"loss": 1.7712, "grad_norm": 0.408640056848526, "learning_rate": 0.0002, "epoch": 1.9829642248722317, "step": 2910}, {"loss": 1.695, "grad_norm": 0.34510108828544617, "learning_rate": 0.0002, "epoch": 1.989778534923339, "step": 2920}, {"loss": 1.662, "grad_norm": 0.3596334755420685, "learning_rate": 0.0002, "epoch": 1.9965928449744463, "step": 2930}, {"eval_loss": 1.8056600093841553, "eval_runtime": 60.7049, "eval_samples_per_second": 8.352, "eval_steps_per_second": 1.054, "epoch": 2.0, "step": 2935}, {"loss": 1.6918, "grad_norm": 0.3460802137851715, "learning_rate": 0.0002, "epoch": 2.0034071550255534, "step": 2940}, {"loss": 1.6036, "grad_norm": 0.4038620591163635, "learning_rate": 0.0002, "epoch": 2.0102214650766608, "step": 2950}, {"loss": 1.5386, "grad_norm": 0.3950219750404358, "learning_rate": 0.0002, "epoch": 2.017035775127768, "step": 2960}, {"loss": 1.5649, "grad_norm": 0.519116997718811, "learning_rate": 0.0002, "epoch": 2.0238500851788754, "step": 2970}, {"loss": 1.5397, "grad_norm": 0.4097684919834137, "learning_rate": 0.0002, "epoch": 2.0306643952299828, "step": 2980}, {"loss": 1.6044, "grad_norm": 0.4153544306755066, "learning_rate": 0.0002, "epoch": 2.03747870528109, "step": 2990}, {"loss": 1.6006, "grad_norm": 0.4351160526275635, "learning_rate": 0.0002, "epoch": 2.0442930153321974, "step": 3000}, {"loss": 1.5091, "grad_norm": 0.42036259174346924, "learning_rate": 0.0002, "epoch": 2.0511073253833048, "step": 3010}, {"loss": 1.5686, "grad_norm": 0.4433218836784363, "learning_rate": 0.0002, "epoch": 2.057921635434412, "step": 3020}, {"loss": 1.5478, "grad_norm": 0.46511581540107727, "learning_rate": 0.0002, "epoch": 2.0647359454855194, "step": 3030}, {"loss": 1.5554, "grad_norm": 0.4567560851573944, "learning_rate": 0.0002, "epoch": 2.0715502555366268, "step": 3040}, {"loss": 1.5561, "grad_norm": 0.45671048760414124, "learning_rate": 0.0002, "epoch": 2.078364565587734, "step": 3050}, {"loss": 1.606, "grad_norm": 0.4598552882671356, "learning_rate": 0.0002, "epoch": 2.0851788756388414, "step": 3060}, {"loss": 1.6357, "grad_norm": 0.4582861661911011, "learning_rate": 0.0002, "epoch": 2.0919931856899487, "step": 3070}, {"loss": 1.5853, "grad_norm": 0.4366969168186188, "learning_rate": 0.0002, "epoch": 2.098807495741056, "step": 3080}, {"loss": 1.5574, "grad_norm": 0.495917409658432, "learning_rate": 0.0002, "epoch": 2.1056218057921634, "step": 3090}, {"loss": 1.5798, "grad_norm": 1.6846044063568115, "learning_rate": 0.0002, "epoch": 2.1124361158432707, "step": 3100}, {"loss": 1.5877, "grad_norm": 0.4765092134475708, "learning_rate": 0.0002, "epoch": 2.119250425894378, "step": 3110}, {"loss": 1.6106, "grad_norm": 0.45029810070991516, "learning_rate": 0.0002, "epoch": 2.1260647359454854, "step": 3120}, {"loss": 1.5707, "grad_norm": 0.5706973075866699, "learning_rate": 0.0002, "epoch": 2.1328790459965927, "step": 3130}, {"loss": 1.589, "grad_norm": 0.4606274366378784, "learning_rate": 0.0002, "epoch": 2.1396933560477, "step": 3140}, {"loss": 1.564, "grad_norm": 2.199115753173828, "learning_rate": 0.0002, "epoch": 2.1465076660988074, "step": 3150}, {"loss": 1.5808, "grad_norm": 0.6231027245521545, "learning_rate": 0.0002, "epoch": 2.1533219761499147, "step": 3160}, {"loss": 1.5397, "grad_norm": 0.46918219327926636, "learning_rate": 0.0002, "epoch": 2.160136286201022, "step": 3170}, {"loss": 1.6082, "grad_norm": 0.5006393194198608, "learning_rate": 0.0002, "epoch": 2.1669505962521294, "step": 3180}, {"loss": 1.591, "grad_norm": 0.4745093286037445, "learning_rate": 0.0002, "epoch": 2.1737649063032367, "step": 3190}, {"loss": 1.6459, "grad_norm": 0.511576771736145, "learning_rate": 0.0002, "epoch": 2.180579216354344, "step": 3200}, {"loss": 1.6019, "grad_norm": 0.38622918725013733, "learning_rate": 0.0002, "epoch": 2.1873935264054514, "step": 3210}, {"loss": 1.6187, "grad_norm": 0.4425644278526306, "learning_rate": 0.0002, "epoch": 2.1942078364565587, "step": 3220}, {"loss": 1.6114, "grad_norm": 0.45643091201782227, "learning_rate": 0.0002, "epoch": 2.201022146507666, "step": 3230}, {"loss": 1.6224, "grad_norm": 0.4975406229496002, "learning_rate": 0.0002, "epoch": 2.2078364565587734, "step": 3240}, {"loss": 1.6654, "grad_norm": 0.4673331081867218, "learning_rate": 0.0002, "epoch": 2.2146507666098807, "step": 3250}, {"loss": 1.6155, "grad_norm": 0.5081731081008911, "learning_rate": 0.0002, "epoch": 2.221465076660988, "step": 3260}, {"loss": 1.53, "grad_norm": 0.4790806770324707, "learning_rate": 0.0002, "epoch": 2.2282793867120954, "step": 3270}, {"loss": 1.6362, "grad_norm": 0.5184140801429749, "learning_rate": 0.0002, "epoch": 2.2350936967632027, "step": 3280}, {"loss": 1.5646, "grad_norm": 0.5159083604812622, "learning_rate": 0.0002, "epoch": 2.24190800681431, "step": 3290}, {"loss": 1.6577, "grad_norm": 0.4876042604446411, "learning_rate": 0.0002, "epoch": 2.2487223168654173, "step": 3300}, {"loss": 1.5718, "grad_norm": 0.5454957485198975, "learning_rate": 0.0002, "epoch": 2.2555366269165247, "step": 3310}, {"loss": 1.5885, "grad_norm": 0.49866822361946106, "learning_rate": 0.0002, "epoch": 2.262350936967632, "step": 3320}, {"loss": 1.5923, "grad_norm": 0.42674365639686584, "learning_rate": 0.0002, "epoch": 2.2691652470187393, "step": 3330}, {"loss": 1.5588, "grad_norm": 0.5202316641807556, "learning_rate": 0.0002, "epoch": 2.2759795570698467, "step": 3340}, {"loss": 1.6032, "grad_norm": 0.4849465489387512, "learning_rate": 0.0002, "epoch": 2.282793867120954, "step": 3350}, {"loss": 1.6853, "grad_norm": 0.47202569246292114, "learning_rate": 0.0002, "epoch": 2.2896081771720613, "step": 3360}, {"loss": 1.6164, "grad_norm": 0.5311620235443115, "learning_rate": 0.0002, "epoch": 2.2964224872231687, "step": 3370}, {"loss": 1.6004, "grad_norm": 0.49011409282684326, "learning_rate": 0.0002, "epoch": 2.303236797274276, "step": 3380}, {"loss": 1.5484, "grad_norm": 0.4789247512817383, "learning_rate": 0.0002, "epoch": 2.3100511073253833, "step": 3390}, {"loss": 1.5866, "grad_norm": 0.46646103262901306, "learning_rate": 0.0002, "epoch": 2.3168654173764907, "step": 3400}, {"loss": 1.5308, "grad_norm": 0.5552441477775574, "learning_rate": 0.0002, "epoch": 2.323679727427598, "step": 3410}, {"loss": 1.5761, "grad_norm": 0.4530351758003235, "learning_rate": 0.0002, "epoch": 2.3304940374787053, "step": 3420}, {"loss": 1.5919, "grad_norm": 0.4806232750415802, "learning_rate": 0.0002, "epoch": 2.3373083475298126, "step": 3430}, {"loss": 1.5569, "grad_norm": 0.5998363494873047, "learning_rate": 0.0002, "epoch": 2.34412265758092, "step": 3440}, {"loss": 1.513, "grad_norm": 0.4918554425239563, "learning_rate": 0.0002, "epoch": 2.3509369676320273, "step": 3450}, {"loss": 1.6323, "grad_norm": 0.5359559655189514, "learning_rate": 0.0002, "epoch": 2.3577512776831346, "step": 3460}, {"loss": 1.5973, "grad_norm": 0.5053277611732483, "learning_rate": 0.0002, "epoch": 2.364565587734242, "step": 3470}, {"loss": 1.5673, "grad_norm": 0.5058915019035339, "learning_rate": 0.0002, "epoch": 2.3713798977853493, "step": 3480}, {"loss": 1.5434, "grad_norm": 0.5314899682998657, "learning_rate": 0.0002, "epoch": 2.3781942078364566, "step": 3490}, {"loss": 1.5203, "grad_norm": 0.48035913705825806, "learning_rate": 0.0002, "epoch": 2.385008517887564, "step": 3500}, {"loss": 1.5936, "grad_norm": 0.45864903926849365, "learning_rate": 0.0002, "epoch": 2.3918228279386713, "step": 3510}, {"loss": 1.6285, "grad_norm": 0.4553050398826599, "learning_rate": 0.0002, "epoch": 2.3986371379897786, "step": 3520}, {"loss": 1.5444, "grad_norm": 0.4483442008495331, "learning_rate": 0.0002, "epoch": 2.405451448040886, "step": 3530}, {"loss": 1.587, "grad_norm": 0.5043742060661316, "learning_rate": 0.0002, "epoch": 2.4122657580919933, "step": 3540}, {"loss": 1.5692, "grad_norm": 0.44807168841362, "learning_rate": 0.0002, "epoch": 2.4190800681431006, "step": 3550}, {"loss": 1.6306, "grad_norm": 0.5065137147903442, "learning_rate": 0.0002, "epoch": 2.425894378194208, "step": 3560}, {"loss": 1.5842, "grad_norm": 0.5186443328857422, "learning_rate": 0.0002, "epoch": 2.4327086882453153, "step": 3570}, {"loss": 1.5956, "grad_norm": 0.49743232131004333, "learning_rate": 0.0002, "epoch": 2.4395229982964226, "step": 3580}, {"loss": 1.6021, "grad_norm": 0.524450421333313, "learning_rate": 0.0002, "epoch": 2.44633730834753, "step": 3590}, {"loss": 1.6283, "grad_norm": 0.5053797364234924, "learning_rate": 0.0002, "epoch": 2.4531516183986373, "step": 3600}, {"loss": 1.6335, "grad_norm": 0.5223091840744019, "learning_rate": 0.0002, "epoch": 2.4599659284497446, "step": 3610}, {"loss": 1.6315, "grad_norm": 0.4763810932636261, "learning_rate": 0.0002, "epoch": 2.466780238500852, "step": 3620}, {"loss": 1.5623, "grad_norm": 0.5097282528877258, "learning_rate": 0.0002, "epoch": 2.4735945485519593, "step": 3630}, {"loss": 1.605, "grad_norm": 0.5831942558288574, "learning_rate": 0.0002, "epoch": 2.4804088586030666, "step": 3640}, {"loss": 1.6074, "grad_norm": 0.47573572397232056, "learning_rate": 0.0002, "epoch": 2.487223168654174, "step": 3650}, {"loss": 1.6411, "grad_norm": 0.49602726101875305, "learning_rate": 0.0002, "epoch": 2.4940374787052813, "step": 3660}, {"loss": 1.571, "grad_norm": 0.5468524694442749, "learning_rate": 0.0002, "epoch": 2.500851788756388, "step": 3670}, {"loss": 1.5782, "grad_norm": 0.45899373292922974, "learning_rate": 0.0002, "epoch": 2.507666098807496, "step": 3680}, {"loss": 1.5114, "grad_norm": 0.5031567215919495, "learning_rate": 0.0002, "epoch": 2.514480408858603, "step": 3690}, {"loss": 1.538, "grad_norm": 0.5224900841712952, "learning_rate": 0.0002, "epoch": 2.5212947189097106, "step": 3700}, {"loss": 1.5269, "grad_norm": 0.504769504070282, "learning_rate": 0.0002, "epoch": 2.5281090289608175, "step": 3710}, {"loss": 1.5141, "grad_norm": 0.6120529770851135, "learning_rate": 0.0002, "epoch": 2.5349233390119252, "step": 3720}, {"loss": 1.5666, "grad_norm": 0.47930678725242615, "learning_rate": 0.0002, "epoch": 2.541737649063032, "step": 3730}, {"loss": 1.6156, "grad_norm": 0.5039092302322388, "learning_rate": 0.0002, "epoch": 2.54855195911414, "step": 3740}, {"loss": 1.5585, "grad_norm": 0.49758994579315186, "learning_rate": 0.0002, "epoch": 2.555366269165247, "step": 3750}, {"loss": 1.5351, "grad_norm": 0.44739171862602234, "learning_rate": 0.0002, "epoch": 2.5621805792163546, "step": 3760}, {"loss": 1.5099, "grad_norm": 0.47056373953819275, "learning_rate": 0.0002, "epoch": 2.5689948892674614, "step": 3770}, {"loss": 1.5524, "grad_norm": 0.5077595114707947, "learning_rate": 0.0002, "epoch": 2.575809199318569, "step": 3780}, {"loss": 1.5524, "grad_norm": 0.4981902837753296, "learning_rate": 0.0002, "epoch": 2.582623509369676, "step": 3790}, {"loss": 1.5381, "grad_norm": 0.5736238360404968, "learning_rate": 0.0002, "epoch": 2.589437819420784, "step": 3800}, {"loss": 1.67, "grad_norm": 0.4898384213447571, "learning_rate": 0.0002, "epoch": 2.5962521294718908, "step": 3810}, {"loss": 1.5411, "grad_norm": 0.4611325263977051, "learning_rate": 0.0002, "epoch": 2.6030664395229985, "step": 3820}, {"loss": 1.5662, "grad_norm": 0.5285341739654541, "learning_rate": 0.0002, "epoch": 2.6098807495741054, "step": 3830}, {"loss": 1.5875, "grad_norm": 0.5679430961608887, "learning_rate": 0.0002, "epoch": 2.616695059625213, "step": 3840}, {"loss": 1.5544, "grad_norm": 0.48532548546791077, "learning_rate": 0.0002, "epoch": 2.62350936967632, "step": 3850}, {"loss": 1.579, "grad_norm": 0.45506492257118225, "learning_rate": 0.0002, "epoch": 2.630323679727428, "step": 3860}, {"loss": 1.5775, "grad_norm": 0.6552556753158569, "learning_rate": 0.0002, "epoch": 2.6371379897785348, "step": 3870}, {"loss": 1.608, "grad_norm": 0.537874698638916, "learning_rate": 0.0002, "epoch": 2.643952299829642, "step": 3880}, {"loss": 1.5653, "grad_norm": 0.46102389693260193, "learning_rate": 0.0002, "epoch": 2.6507666098807494, "step": 3890}, {"loss": 1.5293, "grad_norm": 0.45531195402145386, "learning_rate": 0.0002, "epoch": 2.6575809199318567, "step": 3900}, {"loss": 1.5492, "grad_norm": 0.5327293872833252, "learning_rate": 0.0002, "epoch": 2.664395229982964, "step": 3910}, {"loss": 1.5921, "grad_norm": 0.4968956410884857, "learning_rate": 0.0002, "epoch": 2.6712095400340714, "step": 3920}, {"loss": 1.5823, "grad_norm": 0.4790082275867462, "learning_rate": 0.0002, "epoch": 2.6780238500851787, "step": 3930}, {"loss": 1.615, "grad_norm": 0.5392967462539673, "learning_rate": 0.0002, "epoch": 2.684838160136286, "step": 3940}, {"loss": 1.6218, "grad_norm": 0.5076649785041809, "learning_rate": 0.0002, "epoch": 2.6916524701873934, "step": 3950}, {"loss": 1.6478, "grad_norm": 0.5628064274787903, "learning_rate": 0.0002, "epoch": 2.6984667802385007, "step": 3960}, {"loss": 1.5417, "grad_norm": 0.5012659430503845, "learning_rate": 0.0002, "epoch": 2.705281090289608, "step": 3970}, {"loss": 1.5339, "grad_norm": 0.4947647452354431, "learning_rate": 0.0002, "epoch": 2.7120954003407154, "step": 3980}, {"loss": 1.5724, "grad_norm": 0.4890969693660736, "learning_rate": 0.0002, "epoch": 2.7189097103918227, "step": 3990}, {"loss": 1.5746, "grad_norm": 0.4471694231033325, "learning_rate": 0.0002, "epoch": 2.72572402044293, "step": 4000}, {"loss": 1.6669, "grad_norm": 0.5116439461708069, "learning_rate": 0.0002, "epoch": 2.7325383304940374, "step": 4010}, {"loss": 1.584, "grad_norm": 0.5720411539077759, "learning_rate": 0.0002, "epoch": 2.7393526405451447, "step": 4020}, {"loss": 1.6151, "grad_norm": 0.5529406070709229, "learning_rate": 0.0002, "epoch": 2.746166950596252, "step": 4030}, {"loss": 1.6296, "grad_norm": 0.5229396820068359, "learning_rate": 0.0002, "epoch": 2.7529812606473594, "step": 4040}, {"loss": 1.5363, "grad_norm": 0.5270276069641113, "learning_rate": 0.0002, "epoch": 2.7597955706984667, "step": 4050}, {"loss": 1.6305, "grad_norm": 0.48413026332855225, "learning_rate": 0.0002, "epoch": 2.766609880749574, "step": 4060}, {"loss": 1.5488, "grad_norm": 0.5145403742790222, "learning_rate": 0.0002, "epoch": 2.7734241908006814, "step": 4070}, {"loss": 1.6023, "grad_norm": 0.48626071214675903, "learning_rate": 0.0002, "epoch": 2.7802385008517887, "step": 4080}, {"loss": 1.6082, "grad_norm": 0.5018984079360962, "learning_rate": 0.0002, "epoch": 2.787052810902896, "step": 4090}, {"loss": 1.7166, "grad_norm": 0.4881938695907593, "learning_rate": 0.0002, "epoch": 2.7938671209540034, "step": 4100}, {"loss": 1.515, "grad_norm": 0.5151546001434326, "learning_rate": 0.0002, "epoch": 2.8006814310051107, "step": 4110}, {"loss": 1.6069, "grad_norm": 0.5109850764274597, "learning_rate": 0.0002, "epoch": 2.807495741056218, "step": 4120}, {"loss": 1.6153, "grad_norm": 0.5109251141548157, "learning_rate": 0.0002, "epoch": 2.8143100511073254, "step": 4130}, {"loss": 1.6365, "grad_norm": 0.5025496482849121, "learning_rate": 0.0002, "epoch": 2.8211243611584327, "step": 4140}, {"loss": 1.6292, "grad_norm": 0.49027004837989807, "learning_rate": 0.0002, "epoch": 2.82793867120954, "step": 4150}, {"loss": 1.5591, "grad_norm": 0.4957362413406372, "learning_rate": 0.0002, "epoch": 2.8347529812606473, "step": 4160}, {"loss": 1.6759, "grad_norm": 0.5159927606582642, "learning_rate": 0.0002, "epoch": 2.8415672913117547, "step": 4170}, {"loss": 1.577, "grad_norm": 0.6040670871734619, "learning_rate": 0.0002, "epoch": 2.848381601362862, "step": 4180}, {"loss": 1.5295, "grad_norm": 0.5489953756332397, "learning_rate": 0.0002, "epoch": 2.8551959114139693, "step": 4190}, {"loss": 1.5909, "grad_norm": 0.5416634678840637, "learning_rate": 0.0002, "epoch": 2.8620102214650767, "step": 4200}, {"loss": 1.6014, "grad_norm": 0.5278245210647583, "learning_rate": 0.0002, "epoch": 2.868824531516184, "step": 4210}, {"loss": 1.4848, "grad_norm": 0.43382319808006287, "learning_rate": 0.0002, "epoch": 2.8756388415672913, "step": 4220}, {"loss": 1.6092, "grad_norm": 0.4724387228488922, "learning_rate": 0.0002, "epoch": 2.8824531516183987, "step": 4230}, {"loss": 1.5748, "grad_norm": 0.49824780225753784, "learning_rate": 0.0002, "epoch": 2.889267461669506, "step": 4240}, {"loss": 1.6163, "grad_norm": 0.5360262989997864, "learning_rate": 0.0002, "epoch": 2.8960817717206133, "step": 4250}, {"loss": 1.5934, "grad_norm": 0.49090322852134705, "learning_rate": 0.0002, "epoch": 2.9028960817717206, "step": 4260}, {"loss": 1.624, "grad_norm": 0.5613328218460083, "learning_rate": 0.0002, "epoch": 2.909710391822828, "step": 4270}, {"loss": 1.5713, "grad_norm": 0.4611356258392334, "learning_rate": 0.0002, "epoch": 2.9165247018739353, "step": 4280}, {"loss": 1.5457, "grad_norm": 0.550897479057312, "learning_rate": 0.0002, "epoch": 2.9233390119250426, "step": 4290}, {"loss": 1.6225, "grad_norm": 0.5089612603187561, "learning_rate": 0.0002, "epoch": 2.93015332197615, "step": 4300}, {"loss": 1.5897, "grad_norm": 0.5210904479026794, "learning_rate": 0.0002, "epoch": 2.9369676320272573, "step": 4310}, {"loss": 1.6198, "grad_norm": 0.5506424903869629, "learning_rate": 0.0002, "epoch": 2.9437819420783646, "step": 4320}, {"loss": 1.6395, "grad_norm": 0.5118561387062073, "learning_rate": 0.0002, "epoch": 2.950596252129472, "step": 4330}, {"loss": 1.704, "grad_norm": 0.5034464597702026, "learning_rate": 0.0002, "epoch": 2.9574105621805793, "step": 4340}, {"loss": 1.6314, "grad_norm": 0.5019990801811218, "learning_rate": 0.0002, "epoch": 2.9642248722316866, "step": 4350}, {"loss": 1.6161, "grad_norm": 0.5423325300216675, "learning_rate": 0.0002, "epoch": 2.971039182282794, "step": 4360}, {"loss": 1.6144, "grad_norm": 0.5287469625473022, "learning_rate": 0.0002, "epoch": 2.9778534923339013, "step": 4370}, {"loss": 1.5227, "grad_norm": 0.5206913352012634, "learning_rate": 0.0002, "epoch": 2.9846678023850086, "step": 4380}, {"loss": 1.6026, "grad_norm": 0.5407394170761108, "learning_rate": 0.0002, "epoch": 2.991482112436116, "step": 4390}, {"loss": 1.5908, "grad_norm": 0.5244600176811218, "learning_rate": 0.0002, "epoch": 2.9982964224872233, "step": 4400}]} +{"epoch": 4.0, "step": 5870, "epoch_duration": 2495.3220834732056, "total_accumulated_duration": 9727.539589166641, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28746.0}, "peak_memory_reserved": {"GPU_0": 28746.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0988, "grad_norm": 0.635574460029602, "learning_rate": 0.0002, "epoch": 0.0068143100511073255, "step": 10}, {"loss": 2.5594, "grad_norm": 1.0401769876480103, "learning_rate": 0.0002, "epoch": 0.013628620102214651, "step": 20}, {"loss": 2.3587, "grad_norm": 0.4514131247997284, "learning_rate": 0.0002, "epoch": 0.020442930153321975, "step": 30}, {"loss": 2.121, "grad_norm": 0.6303355693817139, "learning_rate": 0.0002, "epoch": 0.027257240204429302, "step": 40}, {"loss": 1.9833, "grad_norm": 0.4648270606994629, "learning_rate": 0.0002, "epoch": 0.034071550255536626, "step": 50}, {"loss": 1.9384, "grad_norm": 0.42953479290008545, "learning_rate": 0.0002, "epoch": 0.04088586030664395, "step": 60}, {"loss": 1.9202, "grad_norm": 0.433614581823349, "learning_rate": 0.0002, "epoch": 0.04770017035775128, "step": 70}, {"loss": 1.911, "grad_norm": 0.45318254828453064, "learning_rate": 0.0002, "epoch": 0.054514480408858604, "step": 80}, {"loss": 1.8895, "grad_norm": 0.4023568630218506, "learning_rate": 0.0002, "epoch": 0.06132879045996593, "step": 90}, {"loss": 1.9257, "grad_norm": 0.43260207772254944, "learning_rate": 0.0002, "epoch": 0.06814310051107325, "step": 100}, {"loss": 1.9178, "grad_norm": 0.43389809131622314, "learning_rate": 0.0002, "epoch": 0.07495741056218058, "step": 110}, {"loss": 1.9071, "grad_norm": 0.39307987689971924, "learning_rate": 0.0002, "epoch": 0.0817717206132879, "step": 120}, {"loss": 1.9482, "grad_norm": 0.7703037261962891, "learning_rate": 0.0002, "epoch": 0.08858603066439523, "step": 130}, {"loss": 1.8394, "grad_norm": 0.38547563552856445, "learning_rate": 0.0002, "epoch": 0.09540034071550256, "step": 140}, {"loss": 1.9037, "grad_norm": 0.37948688864707947, "learning_rate": 0.0002, "epoch": 0.10221465076660988, "step": 150}, {"loss": 1.8664, "grad_norm": 0.33022379875183105, "learning_rate": 0.0002, "epoch": 0.10902896081771721, "step": 160}, {"loss": 1.8659, "grad_norm": 0.33703792095184326, "learning_rate": 0.0002, "epoch": 0.11584327086882454, "step": 170}, {"loss": 1.8389, "grad_norm": 0.3259912431240082, "learning_rate": 0.0002, "epoch": 0.12265758091993186, "step": 180}, {"loss": 1.8424, "grad_norm": 0.3593858778476715, "learning_rate": 0.0002, "epoch": 0.12947189097103917, "step": 190}, {"loss": 1.8457, "grad_norm": 0.3401614725589752, "learning_rate": 0.0002, "epoch": 0.1362862010221465, "step": 200}, {"loss": 1.7721, "grad_norm": 0.3892078697681427, "learning_rate": 0.0002, "epoch": 0.14310051107325383, "step": 210}, {"loss": 1.8351, "grad_norm": 0.315374493598938, "learning_rate": 0.0002, "epoch": 0.14991482112436116, "step": 220}, {"loss": 1.8307, "grad_norm": 0.3679497539997101, "learning_rate": 0.0002, "epoch": 0.1567291311754685, "step": 230}, {"loss": 1.816, "grad_norm": 0.336730033159256, "learning_rate": 0.0002, "epoch": 0.1635434412265758, "step": 240}, {"loss": 1.7849, "grad_norm": 0.36472755670547485, "learning_rate": 0.0002, "epoch": 0.17035775127768313, "step": 250}, {"loss": 1.7766, "grad_norm": 0.34864023327827454, "learning_rate": 0.0002, "epoch": 0.17717206132879046, "step": 260}, {"loss": 1.896, "grad_norm": 0.3350819945335388, "learning_rate": 0.0002, "epoch": 0.1839863713798978, "step": 270}, {"loss": 1.8742, "grad_norm": 0.3844246566295624, "learning_rate": 0.0002, "epoch": 0.19080068143100512, "step": 280}, {"loss": 1.8531, "grad_norm": 0.38413065671920776, "learning_rate": 0.0002, "epoch": 0.19761499148211242, "step": 290}, {"loss": 1.8415, "grad_norm": 0.4080047607421875, "learning_rate": 0.0002, "epoch": 0.20442930153321975, "step": 300}, {"loss": 1.9009, "grad_norm": 0.3546800911426544, "learning_rate": 0.0002, "epoch": 0.21124361158432708, "step": 310}, {"loss": 1.8092, "grad_norm": 0.38577890396118164, "learning_rate": 0.0002, "epoch": 0.21805792163543442, "step": 320}, {"loss": 1.7113, "grad_norm": 0.38979098200798035, "learning_rate": 0.0002, "epoch": 0.22487223168654175, "step": 330}, {"loss": 1.8162, "grad_norm": 0.35105520486831665, "learning_rate": 0.0002, "epoch": 0.23168654173764908, "step": 340}, {"loss": 1.8408, "grad_norm": 0.31671223044395447, "learning_rate": 0.0002, "epoch": 0.23850085178875638, "step": 350}, {"loss": 1.8014, "grad_norm": 0.33034196496009827, "learning_rate": 0.0002, "epoch": 0.2453151618398637, "step": 360}, {"loss": 1.7132, "grad_norm": 0.2990533709526062, "learning_rate": 0.0002, "epoch": 0.252129471890971, "step": 370}, {"loss": 1.8612, "grad_norm": 0.362208753824234, "learning_rate": 0.0002, "epoch": 0.25894378194207834, "step": 380}, {"loss": 1.8001, "grad_norm": 0.3269096612930298, "learning_rate": 0.0002, "epoch": 0.2657580919931857, "step": 390}, {"loss": 1.786, "grad_norm": 0.30555954575538635, "learning_rate": 0.0002, "epoch": 0.272572402044293, "step": 400}, {"loss": 1.8018, "grad_norm": 0.332933247089386, "learning_rate": 0.0002, "epoch": 0.27938671209540034, "step": 410}, {"loss": 1.8157, "grad_norm": 0.39454060792922974, "learning_rate": 0.0002, "epoch": 0.28620102214650767, "step": 420}, {"loss": 1.7862, "grad_norm": 0.34589633345603943, "learning_rate": 0.0002, "epoch": 0.293015332197615, "step": 430}, {"loss": 1.7612, "grad_norm": 0.3747332990169525, "learning_rate": 0.0002, "epoch": 0.29982964224872233, "step": 440}, {"loss": 1.8476, "grad_norm": 0.34825369715690613, "learning_rate": 0.0002, "epoch": 0.30664395229982966, "step": 450}, {"loss": 1.775, "grad_norm": 0.32906976342201233, "learning_rate": 0.0002, "epoch": 0.313458262350937, "step": 460}, {"loss": 1.7764, "grad_norm": 0.33108609914779663, "learning_rate": 0.0002, "epoch": 0.3202725724020443, "step": 470}, {"loss": 1.8012, "grad_norm": 0.3170463442802429, "learning_rate": 0.0002, "epoch": 0.3270868824531516, "step": 480}, {"loss": 1.8264, "grad_norm": 0.30792081356048584, "learning_rate": 0.0002, "epoch": 0.3339011925042589, "step": 490}, {"loss": 1.7361, "grad_norm": 0.31772997975349426, "learning_rate": 0.0002, "epoch": 0.34071550255536626, "step": 500}, {"loss": 1.8147, "grad_norm": 0.32714012265205383, "learning_rate": 0.0002, "epoch": 0.3475298126064736, "step": 510}, {"loss": 1.8332, "grad_norm": 0.3021100163459778, "learning_rate": 0.0002, "epoch": 0.3543441226575809, "step": 520}, {"loss": 1.7337, "grad_norm": 0.6045835018157959, "learning_rate": 0.0002, "epoch": 0.36115843270868825, "step": 530}, {"loss": 1.8661, "grad_norm": 0.3003896474838257, "learning_rate": 0.0002, "epoch": 0.3679727427597956, "step": 540}, {"loss": 1.8359, "grad_norm": 0.3678470551967621, "learning_rate": 0.0002, "epoch": 0.3747870528109029, "step": 550}, {"loss": 1.7512, "grad_norm": 0.35787615180015564, "learning_rate": 0.0002, "epoch": 0.38160136286201024, "step": 560}, {"loss": 1.8048, "grad_norm": 0.31882143020629883, "learning_rate": 0.0002, "epoch": 0.38841567291311757, "step": 570}, {"loss": 1.8108, "grad_norm": 0.3186313509941101, "learning_rate": 0.0002, "epoch": 0.39522998296422485, "step": 580}, {"loss": 1.8012, "grad_norm": 0.41443702578544617, "learning_rate": 0.0002, "epoch": 0.4020442930153322, "step": 590}, {"loss": 1.7794, "grad_norm": 0.28773069381713867, "learning_rate": 0.0002, "epoch": 0.4088586030664395, "step": 600}, {"loss": 1.8231, "grad_norm": 0.35743263363838196, "learning_rate": 0.0002, "epoch": 0.41567291311754684, "step": 610}, {"loss": 1.7531, "grad_norm": 0.3360286355018616, "learning_rate": 0.0002, "epoch": 0.42248722316865417, "step": 620}, {"loss": 1.7654, "grad_norm": 0.32838866114616394, "learning_rate": 0.0002, "epoch": 0.4293015332197615, "step": 630}, {"loss": 1.8176, "grad_norm": 0.2994388937950134, "learning_rate": 0.0002, "epoch": 0.43611584327086883, "step": 640}, {"loss": 1.7241, "grad_norm": 0.3306307792663574, "learning_rate": 0.0002, "epoch": 0.44293015332197616, "step": 650}, {"loss": 1.8201, "grad_norm": 0.3129560351371765, "learning_rate": 0.0002, "epoch": 0.4497444633730835, "step": 660}, {"loss": 1.803, "grad_norm": 0.3244289457798004, "learning_rate": 0.0002, "epoch": 0.4565587734241908, "step": 670}, {"loss": 1.8399, "grad_norm": 0.3196892738342285, "learning_rate": 0.0002, "epoch": 0.46337308347529815, "step": 680}, {"loss": 1.8291, "grad_norm": 0.3135230243206024, "learning_rate": 0.0002, "epoch": 0.47018739352640543, "step": 690}, {"loss": 1.7423, "grad_norm": 0.28677991032600403, "learning_rate": 0.0002, "epoch": 0.47700170357751276, "step": 700}, {"loss": 1.6982, "grad_norm": 0.3074065148830414, "learning_rate": 0.0002, "epoch": 0.4838160136286201, "step": 710}, {"loss": 1.8078, "grad_norm": 0.3354290723800659, "learning_rate": 0.0002, "epoch": 0.4906303236797274, "step": 720}, {"loss": 1.7333, "grad_norm": 0.324370801448822, "learning_rate": 0.0002, "epoch": 0.49744463373083475, "step": 730}, {"loss": 1.8506, "grad_norm": 0.29496142268180847, "learning_rate": 0.0002, "epoch": 0.504258943781942, "step": 740}, {"loss": 1.7761, "grad_norm": 0.30694130063056946, "learning_rate": 0.0002, "epoch": 0.5110732538330494, "step": 750}, {"loss": 1.7854, "grad_norm": 0.36168408393859863, "learning_rate": 0.0002, "epoch": 0.5178875638841567, "step": 760}, {"loss": 1.7149, "grad_norm": 0.2930343449115753, "learning_rate": 0.0002, "epoch": 0.524701873935264, "step": 770}, {"loss": 1.7924, "grad_norm": 0.3023432493209839, "learning_rate": 0.0002, "epoch": 0.5315161839863713, "step": 780}, {"loss": 1.8467, "grad_norm": 0.3272720277309418, "learning_rate": 0.0002, "epoch": 0.5383304940374787, "step": 790}, {"loss": 1.8639, "grad_norm": 0.2907974421977997, "learning_rate": 0.0002, "epoch": 0.545144804088586, "step": 800}, {"loss": 1.9018, "grad_norm": 0.32267168164253235, "learning_rate": 0.0002, "epoch": 0.5519591141396933, "step": 810}, {"loss": 1.8311, "grad_norm": 0.32059940695762634, "learning_rate": 0.0002, "epoch": 0.5587734241908007, "step": 820}, {"loss": 1.7234, "grad_norm": 0.30951258540153503, "learning_rate": 0.0002, "epoch": 0.565587734241908, "step": 830}, {"loss": 1.8063, "grad_norm": 0.33976122736930847, "learning_rate": 0.0002, "epoch": 0.5724020442930153, "step": 840}, {"loss": 1.7021, "grad_norm": 0.3195820450782776, "learning_rate": 0.0002, "epoch": 0.5792163543441227, "step": 850}, {"loss": 1.8073, "grad_norm": 0.2828562557697296, "learning_rate": 0.0002, "epoch": 0.58603066439523, "step": 860}, {"loss": 1.695, "grad_norm": 0.29591670632362366, "learning_rate": 0.0002, "epoch": 0.5928449744463373, "step": 870}, {"loss": 1.8109, "grad_norm": 0.3086104393005371, "learning_rate": 0.0002, "epoch": 0.5996592844974447, "step": 880}, {"loss": 1.8207, "grad_norm": 0.3592929542064667, "learning_rate": 0.0002, "epoch": 0.606473594548552, "step": 890}, {"loss": 1.7662, "grad_norm": 0.2830186188220978, "learning_rate": 0.0002, "epoch": 0.6132879045996593, "step": 900}, {"loss": 1.8344, "grad_norm": 0.3128598630428314, "learning_rate": 0.0002, "epoch": 0.6201022146507666, "step": 910}, {"loss": 1.7953, "grad_norm": 0.31957563757896423, "learning_rate": 0.0002, "epoch": 0.626916524701874, "step": 920}, {"loss": 1.7578, "grad_norm": 0.30994319915771484, "learning_rate": 0.0002, "epoch": 0.6337308347529813, "step": 930}, {"loss": 1.8494, "grad_norm": 0.3352845013141632, "learning_rate": 0.0002, "epoch": 0.6405451448040886, "step": 940}, {"loss": 1.7054, "grad_norm": 0.2960077226161957, "learning_rate": 0.0002, "epoch": 0.6473594548551959, "step": 950}, {"loss": 1.7209, "grad_norm": 0.32675081491470337, "learning_rate": 0.0002, "epoch": 0.6541737649063032, "step": 960}, {"loss": 1.7776, "grad_norm": 0.30042028427124023, "learning_rate": 0.0002, "epoch": 0.6609880749574105, "step": 970}, {"loss": 1.7597, "grad_norm": 0.3288673758506775, "learning_rate": 0.0002, "epoch": 0.6678023850085179, "step": 980}, {"loss": 1.7962, "grad_norm": 0.323215126991272, "learning_rate": 0.0002, "epoch": 0.6746166950596252, "step": 990}, {"loss": 1.821, "grad_norm": 0.30041399598121643, "learning_rate": 0.0002, "epoch": 0.6814310051107325, "step": 1000}, {"loss": 1.7716, "grad_norm": 0.3076179623603821, "learning_rate": 0.0002, "epoch": 0.6882453151618398, "step": 1010}, {"loss": 1.7827, "grad_norm": 0.2941909730434418, "learning_rate": 0.0002, "epoch": 0.6950596252129472, "step": 1020}, {"loss": 1.7964, "grad_norm": 0.32220420241355896, "learning_rate": 0.0002, "epoch": 0.7018739352640545, "step": 1030}, {"loss": 1.7101, "grad_norm": 0.2989702820777893, "learning_rate": 0.0002, "epoch": 0.7086882453151618, "step": 1040}, {"loss": 1.8749, "grad_norm": 0.3049640357494354, "learning_rate": 0.0002, "epoch": 0.7155025553662692, "step": 1050}, {"loss": 1.782, "grad_norm": 0.3183284103870392, "learning_rate": 0.0002, "epoch": 0.7223168654173765, "step": 1060}, {"loss": 1.785, "grad_norm": 0.3070095181465149, "learning_rate": 0.0002, "epoch": 0.7291311754684838, "step": 1070}, {"loss": 1.7832, "grad_norm": 0.33263063430786133, "learning_rate": 0.0002, "epoch": 0.7359454855195912, "step": 1080}, {"loss": 1.7627, "grad_norm": 0.28774312138557434, "learning_rate": 0.0002, "epoch": 0.7427597955706985, "step": 1090}, {"loss": 1.7343, "grad_norm": 0.29304224252700806, "learning_rate": 0.0002, "epoch": 0.7495741056218058, "step": 1100}, {"loss": 1.8082, "grad_norm": 0.27529507875442505, "learning_rate": 0.0002, "epoch": 0.7563884156729132, "step": 1110}, {"loss": 1.8071, "grad_norm": 0.32319945096969604, "learning_rate": 0.0002, "epoch": 0.7632027257240205, "step": 1120}, {"loss": 1.6998, "grad_norm": 0.33777597546577454, "learning_rate": 0.0002, "epoch": 0.7700170357751278, "step": 1130}, {"loss": 1.8488, "grad_norm": 0.29847201704978943, "learning_rate": 0.0002, "epoch": 0.7768313458262351, "step": 1140}, {"loss": 1.7014, "grad_norm": 0.31370633840560913, "learning_rate": 0.0002, "epoch": 0.7836456558773425, "step": 1150}, {"loss": 1.8175, "grad_norm": 0.31633856892585754, "learning_rate": 0.0002, "epoch": 0.7904599659284497, "step": 1160}, {"loss": 1.7637, "grad_norm": 0.33849263191223145, "learning_rate": 0.0002, "epoch": 0.797274275979557, "step": 1170}, {"loss": 1.8264, "grad_norm": 0.3306333124637604, "learning_rate": 0.0002, "epoch": 0.8040885860306644, "step": 1180}, {"loss": 1.777, "grad_norm": 0.3249678313732147, "learning_rate": 0.0002, "epoch": 0.8109028960817717, "step": 1190}, {"loss": 1.7631, "grad_norm": 0.3252817690372467, "learning_rate": 0.0002, "epoch": 0.817717206132879, "step": 1200}, {"loss": 1.7877, "grad_norm": 0.31772559881210327, "learning_rate": 0.0002, "epoch": 0.8245315161839863, "step": 1210}, {"loss": 1.8068, "grad_norm": 0.2803371846675873, "learning_rate": 0.0002, "epoch": 0.8313458262350937, "step": 1220}, {"loss": 1.738, "grad_norm": 0.26700571179389954, "learning_rate": 0.0002, "epoch": 0.838160136286201, "step": 1230}, {"loss": 1.8301, "grad_norm": 0.3060092031955719, "learning_rate": 0.0002, "epoch": 0.8449744463373083, "step": 1240}, {"loss": 1.7993, "grad_norm": 0.28831684589385986, "learning_rate": 0.0002, "epoch": 0.8517887563884157, "step": 1250}, {"loss": 1.6909, "grad_norm": 0.30708742141723633, "learning_rate": 0.0002, "epoch": 0.858603066439523, "step": 1260}, {"loss": 1.8506, "grad_norm": 0.2915987968444824, "learning_rate": 0.0002, "epoch": 0.8654173764906303, "step": 1270}, {"loss": 1.7536, "grad_norm": 0.2893589735031128, "learning_rate": 0.0002, "epoch": 0.8722316865417377, "step": 1280}, {"loss": 1.7437, "grad_norm": 0.29545632004737854, "learning_rate": 0.0002, "epoch": 0.879045996592845, "step": 1290}, {"loss": 1.859, "grad_norm": 0.3354771137237549, "learning_rate": 0.0002, "epoch": 0.8858603066439523, "step": 1300}, {"loss": 1.7644, "grad_norm": 0.37715399265289307, "learning_rate": 0.0002, "epoch": 0.8926746166950597, "step": 1310}, {"loss": 1.7731, "grad_norm": 0.28847193717956543, "learning_rate": 0.0002, "epoch": 0.899488926746167, "step": 1320}, {"loss": 1.9077, "grad_norm": 0.2780889868736267, "learning_rate": 0.0002, "epoch": 0.9063032367972743, "step": 1330}, {"loss": 1.7861, "grad_norm": 0.2714342176914215, "learning_rate": 0.0002, "epoch": 0.9131175468483816, "step": 1340}, {"loss": 1.8, "grad_norm": 0.2950133979320526, "learning_rate": 0.0002, "epoch": 0.919931856899489, "step": 1350}, {"loss": 1.7368, "grad_norm": 0.29097145795822144, "learning_rate": 0.0002, "epoch": 0.9267461669505963, "step": 1360}, {"loss": 1.7864, "grad_norm": 0.32540133595466614, "learning_rate": 0.0002, "epoch": 0.9335604770017035, "step": 1370}, {"loss": 1.7571, "grad_norm": 0.3076636493206024, "learning_rate": 0.0002, "epoch": 0.9403747870528109, "step": 1380}, {"loss": 1.8312, "grad_norm": 0.2962130308151245, "learning_rate": 0.0002, "epoch": 0.9471890971039182, "step": 1390}, {"loss": 1.7858, "grad_norm": 0.30086860060691833, "learning_rate": 0.0002, "epoch": 0.9540034071550255, "step": 1400}, {"loss": 1.7991, "grad_norm": 0.28634947538375854, "learning_rate": 0.0002, "epoch": 0.9608177172061328, "step": 1410}, {"loss": 1.8385, "grad_norm": 0.35314416885375977, "learning_rate": 0.0002, "epoch": 0.9676320272572402, "step": 1420}, {"loss": 1.8054, "grad_norm": 0.2939317524433136, "learning_rate": 0.0002, "epoch": 0.9744463373083475, "step": 1430}, {"loss": 1.7582, "grad_norm": 0.3010196089744568, "learning_rate": 0.0002, "epoch": 0.9812606473594548, "step": 1440}, {"loss": 1.758, "grad_norm": 0.30816152691841125, "learning_rate": 0.0002, "epoch": 0.9880749574105622, "step": 1450}, {"loss": 1.8159, "grad_norm": 0.28152793645858765, "learning_rate": 0.0002, "epoch": 0.9948892674616695, "step": 1460}, {"eval_loss": 1.8034634590148926, "eval_runtime": 53.6017, "eval_samples_per_second": 9.459, "eval_steps_per_second": 1.194, "epoch": 0.9996592844974447, "step": 1467}, {"loss": 1.7072, "grad_norm": 0.29246416687965393, "learning_rate": 0.0002, "epoch": 1.0017035775127767, "step": 1470}, {"loss": 1.8159, "grad_norm": 0.2668602168560028, "learning_rate": 0.0002, "epoch": 1.008517887563884, "step": 1480}, {"loss": 1.6868, "grad_norm": 0.2998567819595337, "learning_rate": 0.0002, "epoch": 1.0153321976149914, "step": 1490}, {"loss": 1.7331, "grad_norm": 0.3284934461116791, "learning_rate": 0.0002, "epoch": 1.0221465076660987, "step": 1500}, {"loss": 1.689, "grad_norm": 0.3275827169418335, "learning_rate": 0.0002, "epoch": 1.028960817717206, "step": 1510}, {"loss": 1.7092, "grad_norm": 0.3382718563079834, "learning_rate": 0.0002, "epoch": 1.0357751277683134, "step": 1520}, {"loss": 1.7215, "grad_norm": 0.36737215518951416, "learning_rate": 0.0002, "epoch": 1.0425894378194207, "step": 1530}, {"loss": 1.6993, "grad_norm": 0.3442603647708893, "learning_rate": 0.0002, "epoch": 1.049403747870528, "step": 1540}, {"loss": 1.6996, "grad_norm": 0.3323381245136261, "learning_rate": 0.0002, "epoch": 1.0562180579216354, "step": 1550}, {"loss": 1.7444, "grad_norm": 0.341227650642395, "learning_rate": 0.0002, "epoch": 1.0630323679727427, "step": 1560}, {"loss": 1.7419, "grad_norm": 0.3361579477787018, "learning_rate": 0.0002, "epoch": 1.06984667802385, "step": 1570}, {"loss": 1.7337, "grad_norm": 0.3556230962276459, "learning_rate": 0.0002, "epoch": 1.0766609880749574, "step": 1580}, {"loss": 1.6978, "grad_norm": 0.27130424976348877, "learning_rate": 0.0002, "epoch": 1.0834752981260647, "step": 1590}, {"loss": 1.6087, "grad_norm": 0.29366323351860046, "learning_rate": 0.0002, "epoch": 1.090289608177172, "step": 1600}, {"loss": 1.6721, "grad_norm": 0.3581245541572571, "learning_rate": 0.0002, "epoch": 1.0971039182282794, "step": 1610}, {"loss": 1.7639, "grad_norm": 0.3021670877933502, "learning_rate": 0.0002, "epoch": 1.1039182282793867, "step": 1620}, {"loss": 1.7314, "grad_norm": 0.3145572543144226, "learning_rate": 0.0002, "epoch": 1.110732538330494, "step": 1630}, {"loss": 1.7408, "grad_norm": 0.35362154245376587, "learning_rate": 0.0002, "epoch": 1.1175468483816013, "step": 1640}, {"loss": 1.7071, "grad_norm": 0.5413113236427307, "learning_rate": 0.0002, "epoch": 1.1243611584327087, "step": 1650}, {"loss": 1.7615, "grad_norm": 0.6858654022216797, "learning_rate": 0.0002, "epoch": 1.131175468483816, "step": 1660}, {"loss": 1.7029, "grad_norm": 0.3781903386116028, "learning_rate": 0.0002, "epoch": 1.1379897785349233, "step": 1670}, {"loss": 1.659, "grad_norm": 0.404864102602005, "learning_rate": 0.0002, "epoch": 1.1448040885860307, "step": 1680}, {"loss": 1.7212, "grad_norm": 0.3595100939273834, "learning_rate": 0.0002, "epoch": 1.151618398637138, "step": 1690}, {"loss": 1.7023, "grad_norm": 0.33682283759117126, "learning_rate": 0.0002, "epoch": 1.1584327086882453, "step": 1700}, {"loss": 1.7336, "grad_norm": 0.3877373933792114, "learning_rate": 0.0002, "epoch": 1.1652470187393527, "step": 1710}, {"loss": 1.7676, "grad_norm": 0.34606897830963135, "learning_rate": 0.0002, "epoch": 1.17206132879046, "step": 1720}, {"loss": 1.6889, "grad_norm": 0.3122918903827667, "learning_rate": 0.0002, "epoch": 1.1788756388415673, "step": 1730}, {"loss": 1.6585, "grad_norm": 0.34081900119781494, "learning_rate": 0.0002, "epoch": 1.1856899488926746, "step": 1740}, {"loss": 1.7794, "grad_norm": 0.3418807089328766, "learning_rate": 0.0002, "epoch": 1.192504258943782, "step": 1750}, {"loss": 1.7773, "grad_norm": 0.3495500981807709, "learning_rate": 0.0002, "epoch": 1.1993185689948893, "step": 1760}, {"loss": 1.6578, "grad_norm": 0.557288408279419, "learning_rate": 0.0002, "epoch": 1.2061328790459966, "step": 1770}, {"loss": 1.7488, "grad_norm": 0.3193778693675995, "learning_rate": 0.0002, "epoch": 1.212947189097104, "step": 1780}, {"loss": 1.6815, "grad_norm": 0.3306216895580292, "learning_rate": 0.0002, "epoch": 1.2197614991482113, "step": 1790}, {"loss": 1.6772, "grad_norm": 0.37998732924461365, "learning_rate": 0.0002, "epoch": 1.2265758091993186, "step": 1800}, {"loss": 1.7094, "grad_norm": 0.3255669176578522, "learning_rate": 0.0002, "epoch": 1.233390119250426, "step": 1810}, {"loss": 1.7096, "grad_norm": 0.3741287291049957, "learning_rate": 0.0002, "epoch": 1.2402044293015333, "step": 1820}, {"loss": 1.7317, "grad_norm": 0.36727291345596313, "learning_rate": 0.0002, "epoch": 1.2470187393526406, "step": 1830}, {"loss": 1.7418, "grad_norm": 0.3479527235031128, "learning_rate": 0.0002, "epoch": 1.253833049403748, "step": 1840}, {"loss": 1.7062, "grad_norm": 0.3472636640071869, "learning_rate": 0.0002, "epoch": 1.2606473594548553, "step": 1850}, {"loss": 1.6263, "grad_norm": 0.3702869415283203, "learning_rate": 0.0002, "epoch": 1.2674616695059626, "step": 1860}, {"loss": 1.6222, "grad_norm": 0.3934040069580078, "learning_rate": 0.0002, "epoch": 1.27427597955707, "step": 1870}, {"loss": 1.7616, "grad_norm": 0.46887534856796265, "learning_rate": 0.0002, "epoch": 1.2810902896081773, "step": 1880}, {"loss": 1.6071, "grad_norm": 0.3191998600959778, "learning_rate": 0.0002, "epoch": 1.2879045996592846, "step": 1890}, {"loss": 1.7001, "grad_norm": 0.34032225608825684, "learning_rate": 0.0002, "epoch": 1.294718909710392, "step": 1900}, {"loss": 1.8316, "grad_norm": 0.33453696966171265, "learning_rate": 0.0002, "epoch": 1.3015332197614993, "step": 1910}, {"loss": 1.6567, "grad_norm": 0.3451494872570038, "learning_rate": 0.0002, "epoch": 1.3083475298126066, "step": 1920}, {"loss": 1.708, "grad_norm": 0.36203092336654663, "learning_rate": 0.0002, "epoch": 1.315161839863714, "step": 1930}, {"loss": 1.7095, "grad_norm": 0.43794456124305725, "learning_rate": 0.0002, "epoch": 1.321976149914821, "step": 1940}, {"loss": 1.7264, "grad_norm": 0.3630591034889221, "learning_rate": 0.0002, "epoch": 1.3287904599659284, "step": 1950}, {"loss": 1.6529, "grad_norm": 0.36951911449432373, "learning_rate": 0.0002, "epoch": 1.3356047700170357, "step": 1960}, {"loss": 1.651, "grad_norm": 0.4001159965991974, "learning_rate": 0.0002, "epoch": 1.342419080068143, "step": 1970}, {"loss": 1.677, "grad_norm": 0.3820836544036865, "learning_rate": 0.0002, "epoch": 1.3492333901192504, "step": 1980}, {"loss": 1.583, "grad_norm": 0.3705870807170868, "learning_rate": 0.0002, "epoch": 1.3560477001703577, "step": 1990}, {"loss": 1.6207, "grad_norm": 0.3557972013950348, "learning_rate": 0.0002, "epoch": 1.362862010221465, "step": 2000}, {"loss": 1.7656, "grad_norm": 0.38546398282051086, "learning_rate": 0.0002, "epoch": 1.3696763202725724, "step": 2010}, {"loss": 1.6881, "grad_norm": 0.3908020853996277, "learning_rate": 0.0002, "epoch": 1.3764906303236797, "step": 2020}, {"loss": 1.727, "grad_norm": 0.3822040855884552, "learning_rate": 0.0002, "epoch": 1.383304940374787, "step": 2030}, {"loss": 1.7923, "grad_norm": 0.425327867269516, "learning_rate": 0.0002, "epoch": 1.3901192504258943, "step": 2040}, {"loss": 1.7032, "grad_norm": 0.3436259329319, "learning_rate": 0.0002, "epoch": 1.3969335604770017, "step": 2050}, {"loss": 1.6398, "grad_norm": 0.33124062418937683, "learning_rate": 0.0002, "epoch": 1.403747870528109, "step": 2060}, {"loss": 1.6815, "grad_norm": 0.3662424683570862, "learning_rate": 0.0002, "epoch": 1.4105621805792163, "step": 2070}, {"loss": 1.7258, "grad_norm": 0.3720635175704956, "learning_rate": 0.0002, "epoch": 1.4173764906303237, "step": 2080}, {"loss": 1.7186, "grad_norm": 0.3361680805683136, "learning_rate": 0.0002, "epoch": 1.424190800681431, "step": 2090}, {"loss": 1.7606, "grad_norm": 0.32751724123954773, "learning_rate": 0.0002, "epoch": 1.4310051107325383, "step": 2100}, {"loss": 1.7051, "grad_norm": 0.34333378076553345, "learning_rate": 0.0002, "epoch": 1.4378194207836457, "step": 2110}, {"loss": 1.6979, "grad_norm": 0.37777671217918396, "learning_rate": 0.0002, "epoch": 1.444633730834753, "step": 2120}, {"loss": 1.7085, "grad_norm": 0.37126365303993225, "learning_rate": 0.0002, "epoch": 1.4514480408858603, "step": 2130}, {"loss": 1.721, "grad_norm": 0.3602267801761627, "learning_rate": 0.0002, "epoch": 1.4582623509369677, "step": 2140}, {"loss": 1.8148, "grad_norm": 0.3287110924720764, "learning_rate": 0.0002, "epoch": 1.465076660988075, "step": 2150}, {"loss": 1.6966, "grad_norm": 0.3562135100364685, "learning_rate": 0.0002, "epoch": 1.4718909710391823, "step": 2160}, {"loss": 1.713, "grad_norm": 0.38292962312698364, "learning_rate": 0.0002, "epoch": 1.4787052810902896, "step": 2170}, {"loss": 1.7036, "grad_norm": 0.38220319151878357, "learning_rate": 0.0002, "epoch": 1.485519591141397, "step": 2180}, {"loss": 1.7297, "grad_norm": 0.3570062220096588, "learning_rate": 0.0002, "epoch": 1.4923339011925043, "step": 2190}, {"loss": 1.5652, "grad_norm": 0.363146036863327, "learning_rate": 0.0002, "epoch": 1.4991482112436116, "step": 2200}, {"loss": 1.6716, "grad_norm": 0.37393274903297424, "learning_rate": 0.0002, "epoch": 1.5059625212947187, "step": 2210}, {"loss": 1.6839, "grad_norm": 0.3628501892089844, "learning_rate": 0.0002, "epoch": 1.512776831345826, "step": 2220}, {"loss": 1.6627, "grad_norm": 0.33430740237236023, "learning_rate": 0.0002, "epoch": 1.5195911413969334, "step": 2230}, {"loss": 1.7495, "grad_norm": 0.35363978147506714, "learning_rate": 0.0002, "epoch": 1.5264054514480407, "step": 2240}, {"loss": 1.6364, "grad_norm": 0.37220337986946106, "learning_rate": 0.0002, "epoch": 1.533219761499148, "step": 2250}, {"loss": 1.7773, "grad_norm": 0.35020262002944946, "learning_rate": 0.0002, "epoch": 1.5400340715502554, "step": 2260}, {"loss": 1.7453, "grad_norm": 0.35274937748908997, "learning_rate": 0.0002, "epoch": 1.5468483816013627, "step": 2270}, {"loss": 1.7162, "grad_norm": 0.3797738552093506, "learning_rate": 0.0002, "epoch": 1.55366269165247, "step": 2280}, {"loss": 1.6197, "grad_norm": 0.4160412847995758, "learning_rate": 0.0002, "epoch": 1.5604770017035774, "step": 2290}, {"loss": 1.7101, "grad_norm": 0.38779592514038086, "learning_rate": 0.0002, "epoch": 1.5672913117546847, "step": 2300}, {"loss": 1.7254, "grad_norm": 0.39171287417411804, "learning_rate": 0.0002, "epoch": 1.574105621805792, "step": 2310}, {"loss": 1.7087, "grad_norm": 0.3606826663017273, "learning_rate": 0.0002, "epoch": 1.5809199318568994, "step": 2320}, {"loss": 1.7269, "grad_norm": 0.3745017647743225, "learning_rate": 0.0002, "epoch": 1.5877342419080067, "step": 2330}, {"loss": 1.7068, "grad_norm": 0.34933462738990784, "learning_rate": 0.0002, "epoch": 1.594548551959114, "step": 2340}, {"loss": 1.7073, "grad_norm": 0.37268444895744324, "learning_rate": 0.0002, "epoch": 1.6013628620102214, "step": 2350}, {"loss": 1.728, "grad_norm": 0.4603484869003296, "learning_rate": 0.0002, "epoch": 1.6081771720613287, "step": 2360}, {"loss": 1.7621, "grad_norm": 0.35689088702201843, "learning_rate": 0.0002, "epoch": 1.614991482112436, "step": 2370}, {"loss": 1.6989, "grad_norm": 0.3392031490802765, "learning_rate": 0.0002, "epoch": 1.6218057921635434, "step": 2380}, {"loss": 1.7077, "grad_norm": 0.394653856754303, "learning_rate": 0.0002, "epoch": 1.6286201022146507, "step": 2390}, {"loss": 1.7448, "grad_norm": 0.33972012996673584, "learning_rate": 0.0002, "epoch": 1.635434412265758, "step": 2400}, {"loss": 1.7681, "grad_norm": 0.3854375183582306, "learning_rate": 0.0002, "epoch": 1.6422487223168654, "step": 2410}, {"loss": 1.7102, "grad_norm": 0.36143961548805237, "learning_rate": 0.0002, "epoch": 1.6490630323679727, "step": 2420}, {"loss": 1.7432, "grad_norm": 0.35816189646720886, "learning_rate": 0.0002, "epoch": 1.65587734241908, "step": 2430}, {"loss": 1.6407, "grad_norm": 0.36298736929893494, "learning_rate": 0.0002, "epoch": 1.6626916524701874, "step": 2440}, {"loss": 1.723, "grad_norm": 0.36756929755210876, "learning_rate": 0.0002, "epoch": 1.6695059625212947, "step": 2450}, {"loss": 1.6824, "grad_norm": 0.35969603061676025, "learning_rate": 0.0002, "epoch": 1.676320272572402, "step": 2460}, {"loss": 1.7014, "grad_norm": 0.38449376821517944, "learning_rate": 0.0002, "epoch": 1.6831345826235093, "step": 2470}, {"loss": 1.7261, "grad_norm": 0.44511452317237854, "learning_rate": 0.0002, "epoch": 1.6899488926746167, "step": 2480}, {"loss": 1.7397, "grad_norm": 0.3831416368484497, "learning_rate": 0.0002, "epoch": 1.696763202725724, "step": 2490}, {"loss": 1.7046, "grad_norm": 0.3795325756072998, "learning_rate": 0.0002, "epoch": 1.7035775127768313, "step": 2500}, {"loss": 1.6993, "grad_norm": 0.34978193044662476, "learning_rate": 0.0002, "epoch": 1.7103918228279387, "step": 2510}, {"loss": 1.8307, "grad_norm": 0.35923877358436584, "learning_rate": 0.0002, "epoch": 1.717206132879046, "step": 2520}, {"loss": 1.6253, "grad_norm": 0.352999746799469, "learning_rate": 0.0002, "epoch": 1.7240204429301533, "step": 2530}, {"loss": 1.6953, "grad_norm": 0.43673479557037354, "learning_rate": 0.0002, "epoch": 1.7308347529812607, "step": 2540}, {"loss": 1.7079, "grad_norm": 0.4153687357902527, "learning_rate": 0.0002, "epoch": 1.737649063032368, "step": 2550}, {"loss": 1.6714, "grad_norm": 0.35541167855262756, "learning_rate": 0.0002, "epoch": 1.7444633730834753, "step": 2560}, {"loss": 1.696, "grad_norm": 0.3288775086402893, "learning_rate": 0.0002, "epoch": 1.7512776831345827, "step": 2570}, {"loss": 1.7486, "grad_norm": 0.3991123139858246, "learning_rate": 0.0002, "epoch": 1.75809199318569, "step": 2580}, {"loss": 1.645, "grad_norm": 0.39967241883277893, "learning_rate": 0.0002, "epoch": 1.7649063032367973, "step": 2590}, {"loss": 1.6855, "grad_norm": 0.41104283928871155, "learning_rate": 0.0002, "epoch": 1.7717206132879046, "step": 2600}, {"loss": 1.6993, "grad_norm": 0.44885286688804626, "learning_rate": 0.0002, "epoch": 1.778534923339012, "step": 2610}, {"loss": 1.7224, "grad_norm": 0.38384467363357544, "learning_rate": 0.0002, "epoch": 1.7853492333901193, "step": 2620}, {"loss": 1.7213, "grad_norm": 0.35020917654037476, "learning_rate": 0.0002, "epoch": 1.7921635434412266, "step": 2630}, {"loss": 1.6706, "grad_norm": 0.3360341489315033, "learning_rate": 0.0002, "epoch": 1.798977853492334, "step": 2640}, {"loss": 1.7037, "grad_norm": 0.38875144720077515, "learning_rate": 0.0002, "epoch": 1.8057921635434413, "step": 2650}, {"loss": 1.693, "grad_norm": 0.34876883029937744, "learning_rate": 0.0002, "epoch": 1.8126064735945486, "step": 2660}, {"loss": 1.7743, "grad_norm": 0.419979989528656, "learning_rate": 0.0002, "epoch": 1.819420783645656, "step": 2670}, {"loss": 1.7089, "grad_norm": 0.3648919463157654, "learning_rate": 0.0002, "epoch": 1.8262350936967633, "step": 2680}, {"loss": 1.7828, "grad_norm": 0.3485383987426758, "learning_rate": 0.0002, "epoch": 1.8330494037478706, "step": 2690}, {"loss": 1.705, "grad_norm": 0.3647468686103821, "learning_rate": 0.0002, "epoch": 1.839863713798978, "step": 2700}, {"loss": 1.7318, "grad_norm": 0.37003210186958313, "learning_rate": 0.0002, "epoch": 1.8466780238500853, "step": 2710}, {"loss": 1.6647, "grad_norm": 0.37031617760658264, "learning_rate": 0.0002, "epoch": 1.8534923339011926, "step": 2720}, {"loss": 1.69, "grad_norm": 0.3438796103000641, "learning_rate": 0.0002, "epoch": 1.8603066439523, "step": 2730}, {"loss": 1.5995, "grad_norm": 0.41574627161026, "learning_rate": 0.0002, "epoch": 1.8671209540034073, "step": 2740}, {"loss": 1.6877, "grad_norm": 0.35049930214881897, "learning_rate": 0.0002, "epoch": 1.8739352640545146, "step": 2750}, {"loss": 1.7048, "grad_norm": 0.3943989872932434, "learning_rate": 0.0002, "epoch": 1.880749574105622, "step": 2760}, {"loss": 1.7047, "grad_norm": 0.3384978175163269, "learning_rate": 0.0002, "epoch": 1.8875638841567293, "step": 2770}, {"loss": 1.7848, "grad_norm": 0.3501328229904175, "learning_rate": 0.0002, "epoch": 1.8943781942078366, "step": 2780}, {"loss": 1.6986, "grad_norm": 0.37484532594680786, "learning_rate": 0.0002, "epoch": 1.901192504258944, "step": 2790}, {"loss": 1.6867, "grad_norm": 0.34497788548469543, "learning_rate": 0.0002, "epoch": 1.9080068143100513, "step": 2800}, {"loss": 1.6513, "grad_norm": 0.3530851900577545, "learning_rate": 0.0002, "epoch": 1.9148211243611586, "step": 2810}, {"loss": 1.6369, "grad_norm": 0.3879254162311554, "learning_rate": 0.0002, "epoch": 1.921635434412266, "step": 2820}, {"loss": 1.6786, "grad_norm": 0.3885590136051178, "learning_rate": 0.0002, "epoch": 1.9284497444633732, "step": 2830}, {"loss": 1.8049, "grad_norm": 0.3868715465068817, "learning_rate": 0.0002, "epoch": 1.9352640545144804, "step": 2840}, {"loss": 1.7099, "grad_norm": 0.4152422249317169, "learning_rate": 0.0002, "epoch": 1.9420783645655877, "step": 2850}, {"loss": 1.6696, "grad_norm": 0.401714563369751, "learning_rate": 0.0002, "epoch": 1.948892674616695, "step": 2860}, {"loss": 1.7182, "grad_norm": 0.34825265407562256, "learning_rate": 0.0002, "epoch": 1.9557069846678024, "step": 2870}, {"loss": 1.6612, "grad_norm": 0.3620675504207611, "learning_rate": 0.0002, "epoch": 1.9625212947189097, "step": 2880}, {"loss": 1.7451, "grad_norm": 0.3977806866168976, "learning_rate": 0.0002, "epoch": 1.969335604770017, "step": 2890}, {"loss": 1.6514, "grad_norm": 0.3687497079372406, "learning_rate": 0.0002, "epoch": 1.9761499148211243, "step": 2900}, {"loss": 1.7712, "grad_norm": 0.408640056848526, "learning_rate": 0.0002, "epoch": 1.9829642248722317, "step": 2910}, {"loss": 1.695, "grad_norm": 0.34510108828544617, "learning_rate": 0.0002, "epoch": 1.989778534923339, "step": 2920}, {"loss": 1.662, "grad_norm": 0.3596334755420685, "learning_rate": 0.0002, "epoch": 1.9965928449744463, "step": 2930}, {"eval_loss": 1.8056600093841553, "eval_runtime": 60.7049, "eval_samples_per_second": 8.352, "eval_steps_per_second": 1.054, "epoch": 2.0, "step": 2935}, {"loss": 1.6918, "grad_norm": 0.3460802137851715, "learning_rate": 0.0002, "epoch": 2.0034071550255534, "step": 2940}, {"loss": 1.6036, "grad_norm": 0.4038620591163635, "learning_rate": 0.0002, "epoch": 2.0102214650766608, "step": 2950}, {"loss": 1.5386, "grad_norm": 0.3950219750404358, "learning_rate": 0.0002, "epoch": 2.017035775127768, "step": 2960}, {"loss": 1.5649, "grad_norm": 0.519116997718811, "learning_rate": 0.0002, "epoch": 2.0238500851788754, "step": 2970}, {"loss": 1.5397, "grad_norm": 0.4097684919834137, "learning_rate": 0.0002, "epoch": 2.0306643952299828, "step": 2980}, {"loss": 1.6044, "grad_norm": 0.4153544306755066, "learning_rate": 0.0002, "epoch": 2.03747870528109, "step": 2990}, {"loss": 1.6006, "grad_norm": 0.4351160526275635, "learning_rate": 0.0002, "epoch": 2.0442930153321974, "step": 3000}, {"loss": 1.5091, "grad_norm": 0.42036259174346924, "learning_rate": 0.0002, "epoch": 2.0511073253833048, "step": 3010}, {"loss": 1.5686, "grad_norm": 0.4433218836784363, "learning_rate": 0.0002, "epoch": 2.057921635434412, "step": 3020}, {"loss": 1.5478, "grad_norm": 0.46511581540107727, "learning_rate": 0.0002, "epoch": 2.0647359454855194, "step": 3030}, {"loss": 1.5554, "grad_norm": 0.4567560851573944, "learning_rate": 0.0002, "epoch": 2.0715502555366268, "step": 3040}, {"loss": 1.5561, "grad_norm": 0.45671048760414124, "learning_rate": 0.0002, "epoch": 2.078364565587734, "step": 3050}, {"loss": 1.606, "grad_norm": 0.4598552882671356, "learning_rate": 0.0002, "epoch": 2.0851788756388414, "step": 3060}, {"loss": 1.6357, "grad_norm": 0.4582861661911011, "learning_rate": 0.0002, "epoch": 2.0919931856899487, "step": 3070}, {"loss": 1.5853, "grad_norm": 0.4366969168186188, "learning_rate": 0.0002, "epoch": 2.098807495741056, "step": 3080}, {"loss": 1.5574, "grad_norm": 0.495917409658432, "learning_rate": 0.0002, "epoch": 2.1056218057921634, "step": 3090}, {"loss": 1.5798, "grad_norm": 1.6846044063568115, "learning_rate": 0.0002, "epoch": 2.1124361158432707, "step": 3100}, {"loss": 1.5877, "grad_norm": 0.4765092134475708, "learning_rate": 0.0002, "epoch": 2.119250425894378, "step": 3110}, {"loss": 1.6106, "grad_norm": 0.45029810070991516, "learning_rate": 0.0002, "epoch": 2.1260647359454854, "step": 3120}, {"loss": 1.5707, "grad_norm": 0.5706973075866699, "learning_rate": 0.0002, "epoch": 2.1328790459965927, "step": 3130}, {"loss": 1.589, "grad_norm": 0.4606274366378784, "learning_rate": 0.0002, "epoch": 2.1396933560477, "step": 3140}, {"loss": 1.564, "grad_norm": 2.199115753173828, "learning_rate": 0.0002, "epoch": 2.1465076660988074, "step": 3150}, {"loss": 1.5808, "grad_norm": 0.6231027245521545, "learning_rate": 0.0002, "epoch": 2.1533219761499147, "step": 3160}, {"loss": 1.5397, "grad_norm": 0.46918219327926636, "learning_rate": 0.0002, "epoch": 2.160136286201022, "step": 3170}, {"loss": 1.6082, "grad_norm": 0.5006393194198608, "learning_rate": 0.0002, "epoch": 2.1669505962521294, "step": 3180}, {"loss": 1.591, "grad_norm": 0.4745093286037445, "learning_rate": 0.0002, "epoch": 2.1737649063032367, "step": 3190}, {"loss": 1.6459, "grad_norm": 0.511576771736145, "learning_rate": 0.0002, "epoch": 2.180579216354344, "step": 3200}, {"loss": 1.6019, "grad_norm": 0.38622918725013733, "learning_rate": 0.0002, "epoch": 2.1873935264054514, "step": 3210}, {"loss": 1.6187, "grad_norm": 0.4425644278526306, "learning_rate": 0.0002, "epoch": 2.1942078364565587, "step": 3220}, {"loss": 1.6114, "grad_norm": 0.45643091201782227, "learning_rate": 0.0002, "epoch": 2.201022146507666, "step": 3230}, {"loss": 1.6224, "grad_norm": 0.4975406229496002, "learning_rate": 0.0002, "epoch": 2.2078364565587734, "step": 3240}, {"loss": 1.6654, "grad_norm": 0.4673331081867218, "learning_rate": 0.0002, "epoch": 2.2146507666098807, "step": 3250}, {"loss": 1.6155, "grad_norm": 0.5081731081008911, "learning_rate": 0.0002, "epoch": 2.221465076660988, "step": 3260}, {"loss": 1.53, "grad_norm": 0.4790806770324707, "learning_rate": 0.0002, "epoch": 2.2282793867120954, "step": 3270}, {"loss": 1.6362, "grad_norm": 0.5184140801429749, "learning_rate": 0.0002, "epoch": 2.2350936967632027, "step": 3280}, {"loss": 1.5646, "grad_norm": 0.5159083604812622, "learning_rate": 0.0002, "epoch": 2.24190800681431, "step": 3290}, {"loss": 1.6577, "grad_norm": 0.4876042604446411, "learning_rate": 0.0002, "epoch": 2.2487223168654173, "step": 3300}, {"loss": 1.5718, "grad_norm": 0.5454957485198975, "learning_rate": 0.0002, "epoch": 2.2555366269165247, "step": 3310}, {"loss": 1.5885, "grad_norm": 0.49866822361946106, "learning_rate": 0.0002, "epoch": 2.262350936967632, "step": 3320}, {"loss": 1.5923, "grad_norm": 0.42674365639686584, "learning_rate": 0.0002, "epoch": 2.2691652470187393, "step": 3330}, {"loss": 1.5588, "grad_norm": 0.5202316641807556, "learning_rate": 0.0002, "epoch": 2.2759795570698467, "step": 3340}, {"loss": 1.6032, "grad_norm": 0.4849465489387512, "learning_rate": 0.0002, "epoch": 2.282793867120954, "step": 3350}, {"loss": 1.6853, "grad_norm": 0.47202569246292114, "learning_rate": 0.0002, "epoch": 2.2896081771720613, "step": 3360}, {"loss": 1.6164, "grad_norm": 0.5311620235443115, "learning_rate": 0.0002, "epoch": 2.2964224872231687, "step": 3370}, {"loss": 1.6004, "grad_norm": 0.49011409282684326, "learning_rate": 0.0002, "epoch": 2.303236797274276, "step": 3380}, {"loss": 1.5484, "grad_norm": 0.4789247512817383, "learning_rate": 0.0002, "epoch": 2.3100511073253833, "step": 3390}, {"loss": 1.5866, "grad_norm": 0.46646103262901306, "learning_rate": 0.0002, "epoch": 2.3168654173764907, "step": 3400}, {"loss": 1.5308, "grad_norm": 0.5552441477775574, "learning_rate": 0.0002, "epoch": 2.323679727427598, "step": 3410}, {"loss": 1.5761, "grad_norm": 0.4530351758003235, "learning_rate": 0.0002, "epoch": 2.3304940374787053, "step": 3420}, {"loss": 1.5919, "grad_norm": 0.4806232750415802, "learning_rate": 0.0002, "epoch": 2.3373083475298126, "step": 3430}, {"loss": 1.5569, "grad_norm": 0.5998363494873047, "learning_rate": 0.0002, "epoch": 2.34412265758092, "step": 3440}, {"loss": 1.513, "grad_norm": 0.4918554425239563, "learning_rate": 0.0002, "epoch": 2.3509369676320273, "step": 3450}, {"loss": 1.6323, "grad_norm": 0.5359559655189514, "learning_rate": 0.0002, "epoch": 2.3577512776831346, "step": 3460}, {"loss": 1.5973, "grad_norm": 0.5053277611732483, "learning_rate": 0.0002, "epoch": 2.364565587734242, "step": 3470}, {"loss": 1.5673, "grad_norm": 0.5058915019035339, "learning_rate": 0.0002, "epoch": 2.3713798977853493, "step": 3480}, {"loss": 1.5434, "grad_norm": 0.5314899682998657, "learning_rate": 0.0002, "epoch": 2.3781942078364566, "step": 3490}, {"loss": 1.5203, "grad_norm": 0.48035913705825806, "learning_rate": 0.0002, "epoch": 2.385008517887564, "step": 3500}, {"loss": 1.5936, "grad_norm": 0.45864903926849365, "learning_rate": 0.0002, "epoch": 2.3918228279386713, "step": 3510}, {"loss": 1.6285, "grad_norm": 0.4553050398826599, "learning_rate": 0.0002, "epoch": 2.3986371379897786, "step": 3520}, {"loss": 1.5444, "grad_norm": 0.4483442008495331, "learning_rate": 0.0002, "epoch": 2.405451448040886, "step": 3530}, {"loss": 1.587, "grad_norm": 0.5043742060661316, "learning_rate": 0.0002, "epoch": 2.4122657580919933, "step": 3540}, {"loss": 1.5692, "grad_norm": 0.44807168841362, "learning_rate": 0.0002, "epoch": 2.4190800681431006, "step": 3550}, {"loss": 1.6306, "grad_norm": 0.5065137147903442, "learning_rate": 0.0002, "epoch": 2.425894378194208, "step": 3560}, {"loss": 1.5842, "grad_norm": 0.5186443328857422, "learning_rate": 0.0002, "epoch": 2.4327086882453153, "step": 3570}, {"loss": 1.5956, "grad_norm": 0.49743232131004333, "learning_rate": 0.0002, "epoch": 2.4395229982964226, "step": 3580}, {"loss": 1.6021, "grad_norm": 0.524450421333313, "learning_rate": 0.0002, "epoch": 2.44633730834753, "step": 3590}, {"loss": 1.6283, "grad_norm": 0.5053797364234924, "learning_rate": 0.0002, "epoch": 2.4531516183986373, "step": 3600}, {"loss": 1.6335, "grad_norm": 0.5223091840744019, "learning_rate": 0.0002, "epoch": 2.4599659284497446, "step": 3610}, {"loss": 1.6315, "grad_norm": 0.4763810932636261, "learning_rate": 0.0002, "epoch": 2.466780238500852, "step": 3620}, {"loss": 1.5623, "grad_norm": 0.5097282528877258, "learning_rate": 0.0002, "epoch": 2.4735945485519593, "step": 3630}, {"loss": 1.605, "grad_norm": 0.5831942558288574, "learning_rate": 0.0002, "epoch": 2.4804088586030666, "step": 3640}, {"loss": 1.6074, "grad_norm": 0.47573572397232056, "learning_rate": 0.0002, "epoch": 2.487223168654174, "step": 3650}, {"loss": 1.6411, "grad_norm": 0.49602726101875305, "learning_rate": 0.0002, "epoch": 2.4940374787052813, "step": 3660}, {"loss": 1.571, "grad_norm": 0.5468524694442749, "learning_rate": 0.0002, "epoch": 2.500851788756388, "step": 3670}, {"loss": 1.5782, "grad_norm": 0.45899373292922974, "learning_rate": 0.0002, "epoch": 2.507666098807496, "step": 3680}, {"loss": 1.5114, "grad_norm": 0.5031567215919495, "learning_rate": 0.0002, "epoch": 2.514480408858603, "step": 3690}, {"loss": 1.538, "grad_norm": 0.5224900841712952, "learning_rate": 0.0002, "epoch": 2.5212947189097106, "step": 3700}, {"loss": 1.5269, "grad_norm": 0.504769504070282, "learning_rate": 0.0002, "epoch": 2.5281090289608175, "step": 3710}, {"loss": 1.5141, "grad_norm": 0.6120529770851135, "learning_rate": 0.0002, "epoch": 2.5349233390119252, "step": 3720}, {"loss": 1.5666, "grad_norm": 0.47930678725242615, "learning_rate": 0.0002, "epoch": 2.541737649063032, "step": 3730}, {"loss": 1.6156, "grad_norm": 0.5039092302322388, "learning_rate": 0.0002, "epoch": 2.54855195911414, "step": 3740}, {"loss": 1.5585, "grad_norm": 0.49758994579315186, "learning_rate": 0.0002, "epoch": 2.555366269165247, "step": 3750}, {"loss": 1.5351, "grad_norm": 0.44739171862602234, "learning_rate": 0.0002, "epoch": 2.5621805792163546, "step": 3760}, {"loss": 1.5099, "grad_norm": 0.47056373953819275, "learning_rate": 0.0002, "epoch": 2.5689948892674614, "step": 3770}, {"loss": 1.5524, "grad_norm": 0.5077595114707947, "learning_rate": 0.0002, "epoch": 2.575809199318569, "step": 3780}, {"loss": 1.5524, "grad_norm": 0.4981902837753296, "learning_rate": 0.0002, "epoch": 2.582623509369676, "step": 3790}, {"loss": 1.5381, "grad_norm": 0.5736238360404968, "learning_rate": 0.0002, "epoch": 2.589437819420784, "step": 3800}, {"loss": 1.67, "grad_norm": 0.4898384213447571, "learning_rate": 0.0002, "epoch": 2.5962521294718908, "step": 3810}, {"loss": 1.5411, "grad_norm": 0.4611325263977051, "learning_rate": 0.0002, "epoch": 2.6030664395229985, "step": 3820}, {"loss": 1.5662, "grad_norm": 0.5285341739654541, "learning_rate": 0.0002, "epoch": 2.6098807495741054, "step": 3830}, {"loss": 1.5875, "grad_norm": 0.5679430961608887, "learning_rate": 0.0002, "epoch": 2.616695059625213, "step": 3840}, {"loss": 1.5544, "grad_norm": 0.48532548546791077, "learning_rate": 0.0002, "epoch": 2.62350936967632, "step": 3850}, {"loss": 1.579, "grad_norm": 0.45506492257118225, "learning_rate": 0.0002, "epoch": 2.630323679727428, "step": 3860}, {"loss": 1.5775, "grad_norm": 0.6552556753158569, "learning_rate": 0.0002, "epoch": 2.6371379897785348, "step": 3870}, {"loss": 1.608, "grad_norm": 0.537874698638916, "learning_rate": 0.0002, "epoch": 2.643952299829642, "step": 3880}, {"loss": 1.5653, "grad_norm": 0.46102389693260193, "learning_rate": 0.0002, "epoch": 2.6507666098807494, "step": 3890}, {"loss": 1.5293, "grad_norm": 0.45531195402145386, "learning_rate": 0.0002, "epoch": 2.6575809199318567, "step": 3900}, {"loss": 1.5492, "grad_norm": 0.5327293872833252, "learning_rate": 0.0002, "epoch": 2.664395229982964, "step": 3910}, {"loss": 1.5921, "grad_norm": 0.4968956410884857, "learning_rate": 0.0002, "epoch": 2.6712095400340714, "step": 3920}, {"loss": 1.5823, "grad_norm": 0.4790082275867462, "learning_rate": 0.0002, "epoch": 2.6780238500851787, "step": 3930}, {"loss": 1.615, "grad_norm": 0.5392967462539673, "learning_rate": 0.0002, "epoch": 2.684838160136286, "step": 3940}, {"loss": 1.6218, "grad_norm": 0.5076649785041809, "learning_rate": 0.0002, "epoch": 2.6916524701873934, "step": 3950}, {"loss": 1.6478, "grad_norm": 0.5628064274787903, "learning_rate": 0.0002, "epoch": 2.6984667802385007, "step": 3960}, {"loss": 1.5417, "grad_norm": 0.5012659430503845, "learning_rate": 0.0002, "epoch": 2.705281090289608, "step": 3970}, {"loss": 1.5339, "grad_norm": 0.4947647452354431, "learning_rate": 0.0002, "epoch": 2.7120954003407154, "step": 3980}, {"loss": 1.5724, "grad_norm": 0.4890969693660736, "learning_rate": 0.0002, "epoch": 2.7189097103918227, "step": 3990}, {"loss": 1.5746, "grad_norm": 0.4471694231033325, "learning_rate": 0.0002, "epoch": 2.72572402044293, "step": 4000}, {"loss": 1.6669, "grad_norm": 0.5116439461708069, "learning_rate": 0.0002, "epoch": 2.7325383304940374, "step": 4010}, {"loss": 1.584, "grad_norm": 0.5720411539077759, "learning_rate": 0.0002, "epoch": 2.7393526405451447, "step": 4020}, {"loss": 1.6151, "grad_norm": 0.5529406070709229, "learning_rate": 0.0002, "epoch": 2.746166950596252, "step": 4030}, {"loss": 1.6296, "grad_norm": 0.5229396820068359, "learning_rate": 0.0002, "epoch": 2.7529812606473594, "step": 4040}, {"loss": 1.5363, "grad_norm": 0.5270276069641113, "learning_rate": 0.0002, "epoch": 2.7597955706984667, "step": 4050}, {"loss": 1.6305, "grad_norm": 0.48413026332855225, "learning_rate": 0.0002, "epoch": 2.766609880749574, "step": 4060}, {"loss": 1.5488, "grad_norm": 0.5145403742790222, "learning_rate": 0.0002, "epoch": 2.7734241908006814, "step": 4070}, {"loss": 1.6023, "grad_norm": 0.48626071214675903, "learning_rate": 0.0002, "epoch": 2.7802385008517887, "step": 4080}, {"loss": 1.6082, "grad_norm": 0.5018984079360962, "learning_rate": 0.0002, "epoch": 2.787052810902896, "step": 4090}, {"loss": 1.7166, "grad_norm": 0.4881938695907593, "learning_rate": 0.0002, "epoch": 2.7938671209540034, "step": 4100}, {"loss": 1.515, "grad_norm": 0.5151546001434326, "learning_rate": 0.0002, "epoch": 2.8006814310051107, "step": 4110}, {"loss": 1.6069, "grad_norm": 0.5109850764274597, "learning_rate": 0.0002, "epoch": 2.807495741056218, "step": 4120}, {"loss": 1.6153, "grad_norm": 0.5109251141548157, "learning_rate": 0.0002, "epoch": 2.8143100511073254, "step": 4130}, {"loss": 1.6365, "grad_norm": 0.5025496482849121, "learning_rate": 0.0002, "epoch": 2.8211243611584327, "step": 4140}, {"loss": 1.6292, "grad_norm": 0.49027004837989807, "learning_rate": 0.0002, "epoch": 2.82793867120954, "step": 4150}, {"loss": 1.5591, "grad_norm": 0.4957362413406372, "learning_rate": 0.0002, "epoch": 2.8347529812606473, "step": 4160}, {"loss": 1.6759, "grad_norm": 0.5159927606582642, "learning_rate": 0.0002, "epoch": 2.8415672913117547, "step": 4170}, {"loss": 1.577, "grad_norm": 0.6040670871734619, "learning_rate": 0.0002, "epoch": 2.848381601362862, "step": 4180}, {"loss": 1.5295, "grad_norm": 0.5489953756332397, "learning_rate": 0.0002, "epoch": 2.8551959114139693, "step": 4190}, {"loss": 1.5909, "grad_norm": 0.5416634678840637, "learning_rate": 0.0002, "epoch": 2.8620102214650767, "step": 4200}, {"loss": 1.6014, "grad_norm": 0.5278245210647583, "learning_rate": 0.0002, "epoch": 2.868824531516184, "step": 4210}, {"loss": 1.4848, "grad_norm": 0.43382319808006287, "learning_rate": 0.0002, "epoch": 2.8756388415672913, "step": 4220}, {"loss": 1.6092, "grad_norm": 0.4724387228488922, "learning_rate": 0.0002, "epoch": 2.8824531516183987, "step": 4230}, {"loss": 1.5748, "grad_norm": 0.49824780225753784, "learning_rate": 0.0002, "epoch": 2.889267461669506, "step": 4240}, {"loss": 1.6163, "grad_norm": 0.5360262989997864, "learning_rate": 0.0002, "epoch": 2.8960817717206133, "step": 4250}, {"loss": 1.5934, "grad_norm": 0.49090322852134705, "learning_rate": 0.0002, "epoch": 2.9028960817717206, "step": 4260}, {"loss": 1.624, "grad_norm": 0.5613328218460083, "learning_rate": 0.0002, "epoch": 2.909710391822828, "step": 4270}, {"loss": 1.5713, "grad_norm": 0.4611356258392334, "learning_rate": 0.0002, "epoch": 2.9165247018739353, "step": 4280}, {"loss": 1.5457, "grad_norm": 0.550897479057312, "learning_rate": 0.0002, "epoch": 2.9233390119250426, "step": 4290}, {"loss": 1.6225, "grad_norm": 0.5089612603187561, "learning_rate": 0.0002, "epoch": 2.93015332197615, "step": 4300}, {"loss": 1.5897, "grad_norm": 0.5210904479026794, "learning_rate": 0.0002, "epoch": 2.9369676320272573, "step": 4310}, {"loss": 1.6198, "grad_norm": 0.5506424903869629, "learning_rate": 0.0002, "epoch": 2.9437819420783646, "step": 4320}, {"loss": 1.6395, "grad_norm": 0.5118561387062073, "learning_rate": 0.0002, "epoch": 2.950596252129472, "step": 4330}, {"loss": 1.704, "grad_norm": 0.5034464597702026, "learning_rate": 0.0002, "epoch": 2.9574105621805793, "step": 4340}, {"loss": 1.6314, "grad_norm": 0.5019990801811218, "learning_rate": 0.0002, "epoch": 2.9642248722316866, "step": 4350}, {"loss": 1.6161, "grad_norm": 0.5423325300216675, "learning_rate": 0.0002, "epoch": 2.971039182282794, "step": 4360}, {"loss": 1.6144, "grad_norm": 0.5287469625473022, "learning_rate": 0.0002, "epoch": 2.9778534923339013, "step": 4370}, {"loss": 1.5227, "grad_norm": 0.5206913352012634, "learning_rate": 0.0002, "epoch": 2.9846678023850086, "step": 4380}, {"loss": 1.6026, "grad_norm": 0.5407394170761108, "learning_rate": 0.0002, "epoch": 2.991482112436116, "step": 4390}, {"loss": 1.5908, "grad_norm": 0.5244600176811218, "learning_rate": 0.0002, "epoch": 2.9982964224872233, "step": 4400}, {"eval_loss": 1.8412635326385498, "eval_runtime": 65.5583, "eval_samples_per_second": 7.734, "eval_steps_per_second": 0.976, "epoch": 2.9996592844974446, "step": 4402}, {"loss": 1.5157, "grad_norm": 0.5172150731086731, "learning_rate": 0.0002, "epoch": 3.0051107325383306, "step": 4410}, {"loss": 1.398, "grad_norm": 0.6882525086402893, "learning_rate": 0.0002, "epoch": 3.011925042589438, "step": 4420}, {"loss": 1.3884, "grad_norm": 0.6435003280639648, "learning_rate": 0.0002, "epoch": 3.0187393526405453, "step": 4430}, {"loss": 1.4493, "grad_norm": 0.7126057147979736, "learning_rate": 0.0002, "epoch": 3.0255536626916526, "step": 4440}, {"loss": 1.4397, "grad_norm": 0.6634385585784912, "learning_rate": 0.0002, "epoch": 3.03236797274276, "step": 4450}, {"loss": 1.3674, "grad_norm": 0.6468435525894165, "learning_rate": 0.0002, "epoch": 3.0391822827938673, "step": 4460}, {"loss": 1.4045, "grad_norm": 0.5690478086471558, "learning_rate": 0.0002, "epoch": 3.0459965928449746, "step": 4470}, {"loss": 1.3742, "grad_norm": 0.7323708534240723, "learning_rate": 0.0002, "epoch": 3.052810902896082, "step": 4480}, {"loss": 1.3281, "grad_norm": 0.6989302039146423, "learning_rate": 0.0002, "epoch": 3.0596252129471893, "step": 4490}, {"loss": 1.379, "grad_norm": 0.6704450845718384, "learning_rate": 0.0002, "epoch": 3.0664395229982966, "step": 4500}, {"loss": 1.4028, "grad_norm": 0.769137442111969, "learning_rate": 0.0002, "epoch": 3.073253833049404, "step": 4510}, {"loss": 1.4295, "grad_norm": 0.6556448936462402, "learning_rate": 0.0002, "epoch": 3.0800681431005112, "step": 4520}, {"loss": 1.2763, "grad_norm": 0.7143950462341309, "learning_rate": 0.0002, "epoch": 3.0868824531516186, "step": 4530}, {"loss": 1.4806, "grad_norm": 0.7060510516166687, "learning_rate": 0.0002, "epoch": 3.093696763202726, "step": 4540}, {"loss": 1.4097, "grad_norm": 0.6637526750564575, "learning_rate": 0.0002, "epoch": 3.1005110732538332, "step": 4550}, {"loss": 1.4752, "grad_norm": 0.822989284992218, "learning_rate": 0.0002, "epoch": 3.1073253833049406, "step": 4560}, {"loss": 1.4994, "grad_norm": 0.5542152523994446, "learning_rate": 0.0002, "epoch": 3.114139693356048, "step": 4570}, {"loss": 1.4306, "grad_norm": 0.7780306935310364, "learning_rate": 0.0002, "epoch": 3.1209540034071552, "step": 4580}, {"loss": 1.3909, "grad_norm": 0.7372637987136841, "learning_rate": 0.0002, "epoch": 3.1277683134582626, "step": 4590}, {"loss": 1.3989, "grad_norm": 0.6730087995529175, "learning_rate": 0.0002, "epoch": 3.1345826235093694, "step": 4600}, {"loss": 1.3591, "grad_norm": 0.6687398552894592, "learning_rate": 0.0002, "epoch": 3.1413969335604772, "step": 4610}, {"loss": 1.436, "grad_norm": 0.7645083665847778, "learning_rate": 0.0002, "epoch": 3.148211243611584, "step": 4620}, {"loss": 1.3681, "grad_norm": 0.6770380139350891, "learning_rate": 0.0002, "epoch": 3.155025553662692, "step": 4630}, {"loss": 1.405, "grad_norm": 0.7200576663017273, "learning_rate": 0.0002, "epoch": 3.1618398637137988, "step": 4640}, {"loss": 1.3752, "grad_norm": 0.6663638949394226, "learning_rate": 0.0002, "epoch": 3.168654173764906, "step": 4650}, {"loss": 1.4099, "grad_norm": 0.6602960228919983, "learning_rate": 0.0002, "epoch": 3.1754684838160134, "step": 4660}, {"loss": 1.4003, "grad_norm": 0.7838228344917297, "learning_rate": 0.0002, "epoch": 3.1822827938671208, "step": 4670}, {"loss": 1.3853, "grad_norm": 0.7559184432029724, "learning_rate": 0.0002, "epoch": 3.189097103918228, "step": 4680}, {"loss": 1.4516, "grad_norm": 0.6609814167022705, "learning_rate": 0.0002, "epoch": 3.1959114139693354, "step": 4690}, {"loss": 1.4464, "grad_norm": 0.8470419645309448, "learning_rate": 0.0002, "epoch": 3.2027257240204428, "step": 4700}, {"loss": 1.428, "grad_norm": 0.7282822728157043, "learning_rate": 0.0002, "epoch": 3.20954003407155, "step": 4710}, {"loss": 1.5261, "grad_norm": 0.6722773313522339, "learning_rate": 0.0002, "epoch": 3.2163543441226574, "step": 4720}, {"loss": 1.3809, "grad_norm": 0.7630265355110168, "learning_rate": 0.0002, "epoch": 3.2231686541737647, "step": 4730}, {"loss": 1.42, "grad_norm": 0.7102773785591125, "learning_rate": 0.0002, "epoch": 3.229982964224872, "step": 4740}, {"loss": 1.3529, "grad_norm": 0.7778299450874329, "learning_rate": 0.0002, "epoch": 3.2367972742759794, "step": 4750}, {"loss": 1.4715, "grad_norm": 0.7189921736717224, "learning_rate": 0.0002, "epoch": 3.2436115843270867, "step": 4760}, {"loss": 1.4328, "grad_norm": 0.7708092331886292, "learning_rate": 0.0002, "epoch": 3.250425894378194, "step": 4770}, {"loss": 1.3855, "grad_norm": 0.7208452224731445, "learning_rate": 0.0002, "epoch": 3.2572402044293014, "step": 4780}, {"loss": 1.3206, "grad_norm": 0.7220432758331299, "learning_rate": 0.0002, "epoch": 3.2640545144804087, "step": 4790}, {"loss": 1.463, "grad_norm": 0.7064954042434692, "learning_rate": 0.0002, "epoch": 3.270868824531516, "step": 4800}, {"loss": 1.4236, "grad_norm": 0.6618382334709167, "learning_rate": 0.0002, "epoch": 3.2776831345826234, "step": 4810}, {"loss": 1.3878, "grad_norm": 0.6854256391525269, "learning_rate": 0.0002, "epoch": 3.2844974446337307, "step": 4820}, {"loss": 1.4236, "grad_norm": 0.6036319136619568, "learning_rate": 0.0002, "epoch": 3.291311754684838, "step": 4830}, {"loss": 1.4796, "grad_norm": 0.714678943157196, "learning_rate": 0.0002, "epoch": 3.2981260647359454, "step": 4840}, {"loss": 1.4273, "grad_norm": 0.7218600511550903, "learning_rate": 0.0002, "epoch": 3.3049403747870527, "step": 4850}, {"loss": 1.3915, "grad_norm": 0.7243074774742126, "learning_rate": 0.0002, "epoch": 3.31175468483816, "step": 4860}, {"loss": 1.4088, "grad_norm": 0.7058630585670471, "learning_rate": 0.0002, "epoch": 3.3185689948892674, "step": 4870}, {"loss": 1.3837, "grad_norm": 0.7091076970100403, "learning_rate": 0.0002, "epoch": 3.3253833049403747, "step": 4880}, {"loss": 1.4745, "grad_norm": 0.7375147342681885, "learning_rate": 0.0002, "epoch": 3.332197614991482, "step": 4890}, {"loss": 1.4826, "grad_norm": 0.9426755309104919, "learning_rate": 0.0002, "epoch": 3.3390119250425894, "step": 4900}, {"loss": 1.369, "grad_norm": 0.6508213877677917, "learning_rate": 0.0002, "epoch": 3.3458262350936967, "step": 4910}, {"loss": 1.3839, "grad_norm": 0.6945043206214905, "learning_rate": 0.0002, "epoch": 3.352640545144804, "step": 4920}, {"loss": 1.3571, "grad_norm": 0.6335888504981995, "learning_rate": 0.0002, "epoch": 3.3594548551959114, "step": 4930}, {"loss": 1.4391, "grad_norm": 0.6947107911109924, "learning_rate": 0.0002, "epoch": 3.3662691652470187, "step": 4940}, {"loss": 1.3885, "grad_norm": 0.8204733729362488, "learning_rate": 0.0002, "epoch": 3.373083475298126, "step": 4950}, {"loss": 1.4886, "grad_norm": 0.7212244868278503, "learning_rate": 0.0002, "epoch": 3.3798977853492334, "step": 4960}, {"loss": 1.4581, "grad_norm": 0.6053042411804199, "learning_rate": 0.0002, "epoch": 3.3867120954003407, "step": 4970}, {"loss": 1.3863, "grad_norm": 0.7820029854774475, "learning_rate": 0.0002, "epoch": 3.393526405451448, "step": 4980}, {"loss": 1.4326, "grad_norm": 0.6866770386695862, "learning_rate": 0.0002, "epoch": 3.4003407155025553, "step": 4990}, {"loss": 1.4287, "grad_norm": 0.6652463674545288, "learning_rate": 0.0002, "epoch": 3.4071550255536627, "step": 5000}, {"loss": 1.3667, "grad_norm": 1.1209032535552979, "learning_rate": 0.0002, "epoch": 3.41396933560477, "step": 5010}, {"loss": 1.4461, "grad_norm": 0.8390814661979675, "learning_rate": 0.0002, "epoch": 3.4207836456558773, "step": 5020}, {"loss": 1.4556, "grad_norm": 0.7541858553886414, "learning_rate": 0.0002, "epoch": 3.4275979557069847, "step": 5030}, {"loss": 1.4245, "grad_norm": 0.6902772784233093, "learning_rate": 0.0002, "epoch": 3.434412265758092, "step": 5040}, {"loss": 1.3953, "grad_norm": 0.7070329785346985, "learning_rate": 0.0002, "epoch": 3.4412265758091993, "step": 5050}, {"loss": 1.3903, "grad_norm": 0.8075643181800842, "learning_rate": 0.0002, "epoch": 3.4480408858603067, "step": 5060}, {"loss": 1.3929, "grad_norm": 0.7133861780166626, "learning_rate": 0.0002, "epoch": 3.454855195911414, "step": 5070}, {"loss": 1.4632, "grad_norm": 0.6631823182106018, "learning_rate": 0.0002, "epoch": 3.4616695059625213, "step": 5080}, {"loss": 1.4162, "grad_norm": 0.673870325088501, "learning_rate": 0.0002, "epoch": 3.4684838160136287, "step": 5090}, {"loss": 1.4247, "grad_norm": 0.6438634395599365, "learning_rate": 0.0002, "epoch": 3.475298126064736, "step": 5100}, {"loss": 1.4421, "grad_norm": 0.7560495734214783, "learning_rate": 0.0002, "epoch": 3.4821124361158433, "step": 5110}, {"loss": 1.4125, "grad_norm": 0.6877814531326294, "learning_rate": 0.0002, "epoch": 3.4889267461669506, "step": 5120}, {"loss": 1.4308, "grad_norm": 0.7031328678131104, "learning_rate": 0.0002, "epoch": 3.495741056218058, "step": 5130}, {"loss": 1.3705, "grad_norm": 0.6797195672988892, "learning_rate": 0.0002, "epoch": 3.5025553662691653, "step": 5140}, {"loss": 1.4687, "grad_norm": 0.6766413450241089, "learning_rate": 0.0002, "epoch": 3.5093696763202726, "step": 5150}, {"loss": 1.4194, "grad_norm": 0.666656494140625, "learning_rate": 0.0002, "epoch": 3.51618398637138, "step": 5160}, {"loss": 1.469, "grad_norm": 0.74996417760849, "learning_rate": 0.0002, "epoch": 3.5229982964224873, "step": 5170}, {"loss": 1.4848, "grad_norm": 0.7370911836624146, "learning_rate": 0.0002, "epoch": 3.5298126064735946, "step": 5180}, {"loss": 1.4523, "grad_norm": 0.9063456654548645, "learning_rate": 0.0002, "epoch": 3.536626916524702, "step": 5190}, {"loss": 1.4726, "grad_norm": 0.6861422657966614, "learning_rate": 0.0002, "epoch": 3.5434412265758093, "step": 5200}, {"loss": 1.4803, "grad_norm": 0.7104039788246155, "learning_rate": 0.0002, "epoch": 3.5502555366269166, "step": 5210}, {"loss": 1.4313, "grad_norm": 0.6578653454780579, "learning_rate": 0.0002, "epoch": 3.557069846678024, "step": 5220}, {"loss": 1.4596, "grad_norm": 0.7336562275886536, "learning_rate": 0.0002, "epoch": 3.5638841567291313, "step": 5230}, {"loss": 1.4591, "grad_norm": 0.7163010835647583, "learning_rate": 0.0002, "epoch": 3.5706984667802386, "step": 5240}, {"loss": 1.3814, "grad_norm": 0.8112391233444214, "learning_rate": 0.0002, "epoch": 3.577512776831346, "step": 5250}, {"loss": 1.4249, "grad_norm": 0.7260391116142273, "learning_rate": 0.0002, "epoch": 3.5843270868824533, "step": 5260}, {"loss": 1.4249, "grad_norm": 0.7038731575012207, "learning_rate": 0.0002, "epoch": 3.5911413969335606, "step": 5270}, {"loss": 1.4172, "grad_norm": 0.7864376902580261, "learning_rate": 0.0002, "epoch": 3.597955706984668, "step": 5280}, {"loss": 1.4637, "grad_norm": 0.6968383193016052, "learning_rate": 0.0002, "epoch": 3.6047700170357753, "step": 5290}, {"loss": 1.5269, "grad_norm": 0.6726206541061401, "learning_rate": 0.0002, "epoch": 3.6115843270868826, "step": 5300}, {"loss": 1.4199, "grad_norm": 0.6716854572296143, "learning_rate": 0.0002, "epoch": 3.61839863713799, "step": 5310}, {"loss": 1.4686, "grad_norm": 0.7229742407798767, "learning_rate": 0.0002, "epoch": 3.6252129471890973, "step": 5320}, {"loss": 1.4441, "grad_norm": 0.7338683009147644, "learning_rate": 0.0002, "epoch": 3.6320272572402046, "step": 5330}, {"loss": 1.4116, "grad_norm": 0.771672785282135, "learning_rate": 0.0002, "epoch": 3.638841567291312, "step": 5340}, {"loss": 1.4007, "grad_norm": 0.7024078369140625, "learning_rate": 0.0002, "epoch": 3.645655877342419, "step": 5350}, {"loss": 1.4996, "grad_norm": 0.6847538352012634, "learning_rate": 0.0002, "epoch": 3.6524701873935266, "step": 5360}, {"loss": 1.4111, "grad_norm": 0.71802818775177, "learning_rate": 0.0002, "epoch": 3.6592844974446335, "step": 5370}, {"loss": 1.4224, "grad_norm": 0.78530353307724, "learning_rate": 0.0002, "epoch": 3.6660988074957412, "step": 5380}, {"loss": 1.4582, "grad_norm": 0.7262226939201355, "learning_rate": 0.0002, "epoch": 3.672913117546848, "step": 5390}, {"loss": 1.4704, "grad_norm": 0.7608316540718079, "learning_rate": 0.0002, "epoch": 3.679727427597956, "step": 5400}, {"loss": 1.3742, "grad_norm": 0.6994926333427429, "learning_rate": 0.0002, "epoch": 3.686541737649063, "step": 5410}, {"loss": 1.4738, "grad_norm": 0.7888479828834534, "learning_rate": 0.0002, "epoch": 3.6933560477001706, "step": 5420}, {"loss": 1.4213, "grad_norm": 0.7053858041763306, "learning_rate": 0.0002, "epoch": 3.7001703577512775, "step": 5430}, {"loss": 1.4988, "grad_norm": 0.7063165903091431, "learning_rate": 0.0002, "epoch": 3.7069846678023852, "step": 5440}, {"loss": 1.4386, "grad_norm": 0.6603744626045227, "learning_rate": 0.0002, "epoch": 3.713798977853492, "step": 5450}, {"loss": 1.4695, "grad_norm": 0.7043602466583252, "learning_rate": 0.0002, "epoch": 3.7206132879046, "step": 5460}, {"loss": 1.5051, "grad_norm": 0.7026081681251526, "learning_rate": 0.0002, "epoch": 3.7274275979557068, "step": 5470}, {"loss": 1.5613, "grad_norm": 0.7200090289115906, "learning_rate": 0.0002, "epoch": 3.7342419080068145, "step": 5480}, {"loss": 1.4182, "grad_norm": 0.7170904278755188, "learning_rate": 0.0002, "epoch": 3.7410562180579214, "step": 5490}, {"loss": 1.4344, "grad_norm": 0.7489104866981506, "learning_rate": 0.0002, "epoch": 3.747870528109029, "step": 5500}, {"loss": 1.4911, "grad_norm": 0.6540989875793457, "learning_rate": 0.0002, "epoch": 3.754684838160136, "step": 5510}, {"loss": 1.4955, "grad_norm": 0.6654048562049866, "learning_rate": 0.0002, "epoch": 3.761499148211244, "step": 5520}, {"loss": 1.4487, "grad_norm": 0.6577395796775818, "learning_rate": 0.0002, "epoch": 3.7683134582623508, "step": 5530}, {"loss": 1.4283, "grad_norm": 0.7762192487716675, "learning_rate": 0.0002, "epoch": 3.7751277683134585, "step": 5540}, {"loss": 1.4727, "grad_norm": 0.6336314678192139, "learning_rate": 0.0002, "epoch": 3.7819420783645654, "step": 5550}, {"loss": 1.4588, "grad_norm": 0.7098057866096497, "learning_rate": 0.0002, "epoch": 3.7887563884156727, "step": 5560}, {"loss": 1.4679, "grad_norm": 0.7379715442657471, "learning_rate": 0.0002, "epoch": 3.79557069846678, "step": 5570}, {"loss": 1.4633, "grad_norm": 0.6726924777030945, "learning_rate": 0.0002, "epoch": 3.8023850085178874, "step": 5580}, {"loss": 1.4751, "grad_norm": 1.1212009191513062, "learning_rate": 0.0002, "epoch": 3.8091993185689947, "step": 5590}, {"loss": 1.4503, "grad_norm": 0.6503795981407166, "learning_rate": 0.0002, "epoch": 3.816013628620102, "step": 5600}, {"loss": 1.4754, "grad_norm": 0.7041325569152832, "learning_rate": 0.0002, "epoch": 3.8228279386712094, "step": 5610}, {"loss": 1.4199, "grad_norm": 0.7962933778762817, "learning_rate": 0.0002, "epoch": 3.8296422487223167, "step": 5620}, {"loss": 1.4672, "grad_norm": 0.6613591909408569, "learning_rate": 0.0002, "epoch": 3.836456558773424, "step": 5630}, {"loss": 1.5688, "grad_norm": 0.7293516397476196, "learning_rate": 0.0002, "epoch": 3.8432708688245314, "step": 5640}, {"loss": 1.4149, "grad_norm": 0.7388607859611511, "learning_rate": 0.0002, "epoch": 3.8500851788756387, "step": 5650}, {"loss": 1.4743, "grad_norm": 0.6440677642822266, "learning_rate": 0.0002, "epoch": 3.856899488926746, "step": 5660}, {"loss": 1.5082, "grad_norm": 0.7729013562202454, "learning_rate": 0.0002, "epoch": 3.8637137989778534, "step": 5670}, {"loss": 1.4608, "grad_norm": 0.6696794033050537, "learning_rate": 0.0002, "epoch": 3.8705281090289607, "step": 5680}, {"loss": 1.472, "grad_norm": 0.7151781320571899, "learning_rate": 0.0002, "epoch": 3.877342419080068, "step": 5690}, {"loss": 1.4923, "grad_norm": 0.6736966371536255, "learning_rate": 0.0002, "epoch": 3.8841567291311754, "step": 5700}, {"loss": 1.4453, "grad_norm": 0.7444243431091309, "learning_rate": 0.0002, "epoch": 3.8909710391822827, "step": 5710}, {"loss": 1.4562, "grad_norm": 0.6701464653015137, "learning_rate": 0.0002, "epoch": 3.89778534923339, "step": 5720}, {"loss": 1.4478, "grad_norm": 0.7231952548027039, "learning_rate": 0.0002, "epoch": 3.9045996592844974, "step": 5730}, {"loss": 1.4539, "grad_norm": 0.831954300403595, "learning_rate": 0.0002, "epoch": 3.9114139693356047, "step": 5740}, {"loss": 1.5122, "grad_norm": 0.7697733640670776, "learning_rate": 0.0002, "epoch": 3.918228279386712, "step": 5750}, {"loss": 1.4552, "grad_norm": 0.6964395046234131, "learning_rate": 0.0002, "epoch": 3.9250425894378194, "step": 5760}, {"loss": 1.4688, "grad_norm": 0.6942925453186035, "learning_rate": 0.0002, "epoch": 3.9318568994889267, "step": 5770}, {"loss": 1.4668, "grad_norm": 0.6491202712059021, "learning_rate": 0.0002, "epoch": 3.938671209540034, "step": 5780}, {"loss": 1.4404, "grad_norm": 0.7004382610321045, "learning_rate": 0.0002, "epoch": 3.9454855195911414, "step": 5790}, {"loss": 1.5022, "grad_norm": 0.7337747812271118, "learning_rate": 0.0002, "epoch": 3.9522998296422487, "step": 5800}, {"loss": 1.5314, "grad_norm": 0.6923640966415405, "learning_rate": 0.0002, "epoch": 3.959114139693356, "step": 5810}, {"loss": 1.4811, "grad_norm": 0.6815266609191895, "learning_rate": 0.0002, "epoch": 3.9659284497444633, "step": 5820}, {"loss": 1.437, "grad_norm": 0.6755654811859131, "learning_rate": 0.0002, "epoch": 3.9727427597955707, "step": 5830}, {"loss": 1.4277, "grad_norm": 0.6912487149238586, "learning_rate": 0.0002, "epoch": 3.979557069846678, "step": 5840}, {"loss": 1.4654, "grad_norm": 0.6948044896125793, "learning_rate": 0.0002, "epoch": 3.9863713798977853, "step": 5850}, {"loss": 1.4779, "grad_norm": 0.6735455989837646, "learning_rate": 0.0002, "epoch": 3.9931856899488927, "step": 5860}, {"loss": 1.5102, "grad_norm": 0.7005048990249634, "learning_rate": 0.0002, "epoch": 4.0, "step": 5870}]} +{"epoch": 4.999659284497445, "step": 7337, "epoch_duration": 2273.973286151886, "total_accumulated_duration": 12001.512875318527, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28746.0}, "peak_memory_reserved": {"GPU_0": 28746.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0988, "grad_norm": 0.635574460029602, "learning_rate": 0.0002, "epoch": 0.0068143100511073255, "step": 10}, {"loss": 2.5594, "grad_norm": 1.0401769876480103, "learning_rate": 0.0002, "epoch": 0.013628620102214651, "step": 20}, {"loss": 2.3587, "grad_norm": 0.4514131247997284, "learning_rate": 0.0002, "epoch": 0.020442930153321975, "step": 30}, {"loss": 2.121, "grad_norm": 0.6303355693817139, "learning_rate": 0.0002, "epoch": 0.027257240204429302, "step": 40}, {"loss": 1.9833, "grad_norm": 0.4648270606994629, "learning_rate": 0.0002, "epoch": 0.034071550255536626, "step": 50}, {"loss": 1.9384, "grad_norm": 0.42953479290008545, "learning_rate": 0.0002, "epoch": 0.04088586030664395, "step": 60}, {"loss": 1.9202, "grad_norm": 0.433614581823349, "learning_rate": 0.0002, "epoch": 0.04770017035775128, "step": 70}, {"loss": 1.911, "grad_norm": 0.45318254828453064, "learning_rate": 0.0002, "epoch": 0.054514480408858604, "step": 80}, {"loss": 1.8895, "grad_norm": 0.4023568630218506, "learning_rate": 0.0002, "epoch": 0.06132879045996593, "step": 90}, {"loss": 1.9257, "grad_norm": 0.43260207772254944, "learning_rate": 0.0002, "epoch": 0.06814310051107325, "step": 100}, {"loss": 1.9178, "grad_norm": 0.43389809131622314, "learning_rate": 0.0002, "epoch": 0.07495741056218058, "step": 110}, {"loss": 1.9071, "grad_norm": 0.39307987689971924, "learning_rate": 0.0002, "epoch": 0.0817717206132879, "step": 120}, {"loss": 1.9482, "grad_norm": 0.7703037261962891, "learning_rate": 0.0002, "epoch": 0.08858603066439523, "step": 130}, {"loss": 1.8394, "grad_norm": 0.38547563552856445, "learning_rate": 0.0002, "epoch": 0.09540034071550256, "step": 140}, {"loss": 1.9037, "grad_norm": 0.37948688864707947, "learning_rate": 0.0002, "epoch": 0.10221465076660988, "step": 150}, {"loss": 1.8664, "grad_norm": 0.33022379875183105, "learning_rate": 0.0002, "epoch": 0.10902896081771721, "step": 160}, {"loss": 1.8659, "grad_norm": 0.33703792095184326, "learning_rate": 0.0002, "epoch": 0.11584327086882454, "step": 170}, {"loss": 1.8389, "grad_norm": 0.3259912431240082, "learning_rate": 0.0002, "epoch": 0.12265758091993186, "step": 180}, {"loss": 1.8424, "grad_norm": 0.3593858778476715, "learning_rate": 0.0002, "epoch": 0.12947189097103917, "step": 190}, {"loss": 1.8457, "grad_norm": 0.3401614725589752, "learning_rate": 0.0002, "epoch": 0.1362862010221465, "step": 200}, {"loss": 1.7721, "grad_norm": 0.3892078697681427, "learning_rate": 0.0002, "epoch": 0.14310051107325383, "step": 210}, {"loss": 1.8351, "grad_norm": 0.315374493598938, "learning_rate": 0.0002, "epoch": 0.14991482112436116, "step": 220}, {"loss": 1.8307, "grad_norm": 0.3679497539997101, "learning_rate": 0.0002, "epoch": 0.1567291311754685, "step": 230}, {"loss": 1.816, "grad_norm": 0.336730033159256, "learning_rate": 0.0002, "epoch": 0.1635434412265758, "step": 240}, {"loss": 1.7849, "grad_norm": 0.36472755670547485, "learning_rate": 0.0002, "epoch": 0.17035775127768313, "step": 250}, {"loss": 1.7766, "grad_norm": 0.34864023327827454, "learning_rate": 0.0002, "epoch": 0.17717206132879046, "step": 260}, {"loss": 1.896, "grad_norm": 0.3350819945335388, "learning_rate": 0.0002, "epoch": 0.1839863713798978, "step": 270}, {"loss": 1.8742, "grad_norm": 0.3844246566295624, "learning_rate": 0.0002, "epoch": 0.19080068143100512, "step": 280}, {"loss": 1.8531, "grad_norm": 0.38413065671920776, "learning_rate": 0.0002, "epoch": 0.19761499148211242, "step": 290}, {"loss": 1.8415, "grad_norm": 0.4080047607421875, "learning_rate": 0.0002, "epoch": 0.20442930153321975, "step": 300}, {"loss": 1.9009, "grad_norm": 0.3546800911426544, "learning_rate": 0.0002, "epoch": 0.21124361158432708, "step": 310}, {"loss": 1.8092, "grad_norm": 0.38577890396118164, "learning_rate": 0.0002, "epoch": 0.21805792163543442, "step": 320}, {"loss": 1.7113, "grad_norm": 0.38979098200798035, "learning_rate": 0.0002, "epoch": 0.22487223168654175, "step": 330}, {"loss": 1.8162, "grad_norm": 0.35105520486831665, "learning_rate": 0.0002, "epoch": 0.23168654173764908, "step": 340}, {"loss": 1.8408, "grad_norm": 0.31671223044395447, "learning_rate": 0.0002, "epoch": 0.23850085178875638, "step": 350}, {"loss": 1.8014, "grad_norm": 0.33034196496009827, "learning_rate": 0.0002, "epoch": 0.2453151618398637, "step": 360}, {"loss": 1.7132, "grad_norm": 0.2990533709526062, "learning_rate": 0.0002, "epoch": 0.252129471890971, "step": 370}, {"loss": 1.8612, "grad_norm": 0.362208753824234, "learning_rate": 0.0002, "epoch": 0.25894378194207834, "step": 380}, {"loss": 1.8001, "grad_norm": 0.3269096612930298, "learning_rate": 0.0002, "epoch": 0.2657580919931857, "step": 390}, {"loss": 1.786, "grad_norm": 0.30555954575538635, "learning_rate": 0.0002, "epoch": 0.272572402044293, "step": 400}, {"loss": 1.8018, "grad_norm": 0.332933247089386, "learning_rate": 0.0002, "epoch": 0.27938671209540034, "step": 410}, {"loss": 1.8157, "grad_norm": 0.39454060792922974, "learning_rate": 0.0002, "epoch": 0.28620102214650767, "step": 420}, {"loss": 1.7862, "grad_norm": 0.34589633345603943, "learning_rate": 0.0002, "epoch": 0.293015332197615, "step": 430}, {"loss": 1.7612, "grad_norm": 0.3747332990169525, "learning_rate": 0.0002, "epoch": 0.29982964224872233, "step": 440}, {"loss": 1.8476, "grad_norm": 0.34825369715690613, "learning_rate": 0.0002, "epoch": 0.30664395229982966, "step": 450}, {"loss": 1.775, "grad_norm": 0.32906976342201233, "learning_rate": 0.0002, "epoch": 0.313458262350937, "step": 460}, {"loss": 1.7764, "grad_norm": 0.33108609914779663, "learning_rate": 0.0002, "epoch": 0.3202725724020443, "step": 470}, {"loss": 1.8012, "grad_norm": 0.3170463442802429, "learning_rate": 0.0002, "epoch": 0.3270868824531516, "step": 480}, {"loss": 1.8264, "grad_norm": 0.30792081356048584, "learning_rate": 0.0002, "epoch": 0.3339011925042589, "step": 490}, {"loss": 1.7361, "grad_norm": 0.31772997975349426, "learning_rate": 0.0002, "epoch": 0.34071550255536626, "step": 500}, {"loss": 1.8147, "grad_norm": 0.32714012265205383, "learning_rate": 0.0002, "epoch": 0.3475298126064736, "step": 510}, {"loss": 1.8332, "grad_norm": 0.3021100163459778, "learning_rate": 0.0002, "epoch": 0.3543441226575809, "step": 520}, {"loss": 1.7337, "grad_norm": 0.6045835018157959, "learning_rate": 0.0002, "epoch": 0.36115843270868825, "step": 530}, {"loss": 1.8661, "grad_norm": 0.3003896474838257, "learning_rate": 0.0002, "epoch": 0.3679727427597956, "step": 540}, {"loss": 1.8359, "grad_norm": 0.3678470551967621, "learning_rate": 0.0002, "epoch": 0.3747870528109029, "step": 550}, {"loss": 1.7512, "grad_norm": 0.35787615180015564, "learning_rate": 0.0002, "epoch": 0.38160136286201024, "step": 560}, {"loss": 1.8048, "grad_norm": 0.31882143020629883, "learning_rate": 0.0002, "epoch": 0.38841567291311757, "step": 570}, {"loss": 1.8108, "grad_norm": 0.3186313509941101, "learning_rate": 0.0002, "epoch": 0.39522998296422485, "step": 580}, {"loss": 1.8012, "grad_norm": 0.41443702578544617, "learning_rate": 0.0002, "epoch": 0.4020442930153322, "step": 590}, {"loss": 1.7794, "grad_norm": 0.28773069381713867, "learning_rate": 0.0002, "epoch": 0.4088586030664395, "step": 600}, {"loss": 1.8231, "grad_norm": 0.35743263363838196, "learning_rate": 0.0002, "epoch": 0.41567291311754684, "step": 610}, {"loss": 1.7531, "grad_norm": 0.3360286355018616, "learning_rate": 0.0002, "epoch": 0.42248722316865417, "step": 620}, {"loss": 1.7654, "grad_norm": 0.32838866114616394, "learning_rate": 0.0002, "epoch": 0.4293015332197615, "step": 630}, {"loss": 1.8176, "grad_norm": 0.2994388937950134, "learning_rate": 0.0002, "epoch": 0.43611584327086883, "step": 640}, {"loss": 1.7241, "grad_norm": 0.3306307792663574, "learning_rate": 0.0002, "epoch": 0.44293015332197616, "step": 650}, {"loss": 1.8201, "grad_norm": 0.3129560351371765, "learning_rate": 0.0002, "epoch": 0.4497444633730835, "step": 660}, {"loss": 1.803, "grad_norm": 0.3244289457798004, "learning_rate": 0.0002, "epoch": 0.4565587734241908, "step": 670}, {"loss": 1.8399, "grad_norm": 0.3196892738342285, "learning_rate": 0.0002, "epoch": 0.46337308347529815, "step": 680}, {"loss": 1.8291, "grad_norm": 0.3135230243206024, "learning_rate": 0.0002, "epoch": 0.47018739352640543, "step": 690}, {"loss": 1.7423, "grad_norm": 0.28677991032600403, "learning_rate": 0.0002, "epoch": 0.47700170357751276, "step": 700}, {"loss": 1.6982, "grad_norm": 0.3074065148830414, "learning_rate": 0.0002, "epoch": 0.4838160136286201, "step": 710}, {"loss": 1.8078, "grad_norm": 0.3354290723800659, "learning_rate": 0.0002, "epoch": 0.4906303236797274, "step": 720}, {"loss": 1.7333, "grad_norm": 0.324370801448822, "learning_rate": 0.0002, "epoch": 0.49744463373083475, "step": 730}, {"loss": 1.8506, "grad_norm": 0.29496142268180847, "learning_rate": 0.0002, "epoch": 0.504258943781942, "step": 740}, {"loss": 1.7761, "grad_norm": 0.30694130063056946, "learning_rate": 0.0002, "epoch": 0.5110732538330494, "step": 750}, {"loss": 1.7854, "grad_norm": 0.36168408393859863, "learning_rate": 0.0002, "epoch": 0.5178875638841567, "step": 760}, {"loss": 1.7149, "grad_norm": 0.2930343449115753, "learning_rate": 0.0002, "epoch": 0.524701873935264, "step": 770}, {"loss": 1.7924, "grad_norm": 0.3023432493209839, "learning_rate": 0.0002, "epoch": 0.5315161839863713, "step": 780}, {"loss": 1.8467, "grad_norm": 0.3272720277309418, "learning_rate": 0.0002, "epoch": 0.5383304940374787, "step": 790}, {"loss": 1.8639, "grad_norm": 0.2907974421977997, "learning_rate": 0.0002, "epoch": 0.545144804088586, "step": 800}, {"loss": 1.9018, "grad_norm": 0.32267168164253235, "learning_rate": 0.0002, "epoch": 0.5519591141396933, "step": 810}, {"loss": 1.8311, "grad_norm": 0.32059940695762634, "learning_rate": 0.0002, "epoch": 0.5587734241908007, "step": 820}, {"loss": 1.7234, "grad_norm": 0.30951258540153503, "learning_rate": 0.0002, "epoch": 0.565587734241908, "step": 830}, {"loss": 1.8063, "grad_norm": 0.33976122736930847, "learning_rate": 0.0002, "epoch": 0.5724020442930153, "step": 840}, {"loss": 1.7021, "grad_norm": 0.3195820450782776, "learning_rate": 0.0002, "epoch": 0.5792163543441227, "step": 850}, {"loss": 1.8073, "grad_norm": 0.2828562557697296, "learning_rate": 0.0002, "epoch": 0.58603066439523, "step": 860}, {"loss": 1.695, "grad_norm": 0.29591670632362366, "learning_rate": 0.0002, "epoch": 0.5928449744463373, "step": 870}, {"loss": 1.8109, "grad_norm": 0.3086104393005371, "learning_rate": 0.0002, "epoch": 0.5996592844974447, "step": 880}, {"loss": 1.8207, "grad_norm": 0.3592929542064667, "learning_rate": 0.0002, "epoch": 0.606473594548552, "step": 890}, {"loss": 1.7662, "grad_norm": 0.2830186188220978, "learning_rate": 0.0002, "epoch": 0.6132879045996593, "step": 900}, {"loss": 1.8344, "grad_norm": 0.3128598630428314, "learning_rate": 0.0002, "epoch": 0.6201022146507666, "step": 910}, {"loss": 1.7953, "grad_norm": 0.31957563757896423, "learning_rate": 0.0002, "epoch": 0.626916524701874, "step": 920}, {"loss": 1.7578, "grad_norm": 0.30994319915771484, "learning_rate": 0.0002, "epoch": 0.6337308347529813, "step": 930}, {"loss": 1.8494, "grad_norm": 0.3352845013141632, "learning_rate": 0.0002, "epoch": 0.6405451448040886, "step": 940}, {"loss": 1.7054, "grad_norm": 0.2960077226161957, "learning_rate": 0.0002, "epoch": 0.6473594548551959, "step": 950}, {"loss": 1.7209, "grad_norm": 0.32675081491470337, "learning_rate": 0.0002, "epoch": 0.6541737649063032, "step": 960}, {"loss": 1.7776, "grad_norm": 0.30042028427124023, "learning_rate": 0.0002, "epoch": 0.6609880749574105, "step": 970}, {"loss": 1.7597, "grad_norm": 0.3288673758506775, "learning_rate": 0.0002, "epoch": 0.6678023850085179, "step": 980}, {"loss": 1.7962, "grad_norm": 0.323215126991272, "learning_rate": 0.0002, "epoch": 0.6746166950596252, "step": 990}, {"loss": 1.821, "grad_norm": 0.30041399598121643, "learning_rate": 0.0002, "epoch": 0.6814310051107325, "step": 1000}, {"loss": 1.7716, "grad_norm": 0.3076179623603821, "learning_rate": 0.0002, "epoch": 0.6882453151618398, "step": 1010}, {"loss": 1.7827, "grad_norm": 0.2941909730434418, "learning_rate": 0.0002, "epoch": 0.6950596252129472, "step": 1020}, {"loss": 1.7964, "grad_norm": 0.32220420241355896, "learning_rate": 0.0002, "epoch": 0.7018739352640545, "step": 1030}, {"loss": 1.7101, "grad_norm": 0.2989702820777893, "learning_rate": 0.0002, "epoch": 0.7086882453151618, "step": 1040}, {"loss": 1.8749, "grad_norm": 0.3049640357494354, "learning_rate": 0.0002, "epoch": 0.7155025553662692, "step": 1050}, {"loss": 1.782, "grad_norm": 0.3183284103870392, "learning_rate": 0.0002, "epoch": 0.7223168654173765, "step": 1060}, {"loss": 1.785, "grad_norm": 0.3070095181465149, "learning_rate": 0.0002, "epoch": 0.7291311754684838, "step": 1070}, {"loss": 1.7832, "grad_norm": 0.33263063430786133, "learning_rate": 0.0002, "epoch": 0.7359454855195912, "step": 1080}, {"loss": 1.7627, "grad_norm": 0.28774312138557434, "learning_rate": 0.0002, "epoch": 0.7427597955706985, "step": 1090}, {"loss": 1.7343, "grad_norm": 0.29304224252700806, "learning_rate": 0.0002, "epoch": 0.7495741056218058, "step": 1100}, {"loss": 1.8082, "grad_norm": 0.27529507875442505, "learning_rate": 0.0002, "epoch": 0.7563884156729132, "step": 1110}, {"loss": 1.8071, "grad_norm": 0.32319945096969604, "learning_rate": 0.0002, "epoch": 0.7632027257240205, "step": 1120}, {"loss": 1.6998, "grad_norm": 0.33777597546577454, "learning_rate": 0.0002, "epoch": 0.7700170357751278, "step": 1130}, {"loss": 1.8488, "grad_norm": 0.29847201704978943, "learning_rate": 0.0002, "epoch": 0.7768313458262351, "step": 1140}, {"loss": 1.7014, "grad_norm": 0.31370633840560913, "learning_rate": 0.0002, "epoch": 0.7836456558773425, "step": 1150}, {"loss": 1.8175, "grad_norm": 0.31633856892585754, "learning_rate": 0.0002, "epoch": 0.7904599659284497, "step": 1160}, {"loss": 1.7637, "grad_norm": 0.33849263191223145, "learning_rate": 0.0002, "epoch": 0.797274275979557, "step": 1170}, {"loss": 1.8264, "grad_norm": 0.3306333124637604, "learning_rate": 0.0002, "epoch": 0.8040885860306644, "step": 1180}, {"loss": 1.777, "grad_norm": 0.3249678313732147, "learning_rate": 0.0002, "epoch": 0.8109028960817717, "step": 1190}, {"loss": 1.7631, "grad_norm": 0.3252817690372467, "learning_rate": 0.0002, "epoch": 0.817717206132879, "step": 1200}, {"loss": 1.7877, "grad_norm": 0.31772559881210327, "learning_rate": 0.0002, "epoch": 0.8245315161839863, "step": 1210}, {"loss": 1.8068, "grad_norm": 0.2803371846675873, "learning_rate": 0.0002, "epoch": 0.8313458262350937, "step": 1220}, {"loss": 1.738, "grad_norm": 0.26700571179389954, "learning_rate": 0.0002, "epoch": 0.838160136286201, "step": 1230}, {"loss": 1.8301, "grad_norm": 0.3060092031955719, "learning_rate": 0.0002, "epoch": 0.8449744463373083, "step": 1240}, {"loss": 1.7993, "grad_norm": 0.28831684589385986, "learning_rate": 0.0002, "epoch": 0.8517887563884157, "step": 1250}, {"loss": 1.6909, "grad_norm": 0.30708742141723633, "learning_rate": 0.0002, "epoch": 0.858603066439523, "step": 1260}, {"loss": 1.8506, "grad_norm": 0.2915987968444824, "learning_rate": 0.0002, "epoch": 0.8654173764906303, "step": 1270}, {"loss": 1.7536, "grad_norm": 0.2893589735031128, "learning_rate": 0.0002, "epoch": 0.8722316865417377, "step": 1280}, {"loss": 1.7437, "grad_norm": 0.29545632004737854, "learning_rate": 0.0002, "epoch": 0.879045996592845, "step": 1290}, {"loss": 1.859, "grad_norm": 0.3354771137237549, "learning_rate": 0.0002, "epoch": 0.8858603066439523, "step": 1300}, {"loss": 1.7644, "grad_norm": 0.37715399265289307, "learning_rate": 0.0002, "epoch": 0.8926746166950597, "step": 1310}, {"loss": 1.7731, "grad_norm": 0.28847193717956543, "learning_rate": 0.0002, "epoch": 0.899488926746167, "step": 1320}, {"loss": 1.9077, "grad_norm": 0.2780889868736267, "learning_rate": 0.0002, "epoch": 0.9063032367972743, "step": 1330}, {"loss": 1.7861, "grad_norm": 0.2714342176914215, "learning_rate": 0.0002, "epoch": 0.9131175468483816, "step": 1340}, {"loss": 1.8, "grad_norm": 0.2950133979320526, "learning_rate": 0.0002, "epoch": 0.919931856899489, "step": 1350}, {"loss": 1.7368, "grad_norm": 0.29097145795822144, "learning_rate": 0.0002, "epoch": 0.9267461669505963, "step": 1360}, {"loss": 1.7864, "grad_norm": 0.32540133595466614, "learning_rate": 0.0002, "epoch": 0.9335604770017035, "step": 1370}, {"loss": 1.7571, "grad_norm": 0.3076636493206024, "learning_rate": 0.0002, "epoch": 0.9403747870528109, "step": 1380}, {"loss": 1.8312, "grad_norm": 0.2962130308151245, "learning_rate": 0.0002, "epoch": 0.9471890971039182, "step": 1390}, {"loss": 1.7858, "grad_norm": 0.30086860060691833, "learning_rate": 0.0002, "epoch": 0.9540034071550255, "step": 1400}, {"loss": 1.7991, "grad_norm": 0.28634947538375854, "learning_rate": 0.0002, "epoch": 0.9608177172061328, "step": 1410}, {"loss": 1.8385, "grad_norm": 0.35314416885375977, "learning_rate": 0.0002, "epoch": 0.9676320272572402, "step": 1420}, {"loss": 1.8054, "grad_norm": 0.2939317524433136, "learning_rate": 0.0002, "epoch": 0.9744463373083475, "step": 1430}, {"loss": 1.7582, "grad_norm": 0.3010196089744568, "learning_rate": 0.0002, "epoch": 0.9812606473594548, "step": 1440}, {"loss": 1.758, "grad_norm": 0.30816152691841125, "learning_rate": 0.0002, "epoch": 0.9880749574105622, "step": 1450}, {"loss": 1.8159, "grad_norm": 0.28152793645858765, "learning_rate": 0.0002, "epoch": 0.9948892674616695, "step": 1460}, {"eval_loss": 1.8034634590148926, "eval_runtime": 53.6017, "eval_samples_per_second": 9.459, "eval_steps_per_second": 1.194, "epoch": 0.9996592844974447, "step": 1467}, {"loss": 1.7072, "grad_norm": 0.29246416687965393, "learning_rate": 0.0002, "epoch": 1.0017035775127767, "step": 1470}, {"loss": 1.8159, "grad_norm": 0.2668602168560028, "learning_rate": 0.0002, "epoch": 1.008517887563884, "step": 1480}, {"loss": 1.6868, "grad_norm": 0.2998567819595337, "learning_rate": 0.0002, "epoch": 1.0153321976149914, "step": 1490}, {"loss": 1.7331, "grad_norm": 0.3284934461116791, "learning_rate": 0.0002, "epoch": 1.0221465076660987, "step": 1500}, {"loss": 1.689, "grad_norm": 0.3275827169418335, "learning_rate": 0.0002, "epoch": 1.028960817717206, "step": 1510}, {"loss": 1.7092, "grad_norm": 0.3382718563079834, "learning_rate": 0.0002, "epoch": 1.0357751277683134, "step": 1520}, {"loss": 1.7215, "grad_norm": 0.36737215518951416, "learning_rate": 0.0002, "epoch": 1.0425894378194207, "step": 1530}, {"loss": 1.6993, "grad_norm": 0.3442603647708893, "learning_rate": 0.0002, "epoch": 1.049403747870528, "step": 1540}, {"loss": 1.6996, "grad_norm": 0.3323381245136261, "learning_rate": 0.0002, "epoch": 1.0562180579216354, "step": 1550}, {"loss": 1.7444, "grad_norm": 0.341227650642395, "learning_rate": 0.0002, "epoch": 1.0630323679727427, "step": 1560}, {"loss": 1.7419, "grad_norm": 0.3361579477787018, "learning_rate": 0.0002, "epoch": 1.06984667802385, "step": 1570}, {"loss": 1.7337, "grad_norm": 0.3556230962276459, "learning_rate": 0.0002, "epoch": 1.0766609880749574, "step": 1580}, {"loss": 1.6978, "grad_norm": 0.27130424976348877, "learning_rate": 0.0002, "epoch": 1.0834752981260647, "step": 1590}, {"loss": 1.6087, "grad_norm": 0.29366323351860046, "learning_rate": 0.0002, "epoch": 1.090289608177172, "step": 1600}, {"loss": 1.6721, "grad_norm": 0.3581245541572571, "learning_rate": 0.0002, "epoch": 1.0971039182282794, "step": 1610}, {"loss": 1.7639, "grad_norm": 0.3021670877933502, "learning_rate": 0.0002, "epoch": 1.1039182282793867, "step": 1620}, {"loss": 1.7314, "grad_norm": 0.3145572543144226, "learning_rate": 0.0002, "epoch": 1.110732538330494, "step": 1630}, {"loss": 1.7408, "grad_norm": 0.35362154245376587, "learning_rate": 0.0002, "epoch": 1.1175468483816013, "step": 1640}, {"loss": 1.7071, "grad_norm": 0.5413113236427307, "learning_rate": 0.0002, "epoch": 1.1243611584327087, "step": 1650}, {"loss": 1.7615, "grad_norm": 0.6858654022216797, "learning_rate": 0.0002, "epoch": 1.131175468483816, "step": 1660}, {"loss": 1.7029, "grad_norm": 0.3781903386116028, "learning_rate": 0.0002, "epoch": 1.1379897785349233, "step": 1670}, {"loss": 1.659, "grad_norm": 0.404864102602005, "learning_rate": 0.0002, "epoch": 1.1448040885860307, "step": 1680}, {"loss": 1.7212, "grad_norm": 0.3595100939273834, "learning_rate": 0.0002, "epoch": 1.151618398637138, "step": 1690}, {"loss": 1.7023, "grad_norm": 0.33682283759117126, "learning_rate": 0.0002, "epoch": 1.1584327086882453, "step": 1700}, {"loss": 1.7336, "grad_norm": 0.3877373933792114, "learning_rate": 0.0002, "epoch": 1.1652470187393527, "step": 1710}, {"loss": 1.7676, "grad_norm": 0.34606897830963135, "learning_rate": 0.0002, "epoch": 1.17206132879046, "step": 1720}, {"loss": 1.6889, "grad_norm": 0.3122918903827667, "learning_rate": 0.0002, "epoch": 1.1788756388415673, "step": 1730}, {"loss": 1.6585, "grad_norm": 0.34081900119781494, "learning_rate": 0.0002, "epoch": 1.1856899488926746, "step": 1740}, {"loss": 1.7794, "grad_norm": 0.3418807089328766, "learning_rate": 0.0002, "epoch": 1.192504258943782, "step": 1750}, {"loss": 1.7773, "grad_norm": 0.3495500981807709, "learning_rate": 0.0002, "epoch": 1.1993185689948893, "step": 1760}, {"loss": 1.6578, "grad_norm": 0.557288408279419, "learning_rate": 0.0002, "epoch": 1.2061328790459966, "step": 1770}, {"loss": 1.7488, "grad_norm": 0.3193778693675995, "learning_rate": 0.0002, "epoch": 1.212947189097104, "step": 1780}, {"loss": 1.6815, "grad_norm": 0.3306216895580292, "learning_rate": 0.0002, "epoch": 1.2197614991482113, "step": 1790}, {"loss": 1.6772, "grad_norm": 0.37998732924461365, "learning_rate": 0.0002, "epoch": 1.2265758091993186, "step": 1800}, {"loss": 1.7094, "grad_norm": 0.3255669176578522, "learning_rate": 0.0002, "epoch": 1.233390119250426, "step": 1810}, {"loss": 1.7096, "grad_norm": 0.3741287291049957, "learning_rate": 0.0002, "epoch": 1.2402044293015333, "step": 1820}, {"loss": 1.7317, "grad_norm": 0.36727291345596313, "learning_rate": 0.0002, "epoch": 1.2470187393526406, "step": 1830}, {"loss": 1.7418, "grad_norm": 0.3479527235031128, "learning_rate": 0.0002, "epoch": 1.253833049403748, "step": 1840}, {"loss": 1.7062, "grad_norm": 0.3472636640071869, "learning_rate": 0.0002, "epoch": 1.2606473594548553, "step": 1850}, {"loss": 1.6263, "grad_norm": 0.3702869415283203, "learning_rate": 0.0002, "epoch": 1.2674616695059626, "step": 1860}, {"loss": 1.6222, "grad_norm": 0.3934040069580078, "learning_rate": 0.0002, "epoch": 1.27427597955707, "step": 1870}, {"loss": 1.7616, "grad_norm": 0.46887534856796265, "learning_rate": 0.0002, "epoch": 1.2810902896081773, "step": 1880}, {"loss": 1.6071, "grad_norm": 0.3191998600959778, "learning_rate": 0.0002, "epoch": 1.2879045996592846, "step": 1890}, {"loss": 1.7001, "grad_norm": 0.34032225608825684, "learning_rate": 0.0002, "epoch": 1.294718909710392, "step": 1900}, {"loss": 1.8316, "grad_norm": 0.33453696966171265, "learning_rate": 0.0002, "epoch": 1.3015332197614993, "step": 1910}, {"loss": 1.6567, "grad_norm": 0.3451494872570038, "learning_rate": 0.0002, "epoch": 1.3083475298126066, "step": 1920}, {"loss": 1.708, "grad_norm": 0.36203092336654663, "learning_rate": 0.0002, "epoch": 1.315161839863714, "step": 1930}, {"loss": 1.7095, "grad_norm": 0.43794456124305725, "learning_rate": 0.0002, "epoch": 1.321976149914821, "step": 1940}, {"loss": 1.7264, "grad_norm": 0.3630591034889221, "learning_rate": 0.0002, "epoch": 1.3287904599659284, "step": 1950}, {"loss": 1.6529, "grad_norm": 0.36951911449432373, "learning_rate": 0.0002, "epoch": 1.3356047700170357, "step": 1960}, {"loss": 1.651, "grad_norm": 0.4001159965991974, "learning_rate": 0.0002, "epoch": 1.342419080068143, "step": 1970}, {"loss": 1.677, "grad_norm": 0.3820836544036865, "learning_rate": 0.0002, "epoch": 1.3492333901192504, "step": 1980}, {"loss": 1.583, "grad_norm": 0.3705870807170868, "learning_rate": 0.0002, "epoch": 1.3560477001703577, "step": 1990}, {"loss": 1.6207, "grad_norm": 0.3557972013950348, "learning_rate": 0.0002, "epoch": 1.362862010221465, "step": 2000}, {"loss": 1.7656, "grad_norm": 0.38546398282051086, "learning_rate": 0.0002, "epoch": 1.3696763202725724, "step": 2010}, {"loss": 1.6881, "grad_norm": 0.3908020853996277, "learning_rate": 0.0002, "epoch": 1.3764906303236797, "step": 2020}, {"loss": 1.727, "grad_norm": 0.3822040855884552, "learning_rate": 0.0002, "epoch": 1.383304940374787, "step": 2030}, {"loss": 1.7923, "grad_norm": 0.425327867269516, "learning_rate": 0.0002, "epoch": 1.3901192504258943, "step": 2040}, {"loss": 1.7032, "grad_norm": 0.3436259329319, "learning_rate": 0.0002, "epoch": 1.3969335604770017, "step": 2050}, {"loss": 1.6398, "grad_norm": 0.33124062418937683, "learning_rate": 0.0002, "epoch": 1.403747870528109, "step": 2060}, {"loss": 1.6815, "grad_norm": 0.3662424683570862, "learning_rate": 0.0002, "epoch": 1.4105621805792163, "step": 2070}, {"loss": 1.7258, "grad_norm": 0.3720635175704956, "learning_rate": 0.0002, "epoch": 1.4173764906303237, "step": 2080}, {"loss": 1.7186, "grad_norm": 0.3361680805683136, "learning_rate": 0.0002, "epoch": 1.424190800681431, "step": 2090}, {"loss": 1.7606, "grad_norm": 0.32751724123954773, "learning_rate": 0.0002, "epoch": 1.4310051107325383, "step": 2100}, {"loss": 1.7051, "grad_norm": 0.34333378076553345, "learning_rate": 0.0002, "epoch": 1.4378194207836457, "step": 2110}, {"loss": 1.6979, "grad_norm": 0.37777671217918396, "learning_rate": 0.0002, "epoch": 1.444633730834753, "step": 2120}, {"loss": 1.7085, "grad_norm": 0.37126365303993225, "learning_rate": 0.0002, "epoch": 1.4514480408858603, "step": 2130}, {"loss": 1.721, "grad_norm": 0.3602267801761627, "learning_rate": 0.0002, "epoch": 1.4582623509369677, "step": 2140}, {"loss": 1.8148, "grad_norm": 0.3287110924720764, "learning_rate": 0.0002, "epoch": 1.465076660988075, "step": 2150}, {"loss": 1.6966, "grad_norm": 0.3562135100364685, "learning_rate": 0.0002, "epoch": 1.4718909710391823, "step": 2160}, {"loss": 1.713, "grad_norm": 0.38292962312698364, "learning_rate": 0.0002, "epoch": 1.4787052810902896, "step": 2170}, {"loss": 1.7036, "grad_norm": 0.38220319151878357, "learning_rate": 0.0002, "epoch": 1.485519591141397, "step": 2180}, {"loss": 1.7297, "grad_norm": 0.3570062220096588, "learning_rate": 0.0002, "epoch": 1.4923339011925043, "step": 2190}, {"loss": 1.5652, "grad_norm": 0.363146036863327, "learning_rate": 0.0002, "epoch": 1.4991482112436116, "step": 2200}, {"loss": 1.6716, "grad_norm": 0.37393274903297424, "learning_rate": 0.0002, "epoch": 1.5059625212947187, "step": 2210}, {"loss": 1.6839, "grad_norm": 0.3628501892089844, "learning_rate": 0.0002, "epoch": 1.512776831345826, "step": 2220}, {"loss": 1.6627, "grad_norm": 0.33430740237236023, "learning_rate": 0.0002, "epoch": 1.5195911413969334, "step": 2230}, {"loss": 1.7495, "grad_norm": 0.35363978147506714, "learning_rate": 0.0002, "epoch": 1.5264054514480407, "step": 2240}, {"loss": 1.6364, "grad_norm": 0.37220337986946106, "learning_rate": 0.0002, "epoch": 1.533219761499148, "step": 2250}, {"loss": 1.7773, "grad_norm": 0.35020262002944946, "learning_rate": 0.0002, "epoch": 1.5400340715502554, "step": 2260}, {"loss": 1.7453, "grad_norm": 0.35274937748908997, "learning_rate": 0.0002, "epoch": 1.5468483816013627, "step": 2270}, {"loss": 1.7162, "grad_norm": 0.3797738552093506, "learning_rate": 0.0002, "epoch": 1.55366269165247, "step": 2280}, {"loss": 1.6197, "grad_norm": 0.4160412847995758, "learning_rate": 0.0002, "epoch": 1.5604770017035774, "step": 2290}, {"loss": 1.7101, "grad_norm": 0.38779592514038086, "learning_rate": 0.0002, "epoch": 1.5672913117546847, "step": 2300}, {"loss": 1.7254, "grad_norm": 0.39171287417411804, "learning_rate": 0.0002, "epoch": 1.574105621805792, "step": 2310}, {"loss": 1.7087, "grad_norm": 0.3606826663017273, "learning_rate": 0.0002, "epoch": 1.5809199318568994, "step": 2320}, {"loss": 1.7269, "grad_norm": 0.3745017647743225, "learning_rate": 0.0002, "epoch": 1.5877342419080067, "step": 2330}, {"loss": 1.7068, "grad_norm": 0.34933462738990784, "learning_rate": 0.0002, "epoch": 1.594548551959114, "step": 2340}, {"loss": 1.7073, "grad_norm": 0.37268444895744324, "learning_rate": 0.0002, "epoch": 1.6013628620102214, "step": 2350}, {"loss": 1.728, "grad_norm": 0.4603484869003296, "learning_rate": 0.0002, "epoch": 1.6081771720613287, "step": 2360}, {"loss": 1.7621, "grad_norm": 0.35689088702201843, "learning_rate": 0.0002, "epoch": 1.614991482112436, "step": 2370}, {"loss": 1.6989, "grad_norm": 0.3392031490802765, "learning_rate": 0.0002, "epoch": 1.6218057921635434, "step": 2380}, {"loss": 1.7077, "grad_norm": 0.394653856754303, "learning_rate": 0.0002, "epoch": 1.6286201022146507, "step": 2390}, {"loss": 1.7448, "grad_norm": 0.33972012996673584, "learning_rate": 0.0002, "epoch": 1.635434412265758, "step": 2400}, {"loss": 1.7681, "grad_norm": 0.3854375183582306, "learning_rate": 0.0002, "epoch": 1.6422487223168654, "step": 2410}, {"loss": 1.7102, "grad_norm": 0.36143961548805237, "learning_rate": 0.0002, "epoch": 1.6490630323679727, "step": 2420}, {"loss": 1.7432, "grad_norm": 0.35816189646720886, "learning_rate": 0.0002, "epoch": 1.65587734241908, "step": 2430}, {"loss": 1.6407, "grad_norm": 0.36298736929893494, "learning_rate": 0.0002, "epoch": 1.6626916524701874, "step": 2440}, {"loss": 1.723, "grad_norm": 0.36756929755210876, "learning_rate": 0.0002, "epoch": 1.6695059625212947, "step": 2450}, {"loss": 1.6824, "grad_norm": 0.35969603061676025, "learning_rate": 0.0002, "epoch": 1.676320272572402, "step": 2460}, {"loss": 1.7014, "grad_norm": 0.38449376821517944, "learning_rate": 0.0002, "epoch": 1.6831345826235093, "step": 2470}, {"loss": 1.7261, "grad_norm": 0.44511452317237854, "learning_rate": 0.0002, "epoch": 1.6899488926746167, "step": 2480}, {"loss": 1.7397, "grad_norm": 0.3831416368484497, "learning_rate": 0.0002, "epoch": 1.696763202725724, "step": 2490}, {"loss": 1.7046, "grad_norm": 0.3795325756072998, "learning_rate": 0.0002, "epoch": 1.7035775127768313, "step": 2500}, {"loss": 1.6993, "grad_norm": 0.34978193044662476, "learning_rate": 0.0002, "epoch": 1.7103918228279387, "step": 2510}, {"loss": 1.8307, "grad_norm": 0.35923877358436584, "learning_rate": 0.0002, "epoch": 1.717206132879046, "step": 2520}, {"loss": 1.6253, "grad_norm": 0.352999746799469, "learning_rate": 0.0002, "epoch": 1.7240204429301533, "step": 2530}, {"loss": 1.6953, "grad_norm": 0.43673479557037354, "learning_rate": 0.0002, "epoch": 1.7308347529812607, "step": 2540}, {"loss": 1.7079, "grad_norm": 0.4153687357902527, "learning_rate": 0.0002, "epoch": 1.737649063032368, "step": 2550}, {"loss": 1.6714, "grad_norm": 0.35541167855262756, "learning_rate": 0.0002, "epoch": 1.7444633730834753, "step": 2560}, {"loss": 1.696, "grad_norm": 0.3288775086402893, "learning_rate": 0.0002, "epoch": 1.7512776831345827, "step": 2570}, {"loss": 1.7486, "grad_norm": 0.3991123139858246, "learning_rate": 0.0002, "epoch": 1.75809199318569, "step": 2580}, {"loss": 1.645, "grad_norm": 0.39967241883277893, "learning_rate": 0.0002, "epoch": 1.7649063032367973, "step": 2590}, {"loss": 1.6855, "grad_norm": 0.41104283928871155, "learning_rate": 0.0002, "epoch": 1.7717206132879046, "step": 2600}, {"loss": 1.6993, "grad_norm": 0.44885286688804626, "learning_rate": 0.0002, "epoch": 1.778534923339012, "step": 2610}, {"loss": 1.7224, "grad_norm": 0.38384467363357544, "learning_rate": 0.0002, "epoch": 1.7853492333901193, "step": 2620}, {"loss": 1.7213, "grad_norm": 0.35020917654037476, "learning_rate": 0.0002, "epoch": 1.7921635434412266, "step": 2630}, {"loss": 1.6706, "grad_norm": 0.3360341489315033, "learning_rate": 0.0002, "epoch": 1.798977853492334, "step": 2640}, {"loss": 1.7037, "grad_norm": 0.38875144720077515, "learning_rate": 0.0002, "epoch": 1.8057921635434413, "step": 2650}, {"loss": 1.693, "grad_norm": 0.34876883029937744, "learning_rate": 0.0002, "epoch": 1.8126064735945486, "step": 2660}, {"loss": 1.7743, "grad_norm": 0.419979989528656, "learning_rate": 0.0002, "epoch": 1.819420783645656, "step": 2670}, {"loss": 1.7089, "grad_norm": 0.3648919463157654, "learning_rate": 0.0002, "epoch": 1.8262350936967633, "step": 2680}, {"loss": 1.7828, "grad_norm": 0.3485383987426758, "learning_rate": 0.0002, "epoch": 1.8330494037478706, "step": 2690}, {"loss": 1.705, "grad_norm": 0.3647468686103821, "learning_rate": 0.0002, "epoch": 1.839863713798978, "step": 2700}, {"loss": 1.7318, "grad_norm": 0.37003210186958313, "learning_rate": 0.0002, "epoch": 1.8466780238500853, "step": 2710}, {"loss": 1.6647, "grad_norm": 0.37031617760658264, "learning_rate": 0.0002, "epoch": 1.8534923339011926, "step": 2720}, {"loss": 1.69, "grad_norm": 0.3438796103000641, "learning_rate": 0.0002, "epoch": 1.8603066439523, "step": 2730}, {"loss": 1.5995, "grad_norm": 0.41574627161026, "learning_rate": 0.0002, "epoch": 1.8671209540034073, "step": 2740}, {"loss": 1.6877, "grad_norm": 0.35049930214881897, "learning_rate": 0.0002, "epoch": 1.8739352640545146, "step": 2750}, {"loss": 1.7048, "grad_norm": 0.3943989872932434, "learning_rate": 0.0002, "epoch": 1.880749574105622, "step": 2760}, {"loss": 1.7047, "grad_norm": 0.3384978175163269, "learning_rate": 0.0002, "epoch": 1.8875638841567293, "step": 2770}, {"loss": 1.7848, "grad_norm": 0.3501328229904175, "learning_rate": 0.0002, "epoch": 1.8943781942078366, "step": 2780}, {"loss": 1.6986, "grad_norm": 0.37484532594680786, "learning_rate": 0.0002, "epoch": 1.901192504258944, "step": 2790}, {"loss": 1.6867, "grad_norm": 0.34497788548469543, "learning_rate": 0.0002, "epoch": 1.9080068143100513, "step": 2800}, {"loss": 1.6513, "grad_norm": 0.3530851900577545, "learning_rate": 0.0002, "epoch": 1.9148211243611586, "step": 2810}, {"loss": 1.6369, "grad_norm": 0.3879254162311554, "learning_rate": 0.0002, "epoch": 1.921635434412266, "step": 2820}, {"loss": 1.6786, "grad_norm": 0.3885590136051178, "learning_rate": 0.0002, "epoch": 1.9284497444633732, "step": 2830}, {"loss": 1.8049, "grad_norm": 0.3868715465068817, "learning_rate": 0.0002, "epoch": 1.9352640545144804, "step": 2840}, {"loss": 1.7099, "grad_norm": 0.4152422249317169, "learning_rate": 0.0002, "epoch": 1.9420783645655877, "step": 2850}, {"loss": 1.6696, "grad_norm": 0.401714563369751, "learning_rate": 0.0002, "epoch": 1.948892674616695, "step": 2860}, {"loss": 1.7182, "grad_norm": 0.34825265407562256, "learning_rate": 0.0002, "epoch": 1.9557069846678024, "step": 2870}, {"loss": 1.6612, "grad_norm": 0.3620675504207611, "learning_rate": 0.0002, "epoch": 1.9625212947189097, "step": 2880}, {"loss": 1.7451, "grad_norm": 0.3977806866168976, "learning_rate": 0.0002, "epoch": 1.969335604770017, "step": 2890}, {"loss": 1.6514, "grad_norm": 0.3687497079372406, "learning_rate": 0.0002, "epoch": 1.9761499148211243, "step": 2900}, {"loss": 1.7712, "grad_norm": 0.408640056848526, "learning_rate": 0.0002, "epoch": 1.9829642248722317, "step": 2910}, {"loss": 1.695, "grad_norm": 0.34510108828544617, "learning_rate": 0.0002, "epoch": 1.989778534923339, "step": 2920}, {"loss": 1.662, "grad_norm": 0.3596334755420685, "learning_rate": 0.0002, "epoch": 1.9965928449744463, "step": 2930}, {"eval_loss": 1.8056600093841553, "eval_runtime": 60.7049, "eval_samples_per_second": 8.352, "eval_steps_per_second": 1.054, "epoch": 2.0, "step": 2935}, {"loss": 1.6918, "grad_norm": 0.3460802137851715, "learning_rate": 0.0002, "epoch": 2.0034071550255534, "step": 2940}, {"loss": 1.6036, "grad_norm": 0.4038620591163635, "learning_rate": 0.0002, "epoch": 2.0102214650766608, "step": 2950}, {"loss": 1.5386, "grad_norm": 0.3950219750404358, "learning_rate": 0.0002, "epoch": 2.017035775127768, "step": 2960}, {"loss": 1.5649, "grad_norm": 0.519116997718811, "learning_rate": 0.0002, "epoch": 2.0238500851788754, "step": 2970}, {"loss": 1.5397, "grad_norm": 0.4097684919834137, "learning_rate": 0.0002, "epoch": 2.0306643952299828, "step": 2980}, {"loss": 1.6044, "grad_norm": 0.4153544306755066, "learning_rate": 0.0002, "epoch": 2.03747870528109, "step": 2990}, {"loss": 1.6006, "grad_norm": 0.4351160526275635, "learning_rate": 0.0002, "epoch": 2.0442930153321974, "step": 3000}, {"loss": 1.5091, "grad_norm": 0.42036259174346924, "learning_rate": 0.0002, "epoch": 2.0511073253833048, "step": 3010}, {"loss": 1.5686, "grad_norm": 0.4433218836784363, "learning_rate": 0.0002, "epoch": 2.057921635434412, "step": 3020}, {"loss": 1.5478, "grad_norm": 0.46511581540107727, "learning_rate": 0.0002, "epoch": 2.0647359454855194, "step": 3030}, {"loss": 1.5554, "grad_norm": 0.4567560851573944, "learning_rate": 0.0002, "epoch": 2.0715502555366268, "step": 3040}, {"loss": 1.5561, "grad_norm": 0.45671048760414124, "learning_rate": 0.0002, "epoch": 2.078364565587734, "step": 3050}, {"loss": 1.606, "grad_norm": 0.4598552882671356, "learning_rate": 0.0002, "epoch": 2.0851788756388414, "step": 3060}, {"loss": 1.6357, "grad_norm": 0.4582861661911011, "learning_rate": 0.0002, "epoch": 2.0919931856899487, "step": 3070}, {"loss": 1.5853, "grad_norm": 0.4366969168186188, "learning_rate": 0.0002, "epoch": 2.098807495741056, "step": 3080}, {"loss": 1.5574, "grad_norm": 0.495917409658432, "learning_rate": 0.0002, "epoch": 2.1056218057921634, "step": 3090}, {"loss": 1.5798, "grad_norm": 1.6846044063568115, "learning_rate": 0.0002, "epoch": 2.1124361158432707, "step": 3100}, {"loss": 1.5877, "grad_norm": 0.4765092134475708, "learning_rate": 0.0002, "epoch": 2.119250425894378, "step": 3110}, {"loss": 1.6106, "grad_norm": 0.45029810070991516, "learning_rate": 0.0002, "epoch": 2.1260647359454854, "step": 3120}, {"loss": 1.5707, "grad_norm": 0.5706973075866699, "learning_rate": 0.0002, "epoch": 2.1328790459965927, "step": 3130}, {"loss": 1.589, "grad_norm": 0.4606274366378784, "learning_rate": 0.0002, "epoch": 2.1396933560477, "step": 3140}, {"loss": 1.564, "grad_norm": 2.199115753173828, "learning_rate": 0.0002, "epoch": 2.1465076660988074, "step": 3150}, {"loss": 1.5808, "grad_norm": 0.6231027245521545, "learning_rate": 0.0002, "epoch": 2.1533219761499147, "step": 3160}, {"loss": 1.5397, "grad_norm": 0.46918219327926636, "learning_rate": 0.0002, "epoch": 2.160136286201022, "step": 3170}, {"loss": 1.6082, "grad_norm": 0.5006393194198608, "learning_rate": 0.0002, "epoch": 2.1669505962521294, "step": 3180}, {"loss": 1.591, "grad_norm": 0.4745093286037445, "learning_rate": 0.0002, "epoch": 2.1737649063032367, "step": 3190}, {"loss": 1.6459, "grad_norm": 0.511576771736145, "learning_rate": 0.0002, "epoch": 2.180579216354344, "step": 3200}, {"loss": 1.6019, "grad_norm": 0.38622918725013733, "learning_rate": 0.0002, "epoch": 2.1873935264054514, "step": 3210}, {"loss": 1.6187, "grad_norm": 0.4425644278526306, "learning_rate": 0.0002, "epoch": 2.1942078364565587, "step": 3220}, {"loss": 1.6114, "grad_norm": 0.45643091201782227, "learning_rate": 0.0002, "epoch": 2.201022146507666, "step": 3230}, {"loss": 1.6224, "grad_norm": 0.4975406229496002, "learning_rate": 0.0002, "epoch": 2.2078364565587734, "step": 3240}, {"loss": 1.6654, "grad_norm": 0.4673331081867218, "learning_rate": 0.0002, "epoch": 2.2146507666098807, "step": 3250}, {"loss": 1.6155, "grad_norm": 0.5081731081008911, "learning_rate": 0.0002, "epoch": 2.221465076660988, "step": 3260}, {"loss": 1.53, "grad_norm": 0.4790806770324707, "learning_rate": 0.0002, "epoch": 2.2282793867120954, "step": 3270}, {"loss": 1.6362, "grad_norm": 0.5184140801429749, "learning_rate": 0.0002, "epoch": 2.2350936967632027, "step": 3280}, {"loss": 1.5646, "grad_norm": 0.5159083604812622, "learning_rate": 0.0002, "epoch": 2.24190800681431, "step": 3290}, {"loss": 1.6577, "grad_norm": 0.4876042604446411, "learning_rate": 0.0002, "epoch": 2.2487223168654173, "step": 3300}, {"loss": 1.5718, "grad_norm": 0.5454957485198975, "learning_rate": 0.0002, "epoch": 2.2555366269165247, "step": 3310}, {"loss": 1.5885, "grad_norm": 0.49866822361946106, "learning_rate": 0.0002, "epoch": 2.262350936967632, "step": 3320}, {"loss": 1.5923, "grad_norm": 0.42674365639686584, "learning_rate": 0.0002, "epoch": 2.2691652470187393, "step": 3330}, {"loss": 1.5588, "grad_norm": 0.5202316641807556, "learning_rate": 0.0002, "epoch": 2.2759795570698467, "step": 3340}, {"loss": 1.6032, "grad_norm": 0.4849465489387512, "learning_rate": 0.0002, "epoch": 2.282793867120954, "step": 3350}, {"loss": 1.6853, "grad_norm": 0.47202569246292114, "learning_rate": 0.0002, "epoch": 2.2896081771720613, "step": 3360}, {"loss": 1.6164, "grad_norm": 0.5311620235443115, "learning_rate": 0.0002, "epoch": 2.2964224872231687, "step": 3370}, {"loss": 1.6004, "grad_norm": 0.49011409282684326, "learning_rate": 0.0002, "epoch": 2.303236797274276, "step": 3380}, {"loss": 1.5484, "grad_norm": 0.4789247512817383, "learning_rate": 0.0002, "epoch": 2.3100511073253833, "step": 3390}, {"loss": 1.5866, "grad_norm": 0.46646103262901306, "learning_rate": 0.0002, "epoch": 2.3168654173764907, "step": 3400}, {"loss": 1.5308, "grad_norm": 0.5552441477775574, "learning_rate": 0.0002, "epoch": 2.323679727427598, "step": 3410}, {"loss": 1.5761, "grad_norm": 0.4530351758003235, "learning_rate": 0.0002, "epoch": 2.3304940374787053, "step": 3420}, {"loss": 1.5919, "grad_norm": 0.4806232750415802, "learning_rate": 0.0002, "epoch": 2.3373083475298126, "step": 3430}, {"loss": 1.5569, "grad_norm": 0.5998363494873047, "learning_rate": 0.0002, "epoch": 2.34412265758092, "step": 3440}, {"loss": 1.513, "grad_norm": 0.4918554425239563, "learning_rate": 0.0002, "epoch": 2.3509369676320273, "step": 3450}, {"loss": 1.6323, "grad_norm": 0.5359559655189514, "learning_rate": 0.0002, "epoch": 2.3577512776831346, "step": 3460}, {"loss": 1.5973, "grad_norm": 0.5053277611732483, "learning_rate": 0.0002, "epoch": 2.364565587734242, "step": 3470}, {"loss": 1.5673, "grad_norm": 0.5058915019035339, "learning_rate": 0.0002, "epoch": 2.3713798977853493, "step": 3480}, {"loss": 1.5434, "grad_norm": 0.5314899682998657, "learning_rate": 0.0002, "epoch": 2.3781942078364566, "step": 3490}, {"loss": 1.5203, "grad_norm": 0.48035913705825806, "learning_rate": 0.0002, "epoch": 2.385008517887564, "step": 3500}, {"loss": 1.5936, "grad_norm": 0.45864903926849365, "learning_rate": 0.0002, "epoch": 2.3918228279386713, "step": 3510}, {"loss": 1.6285, "grad_norm": 0.4553050398826599, "learning_rate": 0.0002, "epoch": 2.3986371379897786, "step": 3520}, {"loss": 1.5444, "grad_norm": 0.4483442008495331, "learning_rate": 0.0002, "epoch": 2.405451448040886, "step": 3530}, {"loss": 1.587, "grad_norm": 0.5043742060661316, "learning_rate": 0.0002, "epoch": 2.4122657580919933, "step": 3540}, {"loss": 1.5692, "grad_norm": 0.44807168841362, "learning_rate": 0.0002, "epoch": 2.4190800681431006, "step": 3550}, {"loss": 1.6306, "grad_norm": 0.5065137147903442, "learning_rate": 0.0002, "epoch": 2.425894378194208, "step": 3560}, {"loss": 1.5842, "grad_norm": 0.5186443328857422, "learning_rate": 0.0002, "epoch": 2.4327086882453153, "step": 3570}, {"loss": 1.5956, "grad_norm": 0.49743232131004333, "learning_rate": 0.0002, "epoch": 2.4395229982964226, "step": 3580}, {"loss": 1.6021, "grad_norm": 0.524450421333313, "learning_rate": 0.0002, "epoch": 2.44633730834753, "step": 3590}, {"loss": 1.6283, "grad_norm": 0.5053797364234924, "learning_rate": 0.0002, "epoch": 2.4531516183986373, "step": 3600}, {"loss": 1.6335, "grad_norm": 0.5223091840744019, "learning_rate": 0.0002, "epoch": 2.4599659284497446, "step": 3610}, {"loss": 1.6315, "grad_norm": 0.4763810932636261, "learning_rate": 0.0002, "epoch": 2.466780238500852, "step": 3620}, {"loss": 1.5623, "grad_norm": 0.5097282528877258, "learning_rate": 0.0002, "epoch": 2.4735945485519593, "step": 3630}, {"loss": 1.605, "grad_norm": 0.5831942558288574, "learning_rate": 0.0002, "epoch": 2.4804088586030666, "step": 3640}, {"loss": 1.6074, "grad_norm": 0.47573572397232056, "learning_rate": 0.0002, "epoch": 2.487223168654174, "step": 3650}, {"loss": 1.6411, "grad_norm": 0.49602726101875305, "learning_rate": 0.0002, "epoch": 2.4940374787052813, "step": 3660}, {"loss": 1.571, "grad_norm": 0.5468524694442749, "learning_rate": 0.0002, "epoch": 2.500851788756388, "step": 3670}, {"loss": 1.5782, "grad_norm": 0.45899373292922974, "learning_rate": 0.0002, "epoch": 2.507666098807496, "step": 3680}, {"loss": 1.5114, "grad_norm": 0.5031567215919495, "learning_rate": 0.0002, "epoch": 2.514480408858603, "step": 3690}, {"loss": 1.538, "grad_norm": 0.5224900841712952, "learning_rate": 0.0002, "epoch": 2.5212947189097106, "step": 3700}, {"loss": 1.5269, "grad_norm": 0.504769504070282, "learning_rate": 0.0002, "epoch": 2.5281090289608175, "step": 3710}, {"loss": 1.5141, "grad_norm": 0.6120529770851135, "learning_rate": 0.0002, "epoch": 2.5349233390119252, "step": 3720}, {"loss": 1.5666, "grad_norm": 0.47930678725242615, "learning_rate": 0.0002, "epoch": 2.541737649063032, "step": 3730}, {"loss": 1.6156, "grad_norm": 0.5039092302322388, "learning_rate": 0.0002, "epoch": 2.54855195911414, "step": 3740}, {"loss": 1.5585, "grad_norm": 0.49758994579315186, "learning_rate": 0.0002, "epoch": 2.555366269165247, "step": 3750}, {"loss": 1.5351, "grad_norm": 0.44739171862602234, "learning_rate": 0.0002, "epoch": 2.5621805792163546, "step": 3760}, {"loss": 1.5099, "grad_norm": 0.47056373953819275, "learning_rate": 0.0002, "epoch": 2.5689948892674614, "step": 3770}, {"loss": 1.5524, "grad_norm": 0.5077595114707947, "learning_rate": 0.0002, "epoch": 2.575809199318569, "step": 3780}, {"loss": 1.5524, "grad_norm": 0.4981902837753296, "learning_rate": 0.0002, "epoch": 2.582623509369676, "step": 3790}, {"loss": 1.5381, "grad_norm": 0.5736238360404968, "learning_rate": 0.0002, "epoch": 2.589437819420784, "step": 3800}, {"loss": 1.67, "grad_norm": 0.4898384213447571, "learning_rate": 0.0002, "epoch": 2.5962521294718908, "step": 3810}, {"loss": 1.5411, "grad_norm": 0.4611325263977051, "learning_rate": 0.0002, "epoch": 2.6030664395229985, "step": 3820}, {"loss": 1.5662, "grad_norm": 0.5285341739654541, "learning_rate": 0.0002, "epoch": 2.6098807495741054, "step": 3830}, {"loss": 1.5875, "grad_norm": 0.5679430961608887, "learning_rate": 0.0002, "epoch": 2.616695059625213, "step": 3840}, {"loss": 1.5544, "grad_norm": 0.48532548546791077, "learning_rate": 0.0002, "epoch": 2.62350936967632, "step": 3850}, {"loss": 1.579, "grad_norm": 0.45506492257118225, "learning_rate": 0.0002, "epoch": 2.630323679727428, "step": 3860}, {"loss": 1.5775, "grad_norm": 0.6552556753158569, "learning_rate": 0.0002, "epoch": 2.6371379897785348, "step": 3870}, {"loss": 1.608, "grad_norm": 0.537874698638916, "learning_rate": 0.0002, "epoch": 2.643952299829642, "step": 3880}, {"loss": 1.5653, "grad_norm": 0.46102389693260193, "learning_rate": 0.0002, "epoch": 2.6507666098807494, "step": 3890}, {"loss": 1.5293, "grad_norm": 0.45531195402145386, "learning_rate": 0.0002, "epoch": 2.6575809199318567, "step": 3900}, {"loss": 1.5492, "grad_norm": 0.5327293872833252, "learning_rate": 0.0002, "epoch": 2.664395229982964, "step": 3910}, {"loss": 1.5921, "grad_norm": 0.4968956410884857, "learning_rate": 0.0002, "epoch": 2.6712095400340714, "step": 3920}, {"loss": 1.5823, "grad_norm": 0.4790082275867462, "learning_rate": 0.0002, "epoch": 2.6780238500851787, "step": 3930}, {"loss": 1.615, "grad_norm": 0.5392967462539673, "learning_rate": 0.0002, "epoch": 2.684838160136286, "step": 3940}, {"loss": 1.6218, "grad_norm": 0.5076649785041809, "learning_rate": 0.0002, "epoch": 2.6916524701873934, "step": 3950}, {"loss": 1.6478, "grad_norm": 0.5628064274787903, "learning_rate": 0.0002, "epoch": 2.6984667802385007, "step": 3960}, {"loss": 1.5417, "grad_norm": 0.5012659430503845, "learning_rate": 0.0002, "epoch": 2.705281090289608, "step": 3970}, {"loss": 1.5339, "grad_norm": 0.4947647452354431, "learning_rate": 0.0002, "epoch": 2.7120954003407154, "step": 3980}, {"loss": 1.5724, "grad_norm": 0.4890969693660736, "learning_rate": 0.0002, "epoch": 2.7189097103918227, "step": 3990}, {"loss": 1.5746, "grad_norm": 0.4471694231033325, "learning_rate": 0.0002, "epoch": 2.72572402044293, "step": 4000}, {"loss": 1.6669, "grad_norm": 0.5116439461708069, "learning_rate": 0.0002, "epoch": 2.7325383304940374, "step": 4010}, {"loss": 1.584, "grad_norm": 0.5720411539077759, "learning_rate": 0.0002, "epoch": 2.7393526405451447, "step": 4020}, {"loss": 1.6151, "grad_norm": 0.5529406070709229, "learning_rate": 0.0002, "epoch": 2.746166950596252, "step": 4030}, {"loss": 1.6296, "grad_norm": 0.5229396820068359, "learning_rate": 0.0002, "epoch": 2.7529812606473594, "step": 4040}, {"loss": 1.5363, "grad_norm": 0.5270276069641113, "learning_rate": 0.0002, "epoch": 2.7597955706984667, "step": 4050}, {"loss": 1.6305, "grad_norm": 0.48413026332855225, "learning_rate": 0.0002, "epoch": 2.766609880749574, "step": 4060}, {"loss": 1.5488, "grad_norm": 0.5145403742790222, "learning_rate": 0.0002, "epoch": 2.7734241908006814, "step": 4070}, {"loss": 1.6023, "grad_norm": 0.48626071214675903, "learning_rate": 0.0002, "epoch": 2.7802385008517887, "step": 4080}, {"loss": 1.6082, "grad_norm": 0.5018984079360962, "learning_rate": 0.0002, "epoch": 2.787052810902896, "step": 4090}, {"loss": 1.7166, "grad_norm": 0.4881938695907593, "learning_rate": 0.0002, "epoch": 2.7938671209540034, "step": 4100}, {"loss": 1.515, "grad_norm": 0.5151546001434326, "learning_rate": 0.0002, "epoch": 2.8006814310051107, "step": 4110}, {"loss": 1.6069, "grad_norm": 0.5109850764274597, "learning_rate": 0.0002, "epoch": 2.807495741056218, "step": 4120}, {"loss": 1.6153, "grad_norm": 0.5109251141548157, "learning_rate": 0.0002, "epoch": 2.8143100511073254, "step": 4130}, {"loss": 1.6365, "grad_norm": 0.5025496482849121, "learning_rate": 0.0002, "epoch": 2.8211243611584327, "step": 4140}, {"loss": 1.6292, "grad_norm": 0.49027004837989807, "learning_rate": 0.0002, "epoch": 2.82793867120954, "step": 4150}, {"loss": 1.5591, "grad_norm": 0.4957362413406372, "learning_rate": 0.0002, "epoch": 2.8347529812606473, "step": 4160}, {"loss": 1.6759, "grad_norm": 0.5159927606582642, "learning_rate": 0.0002, "epoch": 2.8415672913117547, "step": 4170}, {"loss": 1.577, "grad_norm": 0.6040670871734619, "learning_rate": 0.0002, "epoch": 2.848381601362862, "step": 4180}, {"loss": 1.5295, "grad_norm": 0.5489953756332397, "learning_rate": 0.0002, "epoch": 2.8551959114139693, "step": 4190}, {"loss": 1.5909, "grad_norm": 0.5416634678840637, "learning_rate": 0.0002, "epoch": 2.8620102214650767, "step": 4200}, {"loss": 1.6014, "grad_norm": 0.5278245210647583, "learning_rate": 0.0002, "epoch": 2.868824531516184, "step": 4210}, {"loss": 1.4848, "grad_norm": 0.43382319808006287, "learning_rate": 0.0002, "epoch": 2.8756388415672913, "step": 4220}, {"loss": 1.6092, "grad_norm": 0.4724387228488922, "learning_rate": 0.0002, "epoch": 2.8824531516183987, "step": 4230}, {"loss": 1.5748, "grad_norm": 0.49824780225753784, "learning_rate": 0.0002, "epoch": 2.889267461669506, "step": 4240}, {"loss": 1.6163, "grad_norm": 0.5360262989997864, "learning_rate": 0.0002, "epoch": 2.8960817717206133, "step": 4250}, {"loss": 1.5934, "grad_norm": 0.49090322852134705, "learning_rate": 0.0002, "epoch": 2.9028960817717206, "step": 4260}, {"loss": 1.624, "grad_norm": 0.5613328218460083, "learning_rate": 0.0002, "epoch": 2.909710391822828, "step": 4270}, {"loss": 1.5713, "grad_norm": 0.4611356258392334, "learning_rate": 0.0002, "epoch": 2.9165247018739353, "step": 4280}, {"loss": 1.5457, "grad_norm": 0.550897479057312, "learning_rate": 0.0002, "epoch": 2.9233390119250426, "step": 4290}, {"loss": 1.6225, "grad_norm": 0.5089612603187561, "learning_rate": 0.0002, "epoch": 2.93015332197615, "step": 4300}, {"loss": 1.5897, "grad_norm": 0.5210904479026794, "learning_rate": 0.0002, "epoch": 2.9369676320272573, "step": 4310}, {"loss": 1.6198, "grad_norm": 0.5506424903869629, "learning_rate": 0.0002, "epoch": 2.9437819420783646, "step": 4320}, {"loss": 1.6395, "grad_norm": 0.5118561387062073, "learning_rate": 0.0002, "epoch": 2.950596252129472, "step": 4330}, {"loss": 1.704, "grad_norm": 0.5034464597702026, "learning_rate": 0.0002, "epoch": 2.9574105621805793, "step": 4340}, {"loss": 1.6314, "grad_norm": 0.5019990801811218, "learning_rate": 0.0002, "epoch": 2.9642248722316866, "step": 4350}, {"loss": 1.6161, "grad_norm": 0.5423325300216675, "learning_rate": 0.0002, "epoch": 2.971039182282794, "step": 4360}, {"loss": 1.6144, "grad_norm": 0.5287469625473022, "learning_rate": 0.0002, "epoch": 2.9778534923339013, "step": 4370}, {"loss": 1.5227, "grad_norm": 0.5206913352012634, "learning_rate": 0.0002, "epoch": 2.9846678023850086, "step": 4380}, {"loss": 1.6026, "grad_norm": 0.5407394170761108, "learning_rate": 0.0002, "epoch": 2.991482112436116, "step": 4390}, {"loss": 1.5908, "grad_norm": 0.5244600176811218, "learning_rate": 0.0002, "epoch": 2.9982964224872233, "step": 4400}, {"eval_loss": 1.8412635326385498, "eval_runtime": 65.5583, "eval_samples_per_second": 7.734, "eval_steps_per_second": 0.976, "epoch": 2.9996592844974446, "step": 4402}, {"loss": 1.5157, "grad_norm": 0.5172150731086731, "learning_rate": 0.0002, "epoch": 3.0051107325383306, "step": 4410}, {"loss": 1.398, "grad_norm": 0.6882525086402893, "learning_rate": 0.0002, "epoch": 3.011925042589438, "step": 4420}, {"loss": 1.3884, "grad_norm": 0.6435003280639648, "learning_rate": 0.0002, "epoch": 3.0187393526405453, "step": 4430}, {"loss": 1.4493, "grad_norm": 0.7126057147979736, "learning_rate": 0.0002, "epoch": 3.0255536626916526, "step": 4440}, {"loss": 1.4397, "grad_norm": 0.6634385585784912, "learning_rate": 0.0002, "epoch": 3.03236797274276, "step": 4450}, {"loss": 1.3674, "grad_norm": 0.6468435525894165, "learning_rate": 0.0002, "epoch": 3.0391822827938673, "step": 4460}, {"loss": 1.4045, "grad_norm": 0.5690478086471558, "learning_rate": 0.0002, "epoch": 3.0459965928449746, "step": 4470}, {"loss": 1.3742, "grad_norm": 0.7323708534240723, "learning_rate": 0.0002, "epoch": 3.052810902896082, "step": 4480}, {"loss": 1.3281, "grad_norm": 0.6989302039146423, "learning_rate": 0.0002, "epoch": 3.0596252129471893, "step": 4490}, {"loss": 1.379, "grad_norm": 0.6704450845718384, "learning_rate": 0.0002, "epoch": 3.0664395229982966, "step": 4500}, {"loss": 1.4028, "grad_norm": 0.769137442111969, "learning_rate": 0.0002, "epoch": 3.073253833049404, "step": 4510}, {"loss": 1.4295, "grad_norm": 0.6556448936462402, "learning_rate": 0.0002, "epoch": 3.0800681431005112, "step": 4520}, {"loss": 1.2763, "grad_norm": 0.7143950462341309, "learning_rate": 0.0002, "epoch": 3.0868824531516186, "step": 4530}, {"loss": 1.4806, "grad_norm": 0.7060510516166687, "learning_rate": 0.0002, "epoch": 3.093696763202726, "step": 4540}, {"loss": 1.4097, "grad_norm": 0.6637526750564575, "learning_rate": 0.0002, "epoch": 3.1005110732538332, "step": 4550}, {"loss": 1.4752, "grad_norm": 0.822989284992218, "learning_rate": 0.0002, "epoch": 3.1073253833049406, "step": 4560}, {"loss": 1.4994, "grad_norm": 0.5542152523994446, "learning_rate": 0.0002, "epoch": 3.114139693356048, "step": 4570}, {"loss": 1.4306, "grad_norm": 0.7780306935310364, "learning_rate": 0.0002, "epoch": 3.1209540034071552, "step": 4580}, {"loss": 1.3909, "grad_norm": 0.7372637987136841, "learning_rate": 0.0002, "epoch": 3.1277683134582626, "step": 4590}, {"loss": 1.3989, "grad_norm": 0.6730087995529175, "learning_rate": 0.0002, "epoch": 3.1345826235093694, "step": 4600}, {"loss": 1.3591, "grad_norm": 0.6687398552894592, "learning_rate": 0.0002, "epoch": 3.1413969335604772, "step": 4610}, {"loss": 1.436, "grad_norm": 0.7645083665847778, "learning_rate": 0.0002, "epoch": 3.148211243611584, "step": 4620}, {"loss": 1.3681, "grad_norm": 0.6770380139350891, "learning_rate": 0.0002, "epoch": 3.155025553662692, "step": 4630}, {"loss": 1.405, "grad_norm": 0.7200576663017273, "learning_rate": 0.0002, "epoch": 3.1618398637137988, "step": 4640}, {"loss": 1.3752, "grad_norm": 0.6663638949394226, "learning_rate": 0.0002, "epoch": 3.168654173764906, "step": 4650}, {"loss": 1.4099, "grad_norm": 0.6602960228919983, "learning_rate": 0.0002, "epoch": 3.1754684838160134, "step": 4660}, {"loss": 1.4003, "grad_norm": 0.7838228344917297, "learning_rate": 0.0002, "epoch": 3.1822827938671208, "step": 4670}, {"loss": 1.3853, "grad_norm": 0.7559184432029724, "learning_rate": 0.0002, "epoch": 3.189097103918228, "step": 4680}, {"loss": 1.4516, "grad_norm": 0.6609814167022705, "learning_rate": 0.0002, "epoch": 3.1959114139693354, "step": 4690}, {"loss": 1.4464, "grad_norm": 0.8470419645309448, "learning_rate": 0.0002, "epoch": 3.2027257240204428, "step": 4700}, {"loss": 1.428, "grad_norm": 0.7282822728157043, "learning_rate": 0.0002, "epoch": 3.20954003407155, "step": 4710}, {"loss": 1.5261, "grad_norm": 0.6722773313522339, "learning_rate": 0.0002, "epoch": 3.2163543441226574, "step": 4720}, {"loss": 1.3809, "grad_norm": 0.7630265355110168, "learning_rate": 0.0002, "epoch": 3.2231686541737647, "step": 4730}, {"loss": 1.42, "grad_norm": 0.7102773785591125, "learning_rate": 0.0002, "epoch": 3.229982964224872, "step": 4740}, {"loss": 1.3529, "grad_norm": 0.7778299450874329, "learning_rate": 0.0002, "epoch": 3.2367972742759794, "step": 4750}, {"loss": 1.4715, "grad_norm": 0.7189921736717224, "learning_rate": 0.0002, "epoch": 3.2436115843270867, "step": 4760}, {"loss": 1.4328, "grad_norm": 0.7708092331886292, "learning_rate": 0.0002, "epoch": 3.250425894378194, "step": 4770}, {"loss": 1.3855, "grad_norm": 0.7208452224731445, "learning_rate": 0.0002, "epoch": 3.2572402044293014, "step": 4780}, {"loss": 1.3206, "grad_norm": 0.7220432758331299, "learning_rate": 0.0002, "epoch": 3.2640545144804087, "step": 4790}, {"loss": 1.463, "grad_norm": 0.7064954042434692, "learning_rate": 0.0002, "epoch": 3.270868824531516, "step": 4800}, {"loss": 1.4236, "grad_norm": 0.6618382334709167, "learning_rate": 0.0002, "epoch": 3.2776831345826234, "step": 4810}, {"loss": 1.3878, "grad_norm": 0.6854256391525269, "learning_rate": 0.0002, "epoch": 3.2844974446337307, "step": 4820}, {"loss": 1.4236, "grad_norm": 0.6036319136619568, "learning_rate": 0.0002, "epoch": 3.291311754684838, "step": 4830}, {"loss": 1.4796, "grad_norm": 0.714678943157196, "learning_rate": 0.0002, "epoch": 3.2981260647359454, "step": 4840}, {"loss": 1.4273, "grad_norm": 0.7218600511550903, "learning_rate": 0.0002, "epoch": 3.3049403747870527, "step": 4850}, {"loss": 1.3915, "grad_norm": 0.7243074774742126, "learning_rate": 0.0002, "epoch": 3.31175468483816, "step": 4860}, {"loss": 1.4088, "grad_norm": 0.7058630585670471, "learning_rate": 0.0002, "epoch": 3.3185689948892674, "step": 4870}, {"loss": 1.3837, "grad_norm": 0.7091076970100403, "learning_rate": 0.0002, "epoch": 3.3253833049403747, "step": 4880}, {"loss": 1.4745, "grad_norm": 0.7375147342681885, "learning_rate": 0.0002, "epoch": 3.332197614991482, "step": 4890}, {"loss": 1.4826, "grad_norm": 0.9426755309104919, "learning_rate": 0.0002, "epoch": 3.3390119250425894, "step": 4900}, {"loss": 1.369, "grad_norm": 0.6508213877677917, "learning_rate": 0.0002, "epoch": 3.3458262350936967, "step": 4910}, {"loss": 1.3839, "grad_norm": 0.6945043206214905, "learning_rate": 0.0002, "epoch": 3.352640545144804, "step": 4920}, {"loss": 1.3571, "grad_norm": 0.6335888504981995, "learning_rate": 0.0002, "epoch": 3.3594548551959114, "step": 4930}, {"loss": 1.4391, "grad_norm": 0.6947107911109924, "learning_rate": 0.0002, "epoch": 3.3662691652470187, "step": 4940}, {"loss": 1.3885, "grad_norm": 0.8204733729362488, "learning_rate": 0.0002, "epoch": 3.373083475298126, "step": 4950}, {"loss": 1.4886, "grad_norm": 0.7212244868278503, "learning_rate": 0.0002, "epoch": 3.3798977853492334, "step": 4960}, {"loss": 1.4581, "grad_norm": 0.6053042411804199, "learning_rate": 0.0002, "epoch": 3.3867120954003407, "step": 4970}, {"loss": 1.3863, "grad_norm": 0.7820029854774475, "learning_rate": 0.0002, "epoch": 3.393526405451448, "step": 4980}, {"loss": 1.4326, "grad_norm": 0.6866770386695862, "learning_rate": 0.0002, "epoch": 3.4003407155025553, "step": 4990}, {"loss": 1.4287, "grad_norm": 0.6652463674545288, "learning_rate": 0.0002, "epoch": 3.4071550255536627, "step": 5000}, {"loss": 1.3667, "grad_norm": 1.1209032535552979, "learning_rate": 0.0002, "epoch": 3.41396933560477, "step": 5010}, {"loss": 1.4461, "grad_norm": 0.8390814661979675, "learning_rate": 0.0002, "epoch": 3.4207836456558773, "step": 5020}, {"loss": 1.4556, "grad_norm": 0.7541858553886414, "learning_rate": 0.0002, "epoch": 3.4275979557069847, "step": 5030}, {"loss": 1.4245, "grad_norm": 0.6902772784233093, "learning_rate": 0.0002, "epoch": 3.434412265758092, "step": 5040}, {"loss": 1.3953, "grad_norm": 0.7070329785346985, "learning_rate": 0.0002, "epoch": 3.4412265758091993, "step": 5050}, {"loss": 1.3903, "grad_norm": 0.8075643181800842, "learning_rate": 0.0002, "epoch": 3.4480408858603067, "step": 5060}, {"loss": 1.3929, "grad_norm": 0.7133861780166626, "learning_rate": 0.0002, "epoch": 3.454855195911414, "step": 5070}, {"loss": 1.4632, "grad_norm": 0.6631823182106018, "learning_rate": 0.0002, "epoch": 3.4616695059625213, "step": 5080}, {"loss": 1.4162, "grad_norm": 0.673870325088501, "learning_rate": 0.0002, "epoch": 3.4684838160136287, "step": 5090}, {"loss": 1.4247, "grad_norm": 0.6438634395599365, "learning_rate": 0.0002, "epoch": 3.475298126064736, "step": 5100}, {"loss": 1.4421, "grad_norm": 0.7560495734214783, "learning_rate": 0.0002, "epoch": 3.4821124361158433, "step": 5110}, {"loss": 1.4125, "grad_norm": 0.6877814531326294, "learning_rate": 0.0002, "epoch": 3.4889267461669506, "step": 5120}, {"loss": 1.4308, "grad_norm": 0.7031328678131104, "learning_rate": 0.0002, "epoch": 3.495741056218058, "step": 5130}, {"loss": 1.3705, "grad_norm": 0.6797195672988892, "learning_rate": 0.0002, "epoch": 3.5025553662691653, "step": 5140}, {"loss": 1.4687, "grad_norm": 0.6766413450241089, "learning_rate": 0.0002, "epoch": 3.5093696763202726, "step": 5150}, {"loss": 1.4194, "grad_norm": 0.666656494140625, "learning_rate": 0.0002, "epoch": 3.51618398637138, "step": 5160}, {"loss": 1.469, "grad_norm": 0.74996417760849, "learning_rate": 0.0002, "epoch": 3.5229982964224873, "step": 5170}, {"loss": 1.4848, "grad_norm": 0.7370911836624146, "learning_rate": 0.0002, "epoch": 3.5298126064735946, "step": 5180}, {"loss": 1.4523, "grad_norm": 0.9063456654548645, "learning_rate": 0.0002, "epoch": 3.536626916524702, "step": 5190}, {"loss": 1.4726, "grad_norm": 0.6861422657966614, "learning_rate": 0.0002, "epoch": 3.5434412265758093, "step": 5200}, {"loss": 1.4803, "grad_norm": 0.7104039788246155, "learning_rate": 0.0002, "epoch": 3.5502555366269166, "step": 5210}, {"loss": 1.4313, "grad_norm": 0.6578653454780579, "learning_rate": 0.0002, "epoch": 3.557069846678024, "step": 5220}, {"loss": 1.4596, "grad_norm": 0.7336562275886536, "learning_rate": 0.0002, "epoch": 3.5638841567291313, "step": 5230}, {"loss": 1.4591, "grad_norm": 0.7163010835647583, "learning_rate": 0.0002, "epoch": 3.5706984667802386, "step": 5240}, {"loss": 1.3814, "grad_norm": 0.8112391233444214, "learning_rate": 0.0002, "epoch": 3.577512776831346, "step": 5250}, {"loss": 1.4249, "grad_norm": 0.7260391116142273, "learning_rate": 0.0002, "epoch": 3.5843270868824533, "step": 5260}, {"loss": 1.4249, "grad_norm": 0.7038731575012207, "learning_rate": 0.0002, "epoch": 3.5911413969335606, "step": 5270}, {"loss": 1.4172, "grad_norm": 0.7864376902580261, "learning_rate": 0.0002, "epoch": 3.597955706984668, "step": 5280}, {"loss": 1.4637, "grad_norm": 0.6968383193016052, "learning_rate": 0.0002, "epoch": 3.6047700170357753, "step": 5290}, {"loss": 1.5269, "grad_norm": 0.6726206541061401, "learning_rate": 0.0002, "epoch": 3.6115843270868826, "step": 5300}, {"loss": 1.4199, "grad_norm": 0.6716854572296143, "learning_rate": 0.0002, "epoch": 3.61839863713799, "step": 5310}, {"loss": 1.4686, "grad_norm": 0.7229742407798767, "learning_rate": 0.0002, "epoch": 3.6252129471890973, "step": 5320}, {"loss": 1.4441, "grad_norm": 0.7338683009147644, "learning_rate": 0.0002, "epoch": 3.6320272572402046, "step": 5330}, {"loss": 1.4116, "grad_norm": 0.771672785282135, "learning_rate": 0.0002, "epoch": 3.638841567291312, "step": 5340}, {"loss": 1.4007, "grad_norm": 0.7024078369140625, "learning_rate": 0.0002, "epoch": 3.645655877342419, "step": 5350}, {"loss": 1.4996, "grad_norm": 0.6847538352012634, "learning_rate": 0.0002, "epoch": 3.6524701873935266, "step": 5360}, {"loss": 1.4111, "grad_norm": 0.71802818775177, "learning_rate": 0.0002, "epoch": 3.6592844974446335, "step": 5370}, {"loss": 1.4224, "grad_norm": 0.78530353307724, "learning_rate": 0.0002, "epoch": 3.6660988074957412, "step": 5380}, {"loss": 1.4582, "grad_norm": 0.7262226939201355, "learning_rate": 0.0002, "epoch": 3.672913117546848, "step": 5390}, {"loss": 1.4704, "grad_norm": 0.7608316540718079, "learning_rate": 0.0002, "epoch": 3.679727427597956, "step": 5400}, {"loss": 1.3742, "grad_norm": 0.6994926333427429, "learning_rate": 0.0002, "epoch": 3.686541737649063, "step": 5410}, {"loss": 1.4738, "grad_norm": 0.7888479828834534, "learning_rate": 0.0002, "epoch": 3.6933560477001706, "step": 5420}, {"loss": 1.4213, "grad_norm": 0.7053858041763306, "learning_rate": 0.0002, "epoch": 3.7001703577512775, "step": 5430}, {"loss": 1.4988, "grad_norm": 0.7063165903091431, "learning_rate": 0.0002, "epoch": 3.7069846678023852, "step": 5440}, {"loss": 1.4386, "grad_norm": 0.6603744626045227, "learning_rate": 0.0002, "epoch": 3.713798977853492, "step": 5450}, {"loss": 1.4695, "grad_norm": 0.7043602466583252, "learning_rate": 0.0002, "epoch": 3.7206132879046, "step": 5460}, {"loss": 1.5051, "grad_norm": 0.7026081681251526, "learning_rate": 0.0002, "epoch": 3.7274275979557068, "step": 5470}, {"loss": 1.5613, "grad_norm": 0.7200090289115906, "learning_rate": 0.0002, "epoch": 3.7342419080068145, "step": 5480}, {"loss": 1.4182, "grad_norm": 0.7170904278755188, "learning_rate": 0.0002, "epoch": 3.7410562180579214, "step": 5490}, {"loss": 1.4344, "grad_norm": 0.7489104866981506, "learning_rate": 0.0002, "epoch": 3.747870528109029, "step": 5500}, {"loss": 1.4911, "grad_norm": 0.6540989875793457, "learning_rate": 0.0002, "epoch": 3.754684838160136, "step": 5510}, {"loss": 1.4955, "grad_norm": 0.6654048562049866, "learning_rate": 0.0002, "epoch": 3.761499148211244, "step": 5520}, {"loss": 1.4487, "grad_norm": 0.6577395796775818, "learning_rate": 0.0002, "epoch": 3.7683134582623508, "step": 5530}, {"loss": 1.4283, "grad_norm": 0.7762192487716675, "learning_rate": 0.0002, "epoch": 3.7751277683134585, "step": 5540}, {"loss": 1.4727, "grad_norm": 0.6336314678192139, "learning_rate": 0.0002, "epoch": 3.7819420783645654, "step": 5550}, {"loss": 1.4588, "grad_norm": 0.7098057866096497, "learning_rate": 0.0002, "epoch": 3.7887563884156727, "step": 5560}, {"loss": 1.4679, "grad_norm": 0.7379715442657471, "learning_rate": 0.0002, "epoch": 3.79557069846678, "step": 5570}, {"loss": 1.4633, "grad_norm": 0.6726924777030945, "learning_rate": 0.0002, "epoch": 3.8023850085178874, "step": 5580}, {"loss": 1.4751, "grad_norm": 1.1212009191513062, "learning_rate": 0.0002, "epoch": 3.8091993185689947, "step": 5590}, {"loss": 1.4503, "grad_norm": 0.6503795981407166, "learning_rate": 0.0002, "epoch": 3.816013628620102, "step": 5600}, {"loss": 1.4754, "grad_norm": 0.7041325569152832, "learning_rate": 0.0002, "epoch": 3.8228279386712094, "step": 5610}, {"loss": 1.4199, "grad_norm": 0.7962933778762817, "learning_rate": 0.0002, "epoch": 3.8296422487223167, "step": 5620}, {"loss": 1.4672, "grad_norm": 0.6613591909408569, "learning_rate": 0.0002, "epoch": 3.836456558773424, "step": 5630}, {"loss": 1.5688, "grad_norm": 0.7293516397476196, "learning_rate": 0.0002, "epoch": 3.8432708688245314, "step": 5640}, {"loss": 1.4149, "grad_norm": 0.7388607859611511, "learning_rate": 0.0002, "epoch": 3.8500851788756387, "step": 5650}, {"loss": 1.4743, "grad_norm": 0.6440677642822266, "learning_rate": 0.0002, "epoch": 3.856899488926746, "step": 5660}, {"loss": 1.5082, "grad_norm": 0.7729013562202454, "learning_rate": 0.0002, "epoch": 3.8637137989778534, "step": 5670}, {"loss": 1.4608, "grad_norm": 0.6696794033050537, "learning_rate": 0.0002, "epoch": 3.8705281090289607, "step": 5680}, {"loss": 1.472, "grad_norm": 0.7151781320571899, "learning_rate": 0.0002, "epoch": 3.877342419080068, "step": 5690}, {"loss": 1.4923, "grad_norm": 0.6736966371536255, "learning_rate": 0.0002, "epoch": 3.8841567291311754, "step": 5700}, {"loss": 1.4453, "grad_norm": 0.7444243431091309, "learning_rate": 0.0002, "epoch": 3.8909710391822827, "step": 5710}, {"loss": 1.4562, "grad_norm": 0.6701464653015137, "learning_rate": 0.0002, "epoch": 3.89778534923339, "step": 5720}, {"loss": 1.4478, "grad_norm": 0.7231952548027039, "learning_rate": 0.0002, "epoch": 3.9045996592844974, "step": 5730}, {"loss": 1.4539, "grad_norm": 0.831954300403595, "learning_rate": 0.0002, "epoch": 3.9114139693356047, "step": 5740}, {"loss": 1.5122, "grad_norm": 0.7697733640670776, "learning_rate": 0.0002, "epoch": 3.918228279386712, "step": 5750}, {"loss": 1.4552, "grad_norm": 0.6964395046234131, "learning_rate": 0.0002, "epoch": 3.9250425894378194, "step": 5760}, {"loss": 1.4688, "grad_norm": 0.6942925453186035, "learning_rate": 0.0002, "epoch": 3.9318568994889267, "step": 5770}, {"loss": 1.4668, "grad_norm": 0.6491202712059021, "learning_rate": 0.0002, "epoch": 3.938671209540034, "step": 5780}, {"loss": 1.4404, "grad_norm": 0.7004382610321045, "learning_rate": 0.0002, "epoch": 3.9454855195911414, "step": 5790}, {"loss": 1.5022, "grad_norm": 0.7337747812271118, "learning_rate": 0.0002, "epoch": 3.9522998296422487, "step": 5800}, {"loss": 1.5314, "grad_norm": 0.6923640966415405, "learning_rate": 0.0002, "epoch": 3.959114139693356, "step": 5810}, {"loss": 1.4811, "grad_norm": 0.6815266609191895, "learning_rate": 0.0002, "epoch": 3.9659284497444633, "step": 5820}, {"loss": 1.437, "grad_norm": 0.6755654811859131, "learning_rate": 0.0002, "epoch": 3.9727427597955707, "step": 5830}, {"loss": 1.4277, "grad_norm": 0.6912487149238586, "learning_rate": 0.0002, "epoch": 3.979557069846678, "step": 5840}, {"loss": 1.4654, "grad_norm": 0.6948044896125793, "learning_rate": 0.0002, "epoch": 3.9863713798977853, "step": 5850}, {"loss": 1.4779, "grad_norm": 0.6735455989837646, "learning_rate": 0.0002, "epoch": 3.9931856899488927, "step": 5860}, {"loss": 1.5102, "grad_norm": 0.7005048990249634, "learning_rate": 0.0002, "epoch": 4.0, "step": 5870}, {"eval_loss": 1.923058032989502, "eval_runtime": 58.9903, "eval_samples_per_second": 8.595, "eval_steps_per_second": 1.085, "epoch": 4.0, "step": 5870}, {"loss": 1.2417, "grad_norm": 0.809018075466156, "learning_rate": 0.0002, "epoch": 4.006814310051107, "step": 5880}, {"loss": 1.2874, "grad_norm": 0.9499403238296509, "learning_rate": 0.0002, "epoch": 4.013628620102215, "step": 5890}, {"loss": 1.2245, "grad_norm": 0.7944574356079102, "learning_rate": 0.0002, "epoch": 4.0204429301533215, "step": 5900}, {"loss": 1.2751, "grad_norm": 0.9501046538352966, "learning_rate": 0.0002, "epoch": 4.027257240204429, "step": 5910}, {"loss": 1.2706, "grad_norm": 0.8247923254966736, "learning_rate": 0.0002, "epoch": 4.034071550255536, "step": 5920}, {"loss": 1.2762, "grad_norm": 0.9358038902282715, "learning_rate": 0.0002, "epoch": 4.040885860306644, "step": 5930}, {"loss": 1.2953, "grad_norm": 1.0102452039718628, "learning_rate": 0.0002, "epoch": 4.047700170357751, "step": 5940}, {"loss": 1.216, "grad_norm": 1.0248252153396606, "learning_rate": 0.0002, "epoch": 4.054514480408859, "step": 5950}, {"loss": 1.2115, "grad_norm": 1.0438553094863892, "learning_rate": 0.0002, "epoch": 4.0613287904599655, "step": 5960}, {"loss": 1.2516, "grad_norm": 0.7964957356452942, "learning_rate": 0.0002, "epoch": 4.068143100511073, "step": 5970}, {"loss": 1.1555, "grad_norm": 0.9757015109062195, "learning_rate": 0.0002, "epoch": 4.07495741056218, "step": 5980}, {"loss": 1.2243, "grad_norm": 0.9157161116600037, "learning_rate": 0.0002, "epoch": 4.081771720613288, "step": 5990}, {"loss": 1.2481, "grad_norm": 0.9372851848602295, "learning_rate": 0.0002, "epoch": 4.088586030664395, "step": 6000}, {"loss": 1.2091, "grad_norm": 1.240779995918274, "learning_rate": 0.0002, "epoch": 4.095400340715503, "step": 6010}, {"loss": 1.1727, "grad_norm": 0.8394840359687805, "learning_rate": 0.0002, "epoch": 4.1022146507666095, "step": 6020}, {"loss": 1.2926, "grad_norm": 1.1081455945968628, "learning_rate": 0.0002, "epoch": 4.109028960817717, "step": 6030}, {"loss": 1.2417, "grad_norm": 0.9227745532989502, "learning_rate": 0.0002, "epoch": 4.115843270868824, "step": 6040}, {"loss": 1.1994, "grad_norm": 0.8487664461135864, "learning_rate": 0.0002, "epoch": 4.122657580919932, "step": 6050}, {"loss": 1.2378, "grad_norm": 0.9643339514732361, "learning_rate": 0.0002, "epoch": 4.129471890971039, "step": 6060}, {"loss": 1.2254, "grad_norm": 1.0296099185943604, "learning_rate": 0.0002, "epoch": 4.136286201022147, "step": 6070}, {"loss": 1.2419, "grad_norm": 0.9534215927124023, "learning_rate": 0.0002, "epoch": 4.1431005110732535, "step": 6080}, {"loss": 1.1849, "grad_norm": 0.9647086262702942, "learning_rate": 0.0002, "epoch": 4.149914821124361, "step": 6090}, {"loss": 1.2713, "grad_norm": 1.084836721420288, "learning_rate": 0.0002, "epoch": 4.156729131175468, "step": 6100}, {"loss": 1.1788, "grad_norm": 0.9315235614776611, "learning_rate": 0.0002, "epoch": 4.163543441226576, "step": 6110}, {"loss": 1.17, "grad_norm": 0.9541679620742798, "learning_rate": 0.0002, "epoch": 4.170357751277683, "step": 6120}, {"loss": 1.1407, "grad_norm": 0.9792100191116333, "learning_rate": 0.0002, "epoch": 4.177172061328791, "step": 6130}, {"loss": 1.2069, "grad_norm": 1.065783143043518, "learning_rate": 0.0002, "epoch": 4.1839863713798975, "step": 6140}, {"loss": 1.2512, "grad_norm": 1.036161184310913, "learning_rate": 0.0002, "epoch": 4.190800681431005, "step": 6150}, {"loss": 1.2371, "grad_norm": 0.8979679942131042, "learning_rate": 0.0002, "epoch": 4.197614991482112, "step": 6160}, {"loss": 1.2212, "grad_norm": 0.7584333419799805, "learning_rate": 0.0002, "epoch": 4.20442930153322, "step": 6170}, {"loss": 1.2128, "grad_norm": 1.1970131397247314, "learning_rate": 0.0002, "epoch": 4.211243611584327, "step": 6180}, {"loss": 1.1982, "grad_norm": 2.6447298526763916, "learning_rate": 0.0002, "epoch": 4.218057921635435, "step": 6190}, {"loss": 1.2465, "grad_norm": 0.9357487559318542, "learning_rate": 0.0002, "epoch": 4.2248722316865415, "step": 6200}, {"loss": 1.2963, "grad_norm": 0.9141183495521545, "learning_rate": 0.0002, "epoch": 4.231686541737649, "step": 6210}, {"loss": 1.1959, "grad_norm": 1.0606296062469482, "learning_rate": 0.0002, "epoch": 4.238500851788756, "step": 6220}, {"loss": 1.2629, "grad_norm": 0.9999088048934937, "learning_rate": 0.0002, "epoch": 4.245315161839864, "step": 6230}, {"loss": 1.1471, "grad_norm": 0.9469764232635498, "learning_rate": 0.0002, "epoch": 4.252129471890971, "step": 6240}, {"loss": 1.223, "grad_norm": 1.1508198976516724, "learning_rate": 0.0002, "epoch": 4.258943781942079, "step": 6250}, {"loss": 1.2677, "grad_norm": 1.2576130628585815, "learning_rate": 0.0002, "epoch": 4.2657580919931855, "step": 6260}, {"loss": 1.2216, "grad_norm": 0.9435968399047852, "learning_rate": 0.0002, "epoch": 4.272572402044293, "step": 6270}, {"loss": 1.2788, "grad_norm": 0.9290348887443542, "learning_rate": 0.0002, "epoch": 4.2793867120954, "step": 6280}, {"loss": 1.2631, "grad_norm": 0.9973701238632202, "learning_rate": 0.0002, "epoch": 4.286201022146508, "step": 6290}, {"loss": 1.2276, "grad_norm": 1.012855887413025, "learning_rate": 0.0002, "epoch": 4.293015332197615, "step": 6300}, {"loss": 1.2115, "grad_norm": 0.8371705412864685, "learning_rate": 0.0002, "epoch": 4.2998296422487225, "step": 6310}, {"loss": 1.2423, "grad_norm": 1.0867925882339478, "learning_rate": 0.0002, "epoch": 4.306643952299829, "step": 6320}, {"loss": 1.2262, "grad_norm": 0.9763767123222351, "learning_rate": 0.0002, "epoch": 4.313458262350937, "step": 6330}, {"loss": 1.2557, "grad_norm": 1.1844252347946167, "learning_rate": 0.0002, "epoch": 4.320272572402044, "step": 6340}, {"loss": 1.2635, "grad_norm": 0.8292830586433411, "learning_rate": 0.0002, "epoch": 4.327086882453152, "step": 6350}, {"loss": 1.262, "grad_norm": 0.9351436495780945, "learning_rate": 0.0002, "epoch": 4.333901192504259, "step": 6360}, {"loss": 1.2678, "grad_norm": 1.0425835847854614, "learning_rate": 0.0002, "epoch": 4.3407155025553665, "step": 6370}, {"loss": 1.2476, "grad_norm": 0.8894261121749878, "learning_rate": 0.0002, "epoch": 4.347529812606473, "step": 6380}, {"loss": 1.2965, "grad_norm": 0.9663366079330444, "learning_rate": 0.0002, "epoch": 4.354344122657581, "step": 6390}, {"loss": 1.2529, "grad_norm": 0.8915578126907349, "learning_rate": 0.0002, "epoch": 4.361158432708688, "step": 6400}, {"loss": 1.2573, "grad_norm": 1.0393000841140747, "learning_rate": 0.0002, "epoch": 4.367972742759796, "step": 6410}, {"loss": 1.2254, "grad_norm": 0.917398989200592, "learning_rate": 0.0002, "epoch": 4.374787052810903, "step": 6420}, {"loss": 1.3115, "grad_norm": 1.0496646165847778, "learning_rate": 0.0002, "epoch": 4.3816013628620105, "step": 6430}, {"loss": 1.2607, "grad_norm": 0.9349859356880188, "learning_rate": 0.0002, "epoch": 4.388415672913117, "step": 6440}, {"loss": 1.3414, "grad_norm": 1.0981004238128662, "learning_rate": 0.0002, "epoch": 4.395229982964225, "step": 6450}, {"loss": 1.2391, "grad_norm": 0.9794871807098389, "learning_rate": 0.0002, "epoch": 4.402044293015332, "step": 6460}, {"loss": 1.208, "grad_norm": 0.9321421384811401, "learning_rate": 0.0002, "epoch": 4.40885860306644, "step": 6470}, {"loss": 1.3398, "grad_norm": 0.9158342480659485, "learning_rate": 0.0002, "epoch": 4.415672913117547, "step": 6480}, {"loss": 1.1832, "grad_norm": 0.9462087750434875, "learning_rate": 0.0002, "epoch": 4.4224872231686545, "step": 6490}, {"loss": 1.2366, "grad_norm": 0.9740175604820251, "learning_rate": 0.0002, "epoch": 4.429301533219761, "step": 6500}, {"loss": 1.3074, "grad_norm": 0.8477463126182556, "learning_rate": 0.0002, "epoch": 4.436115843270869, "step": 6510}, {"loss": 1.2719, "grad_norm": 1.0296647548675537, "learning_rate": 0.0002, "epoch": 4.442930153321976, "step": 6520}, {"loss": 1.2647, "grad_norm": 0.9437751173973083, "learning_rate": 0.0002, "epoch": 4.449744463373084, "step": 6530}, {"loss": 1.2043, "grad_norm": 1.011192798614502, "learning_rate": 0.0002, "epoch": 4.456558773424191, "step": 6540}, {"loss": 1.3673, "grad_norm": 0.8836222290992737, "learning_rate": 0.0002, "epoch": 4.4633730834752985, "step": 6550}, {"loss": 1.3028, "grad_norm": 1.2799941301345825, "learning_rate": 0.0002, "epoch": 4.470187393526405, "step": 6560}, {"loss": 1.2789, "grad_norm": 0.925910472869873, "learning_rate": 0.0002, "epoch": 4.477001703577513, "step": 6570}, {"loss": 1.2723, "grad_norm": 0.957401692867279, "learning_rate": 0.0002, "epoch": 4.48381601362862, "step": 6580}, {"loss": 1.242, "grad_norm": 1.0789544582366943, "learning_rate": 0.0002, "epoch": 4.490630323679728, "step": 6590}, {"loss": 1.2553, "grad_norm": 0.8874586820602417, "learning_rate": 0.0002, "epoch": 4.497444633730835, "step": 6600}, {"loss": 1.2779, "grad_norm": 0.9394784569740295, "learning_rate": 0.0002, "epoch": 4.504258943781942, "step": 6610}, {"loss": 1.2744, "grad_norm": 1.029640793800354, "learning_rate": 0.0002, "epoch": 4.511073253833049, "step": 6620}, {"loss": 1.2634, "grad_norm": 0.9510841965675354, "learning_rate": 0.0002, "epoch": 4.517887563884157, "step": 6630}, {"loss": 1.2562, "grad_norm": 0.9992963671684265, "learning_rate": 0.0002, "epoch": 4.524701873935264, "step": 6640}, {"loss": 1.2942, "grad_norm": 0.9312878847122192, "learning_rate": 0.0002, "epoch": 4.531516183986371, "step": 6650}, {"loss": 1.2572, "grad_norm": 0.9406482577323914, "learning_rate": 0.0002, "epoch": 4.538330494037479, "step": 6660}, {"loss": 1.2283, "grad_norm": 1.1058286428451538, "learning_rate": 0.0002, "epoch": 4.5451448040885865, "step": 6670}, {"loss": 1.2391, "grad_norm": 0.9389635920524597, "learning_rate": 0.0002, "epoch": 4.551959114139693, "step": 6680}, {"loss": 1.2696, "grad_norm": 1.0356028079986572, "learning_rate": 0.0002, "epoch": 4.5587734241908, "step": 6690}, {"loss": 1.2935, "grad_norm": 0.9370909929275513, "learning_rate": 0.0002, "epoch": 4.565587734241908, "step": 6700}, {"loss": 1.2914, "grad_norm": 0.9917567372322083, "learning_rate": 0.0002, "epoch": 4.572402044293016, "step": 6710}, {"loss": 1.3318, "grad_norm": 0.9065384864807129, "learning_rate": 0.0002, "epoch": 4.579216354344123, "step": 6720}, {"loss": 1.2909, "grad_norm": 1.3347833156585693, "learning_rate": 0.0002, "epoch": 4.5860306643952296, "step": 6730}, {"loss": 1.3322, "grad_norm": 0.910632312297821, "learning_rate": 0.0002, "epoch": 4.592844974446337, "step": 6740}, {"loss": 1.2584, "grad_norm": 0.8874805569648743, "learning_rate": 0.0002, "epoch": 4.599659284497445, "step": 6750}, {"loss": 1.3173, "grad_norm": 0.9355664253234863, "learning_rate": 0.0002, "epoch": 4.606473594548552, "step": 6760}, {"loss": 1.3515, "grad_norm": 0.9360204339027405, "learning_rate": 0.0002, "epoch": 4.613287904599659, "step": 6770}, {"loss": 1.2326, "grad_norm": 0.9931750893592834, "learning_rate": 0.0002, "epoch": 4.620102214650767, "step": 6780}, {"loss": 1.2677, "grad_norm": 0.9195131063461304, "learning_rate": 0.0002, "epoch": 4.626916524701874, "step": 6790}, {"loss": 1.3417, "grad_norm": 0.9448373913764954, "learning_rate": 0.0002, "epoch": 4.633730834752981, "step": 6800}, {"loss": 1.2658, "grad_norm": 1.162890911102295, "learning_rate": 0.0002, "epoch": 4.640545144804088, "step": 6810}, {"loss": 1.2841, "grad_norm": 0.9739466905593872, "learning_rate": 0.0002, "epoch": 4.647359454855196, "step": 6820}, {"loss": 1.3068, "grad_norm": 0.9462909698486328, "learning_rate": 0.0002, "epoch": 4.654173764906303, "step": 6830}, {"loss": 1.284, "grad_norm": 1.042639970779419, "learning_rate": 0.0002, "epoch": 4.660988074957411, "step": 6840}, {"loss": 1.3337, "grad_norm": 0.8910539150238037, "learning_rate": 0.0002, "epoch": 4.6678023850085175, "step": 6850}, {"loss": 1.3025, "grad_norm": 1.0806447267532349, "learning_rate": 0.0002, "epoch": 4.674616695059625, "step": 6860}, {"loss": 1.2258, "grad_norm": 1.0054864883422852, "learning_rate": 0.0002, "epoch": 4.681431005110732, "step": 6870}, {"loss": 1.3261, "grad_norm": 0.7774158120155334, "learning_rate": 0.0002, "epoch": 4.68824531516184, "step": 6880}, {"loss": 1.2545, "grad_norm": 0.9729512333869934, "learning_rate": 0.0002, "epoch": 4.695059625212947, "step": 6890}, {"loss": 1.3251, "grad_norm": 1.2025411128997803, "learning_rate": 0.0002, "epoch": 4.701873935264055, "step": 6900}, {"loss": 1.3418, "grad_norm": 1.1654069423675537, "learning_rate": 0.0002, "epoch": 4.7086882453151615, "step": 6910}, {"loss": 1.3091, "grad_norm": 1.1501442193984985, "learning_rate": 0.0002, "epoch": 4.715502555366269, "step": 6920}, {"loss": 1.2627, "grad_norm": 1.1083979606628418, "learning_rate": 0.0002, "epoch": 4.722316865417376, "step": 6930}, {"loss": 1.2836, "grad_norm": 0.9431378841400146, "learning_rate": 0.0002, "epoch": 4.729131175468484, "step": 6940}, {"loss": 1.3381, "grad_norm": 0.9722502827644348, "learning_rate": 0.0002, "epoch": 4.735945485519591, "step": 6950}, {"loss": 1.3228, "grad_norm": 0.9094559550285339, "learning_rate": 0.0002, "epoch": 4.742759795570699, "step": 6960}, {"loss": 1.3474, "grad_norm": 0.9918473958969116, "learning_rate": 0.0002, "epoch": 4.7495741056218055, "step": 6970}, {"loss": 1.3352, "grad_norm": 0.9999690651893616, "learning_rate": 0.0002, "epoch": 4.756388415672913, "step": 6980}, {"loss": 1.3579, "grad_norm": 1.0453810691833496, "learning_rate": 0.0002, "epoch": 4.76320272572402, "step": 6990}, {"loss": 1.294, "grad_norm": 1.0167806148529053, "learning_rate": 0.0002, "epoch": 4.770017035775128, "step": 7000}, {"loss": 1.3247, "grad_norm": 0.8133894801139832, "learning_rate": 0.0002, "epoch": 4.776831345826235, "step": 7010}, {"loss": 1.2577, "grad_norm": 0.8000897765159607, "learning_rate": 0.0002, "epoch": 4.783645655877343, "step": 7020}, {"loss": 1.2802, "grad_norm": 0.992080569267273, "learning_rate": 0.0002, "epoch": 4.7904599659284495, "step": 7030}, {"loss": 1.3269, "grad_norm": 0.9824522137641907, "learning_rate": 0.0002, "epoch": 4.797274275979557, "step": 7040}, {"loss": 1.279, "grad_norm": 0.9808870553970337, "learning_rate": 0.0002, "epoch": 4.804088586030664, "step": 7050}, {"loss": 1.3342, "grad_norm": 0.9679701924324036, "learning_rate": 0.0002, "epoch": 4.810902896081772, "step": 7060}, {"loss": 1.2711, "grad_norm": 0.9895215034484863, "learning_rate": 0.0002, "epoch": 4.817717206132879, "step": 7070}, {"loss": 1.3008, "grad_norm": 1.052246332168579, "learning_rate": 0.0002, "epoch": 4.824531516183987, "step": 7080}, {"loss": 1.2874, "grad_norm": 0.9243564605712891, "learning_rate": 0.0002, "epoch": 4.8313458262350935, "step": 7090}, {"loss": 1.2835, "grad_norm": 0.9545369744300842, "learning_rate": 0.0002, "epoch": 4.838160136286201, "step": 7100}, {"loss": 1.31, "grad_norm": 0.9655884504318237, "learning_rate": 0.0002, "epoch": 4.844974446337308, "step": 7110}, {"loss": 1.2862, "grad_norm": 0.9708049893379211, "learning_rate": 0.0002, "epoch": 4.851788756388416, "step": 7120}, {"loss": 1.3425, "grad_norm": 1.0064880847930908, "learning_rate": 0.0002, "epoch": 4.858603066439523, "step": 7130}, {"loss": 1.2899, "grad_norm": 0.939943790435791, "learning_rate": 0.0002, "epoch": 4.8654173764906306, "step": 7140}, {"loss": 1.2887, "grad_norm": 1.0750784873962402, "learning_rate": 0.0002, "epoch": 4.872231686541737, "step": 7150}, {"loss": 1.3367, "grad_norm": 0.9708989262580872, "learning_rate": 0.0002, "epoch": 4.879045996592845, "step": 7160}, {"loss": 1.2797, "grad_norm": 1.0228253602981567, "learning_rate": 0.0002, "epoch": 4.885860306643952, "step": 7170}, {"loss": 1.2695, "grad_norm": 0.8963132500648499, "learning_rate": 0.0002, "epoch": 4.89267461669506, "step": 7180}, {"loss": 1.3473, "grad_norm": 0.9198015928268433, "learning_rate": 0.0002, "epoch": 4.899488926746167, "step": 7190}, {"loss": 1.2541, "grad_norm": 1.099906086921692, "learning_rate": 0.0002, "epoch": 4.9063032367972745, "step": 7200}, {"loss": 1.3188, "grad_norm": 1.0624815225601196, "learning_rate": 0.0002, "epoch": 4.913117546848381, "step": 7210}, {"loss": 1.3026, "grad_norm": 0.9688444137573242, "learning_rate": 0.0002, "epoch": 4.919931856899489, "step": 7220}, {"loss": 1.3379, "grad_norm": 0.867011547088623, "learning_rate": 0.0002, "epoch": 4.926746166950596, "step": 7230}, {"loss": 1.289, "grad_norm": 0.9600282311439514, "learning_rate": 0.0002, "epoch": 4.933560477001704, "step": 7240}, {"loss": 1.2751, "grad_norm": 0.8979372978210449, "learning_rate": 0.0002, "epoch": 4.940374787052811, "step": 7250}, {"loss": 1.3426, "grad_norm": 0.951474130153656, "learning_rate": 0.0002, "epoch": 4.9471890971039185, "step": 7260}, {"loss": 1.2726, "grad_norm": 0.824851393699646, "learning_rate": 0.0002, "epoch": 4.954003407155025, "step": 7270}, {"loss": 1.2679, "grad_norm": 1.2926591634750366, "learning_rate": 0.0002, "epoch": 4.960817717206133, "step": 7280}, {"loss": 1.2974, "grad_norm": 1.1057835817337036, "learning_rate": 0.0002, "epoch": 4.96763202725724, "step": 7290}, {"loss": 1.2275, "grad_norm": 0.9814816117286682, "learning_rate": 0.0002, "epoch": 4.974446337308348, "step": 7300}, {"loss": 1.3001, "grad_norm": 1.0251333713531494, "learning_rate": 0.0002, "epoch": 4.981260647359455, "step": 7310}, {"loss": 1.3113, "grad_norm": 0.9748668074607849, "learning_rate": 0.0002, "epoch": 4.9880749574105625, "step": 7320}, {"loss": 1.3595, "grad_norm": 0.8552228808403015, "learning_rate": 0.0002, "epoch": 4.994889267461669, "step": 7330}]} +{"epoch": 6.0, "step": 8805, "epoch_duration": 2960.479438304901, "total_accumulated_duration": 14961.992313623428, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28746.0}, "peak_memory_reserved": {"GPU_0": 28746.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0988, "grad_norm": 0.635574460029602, "learning_rate": 0.0002, "epoch": 0.0068143100511073255, "step": 10}, {"loss": 2.5594, "grad_norm": 1.0401769876480103, "learning_rate": 0.0002, "epoch": 0.013628620102214651, "step": 20}, {"loss": 2.3587, "grad_norm": 0.4514131247997284, "learning_rate": 0.0002, "epoch": 0.020442930153321975, "step": 30}, {"loss": 2.121, "grad_norm": 0.6303355693817139, "learning_rate": 0.0002, "epoch": 0.027257240204429302, "step": 40}, {"loss": 1.9833, "grad_norm": 0.4648270606994629, "learning_rate": 0.0002, "epoch": 0.034071550255536626, "step": 50}, {"loss": 1.9384, "grad_norm": 0.42953479290008545, "learning_rate": 0.0002, "epoch": 0.04088586030664395, "step": 60}, {"loss": 1.9202, "grad_norm": 0.433614581823349, "learning_rate": 0.0002, "epoch": 0.04770017035775128, "step": 70}, {"loss": 1.911, "grad_norm": 0.45318254828453064, "learning_rate": 0.0002, "epoch": 0.054514480408858604, "step": 80}, {"loss": 1.8895, "grad_norm": 0.4023568630218506, "learning_rate": 0.0002, "epoch": 0.06132879045996593, "step": 90}, {"loss": 1.9257, "grad_norm": 0.43260207772254944, "learning_rate": 0.0002, "epoch": 0.06814310051107325, "step": 100}, {"loss": 1.9178, "grad_norm": 0.43389809131622314, "learning_rate": 0.0002, "epoch": 0.07495741056218058, "step": 110}, {"loss": 1.9071, "grad_norm": 0.39307987689971924, "learning_rate": 0.0002, "epoch": 0.0817717206132879, "step": 120}, {"loss": 1.9482, "grad_norm": 0.7703037261962891, "learning_rate": 0.0002, "epoch": 0.08858603066439523, "step": 130}, {"loss": 1.8394, "grad_norm": 0.38547563552856445, "learning_rate": 0.0002, "epoch": 0.09540034071550256, "step": 140}, {"loss": 1.9037, "grad_norm": 0.37948688864707947, "learning_rate": 0.0002, "epoch": 0.10221465076660988, "step": 150}, {"loss": 1.8664, "grad_norm": 0.33022379875183105, "learning_rate": 0.0002, "epoch": 0.10902896081771721, "step": 160}, {"loss": 1.8659, "grad_norm": 0.33703792095184326, "learning_rate": 0.0002, "epoch": 0.11584327086882454, "step": 170}, {"loss": 1.8389, "grad_norm": 0.3259912431240082, "learning_rate": 0.0002, "epoch": 0.12265758091993186, "step": 180}, {"loss": 1.8424, "grad_norm": 0.3593858778476715, "learning_rate": 0.0002, "epoch": 0.12947189097103917, "step": 190}, {"loss": 1.8457, "grad_norm": 0.3401614725589752, "learning_rate": 0.0002, "epoch": 0.1362862010221465, "step": 200}, {"loss": 1.7721, "grad_norm": 0.3892078697681427, "learning_rate": 0.0002, "epoch": 0.14310051107325383, "step": 210}, {"loss": 1.8351, "grad_norm": 0.315374493598938, "learning_rate": 0.0002, "epoch": 0.14991482112436116, "step": 220}, {"loss": 1.8307, "grad_norm": 0.3679497539997101, "learning_rate": 0.0002, "epoch": 0.1567291311754685, "step": 230}, {"loss": 1.816, "grad_norm": 0.336730033159256, "learning_rate": 0.0002, "epoch": 0.1635434412265758, "step": 240}, {"loss": 1.7849, "grad_norm": 0.36472755670547485, "learning_rate": 0.0002, "epoch": 0.17035775127768313, "step": 250}, {"loss": 1.7766, "grad_norm": 0.34864023327827454, "learning_rate": 0.0002, "epoch": 0.17717206132879046, "step": 260}, {"loss": 1.896, "grad_norm": 0.3350819945335388, "learning_rate": 0.0002, "epoch": 0.1839863713798978, "step": 270}, {"loss": 1.8742, "grad_norm": 0.3844246566295624, "learning_rate": 0.0002, "epoch": 0.19080068143100512, "step": 280}, {"loss": 1.8531, "grad_norm": 0.38413065671920776, "learning_rate": 0.0002, "epoch": 0.19761499148211242, "step": 290}, {"loss": 1.8415, "grad_norm": 0.4080047607421875, "learning_rate": 0.0002, "epoch": 0.20442930153321975, "step": 300}, {"loss": 1.9009, "grad_norm": 0.3546800911426544, "learning_rate": 0.0002, "epoch": 0.21124361158432708, "step": 310}, {"loss": 1.8092, "grad_norm": 0.38577890396118164, "learning_rate": 0.0002, "epoch": 0.21805792163543442, "step": 320}, {"loss": 1.7113, "grad_norm": 0.38979098200798035, "learning_rate": 0.0002, "epoch": 0.22487223168654175, "step": 330}, {"loss": 1.8162, "grad_norm": 0.35105520486831665, "learning_rate": 0.0002, "epoch": 0.23168654173764908, "step": 340}, {"loss": 1.8408, "grad_norm": 0.31671223044395447, "learning_rate": 0.0002, "epoch": 0.23850085178875638, "step": 350}, {"loss": 1.8014, "grad_norm": 0.33034196496009827, "learning_rate": 0.0002, "epoch": 0.2453151618398637, "step": 360}, {"loss": 1.7132, "grad_norm": 0.2990533709526062, "learning_rate": 0.0002, "epoch": 0.252129471890971, "step": 370}, {"loss": 1.8612, "grad_norm": 0.362208753824234, "learning_rate": 0.0002, "epoch": 0.25894378194207834, "step": 380}, {"loss": 1.8001, "grad_norm": 0.3269096612930298, "learning_rate": 0.0002, "epoch": 0.2657580919931857, "step": 390}, {"loss": 1.786, "grad_norm": 0.30555954575538635, "learning_rate": 0.0002, "epoch": 0.272572402044293, "step": 400}, {"loss": 1.8018, "grad_norm": 0.332933247089386, "learning_rate": 0.0002, "epoch": 0.27938671209540034, "step": 410}, {"loss": 1.8157, "grad_norm": 0.39454060792922974, "learning_rate": 0.0002, "epoch": 0.28620102214650767, "step": 420}, {"loss": 1.7862, "grad_norm": 0.34589633345603943, "learning_rate": 0.0002, "epoch": 0.293015332197615, "step": 430}, {"loss": 1.7612, "grad_norm": 0.3747332990169525, "learning_rate": 0.0002, "epoch": 0.29982964224872233, "step": 440}, {"loss": 1.8476, "grad_norm": 0.34825369715690613, "learning_rate": 0.0002, "epoch": 0.30664395229982966, "step": 450}, {"loss": 1.775, "grad_norm": 0.32906976342201233, "learning_rate": 0.0002, "epoch": 0.313458262350937, "step": 460}, {"loss": 1.7764, "grad_norm": 0.33108609914779663, "learning_rate": 0.0002, "epoch": 0.3202725724020443, "step": 470}, {"loss": 1.8012, "grad_norm": 0.3170463442802429, "learning_rate": 0.0002, "epoch": 0.3270868824531516, "step": 480}, {"loss": 1.8264, "grad_norm": 0.30792081356048584, "learning_rate": 0.0002, "epoch": 0.3339011925042589, "step": 490}, {"loss": 1.7361, "grad_norm": 0.31772997975349426, "learning_rate": 0.0002, "epoch": 0.34071550255536626, "step": 500}, {"loss": 1.8147, "grad_norm": 0.32714012265205383, "learning_rate": 0.0002, "epoch": 0.3475298126064736, "step": 510}, {"loss": 1.8332, "grad_norm": 0.3021100163459778, "learning_rate": 0.0002, "epoch": 0.3543441226575809, "step": 520}, {"loss": 1.7337, "grad_norm": 0.6045835018157959, "learning_rate": 0.0002, "epoch": 0.36115843270868825, "step": 530}, {"loss": 1.8661, "grad_norm": 0.3003896474838257, "learning_rate": 0.0002, "epoch": 0.3679727427597956, "step": 540}, {"loss": 1.8359, "grad_norm": 0.3678470551967621, "learning_rate": 0.0002, "epoch": 0.3747870528109029, "step": 550}, {"loss": 1.7512, "grad_norm": 0.35787615180015564, "learning_rate": 0.0002, "epoch": 0.38160136286201024, "step": 560}, {"loss": 1.8048, "grad_norm": 0.31882143020629883, "learning_rate": 0.0002, "epoch": 0.38841567291311757, "step": 570}, {"loss": 1.8108, "grad_norm": 0.3186313509941101, "learning_rate": 0.0002, "epoch": 0.39522998296422485, "step": 580}, {"loss": 1.8012, "grad_norm": 0.41443702578544617, "learning_rate": 0.0002, "epoch": 0.4020442930153322, "step": 590}, {"loss": 1.7794, "grad_norm": 0.28773069381713867, "learning_rate": 0.0002, "epoch": 0.4088586030664395, "step": 600}, {"loss": 1.8231, "grad_norm": 0.35743263363838196, "learning_rate": 0.0002, "epoch": 0.41567291311754684, "step": 610}, {"loss": 1.7531, "grad_norm": 0.3360286355018616, "learning_rate": 0.0002, "epoch": 0.42248722316865417, "step": 620}, {"loss": 1.7654, "grad_norm": 0.32838866114616394, "learning_rate": 0.0002, "epoch": 0.4293015332197615, "step": 630}, {"loss": 1.8176, "grad_norm": 0.2994388937950134, "learning_rate": 0.0002, "epoch": 0.43611584327086883, "step": 640}, {"loss": 1.7241, "grad_norm": 0.3306307792663574, "learning_rate": 0.0002, "epoch": 0.44293015332197616, "step": 650}, {"loss": 1.8201, "grad_norm": 0.3129560351371765, "learning_rate": 0.0002, "epoch": 0.4497444633730835, "step": 660}, {"loss": 1.803, "grad_norm": 0.3244289457798004, "learning_rate": 0.0002, "epoch": 0.4565587734241908, "step": 670}, {"loss": 1.8399, "grad_norm": 0.3196892738342285, "learning_rate": 0.0002, "epoch": 0.46337308347529815, "step": 680}, {"loss": 1.8291, "grad_norm": 0.3135230243206024, "learning_rate": 0.0002, "epoch": 0.47018739352640543, "step": 690}, {"loss": 1.7423, "grad_norm": 0.28677991032600403, "learning_rate": 0.0002, "epoch": 0.47700170357751276, "step": 700}, {"loss": 1.6982, "grad_norm": 0.3074065148830414, "learning_rate": 0.0002, "epoch": 0.4838160136286201, "step": 710}, {"loss": 1.8078, "grad_norm": 0.3354290723800659, "learning_rate": 0.0002, "epoch": 0.4906303236797274, "step": 720}, {"loss": 1.7333, "grad_norm": 0.324370801448822, "learning_rate": 0.0002, "epoch": 0.49744463373083475, "step": 730}, {"loss": 1.8506, "grad_norm": 0.29496142268180847, "learning_rate": 0.0002, "epoch": 0.504258943781942, "step": 740}, {"loss": 1.7761, "grad_norm": 0.30694130063056946, "learning_rate": 0.0002, "epoch": 0.5110732538330494, "step": 750}, {"loss": 1.7854, "grad_norm": 0.36168408393859863, "learning_rate": 0.0002, "epoch": 0.5178875638841567, "step": 760}, {"loss": 1.7149, "grad_norm": 0.2930343449115753, "learning_rate": 0.0002, "epoch": 0.524701873935264, "step": 770}, {"loss": 1.7924, "grad_norm": 0.3023432493209839, "learning_rate": 0.0002, "epoch": 0.5315161839863713, "step": 780}, {"loss": 1.8467, "grad_norm": 0.3272720277309418, "learning_rate": 0.0002, "epoch": 0.5383304940374787, "step": 790}, {"loss": 1.8639, "grad_norm": 0.2907974421977997, "learning_rate": 0.0002, "epoch": 0.545144804088586, "step": 800}, {"loss": 1.9018, "grad_norm": 0.32267168164253235, "learning_rate": 0.0002, "epoch": 0.5519591141396933, "step": 810}, {"loss": 1.8311, "grad_norm": 0.32059940695762634, "learning_rate": 0.0002, "epoch": 0.5587734241908007, "step": 820}, {"loss": 1.7234, "grad_norm": 0.30951258540153503, "learning_rate": 0.0002, "epoch": 0.565587734241908, "step": 830}, {"loss": 1.8063, "grad_norm": 0.33976122736930847, "learning_rate": 0.0002, "epoch": 0.5724020442930153, "step": 840}, {"loss": 1.7021, "grad_norm": 0.3195820450782776, "learning_rate": 0.0002, "epoch": 0.5792163543441227, "step": 850}, {"loss": 1.8073, "grad_norm": 0.2828562557697296, "learning_rate": 0.0002, "epoch": 0.58603066439523, "step": 860}, {"loss": 1.695, "grad_norm": 0.29591670632362366, "learning_rate": 0.0002, "epoch": 0.5928449744463373, "step": 870}, {"loss": 1.8109, "grad_norm": 0.3086104393005371, "learning_rate": 0.0002, "epoch": 0.5996592844974447, "step": 880}, {"loss": 1.8207, "grad_norm": 0.3592929542064667, "learning_rate": 0.0002, "epoch": 0.606473594548552, "step": 890}, {"loss": 1.7662, "grad_norm": 0.2830186188220978, "learning_rate": 0.0002, "epoch": 0.6132879045996593, "step": 900}, {"loss": 1.8344, "grad_norm": 0.3128598630428314, "learning_rate": 0.0002, "epoch": 0.6201022146507666, "step": 910}, {"loss": 1.7953, "grad_norm": 0.31957563757896423, "learning_rate": 0.0002, "epoch": 0.626916524701874, "step": 920}, {"loss": 1.7578, "grad_norm": 0.30994319915771484, "learning_rate": 0.0002, "epoch": 0.6337308347529813, "step": 930}, {"loss": 1.8494, "grad_norm": 0.3352845013141632, "learning_rate": 0.0002, "epoch": 0.6405451448040886, "step": 940}, {"loss": 1.7054, "grad_norm": 0.2960077226161957, "learning_rate": 0.0002, "epoch": 0.6473594548551959, "step": 950}, {"loss": 1.7209, "grad_norm": 0.32675081491470337, "learning_rate": 0.0002, "epoch": 0.6541737649063032, "step": 960}, {"loss": 1.7776, "grad_norm": 0.30042028427124023, "learning_rate": 0.0002, "epoch": 0.6609880749574105, "step": 970}, {"loss": 1.7597, "grad_norm": 0.3288673758506775, "learning_rate": 0.0002, "epoch": 0.6678023850085179, "step": 980}, {"loss": 1.7962, "grad_norm": 0.323215126991272, "learning_rate": 0.0002, "epoch": 0.6746166950596252, "step": 990}, {"loss": 1.821, "grad_norm": 0.30041399598121643, "learning_rate": 0.0002, "epoch": 0.6814310051107325, "step": 1000}, {"loss": 1.7716, "grad_norm": 0.3076179623603821, "learning_rate": 0.0002, "epoch": 0.6882453151618398, "step": 1010}, {"loss": 1.7827, "grad_norm": 0.2941909730434418, "learning_rate": 0.0002, "epoch": 0.6950596252129472, "step": 1020}, {"loss": 1.7964, "grad_norm": 0.32220420241355896, "learning_rate": 0.0002, "epoch": 0.7018739352640545, "step": 1030}, {"loss": 1.7101, "grad_norm": 0.2989702820777893, "learning_rate": 0.0002, "epoch": 0.7086882453151618, "step": 1040}, {"loss": 1.8749, "grad_norm": 0.3049640357494354, "learning_rate": 0.0002, "epoch": 0.7155025553662692, "step": 1050}, {"loss": 1.782, "grad_norm": 0.3183284103870392, "learning_rate": 0.0002, "epoch": 0.7223168654173765, "step": 1060}, {"loss": 1.785, "grad_norm": 0.3070095181465149, "learning_rate": 0.0002, "epoch": 0.7291311754684838, "step": 1070}, {"loss": 1.7832, "grad_norm": 0.33263063430786133, "learning_rate": 0.0002, "epoch": 0.7359454855195912, "step": 1080}, {"loss": 1.7627, "grad_norm": 0.28774312138557434, "learning_rate": 0.0002, "epoch": 0.7427597955706985, "step": 1090}, {"loss": 1.7343, "grad_norm": 0.29304224252700806, "learning_rate": 0.0002, "epoch": 0.7495741056218058, "step": 1100}, {"loss": 1.8082, "grad_norm": 0.27529507875442505, "learning_rate": 0.0002, "epoch": 0.7563884156729132, "step": 1110}, {"loss": 1.8071, "grad_norm": 0.32319945096969604, "learning_rate": 0.0002, "epoch": 0.7632027257240205, "step": 1120}, {"loss": 1.6998, "grad_norm": 0.33777597546577454, "learning_rate": 0.0002, "epoch": 0.7700170357751278, "step": 1130}, {"loss": 1.8488, "grad_norm": 0.29847201704978943, "learning_rate": 0.0002, "epoch": 0.7768313458262351, "step": 1140}, {"loss": 1.7014, "grad_norm": 0.31370633840560913, "learning_rate": 0.0002, "epoch": 0.7836456558773425, "step": 1150}, {"loss": 1.8175, "grad_norm": 0.31633856892585754, "learning_rate": 0.0002, "epoch": 0.7904599659284497, "step": 1160}, {"loss": 1.7637, "grad_norm": 0.33849263191223145, "learning_rate": 0.0002, "epoch": 0.797274275979557, "step": 1170}, {"loss": 1.8264, "grad_norm": 0.3306333124637604, "learning_rate": 0.0002, "epoch": 0.8040885860306644, "step": 1180}, {"loss": 1.777, "grad_norm": 0.3249678313732147, "learning_rate": 0.0002, "epoch": 0.8109028960817717, "step": 1190}, {"loss": 1.7631, "grad_norm": 0.3252817690372467, "learning_rate": 0.0002, "epoch": 0.817717206132879, "step": 1200}, {"loss": 1.7877, "grad_norm": 0.31772559881210327, "learning_rate": 0.0002, "epoch": 0.8245315161839863, "step": 1210}, {"loss": 1.8068, "grad_norm": 0.2803371846675873, "learning_rate": 0.0002, "epoch": 0.8313458262350937, "step": 1220}, {"loss": 1.738, "grad_norm": 0.26700571179389954, "learning_rate": 0.0002, "epoch": 0.838160136286201, "step": 1230}, {"loss": 1.8301, "grad_norm": 0.3060092031955719, "learning_rate": 0.0002, "epoch": 0.8449744463373083, "step": 1240}, {"loss": 1.7993, "grad_norm": 0.28831684589385986, "learning_rate": 0.0002, "epoch": 0.8517887563884157, "step": 1250}, {"loss": 1.6909, "grad_norm": 0.30708742141723633, "learning_rate": 0.0002, "epoch": 0.858603066439523, "step": 1260}, {"loss": 1.8506, "grad_norm": 0.2915987968444824, "learning_rate": 0.0002, "epoch": 0.8654173764906303, "step": 1270}, {"loss": 1.7536, "grad_norm": 0.2893589735031128, "learning_rate": 0.0002, "epoch": 0.8722316865417377, "step": 1280}, {"loss": 1.7437, "grad_norm": 0.29545632004737854, "learning_rate": 0.0002, "epoch": 0.879045996592845, "step": 1290}, {"loss": 1.859, "grad_norm": 0.3354771137237549, "learning_rate": 0.0002, "epoch": 0.8858603066439523, "step": 1300}, {"loss": 1.7644, "grad_norm": 0.37715399265289307, "learning_rate": 0.0002, "epoch": 0.8926746166950597, "step": 1310}, {"loss": 1.7731, "grad_norm": 0.28847193717956543, "learning_rate": 0.0002, "epoch": 0.899488926746167, "step": 1320}, {"loss": 1.9077, "grad_norm": 0.2780889868736267, "learning_rate": 0.0002, "epoch": 0.9063032367972743, "step": 1330}, {"loss": 1.7861, "grad_norm": 0.2714342176914215, "learning_rate": 0.0002, "epoch": 0.9131175468483816, "step": 1340}, {"loss": 1.8, "grad_norm": 0.2950133979320526, "learning_rate": 0.0002, "epoch": 0.919931856899489, "step": 1350}, {"loss": 1.7368, "grad_norm": 0.29097145795822144, "learning_rate": 0.0002, "epoch": 0.9267461669505963, "step": 1360}, {"loss": 1.7864, "grad_norm": 0.32540133595466614, "learning_rate": 0.0002, "epoch": 0.9335604770017035, "step": 1370}, {"loss": 1.7571, "grad_norm": 0.3076636493206024, "learning_rate": 0.0002, "epoch": 0.9403747870528109, "step": 1380}, {"loss": 1.8312, "grad_norm": 0.2962130308151245, "learning_rate": 0.0002, "epoch": 0.9471890971039182, "step": 1390}, {"loss": 1.7858, "grad_norm": 0.30086860060691833, "learning_rate": 0.0002, "epoch": 0.9540034071550255, "step": 1400}, {"loss": 1.7991, "grad_norm": 0.28634947538375854, "learning_rate": 0.0002, "epoch": 0.9608177172061328, "step": 1410}, {"loss": 1.8385, "grad_norm": 0.35314416885375977, "learning_rate": 0.0002, "epoch": 0.9676320272572402, "step": 1420}, {"loss": 1.8054, "grad_norm": 0.2939317524433136, "learning_rate": 0.0002, "epoch": 0.9744463373083475, "step": 1430}, {"loss": 1.7582, "grad_norm": 0.3010196089744568, "learning_rate": 0.0002, "epoch": 0.9812606473594548, "step": 1440}, {"loss": 1.758, "grad_norm": 0.30816152691841125, "learning_rate": 0.0002, "epoch": 0.9880749574105622, "step": 1450}, {"loss": 1.8159, "grad_norm": 0.28152793645858765, "learning_rate": 0.0002, "epoch": 0.9948892674616695, "step": 1460}, {"eval_loss": 1.8034634590148926, "eval_runtime": 53.6017, "eval_samples_per_second": 9.459, "eval_steps_per_second": 1.194, "epoch": 0.9996592844974447, "step": 1467}, {"loss": 1.7072, "grad_norm": 0.29246416687965393, "learning_rate": 0.0002, "epoch": 1.0017035775127767, "step": 1470}, {"loss": 1.8159, "grad_norm": 0.2668602168560028, "learning_rate": 0.0002, "epoch": 1.008517887563884, "step": 1480}, {"loss": 1.6868, "grad_norm": 0.2998567819595337, "learning_rate": 0.0002, "epoch": 1.0153321976149914, "step": 1490}, {"loss": 1.7331, "grad_norm": 0.3284934461116791, "learning_rate": 0.0002, "epoch": 1.0221465076660987, "step": 1500}, {"loss": 1.689, "grad_norm": 0.3275827169418335, "learning_rate": 0.0002, "epoch": 1.028960817717206, "step": 1510}, {"loss": 1.7092, "grad_norm": 0.3382718563079834, "learning_rate": 0.0002, "epoch": 1.0357751277683134, "step": 1520}, {"loss": 1.7215, "grad_norm": 0.36737215518951416, "learning_rate": 0.0002, "epoch": 1.0425894378194207, "step": 1530}, {"loss": 1.6993, "grad_norm": 0.3442603647708893, "learning_rate": 0.0002, "epoch": 1.049403747870528, "step": 1540}, {"loss": 1.6996, "grad_norm": 0.3323381245136261, "learning_rate": 0.0002, "epoch": 1.0562180579216354, "step": 1550}, {"loss": 1.7444, "grad_norm": 0.341227650642395, "learning_rate": 0.0002, "epoch": 1.0630323679727427, "step": 1560}, {"loss": 1.7419, "grad_norm": 0.3361579477787018, "learning_rate": 0.0002, "epoch": 1.06984667802385, "step": 1570}, {"loss": 1.7337, "grad_norm": 0.3556230962276459, "learning_rate": 0.0002, "epoch": 1.0766609880749574, "step": 1580}, {"loss": 1.6978, "grad_norm": 0.27130424976348877, "learning_rate": 0.0002, "epoch": 1.0834752981260647, "step": 1590}, {"loss": 1.6087, "grad_norm": 0.29366323351860046, "learning_rate": 0.0002, "epoch": 1.090289608177172, "step": 1600}, {"loss": 1.6721, "grad_norm": 0.3581245541572571, "learning_rate": 0.0002, "epoch": 1.0971039182282794, "step": 1610}, {"loss": 1.7639, "grad_norm": 0.3021670877933502, "learning_rate": 0.0002, "epoch": 1.1039182282793867, "step": 1620}, {"loss": 1.7314, "grad_norm": 0.3145572543144226, "learning_rate": 0.0002, "epoch": 1.110732538330494, "step": 1630}, {"loss": 1.7408, "grad_norm": 0.35362154245376587, "learning_rate": 0.0002, "epoch": 1.1175468483816013, "step": 1640}, {"loss": 1.7071, "grad_norm": 0.5413113236427307, "learning_rate": 0.0002, "epoch": 1.1243611584327087, "step": 1650}, {"loss": 1.7615, "grad_norm": 0.6858654022216797, "learning_rate": 0.0002, "epoch": 1.131175468483816, "step": 1660}, {"loss": 1.7029, "grad_norm": 0.3781903386116028, "learning_rate": 0.0002, "epoch": 1.1379897785349233, "step": 1670}, {"loss": 1.659, "grad_norm": 0.404864102602005, "learning_rate": 0.0002, "epoch": 1.1448040885860307, "step": 1680}, {"loss": 1.7212, "grad_norm": 0.3595100939273834, "learning_rate": 0.0002, "epoch": 1.151618398637138, "step": 1690}, {"loss": 1.7023, "grad_norm": 0.33682283759117126, "learning_rate": 0.0002, "epoch": 1.1584327086882453, "step": 1700}, {"loss": 1.7336, "grad_norm": 0.3877373933792114, "learning_rate": 0.0002, "epoch": 1.1652470187393527, "step": 1710}, {"loss": 1.7676, "grad_norm": 0.34606897830963135, "learning_rate": 0.0002, "epoch": 1.17206132879046, "step": 1720}, {"loss": 1.6889, "grad_norm": 0.3122918903827667, "learning_rate": 0.0002, "epoch": 1.1788756388415673, "step": 1730}, {"loss": 1.6585, "grad_norm": 0.34081900119781494, "learning_rate": 0.0002, "epoch": 1.1856899488926746, "step": 1740}, {"loss": 1.7794, "grad_norm": 0.3418807089328766, "learning_rate": 0.0002, "epoch": 1.192504258943782, "step": 1750}, {"loss": 1.7773, "grad_norm": 0.3495500981807709, "learning_rate": 0.0002, "epoch": 1.1993185689948893, "step": 1760}, {"loss": 1.6578, "grad_norm": 0.557288408279419, "learning_rate": 0.0002, "epoch": 1.2061328790459966, "step": 1770}, {"loss": 1.7488, "grad_norm": 0.3193778693675995, "learning_rate": 0.0002, "epoch": 1.212947189097104, "step": 1780}, {"loss": 1.6815, "grad_norm": 0.3306216895580292, "learning_rate": 0.0002, "epoch": 1.2197614991482113, "step": 1790}, {"loss": 1.6772, "grad_norm": 0.37998732924461365, "learning_rate": 0.0002, "epoch": 1.2265758091993186, "step": 1800}, {"loss": 1.7094, "grad_norm": 0.3255669176578522, "learning_rate": 0.0002, "epoch": 1.233390119250426, "step": 1810}, {"loss": 1.7096, "grad_norm": 0.3741287291049957, "learning_rate": 0.0002, "epoch": 1.2402044293015333, "step": 1820}, {"loss": 1.7317, "grad_norm": 0.36727291345596313, "learning_rate": 0.0002, "epoch": 1.2470187393526406, "step": 1830}, {"loss": 1.7418, "grad_norm": 0.3479527235031128, "learning_rate": 0.0002, "epoch": 1.253833049403748, "step": 1840}, {"loss": 1.7062, "grad_norm": 0.3472636640071869, "learning_rate": 0.0002, "epoch": 1.2606473594548553, "step": 1850}, {"loss": 1.6263, "grad_norm": 0.3702869415283203, "learning_rate": 0.0002, "epoch": 1.2674616695059626, "step": 1860}, {"loss": 1.6222, "grad_norm": 0.3934040069580078, "learning_rate": 0.0002, "epoch": 1.27427597955707, "step": 1870}, {"loss": 1.7616, "grad_norm": 0.46887534856796265, "learning_rate": 0.0002, "epoch": 1.2810902896081773, "step": 1880}, {"loss": 1.6071, "grad_norm": 0.3191998600959778, "learning_rate": 0.0002, "epoch": 1.2879045996592846, "step": 1890}, {"loss": 1.7001, "grad_norm": 0.34032225608825684, "learning_rate": 0.0002, "epoch": 1.294718909710392, "step": 1900}, {"loss": 1.8316, "grad_norm": 0.33453696966171265, "learning_rate": 0.0002, "epoch": 1.3015332197614993, "step": 1910}, {"loss": 1.6567, "grad_norm": 0.3451494872570038, "learning_rate": 0.0002, "epoch": 1.3083475298126066, "step": 1920}, {"loss": 1.708, "grad_norm": 0.36203092336654663, "learning_rate": 0.0002, "epoch": 1.315161839863714, "step": 1930}, {"loss": 1.7095, "grad_norm": 0.43794456124305725, "learning_rate": 0.0002, "epoch": 1.321976149914821, "step": 1940}, {"loss": 1.7264, "grad_norm": 0.3630591034889221, "learning_rate": 0.0002, "epoch": 1.3287904599659284, "step": 1950}, {"loss": 1.6529, "grad_norm": 0.36951911449432373, "learning_rate": 0.0002, "epoch": 1.3356047700170357, "step": 1960}, {"loss": 1.651, "grad_norm": 0.4001159965991974, "learning_rate": 0.0002, "epoch": 1.342419080068143, "step": 1970}, {"loss": 1.677, "grad_norm": 0.3820836544036865, "learning_rate": 0.0002, "epoch": 1.3492333901192504, "step": 1980}, {"loss": 1.583, "grad_norm": 0.3705870807170868, "learning_rate": 0.0002, "epoch": 1.3560477001703577, "step": 1990}, {"loss": 1.6207, "grad_norm": 0.3557972013950348, "learning_rate": 0.0002, "epoch": 1.362862010221465, "step": 2000}, {"loss": 1.7656, "grad_norm": 0.38546398282051086, "learning_rate": 0.0002, "epoch": 1.3696763202725724, "step": 2010}, {"loss": 1.6881, "grad_norm": 0.3908020853996277, "learning_rate": 0.0002, "epoch": 1.3764906303236797, "step": 2020}, {"loss": 1.727, "grad_norm": 0.3822040855884552, "learning_rate": 0.0002, "epoch": 1.383304940374787, "step": 2030}, {"loss": 1.7923, "grad_norm": 0.425327867269516, "learning_rate": 0.0002, "epoch": 1.3901192504258943, "step": 2040}, {"loss": 1.7032, "grad_norm": 0.3436259329319, "learning_rate": 0.0002, "epoch": 1.3969335604770017, "step": 2050}, {"loss": 1.6398, "grad_norm": 0.33124062418937683, "learning_rate": 0.0002, "epoch": 1.403747870528109, "step": 2060}, {"loss": 1.6815, "grad_norm": 0.3662424683570862, "learning_rate": 0.0002, "epoch": 1.4105621805792163, "step": 2070}, {"loss": 1.7258, "grad_norm": 0.3720635175704956, "learning_rate": 0.0002, "epoch": 1.4173764906303237, "step": 2080}, {"loss": 1.7186, "grad_norm": 0.3361680805683136, "learning_rate": 0.0002, "epoch": 1.424190800681431, "step": 2090}, {"loss": 1.7606, "grad_norm": 0.32751724123954773, "learning_rate": 0.0002, "epoch": 1.4310051107325383, "step": 2100}, {"loss": 1.7051, "grad_norm": 0.34333378076553345, "learning_rate": 0.0002, "epoch": 1.4378194207836457, "step": 2110}, {"loss": 1.6979, "grad_norm": 0.37777671217918396, "learning_rate": 0.0002, "epoch": 1.444633730834753, "step": 2120}, {"loss": 1.7085, "grad_norm": 0.37126365303993225, "learning_rate": 0.0002, "epoch": 1.4514480408858603, "step": 2130}, {"loss": 1.721, "grad_norm": 0.3602267801761627, "learning_rate": 0.0002, "epoch": 1.4582623509369677, "step": 2140}, {"loss": 1.8148, "grad_norm": 0.3287110924720764, "learning_rate": 0.0002, "epoch": 1.465076660988075, "step": 2150}, {"loss": 1.6966, "grad_norm": 0.3562135100364685, "learning_rate": 0.0002, "epoch": 1.4718909710391823, "step": 2160}, {"loss": 1.713, "grad_norm": 0.38292962312698364, "learning_rate": 0.0002, "epoch": 1.4787052810902896, "step": 2170}, {"loss": 1.7036, "grad_norm": 0.38220319151878357, "learning_rate": 0.0002, "epoch": 1.485519591141397, "step": 2180}, {"loss": 1.7297, "grad_norm": 0.3570062220096588, "learning_rate": 0.0002, "epoch": 1.4923339011925043, "step": 2190}, {"loss": 1.5652, "grad_norm": 0.363146036863327, "learning_rate": 0.0002, "epoch": 1.4991482112436116, "step": 2200}, {"loss": 1.6716, "grad_norm": 0.37393274903297424, "learning_rate": 0.0002, "epoch": 1.5059625212947187, "step": 2210}, {"loss": 1.6839, "grad_norm": 0.3628501892089844, "learning_rate": 0.0002, "epoch": 1.512776831345826, "step": 2220}, {"loss": 1.6627, "grad_norm": 0.33430740237236023, "learning_rate": 0.0002, "epoch": 1.5195911413969334, "step": 2230}, {"loss": 1.7495, "grad_norm": 0.35363978147506714, "learning_rate": 0.0002, "epoch": 1.5264054514480407, "step": 2240}, {"loss": 1.6364, "grad_norm": 0.37220337986946106, "learning_rate": 0.0002, "epoch": 1.533219761499148, "step": 2250}, {"loss": 1.7773, "grad_norm": 0.35020262002944946, "learning_rate": 0.0002, "epoch": 1.5400340715502554, "step": 2260}, {"loss": 1.7453, "grad_norm": 0.35274937748908997, "learning_rate": 0.0002, "epoch": 1.5468483816013627, "step": 2270}, {"loss": 1.7162, "grad_norm": 0.3797738552093506, "learning_rate": 0.0002, "epoch": 1.55366269165247, "step": 2280}, {"loss": 1.6197, "grad_norm": 0.4160412847995758, "learning_rate": 0.0002, "epoch": 1.5604770017035774, "step": 2290}, {"loss": 1.7101, "grad_norm": 0.38779592514038086, "learning_rate": 0.0002, "epoch": 1.5672913117546847, "step": 2300}, {"loss": 1.7254, "grad_norm": 0.39171287417411804, "learning_rate": 0.0002, "epoch": 1.574105621805792, "step": 2310}, {"loss": 1.7087, "grad_norm": 0.3606826663017273, "learning_rate": 0.0002, "epoch": 1.5809199318568994, "step": 2320}, {"loss": 1.7269, "grad_norm": 0.3745017647743225, "learning_rate": 0.0002, "epoch": 1.5877342419080067, "step": 2330}, {"loss": 1.7068, "grad_norm": 0.34933462738990784, "learning_rate": 0.0002, "epoch": 1.594548551959114, "step": 2340}, {"loss": 1.7073, "grad_norm": 0.37268444895744324, "learning_rate": 0.0002, "epoch": 1.6013628620102214, "step": 2350}, {"loss": 1.728, "grad_norm": 0.4603484869003296, "learning_rate": 0.0002, "epoch": 1.6081771720613287, "step": 2360}, {"loss": 1.7621, "grad_norm": 0.35689088702201843, "learning_rate": 0.0002, "epoch": 1.614991482112436, "step": 2370}, {"loss": 1.6989, "grad_norm": 0.3392031490802765, "learning_rate": 0.0002, "epoch": 1.6218057921635434, "step": 2380}, {"loss": 1.7077, "grad_norm": 0.394653856754303, "learning_rate": 0.0002, "epoch": 1.6286201022146507, "step": 2390}, {"loss": 1.7448, "grad_norm": 0.33972012996673584, "learning_rate": 0.0002, "epoch": 1.635434412265758, "step": 2400}, {"loss": 1.7681, "grad_norm": 0.3854375183582306, "learning_rate": 0.0002, "epoch": 1.6422487223168654, "step": 2410}, {"loss": 1.7102, "grad_norm": 0.36143961548805237, "learning_rate": 0.0002, "epoch": 1.6490630323679727, "step": 2420}, {"loss": 1.7432, "grad_norm": 0.35816189646720886, "learning_rate": 0.0002, "epoch": 1.65587734241908, "step": 2430}, {"loss": 1.6407, "grad_norm": 0.36298736929893494, "learning_rate": 0.0002, "epoch": 1.6626916524701874, "step": 2440}, {"loss": 1.723, "grad_norm": 0.36756929755210876, "learning_rate": 0.0002, "epoch": 1.6695059625212947, "step": 2450}, {"loss": 1.6824, "grad_norm": 0.35969603061676025, "learning_rate": 0.0002, "epoch": 1.676320272572402, "step": 2460}, {"loss": 1.7014, "grad_norm": 0.38449376821517944, "learning_rate": 0.0002, "epoch": 1.6831345826235093, "step": 2470}, {"loss": 1.7261, "grad_norm": 0.44511452317237854, "learning_rate": 0.0002, "epoch": 1.6899488926746167, "step": 2480}, {"loss": 1.7397, "grad_norm": 0.3831416368484497, "learning_rate": 0.0002, "epoch": 1.696763202725724, "step": 2490}, {"loss": 1.7046, "grad_norm": 0.3795325756072998, "learning_rate": 0.0002, "epoch": 1.7035775127768313, "step": 2500}, {"loss": 1.6993, "grad_norm": 0.34978193044662476, "learning_rate": 0.0002, "epoch": 1.7103918228279387, "step": 2510}, {"loss": 1.8307, "grad_norm": 0.35923877358436584, "learning_rate": 0.0002, "epoch": 1.717206132879046, "step": 2520}, {"loss": 1.6253, "grad_norm": 0.352999746799469, "learning_rate": 0.0002, "epoch": 1.7240204429301533, "step": 2530}, {"loss": 1.6953, "grad_norm": 0.43673479557037354, "learning_rate": 0.0002, "epoch": 1.7308347529812607, "step": 2540}, {"loss": 1.7079, "grad_norm": 0.4153687357902527, "learning_rate": 0.0002, "epoch": 1.737649063032368, "step": 2550}, {"loss": 1.6714, "grad_norm": 0.35541167855262756, "learning_rate": 0.0002, "epoch": 1.7444633730834753, "step": 2560}, {"loss": 1.696, "grad_norm": 0.3288775086402893, "learning_rate": 0.0002, "epoch": 1.7512776831345827, "step": 2570}, {"loss": 1.7486, "grad_norm": 0.3991123139858246, "learning_rate": 0.0002, "epoch": 1.75809199318569, "step": 2580}, {"loss": 1.645, "grad_norm": 0.39967241883277893, "learning_rate": 0.0002, "epoch": 1.7649063032367973, "step": 2590}, {"loss": 1.6855, "grad_norm": 0.41104283928871155, "learning_rate": 0.0002, "epoch": 1.7717206132879046, "step": 2600}, {"loss": 1.6993, "grad_norm": 0.44885286688804626, "learning_rate": 0.0002, "epoch": 1.778534923339012, "step": 2610}, {"loss": 1.7224, "grad_norm": 0.38384467363357544, "learning_rate": 0.0002, "epoch": 1.7853492333901193, "step": 2620}, {"loss": 1.7213, "grad_norm": 0.35020917654037476, "learning_rate": 0.0002, "epoch": 1.7921635434412266, "step": 2630}, {"loss": 1.6706, "grad_norm": 0.3360341489315033, "learning_rate": 0.0002, "epoch": 1.798977853492334, "step": 2640}, {"loss": 1.7037, "grad_norm": 0.38875144720077515, "learning_rate": 0.0002, "epoch": 1.8057921635434413, "step": 2650}, {"loss": 1.693, "grad_norm": 0.34876883029937744, "learning_rate": 0.0002, "epoch": 1.8126064735945486, "step": 2660}, {"loss": 1.7743, "grad_norm": 0.419979989528656, "learning_rate": 0.0002, "epoch": 1.819420783645656, "step": 2670}, {"loss": 1.7089, "grad_norm": 0.3648919463157654, "learning_rate": 0.0002, "epoch": 1.8262350936967633, "step": 2680}, {"loss": 1.7828, "grad_norm": 0.3485383987426758, "learning_rate": 0.0002, "epoch": 1.8330494037478706, "step": 2690}, {"loss": 1.705, "grad_norm": 0.3647468686103821, "learning_rate": 0.0002, "epoch": 1.839863713798978, "step": 2700}, {"loss": 1.7318, "grad_norm": 0.37003210186958313, "learning_rate": 0.0002, "epoch": 1.8466780238500853, "step": 2710}, {"loss": 1.6647, "grad_norm": 0.37031617760658264, "learning_rate": 0.0002, "epoch": 1.8534923339011926, "step": 2720}, {"loss": 1.69, "grad_norm": 0.3438796103000641, "learning_rate": 0.0002, "epoch": 1.8603066439523, "step": 2730}, {"loss": 1.5995, "grad_norm": 0.41574627161026, "learning_rate": 0.0002, "epoch": 1.8671209540034073, "step": 2740}, {"loss": 1.6877, "grad_norm": 0.35049930214881897, "learning_rate": 0.0002, "epoch": 1.8739352640545146, "step": 2750}, {"loss": 1.7048, "grad_norm": 0.3943989872932434, "learning_rate": 0.0002, "epoch": 1.880749574105622, "step": 2760}, {"loss": 1.7047, "grad_norm": 0.3384978175163269, "learning_rate": 0.0002, "epoch": 1.8875638841567293, "step": 2770}, {"loss": 1.7848, "grad_norm": 0.3501328229904175, "learning_rate": 0.0002, "epoch": 1.8943781942078366, "step": 2780}, {"loss": 1.6986, "grad_norm": 0.37484532594680786, "learning_rate": 0.0002, "epoch": 1.901192504258944, "step": 2790}, {"loss": 1.6867, "grad_norm": 0.34497788548469543, "learning_rate": 0.0002, "epoch": 1.9080068143100513, "step": 2800}, {"loss": 1.6513, "grad_norm": 0.3530851900577545, "learning_rate": 0.0002, "epoch": 1.9148211243611586, "step": 2810}, {"loss": 1.6369, "grad_norm": 0.3879254162311554, "learning_rate": 0.0002, "epoch": 1.921635434412266, "step": 2820}, {"loss": 1.6786, "grad_norm": 0.3885590136051178, "learning_rate": 0.0002, "epoch": 1.9284497444633732, "step": 2830}, {"loss": 1.8049, "grad_norm": 0.3868715465068817, "learning_rate": 0.0002, "epoch": 1.9352640545144804, "step": 2840}, {"loss": 1.7099, "grad_norm": 0.4152422249317169, "learning_rate": 0.0002, "epoch": 1.9420783645655877, "step": 2850}, {"loss": 1.6696, "grad_norm": 0.401714563369751, "learning_rate": 0.0002, "epoch": 1.948892674616695, "step": 2860}, {"loss": 1.7182, "grad_norm": 0.34825265407562256, "learning_rate": 0.0002, "epoch": 1.9557069846678024, "step": 2870}, {"loss": 1.6612, "grad_norm": 0.3620675504207611, "learning_rate": 0.0002, "epoch": 1.9625212947189097, "step": 2880}, {"loss": 1.7451, "grad_norm": 0.3977806866168976, "learning_rate": 0.0002, "epoch": 1.969335604770017, "step": 2890}, {"loss": 1.6514, "grad_norm": 0.3687497079372406, "learning_rate": 0.0002, "epoch": 1.9761499148211243, "step": 2900}, {"loss": 1.7712, "grad_norm": 0.408640056848526, "learning_rate": 0.0002, "epoch": 1.9829642248722317, "step": 2910}, {"loss": 1.695, "grad_norm": 0.34510108828544617, "learning_rate": 0.0002, "epoch": 1.989778534923339, "step": 2920}, {"loss": 1.662, "grad_norm": 0.3596334755420685, "learning_rate": 0.0002, "epoch": 1.9965928449744463, "step": 2930}, {"eval_loss": 1.8056600093841553, "eval_runtime": 60.7049, "eval_samples_per_second": 8.352, "eval_steps_per_second": 1.054, "epoch": 2.0, "step": 2935}, {"loss": 1.6918, "grad_norm": 0.3460802137851715, "learning_rate": 0.0002, "epoch": 2.0034071550255534, "step": 2940}, {"loss": 1.6036, "grad_norm": 0.4038620591163635, "learning_rate": 0.0002, "epoch": 2.0102214650766608, "step": 2950}, {"loss": 1.5386, "grad_norm": 0.3950219750404358, "learning_rate": 0.0002, "epoch": 2.017035775127768, "step": 2960}, {"loss": 1.5649, "grad_norm": 0.519116997718811, "learning_rate": 0.0002, "epoch": 2.0238500851788754, "step": 2970}, {"loss": 1.5397, "grad_norm": 0.4097684919834137, "learning_rate": 0.0002, "epoch": 2.0306643952299828, "step": 2980}, {"loss": 1.6044, "grad_norm": 0.4153544306755066, "learning_rate": 0.0002, "epoch": 2.03747870528109, "step": 2990}, {"loss": 1.6006, "grad_norm": 0.4351160526275635, "learning_rate": 0.0002, "epoch": 2.0442930153321974, "step": 3000}, {"loss": 1.5091, "grad_norm": 0.42036259174346924, "learning_rate": 0.0002, "epoch": 2.0511073253833048, "step": 3010}, {"loss": 1.5686, "grad_norm": 0.4433218836784363, "learning_rate": 0.0002, "epoch": 2.057921635434412, "step": 3020}, {"loss": 1.5478, "grad_norm": 0.46511581540107727, "learning_rate": 0.0002, "epoch": 2.0647359454855194, "step": 3030}, {"loss": 1.5554, "grad_norm": 0.4567560851573944, "learning_rate": 0.0002, "epoch": 2.0715502555366268, "step": 3040}, {"loss": 1.5561, "grad_norm": 0.45671048760414124, "learning_rate": 0.0002, "epoch": 2.078364565587734, "step": 3050}, {"loss": 1.606, "grad_norm": 0.4598552882671356, "learning_rate": 0.0002, "epoch": 2.0851788756388414, "step": 3060}, {"loss": 1.6357, "grad_norm": 0.4582861661911011, "learning_rate": 0.0002, "epoch": 2.0919931856899487, "step": 3070}, {"loss": 1.5853, "grad_norm": 0.4366969168186188, "learning_rate": 0.0002, "epoch": 2.098807495741056, "step": 3080}, {"loss": 1.5574, "grad_norm": 0.495917409658432, "learning_rate": 0.0002, "epoch": 2.1056218057921634, "step": 3090}, {"loss": 1.5798, "grad_norm": 1.6846044063568115, "learning_rate": 0.0002, "epoch": 2.1124361158432707, "step": 3100}, {"loss": 1.5877, "grad_norm": 0.4765092134475708, "learning_rate": 0.0002, "epoch": 2.119250425894378, "step": 3110}, {"loss": 1.6106, "grad_norm": 0.45029810070991516, "learning_rate": 0.0002, "epoch": 2.1260647359454854, "step": 3120}, {"loss": 1.5707, "grad_norm": 0.5706973075866699, "learning_rate": 0.0002, "epoch": 2.1328790459965927, "step": 3130}, {"loss": 1.589, "grad_norm": 0.4606274366378784, "learning_rate": 0.0002, "epoch": 2.1396933560477, "step": 3140}, {"loss": 1.564, "grad_norm": 2.199115753173828, "learning_rate": 0.0002, "epoch": 2.1465076660988074, "step": 3150}, {"loss": 1.5808, "grad_norm": 0.6231027245521545, "learning_rate": 0.0002, "epoch": 2.1533219761499147, "step": 3160}, {"loss": 1.5397, "grad_norm": 0.46918219327926636, "learning_rate": 0.0002, "epoch": 2.160136286201022, "step": 3170}, {"loss": 1.6082, "grad_norm": 0.5006393194198608, "learning_rate": 0.0002, "epoch": 2.1669505962521294, "step": 3180}, {"loss": 1.591, "grad_norm": 0.4745093286037445, "learning_rate": 0.0002, "epoch": 2.1737649063032367, "step": 3190}, {"loss": 1.6459, "grad_norm": 0.511576771736145, "learning_rate": 0.0002, "epoch": 2.180579216354344, "step": 3200}, {"loss": 1.6019, "grad_norm": 0.38622918725013733, "learning_rate": 0.0002, "epoch": 2.1873935264054514, "step": 3210}, {"loss": 1.6187, "grad_norm": 0.4425644278526306, "learning_rate": 0.0002, "epoch": 2.1942078364565587, "step": 3220}, {"loss": 1.6114, "grad_norm": 0.45643091201782227, "learning_rate": 0.0002, "epoch": 2.201022146507666, "step": 3230}, {"loss": 1.6224, "grad_norm": 0.4975406229496002, "learning_rate": 0.0002, "epoch": 2.2078364565587734, "step": 3240}, {"loss": 1.6654, "grad_norm": 0.4673331081867218, "learning_rate": 0.0002, "epoch": 2.2146507666098807, "step": 3250}, {"loss": 1.6155, "grad_norm": 0.5081731081008911, "learning_rate": 0.0002, "epoch": 2.221465076660988, "step": 3260}, {"loss": 1.53, "grad_norm": 0.4790806770324707, "learning_rate": 0.0002, "epoch": 2.2282793867120954, "step": 3270}, {"loss": 1.6362, "grad_norm": 0.5184140801429749, "learning_rate": 0.0002, "epoch": 2.2350936967632027, "step": 3280}, {"loss": 1.5646, "grad_norm": 0.5159083604812622, "learning_rate": 0.0002, "epoch": 2.24190800681431, "step": 3290}, {"loss": 1.6577, "grad_norm": 0.4876042604446411, "learning_rate": 0.0002, "epoch": 2.2487223168654173, "step": 3300}, {"loss": 1.5718, "grad_norm": 0.5454957485198975, "learning_rate": 0.0002, "epoch": 2.2555366269165247, "step": 3310}, {"loss": 1.5885, "grad_norm": 0.49866822361946106, "learning_rate": 0.0002, "epoch": 2.262350936967632, "step": 3320}, {"loss": 1.5923, "grad_norm": 0.42674365639686584, "learning_rate": 0.0002, "epoch": 2.2691652470187393, "step": 3330}, {"loss": 1.5588, "grad_norm": 0.5202316641807556, "learning_rate": 0.0002, "epoch": 2.2759795570698467, "step": 3340}, {"loss": 1.6032, "grad_norm": 0.4849465489387512, "learning_rate": 0.0002, "epoch": 2.282793867120954, "step": 3350}, {"loss": 1.6853, "grad_norm": 0.47202569246292114, "learning_rate": 0.0002, "epoch": 2.2896081771720613, "step": 3360}, {"loss": 1.6164, "grad_norm": 0.5311620235443115, "learning_rate": 0.0002, "epoch": 2.2964224872231687, "step": 3370}, {"loss": 1.6004, "grad_norm": 0.49011409282684326, "learning_rate": 0.0002, "epoch": 2.303236797274276, "step": 3380}, {"loss": 1.5484, "grad_norm": 0.4789247512817383, "learning_rate": 0.0002, "epoch": 2.3100511073253833, "step": 3390}, {"loss": 1.5866, "grad_norm": 0.46646103262901306, "learning_rate": 0.0002, "epoch": 2.3168654173764907, "step": 3400}, {"loss": 1.5308, "grad_norm": 0.5552441477775574, "learning_rate": 0.0002, "epoch": 2.323679727427598, "step": 3410}, {"loss": 1.5761, "grad_norm": 0.4530351758003235, "learning_rate": 0.0002, "epoch": 2.3304940374787053, "step": 3420}, {"loss": 1.5919, "grad_norm": 0.4806232750415802, "learning_rate": 0.0002, "epoch": 2.3373083475298126, "step": 3430}, {"loss": 1.5569, "grad_norm": 0.5998363494873047, "learning_rate": 0.0002, "epoch": 2.34412265758092, "step": 3440}, {"loss": 1.513, "grad_norm": 0.4918554425239563, "learning_rate": 0.0002, "epoch": 2.3509369676320273, "step": 3450}, {"loss": 1.6323, "grad_norm": 0.5359559655189514, "learning_rate": 0.0002, "epoch": 2.3577512776831346, "step": 3460}, {"loss": 1.5973, "grad_norm": 0.5053277611732483, "learning_rate": 0.0002, "epoch": 2.364565587734242, "step": 3470}, {"loss": 1.5673, "grad_norm": 0.5058915019035339, "learning_rate": 0.0002, "epoch": 2.3713798977853493, "step": 3480}, {"loss": 1.5434, "grad_norm": 0.5314899682998657, "learning_rate": 0.0002, "epoch": 2.3781942078364566, "step": 3490}, {"loss": 1.5203, "grad_norm": 0.48035913705825806, "learning_rate": 0.0002, "epoch": 2.385008517887564, "step": 3500}, {"loss": 1.5936, "grad_norm": 0.45864903926849365, "learning_rate": 0.0002, "epoch": 2.3918228279386713, "step": 3510}, {"loss": 1.6285, "grad_norm": 0.4553050398826599, "learning_rate": 0.0002, "epoch": 2.3986371379897786, "step": 3520}, {"loss": 1.5444, "grad_norm": 0.4483442008495331, "learning_rate": 0.0002, "epoch": 2.405451448040886, "step": 3530}, {"loss": 1.587, "grad_norm": 0.5043742060661316, "learning_rate": 0.0002, "epoch": 2.4122657580919933, "step": 3540}, {"loss": 1.5692, "grad_norm": 0.44807168841362, "learning_rate": 0.0002, "epoch": 2.4190800681431006, "step": 3550}, {"loss": 1.6306, "grad_norm": 0.5065137147903442, "learning_rate": 0.0002, "epoch": 2.425894378194208, "step": 3560}, {"loss": 1.5842, "grad_norm": 0.5186443328857422, "learning_rate": 0.0002, "epoch": 2.4327086882453153, "step": 3570}, {"loss": 1.5956, "grad_norm": 0.49743232131004333, "learning_rate": 0.0002, "epoch": 2.4395229982964226, "step": 3580}, {"loss": 1.6021, "grad_norm": 0.524450421333313, "learning_rate": 0.0002, "epoch": 2.44633730834753, "step": 3590}, {"loss": 1.6283, "grad_norm": 0.5053797364234924, "learning_rate": 0.0002, "epoch": 2.4531516183986373, "step": 3600}, {"loss": 1.6335, "grad_norm": 0.5223091840744019, "learning_rate": 0.0002, "epoch": 2.4599659284497446, "step": 3610}, {"loss": 1.6315, "grad_norm": 0.4763810932636261, "learning_rate": 0.0002, "epoch": 2.466780238500852, "step": 3620}, {"loss": 1.5623, "grad_norm": 0.5097282528877258, "learning_rate": 0.0002, "epoch": 2.4735945485519593, "step": 3630}, {"loss": 1.605, "grad_norm": 0.5831942558288574, "learning_rate": 0.0002, "epoch": 2.4804088586030666, "step": 3640}, {"loss": 1.6074, "grad_norm": 0.47573572397232056, "learning_rate": 0.0002, "epoch": 2.487223168654174, "step": 3650}, {"loss": 1.6411, "grad_norm": 0.49602726101875305, "learning_rate": 0.0002, "epoch": 2.4940374787052813, "step": 3660}, {"loss": 1.571, "grad_norm": 0.5468524694442749, "learning_rate": 0.0002, "epoch": 2.500851788756388, "step": 3670}, {"loss": 1.5782, "grad_norm": 0.45899373292922974, "learning_rate": 0.0002, "epoch": 2.507666098807496, "step": 3680}, {"loss": 1.5114, "grad_norm": 0.5031567215919495, "learning_rate": 0.0002, "epoch": 2.514480408858603, "step": 3690}, {"loss": 1.538, "grad_norm": 0.5224900841712952, "learning_rate": 0.0002, "epoch": 2.5212947189097106, "step": 3700}, {"loss": 1.5269, "grad_norm": 0.504769504070282, "learning_rate": 0.0002, "epoch": 2.5281090289608175, "step": 3710}, {"loss": 1.5141, "grad_norm": 0.6120529770851135, "learning_rate": 0.0002, "epoch": 2.5349233390119252, "step": 3720}, {"loss": 1.5666, "grad_norm": 0.47930678725242615, "learning_rate": 0.0002, "epoch": 2.541737649063032, "step": 3730}, {"loss": 1.6156, "grad_norm": 0.5039092302322388, "learning_rate": 0.0002, "epoch": 2.54855195911414, "step": 3740}, {"loss": 1.5585, "grad_norm": 0.49758994579315186, "learning_rate": 0.0002, "epoch": 2.555366269165247, "step": 3750}, {"loss": 1.5351, "grad_norm": 0.44739171862602234, "learning_rate": 0.0002, "epoch": 2.5621805792163546, "step": 3760}, {"loss": 1.5099, "grad_norm": 0.47056373953819275, "learning_rate": 0.0002, "epoch": 2.5689948892674614, "step": 3770}, {"loss": 1.5524, "grad_norm": 0.5077595114707947, "learning_rate": 0.0002, "epoch": 2.575809199318569, "step": 3780}, {"loss": 1.5524, "grad_norm": 0.4981902837753296, "learning_rate": 0.0002, "epoch": 2.582623509369676, "step": 3790}, {"loss": 1.5381, "grad_norm": 0.5736238360404968, "learning_rate": 0.0002, "epoch": 2.589437819420784, "step": 3800}, {"loss": 1.67, "grad_norm": 0.4898384213447571, "learning_rate": 0.0002, "epoch": 2.5962521294718908, "step": 3810}, {"loss": 1.5411, "grad_norm": 0.4611325263977051, "learning_rate": 0.0002, "epoch": 2.6030664395229985, "step": 3820}, {"loss": 1.5662, "grad_norm": 0.5285341739654541, "learning_rate": 0.0002, "epoch": 2.6098807495741054, "step": 3830}, {"loss": 1.5875, "grad_norm": 0.5679430961608887, "learning_rate": 0.0002, "epoch": 2.616695059625213, "step": 3840}, {"loss": 1.5544, "grad_norm": 0.48532548546791077, "learning_rate": 0.0002, "epoch": 2.62350936967632, "step": 3850}, {"loss": 1.579, "grad_norm": 0.45506492257118225, "learning_rate": 0.0002, "epoch": 2.630323679727428, "step": 3860}, {"loss": 1.5775, "grad_norm": 0.6552556753158569, "learning_rate": 0.0002, "epoch": 2.6371379897785348, "step": 3870}, {"loss": 1.608, "grad_norm": 0.537874698638916, "learning_rate": 0.0002, "epoch": 2.643952299829642, "step": 3880}, {"loss": 1.5653, "grad_norm": 0.46102389693260193, "learning_rate": 0.0002, "epoch": 2.6507666098807494, "step": 3890}, {"loss": 1.5293, "grad_norm": 0.45531195402145386, "learning_rate": 0.0002, "epoch": 2.6575809199318567, "step": 3900}, {"loss": 1.5492, "grad_norm": 0.5327293872833252, "learning_rate": 0.0002, "epoch": 2.664395229982964, "step": 3910}, {"loss": 1.5921, "grad_norm": 0.4968956410884857, "learning_rate": 0.0002, "epoch": 2.6712095400340714, "step": 3920}, {"loss": 1.5823, "grad_norm": 0.4790082275867462, "learning_rate": 0.0002, "epoch": 2.6780238500851787, "step": 3930}, {"loss": 1.615, "grad_norm": 0.5392967462539673, "learning_rate": 0.0002, "epoch": 2.684838160136286, "step": 3940}, {"loss": 1.6218, "grad_norm": 0.5076649785041809, "learning_rate": 0.0002, "epoch": 2.6916524701873934, "step": 3950}, {"loss": 1.6478, "grad_norm": 0.5628064274787903, "learning_rate": 0.0002, "epoch": 2.6984667802385007, "step": 3960}, {"loss": 1.5417, "grad_norm": 0.5012659430503845, "learning_rate": 0.0002, "epoch": 2.705281090289608, "step": 3970}, {"loss": 1.5339, "grad_norm": 0.4947647452354431, "learning_rate": 0.0002, "epoch": 2.7120954003407154, "step": 3980}, {"loss": 1.5724, "grad_norm": 0.4890969693660736, "learning_rate": 0.0002, "epoch": 2.7189097103918227, "step": 3990}, {"loss": 1.5746, "grad_norm": 0.4471694231033325, "learning_rate": 0.0002, "epoch": 2.72572402044293, "step": 4000}, {"loss": 1.6669, "grad_norm": 0.5116439461708069, "learning_rate": 0.0002, "epoch": 2.7325383304940374, "step": 4010}, {"loss": 1.584, "grad_norm": 0.5720411539077759, "learning_rate": 0.0002, "epoch": 2.7393526405451447, "step": 4020}, {"loss": 1.6151, "grad_norm": 0.5529406070709229, "learning_rate": 0.0002, "epoch": 2.746166950596252, "step": 4030}, {"loss": 1.6296, "grad_norm": 0.5229396820068359, "learning_rate": 0.0002, "epoch": 2.7529812606473594, "step": 4040}, {"loss": 1.5363, "grad_norm": 0.5270276069641113, "learning_rate": 0.0002, "epoch": 2.7597955706984667, "step": 4050}, {"loss": 1.6305, "grad_norm": 0.48413026332855225, "learning_rate": 0.0002, "epoch": 2.766609880749574, "step": 4060}, {"loss": 1.5488, "grad_norm": 0.5145403742790222, "learning_rate": 0.0002, "epoch": 2.7734241908006814, "step": 4070}, {"loss": 1.6023, "grad_norm": 0.48626071214675903, "learning_rate": 0.0002, "epoch": 2.7802385008517887, "step": 4080}, {"loss": 1.6082, "grad_norm": 0.5018984079360962, "learning_rate": 0.0002, "epoch": 2.787052810902896, "step": 4090}, {"loss": 1.7166, "grad_norm": 0.4881938695907593, "learning_rate": 0.0002, "epoch": 2.7938671209540034, "step": 4100}, {"loss": 1.515, "grad_norm": 0.5151546001434326, "learning_rate": 0.0002, "epoch": 2.8006814310051107, "step": 4110}, {"loss": 1.6069, "grad_norm": 0.5109850764274597, "learning_rate": 0.0002, "epoch": 2.807495741056218, "step": 4120}, {"loss": 1.6153, "grad_norm": 0.5109251141548157, "learning_rate": 0.0002, "epoch": 2.8143100511073254, "step": 4130}, {"loss": 1.6365, "grad_norm": 0.5025496482849121, "learning_rate": 0.0002, "epoch": 2.8211243611584327, "step": 4140}, {"loss": 1.6292, "grad_norm": 0.49027004837989807, "learning_rate": 0.0002, "epoch": 2.82793867120954, "step": 4150}, {"loss": 1.5591, "grad_norm": 0.4957362413406372, "learning_rate": 0.0002, "epoch": 2.8347529812606473, "step": 4160}, {"loss": 1.6759, "grad_norm": 0.5159927606582642, "learning_rate": 0.0002, "epoch": 2.8415672913117547, "step": 4170}, {"loss": 1.577, "grad_norm": 0.6040670871734619, "learning_rate": 0.0002, "epoch": 2.848381601362862, "step": 4180}, {"loss": 1.5295, "grad_norm": 0.5489953756332397, "learning_rate": 0.0002, "epoch": 2.8551959114139693, "step": 4190}, {"loss": 1.5909, "grad_norm": 0.5416634678840637, "learning_rate": 0.0002, "epoch": 2.8620102214650767, "step": 4200}, {"loss": 1.6014, "grad_norm": 0.5278245210647583, "learning_rate": 0.0002, "epoch": 2.868824531516184, "step": 4210}, {"loss": 1.4848, "grad_norm": 0.43382319808006287, "learning_rate": 0.0002, "epoch": 2.8756388415672913, "step": 4220}, {"loss": 1.6092, "grad_norm": 0.4724387228488922, "learning_rate": 0.0002, "epoch": 2.8824531516183987, "step": 4230}, {"loss": 1.5748, "grad_norm": 0.49824780225753784, "learning_rate": 0.0002, "epoch": 2.889267461669506, "step": 4240}, {"loss": 1.6163, "grad_norm": 0.5360262989997864, "learning_rate": 0.0002, "epoch": 2.8960817717206133, "step": 4250}, {"loss": 1.5934, "grad_norm": 0.49090322852134705, "learning_rate": 0.0002, "epoch": 2.9028960817717206, "step": 4260}, {"loss": 1.624, "grad_norm": 0.5613328218460083, "learning_rate": 0.0002, "epoch": 2.909710391822828, "step": 4270}, {"loss": 1.5713, "grad_norm": 0.4611356258392334, "learning_rate": 0.0002, "epoch": 2.9165247018739353, "step": 4280}, {"loss": 1.5457, "grad_norm": 0.550897479057312, "learning_rate": 0.0002, "epoch": 2.9233390119250426, "step": 4290}, {"loss": 1.6225, "grad_norm": 0.5089612603187561, "learning_rate": 0.0002, "epoch": 2.93015332197615, "step": 4300}, {"loss": 1.5897, "grad_norm": 0.5210904479026794, "learning_rate": 0.0002, "epoch": 2.9369676320272573, "step": 4310}, {"loss": 1.6198, "grad_norm": 0.5506424903869629, "learning_rate": 0.0002, "epoch": 2.9437819420783646, "step": 4320}, {"loss": 1.6395, "grad_norm": 0.5118561387062073, "learning_rate": 0.0002, "epoch": 2.950596252129472, "step": 4330}, {"loss": 1.704, "grad_norm": 0.5034464597702026, "learning_rate": 0.0002, "epoch": 2.9574105621805793, "step": 4340}, {"loss": 1.6314, "grad_norm": 0.5019990801811218, "learning_rate": 0.0002, "epoch": 2.9642248722316866, "step": 4350}, {"loss": 1.6161, "grad_norm": 0.5423325300216675, "learning_rate": 0.0002, "epoch": 2.971039182282794, "step": 4360}, {"loss": 1.6144, "grad_norm": 0.5287469625473022, "learning_rate": 0.0002, "epoch": 2.9778534923339013, "step": 4370}, {"loss": 1.5227, "grad_norm": 0.5206913352012634, "learning_rate": 0.0002, "epoch": 2.9846678023850086, "step": 4380}, {"loss": 1.6026, "grad_norm": 0.5407394170761108, "learning_rate": 0.0002, "epoch": 2.991482112436116, "step": 4390}, {"loss": 1.5908, "grad_norm": 0.5244600176811218, "learning_rate": 0.0002, "epoch": 2.9982964224872233, "step": 4400}, {"eval_loss": 1.8412635326385498, "eval_runtime": 65.5583, "eval_samples_per_second": 7.734, "eval_steps_per_second": 0.976, "epoch": 2.9996592844974446, "step": 4402}, {"loss": 1.5157, "grad_norm": 0.5172150731086731, "learning_rate": 0.0002, "epoch": 3.0051107325383306, "step": 4410}, {"loss": 1.398, "grad_norm": 0.6882525086402893, "learning_rate": 0.0002, "epoch": 3.011925042589438, "step": 4420}, {"loss": 1.3884, "grad_norm": 0.6435003280639648, "learning_rate": 0.0002, "epoch": 3.0187393526405453, "step": 4430}, {"loss": 1.4493, "grad_norm": 0.7126057147979736, "learning_rate": 0.0002, "epoch": 3.0255536626916526, "step": 4440}, {"loss": 1.4397, "grad_norm": 0.6634385585784912, "learning_rate": 0.0002, "epoch": 3.03236797274276, "step": 4450}, {"loss": 1.3674, "grad_norm": 0.6468435525894165, "learning_rate": 0.0002, "epoch": 3.0391822827938673, "step": 4460}, {"loss": 1.4045, "grad_norm": 0.5690478086471558, "learning_rate": 0.0002, "epoch": 3.0459965928449746, "step": 4470}, {"loss": 1.3742, "grad_norm": 0.7323708534240723, "learning_rate": 0.0002, "epoch": 3.052810902896082, "step": 4480}, {"loss": 1.3281, "grad_norm": 0.6989302039146423, "learning_rate": 0.0002, "epoch": 3.0596252129471893, "step": 4490}, {"loss": 1.379, "grad_norm": 0.6704450845718384, "learning_rate": 0.0002, "epoch": 3.0664395229982966, "step": 4500}, {"loss": 1.4028, "grad_norm": 0.769137442111969, "learning_rate": 0.0002, "epoch": 3.073253833049404, "step": 4510}, {"loss": 1.4295, "grad_norm": 0.6556448936462402, "learning_rate": 0.0002, "epoch": 3.0800681431005112, "step": 4520}, {"loss": 1.2763, "grad_norm": 0.7143950462341309, "learning_rate": 0.0002, "epoch": 3.0868824531516186, "step": 4530}, {"loss": 1.4806, "grad_norm": 0.7060510516166687, "learning_rate": 0.0002, "epoch": 3.093696763202726, "step": 4540}, {"loss": 1.4097, "grad_norm": 0.6637526750564575, "learning_rate": 0.0002, "epoch": 3.1005110732538332, "step": 4550}, {"loss": 1.4752, "grad_norm": 0.822989284992218, "learning_rate": 0.0002, "epoch": 3.1073253833049406, "step": 4560}, {"loss": 1.4994, "grad_norm": 0.5542152523994446, "learning_rate": 0.0002, "epoch": 3.114139693356048, "step": 4570}, {"loss": 1.4306, "grad_norm": 0.7780306935310364, "learning_rate": 0.0002, "epoch": 3.1209540034071552, "step": 4580}, {"loss": 1.3909, "grad_norm": 0.7372637987136841, "learning_rate": 0.0002, "epoch": 3.1277683134582626, "step": 4590}, {"loss": 1.3989, "grad_norm": 0.6730087995529175, "learning_rate": 0.0002, "epoch": 3.1345826235093694, "step": 4600}, {"loss": 1.3591, "grad_norm": 0.6687398552894592, "learning_rate": 0.0002, "epoch": 3.1413969335604772, "step": 4610}, {"loss": 1.436, "grad_norm": 0.7645083665847778, "learning_rate": 0.0002, "epoch": 3.148211243611584, "step": 4620}, {"loss": 1.3681, "grad_norm": 0.6770380139350891, "learning_rate": 0.0002, "epoch": 3.155025553662692, "step": 4630}, {"loss": 1.405, "grad_norm": 0.7200576663017273, "learning_rate": 0.0002, "epoch": 3.1618398637137988, "step": 4640}, {"loss": 1.3752, "grad_norm": 0.6663638949394226, "learning_rate": 0.0002, "epoch": 3.168654173764906, "step": 4650}, {"loss": 1.4099, "grad_norm": 0.6602960228919983, "learning_rate": 0.0002, "epoch": 3.1754684838160134, "step": 4660}, {"loss": 1.4003, "grad_norm": 0.7838228344917297, "learning_rate": 0.0002, "epoch": 3.1822827938671208, "step": 4670}, {"loss": 1.3853, "grad_norm": 0.7559184432029724, "learning_rate": 0.0002, "epoch": 3.189097103918228, "step": 4680}, {"loss": 1.4516, "grad_norm": 0.6609814167022705, "learning_rate": 0.0002, "epoch": 3.1959114139693354, "step": 4690}, {"loss": 1.4464, "grad_norm": 0.8470419645309448, "learning_rate": 0.0002, "epoch": 3.2027257240204428, "step": 4700}, {"loss": 1.428, "grad_norm": 0.7282822728157043, "learning_rate": 0.0002, "epoch": 3.20954003407155, "step": 4710}, {"loss": 1.5261, "grad_norm": 0.6722773313522339, "learning_rate": 0.0002, "epoch": 3.2163543441226574, "step": 4720}, {"loss": 1.3809, "grad_norm": 0.7630265355110168, "learning_rate": 0.0002, "epoch": 3.2231686541737647, "step": 4730}, {"loss": 1.42, "grad_norm": 0.7102773785591125, "learning_rate": 0.0002, "epoch": 3.229982964224872, "step": 4740}, {"loss": 1.3529, "grad_norm": 0.7778299450874329, "learning_rate": 0.0002, "epoch": 3.2367972742759794, "step": 4750}, {"loss": 1.4715, "grad_norm": 0.7189921736717224, "learning_rate": 0.0002, "epoch": 3.2436115843270867, "step": 4760}, {"loss": 1.4328, "grad_norm": 0.7708092331886292, "learning_rate": 0.0002, "epoch": 3.250425894378194, "step": 4770}, {"loss": 1.3855, "grad_norm": 0.7208452224731445, "learning_rate": 0.0002, "epoch": 3.2572402044293014, "step": 4780}, {"loss": 1.3206, "grad_norm": 0.7220432758331299, "learning_rate": 0.0002, "epoch": 3.2640545144804087, "step": 4790}, {"loss": 1.463, "grad_norm": 0.7064954042434692, "learning_rate": 0.0002, "epoch": 3.270868824531516, "step": 4800}, {"loss": 1.4236, "grad_norm": 0.6618382334709167, "learning_rate": 0.0002, "epoch": 3.2776831345826234, "step": 4810}, {"loss": 1.3878, "grad_norm": 0.6854256391525269, "learning_rate": 0.0002, "epoch": 3.2844974446337307, "step": 4820}, {"loss": 1.4236, "grad_norm": 0.6036319136619568, "learning_rate": 0.0002, "epoch": 3.291311754684838, "step": 4830}, {"loss": 1.4796, "grad_norm": 0.714678943157196, "learning_rate": 0.0002, "epoch": 3.2981260647359454, "step": 4840}, {"loss": 1.4273, "grad_norm": 0.7218600511550903, "learning_rate": 0.0002, "epoch": 3.3049403747870527, "step": 4850}, {"loss": 1.3915, "grad_norm": 0.7243074774742126, "learning_rate": 0.0002, "epoch": 3.31175468483816, "step": 4860}, {"loss": 1.4088, "grad_norm": 0.7058630585670471, "learning_rate": 0.0002, "epoch": 3.3185689948892674, "step": 4870}, {"loss": 1.3837, "grad_norm": 0.7091076970100403, "learning_rate": 0.0002, "epoch": 3.3253833049403747, "step": 4880}, {"loss": 1.4745, "grad_norm": 0.7375147342681885, "learning_rate": 0.0002, "epoch": 3.332197614991482, "step": 4890}, {"loss": 1.4826, "grad_norm": 0.9426755309104919, "learning_rate": 0.0002, "epoch": 3.3390119250425894, "step": 4900}, {"loss": 1.369, "grad_norm": 0.6508213877677917, "learning_rate": 0.0002, "epoch": 3.3458262350936967, "step": 4910}, {"loss": 1.3839, "grad_norm": 0.6945043206214905, "learning_rate": 0.0002, "epoch": 3.352640545144804, "step": 4920}, {"loss": 1.3571, "grad_norm": 0.6335888504981995, "learning_rate": 0.0002, "epoch": 3.3594548551959114, "step": 4930}, {"loss": 1.4391, "grad_norm": 0.6947107911109924, "learning_rate": 0.0002, "epoch": 3.3662691652470187, "step": 4940}, {"loss": 1.3885, "grad_norm": 0.8204733729362488, "learning_rate": 0.0002, "epoch": 3.373083475298126, "step": 4950}, {"loss": 1.4886, "grad_norm": 0.7212244868278503, "learning_rate": 0.0002, "epoch": 3.3798977853492334, "step": 4960}, {"loss": 1.4581, "grad_norm": 0.6053042411804199, "learning_rate": 0.0002, "epoch": 3.3867120954003407, "step": 4970}, {"loss": 1.3863, "grad_norm": 0.7820029854774475, "learning_rate": 0.0002, "epoch": 3.393526405451448, "step": 4980}, {"loss": 1.4326, "grad_norm": 0.6866770386695862, "learning_rate": 0.0002, "epoch": 3.4003407155025553, "step": 4990}, {"loss": 1.4287, "grad_norm": 0.6652463674545288, "learning_rate": 0.0002, "epoch": 3.4071550255536627, "step": 5000}, {"loss": 1.3667, "grad_norm": 1.1209032535552979, "learning_rate": 0.0002, "epoch": 3.41396933560477, "step": 5010}, {"loss": 1.4461, "grad_norm": 0.8390814661979675, "learning_rate": 0.0002, "epoch": 3.4207836456558773, "step": 5020}, {"loss": 1.4556, "grad_norm": 0.7541858553886414, "learning_rate": 0.0002, "epoch": 3.4275979557069847, "step": 5030}, {"loss": 1.4245, "grad_norm": 0.6902772784233093, "learning_rate": 0.0002, "epoch": 3.434412265758092, "step": 5040}, {"loss": 1.3953, "grad_norm": 0.7070329785346985, "learning_rate": 0.0002, "epoch": 3.4412265758091993, "step": 5050}, {"loss": 1.3903, "grad_norm": 0.8075643181800842, "learning_rate": 0.0002, "epoch": 3.4480408858603067, "step": 5060}, {"loss": 1.3929, "grad_norm": 0.7133861780166626, "learning_rate": 0.0002, "epoch": 3.454855195911414, "step": 5070}, {"loss": 1.4632, "grad_norm": 0.6631823182106018, "learning_rate": 0.0002, "epoch": 3.4616695059625213, "step": 5080}, {"loss": 1.4162, "grad_norm": 0.673870325088501, "learning_rate": 0.0002, "epoch": 3.4684838160136287, "step": 5090}, {"loss": 1.4247, "grad_norm": 0.6438634395599365, "learning_rate": 0.0002, "epoch": 3.475298126064736, "step": 5100}, {"loss": 1.4421, "grad_norm": 0.7560495734214783, "learning_rate": 0.0002, "epoch": 3.4821124361158433, "step": 5110}, {"loss": 1.4125, "grad_norm": 0.6877814531326294, "learning_rate": 0.0002, "epoch": 3.4889267461669506, "step": 5120}, {"loss": 1.4308, "grad_norm": 0.7031328678131104, "learning_rate": 0.0002, "epoch": 3.495741056218058, "step": 5130}, {"loss": 1.3705, "grad_norm": 0.6797195672988892, "learning_rate": 0.0002, "epoch": 3.5025553662691653, "step": 5140}, {"loss": 1.4687, "grad_norm": 0.6766413450241089, "learning_rate": 0.0002, "epoch": 3.5093696763202726, "step": 5150}, {"loss": 1.4194, "grad_norm": 0.666656494140625, "learning_rate": 0.0002, "epoch": 3.51618398637138, "step": 5160}, {"loss": 1.469, "grad_norm": 0.74996417760849, "learning_rate": 0.0002, "epoch": 3.5229982964224873, "step": 5170}, {"loss": 1.4848, "grad_norm": 0.7370911836624146, "learning_rate": 0.0002, "epoch": 3.5298126064735946, "step": 5180}, {"loss": 1.4523, "grad_norm": 0.9063456654548645, "learning_rate": 0.0002, "epoch": 3.536626916524702, "step": 5190}, {"loss": 1.4726, "grad_norm": 0.6861422657966614, "learning_rate": 0.0002, "epoch": 3.5434412265758093, "step": 5200}, {"loss": 1.4803, "grad_norm": 0.7104039788246155, "learning_rate": 0.0002, "epoch": 3.5502555366269166, "step": 5210}, {"loss": 1.4313, "grad_norm": 0.6578653454780579, "learning_rate": 0.0002, "epoch": 3.557069846678024, "step": 5220}, {"loss": 1.4596, "grad_norm": 0.7336562275886536, "learning_rate": 0.0002, "epoch": 3.5638841567291313, "step": 5230}, {"loss": 1.4591, "grad_norm": 0.7163010835647583, "learning_rate": 0.0002, "epoch": 3.5706984667802386, "step": 5240}, {"loss": 1.3814, "grad_norm": 0.8112391233444214, "learning_rate": 0.0002, "epoch": 3.577512776831346, "step": 5250}, {"loss": 1.4249, "grad_norm": 0.7260391116142273, "learning_rate": 0.0002, "epoch": 3.5843270868824533, "step": 5260}, {"loss": 1.4249, "grad_norm": 0.7038731575012207, "learning_rate": 0.0002, "epoch": 3.5911413969335606, "step": 5270}, {"loss": 1.4172, "grad_norm": 0.7864376902580261, "learning_rate": 0.0002, "epoch": 3.597955706984668, "step": 5280}, {"loss": 1.4637, "grad_norm": 0.6968383193016052, "learning_rate": 0.0002, "epoch": 3.6047700170357753, "step": 5290}, {"loss": 1.5269, "grad_norm": 0.6726206541061401, "learning_rate": 0.0002, "epoch": 3.6115843270868826, "step": 5300}, {"loss": 1.4199, "grad_norm": 0.6716854572296143, "learning_rate": 0.0002, "epoch": 3.61839863713799, "step": 5310}, {"loss": 1.4686, "grad_norm": 0.7229742407798767, "learning_rate": 0.0002, "epoch": 3.6252129471890973, "step": 5320}, {"loss": 1.4441, "grad_norm": 0.7338683009147644, "learning_rate": 0.0002, "epoch": 3.6320272572402046, "step": 5330}, {"loss": 1.4116, "grad_norm": 0.771672785282135, "learning_rate": 0.0002, "epoch": 3.638841567291312, "step": 5340}, {"loss": 1.4007, "grad_norm": 0.7024078369140625, "learning_rate": 0.0002, "epoch": 3.645655877342419, "step": 5350}, {"loss": 1.4996, "grad_norm": 0.6847538352012634, "learning_rate": 0.0002, "epoch": 3.6524701873935266, "step": 5360}, {"loss": 1.4111, "grad_norm": 0.71802818775177, "learning_rate": 0.0002, "epoch": 3.6592844974446335, "step": 5370}, {"loss": 1.4224, "grad_norm": 0.78530353307724, "learning_rate": 0.0002, "epoch": 3.6660988074957412, "step": 5380}, {"loss": 1.4582, "grad_norm": 0.7262226939201355, "learning_rate": 0.0002, "epoch": 3.672913117546848, "step": 5390}, {"loss": 1.4704, "grad_norm": 0.7608316540718079, "learning_rate": 0.0002, "epoch": 3.679727427597956, "step": 5400}, {"loss": 1.3742, "grad_norm": 0.6994926333427429, "learning_rate": 0.0002, "epoch": 3.686541737649063, "step": 5410}, {"loss": 1.4738, "grad_norm": 0.7888479828834534, "learning_rate": 0.0002, "epoch": 3.6933560477001706, "step": 5420}, {"loss": 1.4213, "grad_norm": 0.7053858041763306, "learning_rate": 0.0002, "epoch": 3.7001703577512775, "step": 5430}, {"loss": 1.4988, "grad_norm": 0.7063165903091431, "learning_rate": 0.0002, "epoch": 3.7069846678023852, "step": 5440}, {"loss": 1.4386, "grad_norm": 0.6603744626045227, "learning_rate": 0.0002, "epoch": 3.713798977853492, "step": 5450}, {"loss": 1.4695, "grad_norm": 0.7043602466583252, "learning_rate": 0.0002, "epoch": 3.7206132879046, "step": 5460}, {"loss": 1.5051, "grad_norm": 0.7026081681251526, "learning_rate": 0.0002, "epoch": 3.7274275979557068, "step": 5470}, {"loss": 1.5613, "grad_norm": 0.7200090289115906, "learning_rate": 0.0002, "epoch": 3.7342419080068145, "step": 5480}, {"loss": 1.4182, "grad_norm": 0.7170904278755188, "learning_rate": 0.0002, "epoch": 3.7410562180579214, "step": 5490}, {"loss": 1.4344, "grad_norm": 0.7489104866981506, "learning_rate": 0.0002, "epoch": 3.747870528109029, "step": 5500}, {"loss": 1.4911, "grad_norm": 0.6540989875793457, "learning_rate": 0.0002, "epoch": 3.754684838160136, "step": 5510}, {"loss": 1.4955, "grad_norm": 0.6654048562049866, "learning_rate": 0.0002, "epoch": 3.761499148211244, "step": 5520}, {"loss": 1.4487, "grad_norm": 0.6577395796775818, "learning_rate": 0.0002, "epoch": 3.7683134582623508, "step": 5530}, {"loss": 1.4283, "grad_norm": 0.7762192487716675, "learning_rate": 0.0002, "epoch": 3.7751277683134585, "step": 5540}, {"loss": 1.4727, "grad_norm": 0.6336314678192139, "learning_rate": 0.0002, "epoch": 3.7819420783645654, "step": 5550}, {"loss": 1.4588, "grad_norm": 0.7098057866096497, "learning_rate": 0.0002, "epoch": 3.7887563884156727, "step": 5560}, {"loss": 1.4679, "grad_norm": 0.7379715442657471, "learning_rate": 0.0002, "epoch": 3.79557069846678, "step": 5570}, {"loss": 1.4633, "grad_norm": 0.6726924777030945, "learning_rate": 0.0002, "epoch": 3.8023850085178874, "step": 5580}, {"loss": 1.4751, "grad_norm": 1.1212009191513062, "learning_rate": 0.0002, "epoch": 3.8091993185689947, "step": 5590}, {"loss": 1.4503, "grad_norm": 0.6503795981407166, "learning_rate": 0.0002, "epoch": 3.816013628620102, "step": 5600}, {"loss": 1.4754, "grad_norm": 0.7041325569152832, "learning_rate": 0.0002, "epoch": 3.8228279386712094, "step": 5610}, {"loss": 1.4199, "grad_norm": 0.7962933778762817, "learning_rate": 0.0002, "epoch": 3.8296422487223167, "step": 5620}, {"loss": 1.4672, "grad_norm": 0.6613591909408569, "learning_rate": 0.0002, "epoch": 3.836456558773424, "step": 5630}, {"loss": 1.5688, "grad_norm": 0.7293516397476196, "learning_rate": 0.0002, "epoch": 3.8432708688245314, "step": 5640}, {"loss": 1.4149, "grad_norm": 0.7388607859611511, "learning_rate": 0.0002, "epoch": 3.8500851788756387, "step": 5650}, {"loss": 1.4743, "grad_norm": 0.6440677642822266, "learning_rate": 0.0002, "epoch": 3.856899488926746, "step": 5660}, {"loss": 1.5082, "grad_norm": 0.7729013562202454, "learning_rate": 0.0002, "epoch": 3.8637137989778534, "step": 5670}, {"loss": 1.4608, "grad_norm": 0.6696794033050537, "learning_rate": 0.0002, "epoch": 3.8705281090289607, "step": 5680}, {"loss": 1.472, "grad_norm": 0.7151781320571899, "learning_rate": 0.0002, "epoch": 3.877342419080068, "step": 5690}, {"loss": 1.4923, "grad_norm": 0.6736966371536255, "learning_rate": 0.0002, "epoch": 3.8841567291311754, "step": 5700}, {"loss": 1.4453, "grad_norm": 0.7444243431091309, "learning_rate": 0.0002, "epoch": 3.8909710391822827, "step": 5710}, {"loss": 1.4562, "grad_norm": 0.6701464653015137, "learning_rate": 0.0002, "epoch": 3.89778534923339, "step": 5720}, {"loss": 1.4478, "grad_norm": 0.7231952548027039, "learning_rate": 0.0002, "epoch": 3.9045996592844974, "step": 5730}, {"loss": 1.4539, "grad_norm": 0.831954300403595, "learning_rate": 0.0002, "epoch": 3.9114139693356047, "step": 5740}, {"loss": 1.5122, "grad_norm": 0.7697733640670776, "learning_rate": 0.0002, "epoch": 3.918228279386712, "step": 5750}, {"loss": 1.4552, "grad_norm": 0.6964395046234131, "learning_rate": 0.0002, "epoch": 3.9250425894378194, "step": 5760}, {"loss": 1.4688, "grad_norm": 0.6942925453186035, "learning_rate": 0.0002, "epoch": 3.9318568994889267, "step": 5770}, {"loss": 1.4668, "grad_norm": 0.6491202712059021, "learning_rate": 0.0002, "epoch": 3.938671209540034, "step": 5780}, {"loss": 1.4404, "grad_norm": 0.7004382610321045, "learning_rate": 0.0002, "epoch": 3.9454855195911414, "step": 5790}, {"loss": 1.5022, "grad_norm": 0.7337747812271118, "learning_rate": 0.0002, "epoch": 3.9522998296422487, "step": 5800}, {"loss": 1.5314, "grad_norm": 0.6923640966415405, "learning_rate": 0.0002, "epoch": 3.959114139693356, "step": 5810}, {"loss": 1.4811, "grad_norm": 0.6815266609191895, "learning_rate": 0.0002, "epoch": 3.9659284497444633, "step": 5820}, {"loss": 1.437, "grad_norm": 0.6755654811859131, "learning_rate": 0.0002, "epoch": 3.9727427597955707, "step": 5830}, {"loss": 1.4277, "grad_norm": 0.6912487149238586, "learning_rate": 0.0002, "epoch": 3.979557069846678, "step": 5840}, {"loss": 1.4654, "grad_norm": 0.6948044896125793, "learning_rate": 0.0002, "epoch": 3.9863713798977853, "step": 5850}, {"loss": 1.4779, "grad_norm": 0.6735455989837646, "learning_rate": 0.0002, "epoch": 3.9931856899488927, "step": 5860}, {"loss": 1.5102, "grad_norm": 0.7005048990249634, "learning_rate": 0.0002, "epoch": 4.0, "step": 5870}, {"eval_loss": 1.923058032989502, "eval_runtime": 58.9903, "eval_samples_per_second": 8.595, "eval_steps_per_second": 1.085, "epoch": 4.0, "step": 5870}, {"loss": 1.2417, "grad_norm": 0.809018075466156, "learning_rate": 0.0002, "epoch": 4.006814310051107, "step": 5880}, {"loss": 1.2874, "grad_norm": 0.9499403238296509, "learning_rate": 0.0002, "epoch": 4.013628620102215, "step": 5890}, {"loss": 1.2245, "grad_norm": 0.7944574356079102, "learning_rate": 0.0002, "epoch": 4.0204429301533215, "step": 5900}, {"loss": 1.2751, "grad_norm": 0.9501046538352966, "learning_rate": 0.0002, "epoch": 4.027257240204429, "step": 5910}, {"loss": 1.2706, "grad_norm": 0.8247923254966736, "learning_rate": 0.0002, "epoch": 4.034071550255536, "step": 5920}, {"loss": 1.2762, "grad_norm": 0.9358038902282715, "learning_rate": 0.0002, "epoch": 4.040885860306644, "step": 5930}, {"loss": 1.2953, "grad_norm": 1.0102452039718628, "learning_rate": 0.0002, "epoch": 4.047700170357751, "step": 5940}, {"loss": 1.216, "grad_norm": 1.0248252153396606, "learning_rate": 0.0002, "epoch": 4.054514480408859, "step": 5950}, {"loss": 1.2115, "grad_norm": 1.0438553094863892, "learning_rate": 0.0002, "epoch": 4.0613287904599655, "step": 5960}, {"loss": 1.2516, "grad_norm": 0.7964957356452942, "learning_rate": 0.0002, "epoch": 4.068143100511073, "step": 5970}, {"loss": 1.1555, "grad_norm": 0.9757015109062195, "learning_rate": 0.0002, "epoch": 4.07495741056218, "step": 5980}, {"loss": 1.2243, "grad_norm": 0.9157161116600037, "learning_rate": 0.0002, "epoch": 4.081771720613288, "step": 5990}, {"loss": 1.2481, "grad_norm": 0.9372851848602295, "learning_rate": 0.0002, "epoch": 4.088586030664395, "step": 6000}, {"loss": 1.2091, "grad_norm": 1.240779995918274, "learning_rate": 0.0002, "epoch": 4.095400340715503, "step": 6010}, {"loss": 1.1727, "grad_norm": 0.8394840359687805, "learning_rate": 0.0002, "epoch": 4.1022146507666095, "step": 6020}, {"loss": 1.2926, "grad_norm": 1.1081455945968628, "learning_rate": 0.0002, "epoch": 4.109028960817717, "step": 6030}, {"loss": 1.2417, "grad_norm": 0.9227745532989502, "learning_rate": 0.0002, "epoch": 4.115843270868824, "step": 6040}, {"loss": 1.1994, "grad_norm": 0.8487664461135864, "learning_rate": 0.0002, "epoch": 4.122657580919932, "step": 6050}, {"loss": 1.2378, "grad_norm": 0.9643339514732361, "learning_rate": 0.0002, "epoch": 4.129471890971039, "step": 6060}, {"loss": 1.2254, "grad_norm": 1.0296099185943604, "learning_rate": 0.0002, "epoch": 4.136286201022147, "step": 6070}, {"loss": 1.2419, "grad_norm": 0.9534215927124023, "learning_rate": 0.0002, "epoch": 4.1431005110732535, "step": 6080}, {"loss": 1.1849, "grad_norm": 0.9647086262702942, "learning_rate": 0.0002, "epoch": 4.149914821124361, "step": 6090}, {"loss": 1.2713, "grad_norm": 1.084836721420288, "learning_rate": 0.0002, "epoch": 4.156729131175468, "step": 6100}, {"loss": 1.1788, "grad_norm": 0.9315235614776611, "learning_rate": 0.0002, "epoch": 4.163543441226576, "step": 6110}, {"loss": 1.17, "grad_norm": 0.9541679620742798, "learning_rate": 0.0002, "epoch": 4.170357751277683, "step": 6120}, {"loss": 1.1407, "grad_norm": 0.9792100191116333, "learning_rate": 0.0002, "epoch": 4.177172061328791, "step": 6130}, {"loss": 1.2069, "grad_norm": 1.065783143043518, "learning_rate": 0.0002, "epoch": 4.1839863713798975, "step": 6140}, {"loss": 1.2512, "grad_norm": 1.036161184310913, "learning_rate": 0.0002, "epoch": 4.190800681431005, "step": 6150}, {"loss": 1.2371, "grad_norm": 0.8979679942131042, "learning_rate": 0.0002, "epoch": 4.197614991482112, "step": 6160}, {"loss": 1.2212, "grad_norm": 0.7584333419799805, "learning_rate": 0.0002, "epoch": 4.20442930153322, "step": 6170}, {"loss": 1.2128, "grad_norm": 1.1970131397247314, "learning_rate": 0.0002, "epoch": 4.211243611584327, "step": 6180}, {"loss": 1.1982, "grad_norm": 2.6447298526763916, "learning_rate": 0.0002, "epoch": 4.218057921635435, "step": 6190}, {"loss": 1.2465, "grad_norm": 0.9357487559318542, "learning_rate": 0.0002, "epoch": 4.2248722316865415, "step": 6200}, {"loss": 1.2963, "grad_norm": 0.9141183495521545, "learning_rate": 0.0002, "epoch": 4.231686541737649, "step": 6210}, {"loss": 1.1959, "grad_norm": 1.0606296062469482, "learning_rate": 0.0002, "epoch": 4.238500851788756, "step": 6220}, {"loss": 1.2629, "grad_norm": 0.9999088048934937, "learning_rate": 0.0002, "epoch": 4.245315161839864, "step": 6230}, {"loss": 1.1471, "grad_norm": 0.9469764232635498, "learning_rate": 0.0002, "epoch": 4.252129471890971, "step": 6240}, {"loss": 1.223, "grad_norm": 1.1508198976516724, "learning_rate": 0.0002, "epoch": 4.258943781942079, "step": 6250}, {"loss": 1.2677, "grad_norm": 1.2576130628585815, "learning_rate": 0.0002, "epoch": 4.2657580919931855, "step": 6260}, {"loss": 1.2216, "grad_norm": 0.9435968399047852, "learning_rate": 0.0002, "epoch": 4.272572402044293, "step": 6270}, {"loss": 1.2788, "grad_norm": 0.9290348887443542, "learning_rate": 0.0002, "epoch": 4.2793867120954, "step": 6280}, {"loss": 1.2631, "grad_norm": 0.9973701238632202, "learning_rate": 0.0002, "epoch": 4.286201022146508, "step": 6290}, {"loss": 1.2276, "grad_norm": 1.012855887413025, "learning_rate": 0.0002, "epoch": 4.293015332197615, "step": 6300}, {"loss": 1.2115, "grad_norm": 0.8371705412864685, "learning_rate": 0.0002, "epoch": 4.2998296422487225, "step": 6310}, {"loss": 1.2423, "grad_norm": 1.0867925882339478, "learning_rate": 0.0002, "epoch": 4.306643952299829, "step": 6320}, {"loss": 1.2262, "grad_norm": 0.9763767123222351, "learning_rate": 0.0002, "epoch": 4.313458262350937, "step": 6330}, {"loss": 1.2557, "grad_norm": 1.1844252347946167, "learning_rate": 0.0002, "epoch": 4.320272572402044, "step": 6340}, {"loss": 1.2635, "grad_norm": 0.8292830586433411, "learning_rate": 0.0002, "epoch": 4.327086882453152, "step": 6350}, {"loss": 1.262, "grad_norm": 0.9351436495780945, "learning_rate": 0.0002, "epoch": 4.333901192504259, "step": 6360}, {"loss": 1.2678, "grad_norm": 1.0425835847854614, "learning_rate": 0.0002, "epoch": 4.3407155025553665, "step": 6370}, {"loss": 1.2476, "grad_norm": 0.8894261121749878, "learning_rate": 0.0002, "epoch": 4.347529812606473, "step": 6380}, {"loss": 1.2965, "grad_norm": 0.9663366079330444, "learning_rate": 0.0002, "epoch": 4.354344122657581, "step": 6390}, {"loss": 1.2529, "grad_norm": 0.8915578126907349, "learning_rate": 0.0002, "epoch": 4.361158432708688, "step": 6400}, {"loss": 1.2573, "grad_norm": 1.0393000841140747, "learning_rate": 0.0002, "epoch": 4.367972742759796, "step": 6410}, {"loss": 1.2254, "grad_norm": 0.917398989200592, "learning_rate": 0.0002, "epoch": 4.374787052810903, "step": 6420}, {"loss": 1.3115, "grad_norm": 1.0496646165847778, "learning_rate": 0.0002, "epoch": 4.3816013628620105, "step": 6430}, {"loss": 1.2607, "grad_norm": 0.9349859356880188, "learning_rate": 0.0002, "epoch": 4.388415672913117, "step": 6440}, {"loss": 1.3414, "grad_norm": 1.0981004238128662, "learning_rate": 0.0002, "epoch": 4.395229982964225, "step": 6450}, {"loss": 1.2391, "grad_norm": 0.9794871807098389, "learning_rate": 0.0002, "epoch": 4.402044293015332, "step": 6460}, {"loss": 1.208, "grad_norm": 0.9321421384811401, "learning_rate": 0.0002, "epoch": 4.40885860306644, "step": 6470}, {"loss": 1.3398, "grad_norm": 0.9158342480659485, "learning_rate": 0.0002, "epoch": 4.415672913117547, "step": 6480}, {"loss": 1.1832, "grad_norm": 0.9462087750434875, "learning_rate": 0.0002, "epoch": 4.4224872231686545, "step": 6490}, {"loss": 1.2366, "grad_norm": 0.9740175604820251, "learning_rate": 0.0002, "epoch": 4.429301533219761, "step": 6500}, {"loss": 1.3074, "grad_norm": 0.8477463126182556, "learning_rate": 0.0002, "epoch": 4.436115843270869, "step": 6510}, {"loss": 1.2719, "grad_norm": 1.0296647548675537, "learning_rate": 0.0002, "epoch": 4.442930153321976, "step": 6520}, {"loss": 1.2647, "grad_norm": 0.9437751173973083, "learning_rate": 0.0002, "epoch": 4.449744463373084, "step": 6530}, {"loss": 1.2043, "grad_norm": 1.011192798614502, "learning_rate": 0.0002, "epoch": 4.456558773424191, "step": 6540}, {"loss": 1.3673, "grad_norm": 0.8836222290992737, "learning_rate": 0.0002, "epoch": 4.4633730834752985, "step": 6550}, {"loss": 1.3028, "grad_norm": 1.2799941301345825, "learning_rate": 0.0002, "epoch": 4.470187393526405, "step": 6560}, {"loss": 1.2789, "grad_norm": 0.925910472869873, "learning_rate": 0.0002, "epoch": 4.477001703577513, "step": 6570}, {"loss": 1.2723, "grad_norm": 0.957401692867279, "learning_rate": 0.0002, "epoch": 4.48381601362862, "step": 6580}, {"loss": 1.242, "grad_norm": 1.0789544582366943, "learning_rate": 0.0002, "epoch": 4.490630323679728, "step": 6590}, {"loss": 1.2553, "grad_norm": 0.8874586820602417, "learning_rate": 0.0002, "epoch": 4.497444633730835, "step": 6600}, {"loss": 1.2779, "grad_norm": 0.9394784569740295, "learning_rate": 0.0002, "epoch": 4.504258943781942, "step": 6610}, {"loss": 1.2744, "grad_norm": 1.029640793800354, "learning_rate": 0.0002, "epoch": 4.511073253833049, "step": 6620}, {"loss": 1.2634, "grad_norm": 0.9510841965675354, "learning_rate": 0.0002, "epoch": 4.517887563884157, "step": 6630}, {"loss": 1.2562, "grad_norm": 0.9992963671684265, "learning_rate": 0.0002, "epoch": 4.524701873935264, "step": 6640}, {"loss": 1.2942, "grad_norm": 0.9312878847122192, "learning_rate": 0.0002, "epoch": 4.531516183986371, "step": 6650}, {"loss": 1.2572, "grad_norm": 0.9406482577323914, "learning_rate": 0.0002, "epoch": 4.538330494037479, "step": 6660}, {"loss": 1.2283, "grad_norm": 1.1058286428451538, "learning_rate": 0.0002, "epoch": 4.5451448040885865, "step": 6670}, {"loss": 1.2391, "grad_norm": 0.9389635920524597, "learning_rate": 0.0002, "epoch": 4.551959114139693, "step": 6680}, {"loss": 1.2696, "grad_norm": 1.0356028079986572, "learning_rate": 0.0002, "epoch": 4.5587734241908, "step": 6690}, {"loss": 1.2935, "grad_norm": 0.9370909929275513, "learning_rate": 0.0002, "epoch": 4.565587734241908, "step": 6700}, {"loss": 1.2914, "grad_norm": 0.9917567372322083, "learning_rate": 0.0002, "epoch": 4.572402044293016, "step": 6710}, {"loss": 1.3318, "grad_norm": 0.9065384864807129, "learning_rate": 0.0002, "epoch": 4.579216354344123, "step": 6720}, {"loss": 1.2909, "grad_norm": 1.3347833156585693, "learning_rate": 0.0002, "epoch": 4.5860306643952296, "step": 6730}, {"loss": 1.3322, "grad_norm": 0.910632312297821, "learning_rate": 0.0002, "epoch": 4.592844974446337, "step": 6740}, {"loss": 1.2584, "grad_norm": 0.8874805569648743, "learning_rate": 0.0002, "epoch": 4.599659284497445, "step": 6750}, {"loss": 1.3173, "grad_norm": 0.9355664253234863, "learning_rate": 0.0002, "epoch": 4.606473594548552, "step": 6760}, {"loss": 1.3515, "grad_norm": 0.9360204339027405, "learning_rate": 0.0002, "epoch": 4.613287904599659, "step": 6770}, {"loss": 1.2326, "grad_norm": 0.9931750893592834, "learning_rate": 0.0002, "epoch": 4.620102214650767, "step": 6780}, {"loss": 1.2677, "grad_norm": 0.9195131063461304, "learning_rate": 0.0002, "epoch": 4.626916524701874, "step": 6790}, {"loss": 1.3417, "grad_norm": 0.9448373913764954, "learning_rate": 0.0002, "epoch": 4.633730834752981, "step": 6800}, {"loss": 1.2658, "grad_norm": 1.162890911102295, "learning_rate": 0.0002, "epoch": 4.640545144804088, "step": 6810}, {"loss": 1.2841, "grad_norm": 0.9739466905593872, "learning_rate": 0.0002, "epoch": 4.647359454855196, "step": 6820}, {"loss": 1.3068, "grad_norm": 0.9462909698486328, "learning_rate": 0.0002, "epoch": 4.654173764906303, "step": 6830}, {"loss": 1.284, "grad_norm": 1.042639970779419, "learning_rate": 0.0002, "epoch": 4.660988074957411, "step": 6840}, {"loss": 1.3337, "grad_norm": 0.8910539150238037, "learning_rate": 0.0002, "epoch": 4.6678023850085175, "step": 6850}, {"loss": 1.3025, "grad_norm": 1.0806447267532349, "learning_rate": 0.0002, "epoch": 4.674616695059625, "step": 6860}, {"loss": 1.2258, "grad_norm": 1.0054864883422852, "learning_rate": 0.0002, "epoch": 4.681431005110732, "step": 6870}, {"loss": 1.3261, "grad_norm": 0.7774158120155334, "learning_rate": 0.0002, "epoch": 4.68824531516184, "step": 6880}, {"loss": 1.2545, "grad_norm": 0.9729512333869934, "learning_rate": 0.0002, "epoch": 4.695059625212947, "step": 6890}, {"loss": 1.3251, "grad_norm": 1.2025411128997803, "learning_rate": 0.0002, "epoch": 4.701873935264055, "step": 6900}, {"loss": 1.3418, "grad_norm": 1.1654069423675537, "learning_rate": 0.0002, "epoch": 4.7086882453151615, "step": 6910}, {"loss": 1.3091, "grad_norm": 1.1501442193984985, "learning_rate": 0.0002, "epoch": 4.715502555366269, "step": 6920}, {"loss": 1.2627, "grad_norm": 1.1083979606628418, "learning_rate": 0.0002, "epoch": 4.722316865417376, "step": 6930}, {"loss": 1.2836, "grad_norm": 0.9431378841400146, "learning_rate": 0.0002, "epoch": 4.729131175468484, "step": 6940}, {"loss": 1.3381, "grad_norm": 0.9722502827644348, "learning_rate": 0.0002, "epoch": 4.735945485519591, "step": 6950}, {"loss": 1.3228, "grad_norm": 0.9094559550285339, "learning_rate": 0.0002, "epoch": 4.742759795570699, "step": 6960}, {"loss": 1.3474, "grad_norm": 0.9918473958969116, "learning_rate": 0.0002, "epoch": 4.7495741056218055, "step": 6970}, {"loss": 1.3352, "grad_norm": 0.9999690651893616, "learning_rate": 0.0002, "epoch": 4.756388415672913, "step": 6980}, {"loss": 1.3579, "grad_norm": 1.0453810691833496, "learning_rate": 0.0002, "epoch": 4.76320272572402, "step": 6990}, {"loss": 1.294, "grad_norm": 1.0167806148529053, "learning_rate": 0.0002, "epoch": 4.770017035775128, "step": 7000}, {"loss": 1.3247, "grad_norm": 0.8133894801139832, "learning_rate": 0.0002, "epoch": 4.776831345826235, "step": 7010}, {"loss": 1.2577, "grad_norm": 0.8000897765159607, "learning_rate": 0.0002, "epoch": 4.783645655877343, "step": 7020}, {"loss": 1.2802, "grad_norm": 0.992080569267273, "learning_rate": 0.0002, "epoch": 4.7904599659284495, "step": 7030}, {"loss": 1.3269, "grad_norm": 0.9824522137641907, "learning_rate": 0.0002, "epoch": 4.797274275979557, "step": 7040}, {"loss": 1.279, "grad_norm": 0.9808870553970337, "learning_rate": 0.0002, "epoch": 4.804088586030664, "step": 7050}, {"loss": 1.3342, "grad_norm": 0.9679701924324036, "learning_rate": 0.0002, "epoch": 4.810902896081772, "step": 7060}, {"loss": 1.2711, "grad_norm": 0.9895215034484863, "learning_rate": 0.0002, "epoch": 4.817717206132879, "step": 7070}, {"loss": 1.3008, "grad_norm": 1.052246332168579, "learning_rate": 0.0002, "epoch": 4.824531516183987, "step": 7080}, {"loss": 1.2874, "grad_norm": 0.9243564605712891, "learning_rate": 0.0002, "epoch": 4.8313458262350935, "step": 7090}, {"loss": 1.2835, "grad_norm": 0.9545369744300842, "learning_rate": 0.0002, "epoch": 4.838160136286201, "step": 7100}, {"loss": 1.31, "grad_norm": 0.9655884504318237, "learning_rate": 0.0002, "epoch": 4.844974446337308, "step": 7110}, {"loss": 1.2862, "grad_norm": 0.9708049893379211, "learning_rate": 0.0002, "epoch": 4.851788756388416, "step": 7120}, {"loss": 1.3425, "grad_norm": 1.0064880847930908, "learning_rate": 0.0002, "epoch": 4.858603066439523, "step": 7130}, {"loss": 1.2899, "grad_norm": 0.939943790435791, "learning_rate": 0.0002, "epoch": 4.8654173764906306, "step": 7140}, {"loss": 1.2887, "grad_norm": 1.0750784873962402, "learning_rate": 0.0002, "epoch": 4.872231686541737, "step": 7150}, {"loss": 1.3367, "grad_norm": 0.9708989262580872, "learning_rate": 0.0002, "epoch": 4.879045996592845, "step": 7160}, {"loss": 1.2797, "grad_norm": 1.0228253602981567, "learning_rate": 0.0002, "epoch": 4.885860306643952, "step": 7170}, {"loss": 1.2695, "grad_norm": 0.8963132500648499, "learning_rate": 0.0002, "epoch": 4.89267461669506, "step": 7180}, {"loss": 1.3473, "grad_norm": 0.9198015928268433, "learning_rate": 0.0002, "epoch": 4.899488926746167, "step": 7190}, {"loss": 1.2541, "grad_norm": 1.099906086921692, "learning_rate": 0.0002, "epoch": 4.9063032367972745, "step": 7200}, {"loss": 1.3188, "grad_norm": 1.0624815225601196, "learning_rate": 0.0002, "epoch": 4.913117546848381, "step": 7210}, {"loss": 1.3026, "grad_norm": 0.9688444137573242, "learning_rate": 0.0002, "epoch": 4.919931856899489, "step": 7220}, {"loss": 1.3379, "grad_norm": 0.867011547088623, "learning_rate": 0.0002, "epoch": 4.926746166950596, "step": 7230}, {"loss": 1.289, "grad_norm": 0.9600282311439514, "learning_rate": 0.0002, "epoch": 4.933560477001704, "step": 7240}, {"loss": 1.2751, "grad_norm": 0.8979372978210449, "learning_rate": 0.0002, "epoch": 4.940374787052811, "step": 7250}, {"loss": 1.3426, "grad_norm": 0.951474130153656, "learning_rate": 0.0002, "epoch": 4.9471890971039185, "step": 7260}, {"loss": 1.2726, "grad_norm": 0.824851393699646, "learning_rate": 0.0002, "epoch": 4.954003407155025, "step": 7270}, {"loss": 1.2679, "grad_norm": 1.2926591634750366, "learning_rate": 0.0002, "epoch": 4.960817717206133, "step": 7280}, {"loss": 1.2974, "grad_norm": 1.1057835817337036, "learning_rate": 0.0002, "epoch": 4.96763202725724, "step": 7290}, {"loss": 1.2275, "grad_norm": 0.9814816117286682, "learning_rate": 0.0002, "epoch": 4.974446337308348, "step": 7300}, {"loss": 1.3001, "grad_norm": 1.0251333713531494, "learning_rate": 0.0002, "epoch": 4.981260647359455, "step": 7310}, {"loss": 1.3113, "grad_norm": 0.9748668074607849, "learning_rate": 0.0002, "epoch": 4.9880749574105625, "step": 7320}, {"loss": 1.3595, "grad_norm": 0.8552228808403015, "learning_rate": 0.0002, "epoch": 4.994889267461669, "step": 7330}, {"eval_loss": 2.03971004486084, "eval_runtime": 67.4144, "eval_samples_per_second": 7.521, "eval_steps_per_second": 0.949, "epoch": 4.999659284497445, "step": 7337}, {"loss": 1.2464, "grad_norm": 0.8210785388946533, "learning_rate": 0.0002, "epoch": 5.001703577512777, "step": 7340}, {"loss": 1.0356, "grad_norm": 1.2577511072158813, "learning_rate": 0.0002, "epoch": 5.008517887563884, "step": 7350}, {"loss": 0.9944, "grad_norm": 1.280604362487793, "learning_rate": 0.0002, "epoch": 5.015332197614992, "step": 7360}, {"loss": 1.0858, "grad_norm": 1.3985474109649658, "learning_rate": 0.0002, "epoch": 5.022146507666099, "step": 7370}, {"loss": 1.0122, "grad_norm": 1.1621310710906982, "learning_rate": 0.0002, "epoch": 5.0289608177172065, "step": 7380}, {"loss": 1.05, "grad_norm": 1.3278541564941406, "learning_rate": 0.0002, "epoch": 5.035775127768313, "step": 7390}, {"loss": 1.0237, "grad_norm": 1.1166491508483887, "learning_rate": 0.0002, "epoch": 5.042589437819421, "step": 7400}, {"loss": 1.0397, "grad_norm": 1.8087667226791382, "learning_rate": 0.0002, "epoch": 5.049403747870528, "step": 7410}, {"loss": 1.0191, "grad_norm": 1.1517921686172485, "learning_rate": 0.0002, "epoch": 5.056218057921636, "step": 7420}, {"loss": 1.025, "grad_norm": 1.2875889539718628, "learning_rate": 0.0002, "epoch": 5.063032367972743, "step": 7430}, {"loss": 1.043, "grad_norm": 1.199702262878418, "learning_rate": 0.0002, "epoch": 5.0698466780238505, "step": 7440}, {"loss": 1.0176, "grad_norm": 1.2912452220916748, "learning_rate": 0.0002, "epoch": 5.076660988074957, "step": 7450}, {"loss": 1.0042, "grad_norm": 1.1446452140808105, "learning_rate": 0.0002, "epoch": 5.083475298126065, "step": 7460}, {"loss": 1.047, "grad_norm": 1.3625746965408325, "learning_rate": 0.0002, "epoch": 5.090289608177172, "step": 7470}, {"loss": 1.052, "grad_norm": 1.2116546630859375, "learning_rate": 0.0002, "epoch": 5.09710391822828, "step": 7480}, {"loss": 1.1041, "grad_norm": 1.3896098136901855, "learning_rate": 0.0002, "epoch": 5.103918228279387, "step": 7490}, {"loss": 1.0668, "grad_norm": 1.6265277862548828, "learning_rate": 0.0002, "epoch": 5.1107325383304945, "step": 7500}, {"loss": 1.028, "grad_norm": 1.1468392610549927, "learning_rate": 0.0002, "epoch": 5.117546848381601, "step": 7510}, {"loss": 0.9915, "grad_norm": 1.2649329900741577, "learning_rate": 0.0002, "epoch": 5.124361158432709, "step": 7520}, {"loss": 1.0251, "grad_norm": 1.1866015195846558, "learning_rate": 0.0002, "epoch": 5.131175468483816, "step": 7530}, {"loss": 1.0626, "grad_norm": 1.1517255306243896, "learning_rate": 0.0002, "epoch": 5.137989778534923, "step": 7540}, {"loss": 1.0303, "grad_norm": 1.3475146293640137, "learning_rate": 0.0002, "epoch": 5.144804088586031, "step": 7550}, {"loss": 1.0456, "grad_norm": 1.1167018413543701, "learning_rate": 0.0002, "epoch": 5.151618398637138, "step": 7560}, {"loss": 1.04, "grad_norm": 1.209572434425354, "learning_rate": 0.0002, "epoch": 5.158432708688245, "step": 7570}, {"loss": 1.0533, "grad_norm": 1.3578280210494995, "learning_rate": 0.0002, "epoch": 5.165247018739352, "step": 7580}, {"loss": 1.0958, "grad_norm": 1.2447012662887573, "learning_rate": 0.0002, "epoch": 5.17206132879046, "step": 7590}, {"loss": 1.0521, "grad_norm": 1.3715848922729492, "learning_rate": 0.0002, "epoch": 5.178875638841567, "step": 7600}, {"loss": 1.0556, "grad_norm": 1.435860276222229, "learning_rate": 0.0002, "epoch": 5.185689948892675, "step": 7610}, {"loss": 1.0504, "grad_norm": 1.4093858003616333, "learning_rate": 0.0002, "epoch": 5.1925042589437815, "step": 7620}, {"loss": 1.083, "grad_norm": 1.1747535467147827, "learning_rate": 0.0002, "epoch": 5.199318568994889, "step": 7630}, {"loss": 1.048, "grad_norm": 1.4704833030700684, "learning_rate": 0.0002, "epoch": 5.206132879045996, "step": 7640}, {"loss": 0.9991, "grad_norm": 1.2270972728729248, "learning_rate": 0.0002, "epoch": 5.212947189097104, "step": 7650}, {"loss": 1.0738, "grad_norm": 1.2215691804885864, "learning_rate": 0.0002, "epoch": 5.219761499148211, "step": 7660}, {"loss": 1.0628, "grad_norm": 1.3641486167907715, "learning_rate": 0.0002, "epoch": 5.226575809199319, "step": 7670}, {"loss": 1.1066, "grad_norm": 1.3532041311264038, "learning_rate": 0.0002, "epoch": 5.2333901192504255, "step": 7680}, {"loss": 1.0209, "grad_norm": 1.2243095636367798, "learning_rate": 0.0002, "epoch": 5.240204429301533, "step": 7690}, {"loss": 1.0503, "grad_norm": 1.3644746541976929, "learning_rate": 0.0002, "epoch": 5.24701873935264, "step": 7700}, {"loss": 1.0406, "grad_norm": 1.18478262424469, "learning_rate": 0.0002, "epoch": 5.253833049403748, "step": 7710}, {"loss": 1.1023, "grad_norm": 1.2146114110946655, "learning_rate": 0.0002, "epoch": 5.260647359454855, "step": 7720}, {"loss": 1.1528, "grad_norm": 1.233984112739563, "learning_rate": 0.0002, "epoch": 5.267461669505963, "step": 7730}, {"loss": 1.0681, "grad_norm": 1.3709665536880493, "learning_rate": 0.0002, "epoch": 5.2742759795570695, "step": 7740}, {"loss": 1.0195, "grad_norm": 1.36055326461792, "learning_rate": 0.0002, "epoch": 5.281090289608177, "step": 7750}, {"loss": 1.0447, "grad_norm": 1.6232351064682007, "learning_rate": 0.0002, "epoch": 5.287904599659284, "step": 7760}, {"loss": 1.0627, "grad_norm": 1.3359960317611694, "learning_rate": 0.0002, "epoch": 5.294718909710392, "step": 7770}, {"loss": 1.1082, "grad_norm": 1.3815656900405884, "learning_rate": 0.0002, "epoch": 5.301533219761499, "step": 7780}, {"loss": 1.0891, "grad_norm": 1.1392076015472412, "learning_rate": 0.0002, "epoch": 5.308347529812607, "step": 7790}, {"loss": 1.0364, "grad_norm": 1.3006905317306519, "learning_rate": 0.0002, "epoch": 5.3151618398637135, "step": 7800}, {"loss": 1.1005, "grad_norm": 1.503645896911621, "learning_rate": 0.0002, "epoch": 5.321976149914821, "step": 7810}, {"loss": 1.0075, "grad_norm": 1.141939640045166, "learning_rate": 0.0002, "epoch": 5.328790459965928, "step": 7820}, {"loss": 1.0284, "grad_norm": 1.4654004573822021, "learning_rate": 0.0002, "epoch": 5.335604770017036, "step": 7830}, {"loss": 1.1185, "grad_norm": 1.4195219278335571, "learning_rate": 0.0002, "epoch": 5.342419080068143, "step": 7840}, {"loss": 1.0535, "grad_norm": 1.2354168891906738, "learning_rate": 0.0002, "epoch": 5.349233390119251, "step": 7850}, {"loss": 1.0923, "grad_norm": 1.529862880706787, "learning_rate": 0.0002, "epoch": 5.3560477001703575, "step": 7860}, {"loss": 1.1005, "grad_norm": 1.364678978919983, "learning_rate": 0.0002, "epoch": 5.362862010221465, "step": 7870}, {"loss": 1.1084, "grad_norm": 1.1010444164276123, "learning_rate": 0.0002, "epoch": 5.369676320272572, "step": 7880}, {"loss": 1.1225, "grad_norm": 1.1949712038040161, "learning_rate": 0.0002, "epoch": 5.37649063032368, "step": 7890}, {"loss": 1.058, "grad_norm": 1.485922932624817, "learning_rate": 0.0002, "epoch": 5.383304940374787, "step": 7900}, {"loss": 0.9894, "grad_norm": 1.0844227075576782, "learning_rate": 0.0002, "epoch": 5.390119250425895, "step": 7910}, {"loss": 1.0418, "grad_norm": 1.3784468173980713, "learning_rate": 0.0002, "epoch": 5.3969335604770015, "step": 7920}, {"loss": 1.0542, "grad_norm": 1.4771490097045898, "learning_rate": 0.0002, "epoch": 5.403747870528109, "step": 7930}, {"loss": 1.1265, "grad_norm": 1.2460103034973145, "learning_rate": 0.0002, "epoch": 5.410562180579216, "step": 7940}, {"loss": 1.096, "grad_norm": 1.3047645092010498, "learning_rate": 0.0002, "epoch": 5.417376490630324, "step": 7950}, {"loss": 1.0956, "grad_norm": 1.1396620273590088, "learning_rate": 0.0002, "epoch": 5.424190800681431, "step": 7960}, {"loss": 1.0685, "grad_norm": 1.4193450212478638, "learning_rate": 0.0002, "epoch": 5.4310051107325386, "step": 7970}, {"loss": 1.1347, "grad_norm": 1.2085850238800049, "learning_rate": 0.0002, "epoch": 5.437819420783645, "step": 7980}, {"loss": 1.0277, "grad_norm": 1.2721607685089111, "learning_rate": 0.0002, "epoch": 5.444633730834753, "step": 7990}, {"loss": 1.1316, "grad_norm": 1.4134020805358887, "learning_rate": 0.0002, "epoch": 5.45144804088586, "step": 8000}, {"loss": 1.0576, "grad_norm": 1.4283325672149658, "learning_rate": 0.0002, "epoch": 5.458262350936968, "step": 8010}, {"loss": 1.0505, "grad_norm": 1.3127079010009766, "learning_rate": 0.0002, "epoch": 5.465076660988075, "step": 8020}, {"loss": 1.0812, "grad_norm": 1.2924352884292603, "learning_rate": 0.0002, "epoch": 5.4718909710391825, "step": 8030}, {"loss": 1.1178, "grad_norm": 1.8000653982162476, "learning_rate": 0.0002, "epoch": 5.478705281090289, "step": 8040}, {"loss": 1.1205, "grad_norm": 1.1538785696029663, "learning_rate": 0.0002, "epoch": 5.485519591141397, "step": 8050}, {"loss": 1.1015, "grad_norm": 1.1173290014266968, "learning_rate": 0.0002, "epoch": 5.492333901192504, "step": 8060}, {"loss": 1.1597, "grad_norm": 1.1501243114471436, "learning_rate": 0.0002, "epoch": 5.499148211243612, "step": 8070}, {"loss": 1.1465, "grad_norm": 1.1335760354995728, "learning_rate": 0.0002, "epoch": 5.505962521294719, "step": 8080}, {"loss": 1.1005, "grad_norm": 1.565274953842163, "learning_rate": 0.0002, "epoch": 5.5127768313458265, "step": 8090}, {"loss": 1.1085, "grad_norm": 1.3415014743804932, "learning_rate": 0.0002, "epoch": 5.519591141396933, "step": 8100}, {"loss": 1.1166, "grad_norm": 1.2377240657806396, "learning_rate": 0.0002, "epoch": 5.526405451448041, "step": 8110}, {"loss": 1.0766, "grad_norm": 1.3333637714385986, "learning_rate": 0.0002, "epoch": 5.533219761499148, "step": 8120}, {"loss": 1.1515, "grad_norm": 1.2620662450790405, "learning_rate": 0.0002, "epoch": 5.540034071550256, "step": 8130}, {"loss": 1.0839, "grad_norm": 1.2806652784347534, "learning_rate": 0.0002, "epoch": 5.546848381601363, "step": 8140}, {"loss": 1.1221, "grad_norm": 1.2057335376739502, "learning_rate": 0.0002, "epoch": 5.5536626916524705, "step": 8150}, {"loss": 1.1292, "grad_norm": 1.411726951599121, "learning_rate": 0.0002, "epoch": 5.560477001703577, "step": 8160}, {"loss": 1.0887, "grad_norm": 1.381104588508606, "learning_rate": 0.0002, "epoch": 5.567291311754685, "step": 8170}, {"loss": 1.1317, "grad_norm": 1.3449294567108154, "learning_rate": 0.0002, "epoch": 5.574105621805792, "step": 8180}, {"loss": 1.1392, "grad_norm": 1.2791016101837158, "learning_rate": 0.0002, "epoch": 5.5809199318569, "step": 8190}, {"loss": 1.0972, "grad_norm": 1.276891827583313, "learning_rate": 0.0002, "epoch": 5.587734241908007, "step": 8200}, {"loss": 1.1001, "grad_norm": 1.3951541185379028, "learning_rate": 0.0002, "epoch": 5.5945485519591145, "step": 8210}, {"loss": 1.0993, "grad_norm": 1.4167890548706055, "learning_rate": 0.0002, "epoch": 5.601362862010221, "step": 8220}, {"loss": 1.0826, "grad_norm": 1.4388375282287598, "learning_rate": 0.0002, "epoch": 5.608177172061329, "step": 8230}, {"loss": 1.1941, "grad_norm": 1.210157036781311, "learning_rate": 0.0002, "epoch": 5.614991482112436, "step": 8240}, {"loss": 1.0833, "grad_norm": 1.0557862520217896, "learning_rate": 0.0002, "epoch": 5.621805792163544, "step": 8250}, {"loss": 1.1197, "grad_norm": 1.2913990020751953, "learning_rate": 0.0002, "epoch": 5.628620102214651, "step": 8260}, {"loss": 1.0346, "grad_norm": 1.2204737663269043, "learning_rate": 0.0002, "epoch": 5.6354344122657585, "step": 8270}, {"loss": 1.1429, "grad_norm": 1.57016921043396, "learning_rate": 0.0002, "epoch": 5.642248722316865, "step": 8280}, {"loss": 1.0988, "grad_norm": 1.0117967128753662, "learning_rate": 0.0002, "epoch": 5.649063032367973, "step": 8290}, {"loss": 1.0786, "grad_norm": 1.3195525407791138, "learning_rate": 0.0002, "epoch": 5.65587734241908, "step": 8300}, {"loss": 1.0618, "grad_norm": 1.2566497325897217, "learning_rate": 0.0002, "epoch": 5.662691652470187, "step": 8310}, {"loss": 1.1635, "grad_norm": 1.1446818113327026, "learning_rate": 0.0002, "epoch": 5.669505962521295, "step": 8320}, {"loss": 1.2201, "grad_norm": 1.2928680181503296, "learning_rate": 0.0002, "epoch": 5.6763202725724025, "step": 8330}, {"loss": 1.1488, "grad_norm": 1.2823996543884277, "learning_rate": 0.0002, "epoch": 5.683134582623509, "step": 8340}, {"loss": 1.0686, "grad_norm": 1.1523874998092651, "learning_rate": 0.0002, "epoch": 5.689948892674616, "step": 8350}, {"loss": 1.0938, "grad_norm": 1.0819287300109863, "learning_rate": 0.0002, "epoch": 5.696763202725724, "step": 8360}, {"loss": 1.167, "grad_norm": 1.2384417057037354, "learning_rate": 0.0002, "epoch": 5.703577512776832, "step": 8370}, {"loss": 1.1136, "grad_norm": 1.1733224391937256, "learning_rate": 0.0002, "epoch": 5.710391822827939, "step": 8380}, {"loss": 1.1041, "grad_norm": 1.3173418045043945, "learning_rate": 0.0002, "epoch": 5.7172061328790456, "step": 8390}, {"loss": 1.1014, "grad_norm": 1.285880446434021, "learning_rate": 0.0002, "epoch": 5.724020442930153, "step": 8400}, {"loss": 1.1161, "grad_norm": 1.1404874324798584, "learning_rate": 0.0002, "epoch": 5.730834752981261, "step": 8410}, {"loss": 1.192, "grad_norm": 1.2432540655136108, "learning_rate": 0.0002, "epoch": 5.737649063032368, "step": 8420}, {"loss": 1.1702, "grad_norm": 1.2432233095169067, "learning_rate": 0.0002, "epoch": 5.744463373083475, "step": 8430}, {"loss": 1.1357, "grad_norm": 1.154496669769287, "learning_rate": 0.0002, "epoch": 5.751277683134583, "step": 8440}, {"loss": 1.1706, "grad_norm": 1.3301030397415161, "learning_rate": 0.0002, "epoch": 5.75809199318569, "step": 8450}, {"loss": 1.2052, "grad_norm": 1.243760347366333, "learning_rate": 0.0002, "epoch": 5.764906303236797, "step": 8460}, {"loss": 1.1035, "grad_norm": 1.4083361625671387, "learning_rate": 0.0002, "epoch": 5.771720613287904, "step": 8470}, {"loss": 1.1362, "grad_norm": 1.5662120580673218, "learning_rate": 0.0002, "epoch": 5.778534923339012, "step": 8480}, {"loss": 1.1578, "grad_norm": 1.2111139297485352, "learning_rate": 0.0002, "epoch": 5.78534923339012, "step": 8490}, {"loss": 1.1333, "grad_norm": 1.2776305675506592, "learning_rate": 0.0002, "epoch": 5.792163543441227, "step": 8500}, {"loss": 1.1439, "grad_norm": 1.1777727603912354, "learning_rate": 0.0002, "epoch": 5.7989778534923335, "step": 8510}, {"loss": 1.0859, "grad_norm": 1.1696112155914307, "learning_rate": 0.0002, "epoch": 5.805792163543441, "step": 8520}, {"loss": 1.162, "grad_norm": 1.137397289276123, "learning_rate": 0.0002, "epoch": 5.812606473594548, "step": 8530}, {"loss": 1.2099, "grad_norm": 1.3182098865509033, "learning_rate": 0.0002, "epoch": 5.819420783645656, "step": 8540}, {"loss": 1.1427, "grad_norm": 1.359756588935852, "learning_rate": 0.0002, "epoch": 5.826235093696763, "step": 8550}, {"loss": 1.1714, "grad_norm": 1.4118162393569946, "learning_rate": 0.0002, "epoch": 5.833049403747871, "step": 8560}, {"loss": 1.1758, "grad_norm": 1.1899290084838867, "learning_rate": 0.0002, "epoch": 5.8398637137989775, "step": 8570}, {"loss": 1.1511, "grad_norm": 1.1764532327651978, "learning_rate": 0.0002, "epoch": 5.846678023850085, "step": 8580}, {"loss": 1.1633, "grad_norm": 1.33274245262146, "learning_rate": 0.0002, "epoch": 5.853492333901192, "step": 8590}, {"loss": 1.1092, "grad_norm": 1.2571861743927002, "learning_rate": 0.0002, "epoch": 5.8603066439523, "step": 8600}, {"loss": 1.1137, "grad_norm": 1.3523616790771484, "learning_rate": 0.0002, "epoch": 5.867120954003407, "step": 8610}, {"loss": 1.2442, "grad_norm": 1.3556902408599854, "learning_rate": 0.0002, "epoch": 5.873935264054515, "step": 8620}, {"loss": 1.0967, "grad_norm": 1.2864879369735718, "learning_rate": 0.0002, "epoch": 5.8807495741056215, "step": 8630}, {"loss": 1.1491, "grad_norm": 1.2872768640518188, "learning_rate": 0.0002, "epoch": 5.887563884156729, "step": 8640}, {"loss": 1.1003, "grad_norm": 1.1446053981781006, "learning_rate": 0.0002, "epoch": 5.894378194207836, "step": 8650}, {"loss": 1.1095, "grad_norm": 1.292615532875061, "learning_rate": 0.0002, "epoch": 5.901192504258944, "step": 8660}, {"loss": 1.2009, "grad_norm": 1.190891981124878, "learning_rate": 0.0002, "epoch": 5.908006814310051, "step": 8670}, {"loss": 1.1386, "grad_norm": 1.330273985862732, "learning_rate": 0.0002, "epoch": 5.914821124361159, "step": 8680}, {"loss": 1.1874, "grad_norm": 1.41121244430542, "learning_rate": 0.0002, "epoch": 5.9216354344122655, "step": 8690}, {"loss": 1.1573, "grad_norm": 1.1360729932785034, "learning_rate": 0.0002, "epoch": 5.928449744463373, "step": 8700}, {"loss": 1.115, "grad_norm": 1.2220772504806519, "learning_rate": 0.0002, "epoch": 5.93526405451448, "step": 8710}, {"loss": 1.1696, "grad_norm": 1.1077110767364502, "learning_rate": 0.0002, "epoch": 5.942078364565588, "step": 8720}, {"loss": 1.1443, "grad_norm": 1.3632500171661377, "learning_rate": 0.0002, "epoch": 5.948892674616695, "step": 8730}, {"loss": 1.1474, "grad_norm": 1.4695830345153809, "learning_rate": 0.0002, "epoch": 5.955706984667803, "step": 8740}, {"loss": 1.1825, "grad_norm": 1.217741847038269, "learning_rate": 0.0002, "epoch": 5.9625212947189095, "step": 8750}, {"loss": 1.1495, "grad_norm": 1.0386874675750732, "learning_rate": 0.0002, "epoch": 5.969335604770017, "step": 8760}, {"loss": 1.1146, "grad_norm": 1.2067872285842896, "learning_rate": 0.0002, "epoch": 5.976149914821124, "step": 8770}, {"loss": 1.1987, "grad_norm": 1.3842018842697144, "learning_rate": 0.0002, "epoch": 5.982964224872232, "step": 8780}, {"loss": 1.2147, "grad_norm": 1.4584033489227295, "learning_rate": 0.0002, "epoch": 5.989778534923339, "step": 8790}, {"loss": 1.2078, "grad_norm": 1.1912888288497925, "learning_rate": 0.0002, "epoch": 5.996592844974447, "step": 8800}]} +{"epoch": 6.999659284497445, "step": 10272, "epoch_duration": 3076.5613317489624, "total_accumulated_duration": 18038.55364537239, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7887.97119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28746.0}, "peak_memory_reserved": {"GPU_0": 28746.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0988, "grad_norm": 0.635574460029602, "learning_rate": 0.0002, "epoch": 0.0068143100511073255, "step": 10}, {"loss": 2.5594, "grad_norm": 1.0401769876480103, "learning_rate": 0.0002, "epoch": 0.013628620102214651, "step": 20}, {"loss": 2.3587, "grad_norm": 0.4514131247997284, "learning_rate": 0.0002, "epoch": 0.020442930153321975, "step": 30}, {"loss": 2.121, "grad_norm": 0.6303355693817139, "learning_rate": 0.0002, "epoch": 0.027257240204429302, "step": 40}, {"loss": 1.9833, "grad_norm": 0.4648270606994629, "learning_rate": 0.0002, "epoch": 0.034071550255536626, "step": 50}, {"loss": 1.9384, "grad_norm": 0.42953479290008545, "learning_rate": 0.0002, "epoch": 0.04088586030664395, "step": 60}, {"loss": 1.9202, "grad_norm": 0.433614581823349, "learning_rate": 0.0002, "epoch": 0.04770017035775128, "step": 70}, {"loss": 1.911, "grad_norm": 0.45318254828453064, "learning_rate": 0.0002, "epoch": 0.054514480408858604, "step": 80}, {"loss": 1.8895, "grad_norm": 0.4023568630218506, "learning_rate": 0.0002, "epoch": 0.06132879045996593, "step": 90}, {"loss": 1.9257, "grad_norm": 0.43260207772254944, "learning_rate": 0.0002, "epoch": 0.06814310051107325, "step": 100}, {"loss": 1.9178, "grad_norm": 0.43389809131622314, "learning_rate": 0.0002, "epoch": 0.07495741056218058, "step": 110}, {"loss": 1.9071, "grad_norm": 0.39307987689971924, "learning_rate": 0.0002, "epoch": 0.0817717206132879, "step": 120}, {"loss": 1.9482, "grad_norm": 0.7703037261962891, "learning_rate": 0.0002, "epoch": 0.08858603066439523, "step": 130}, {"loss": 1.8394, "grad_norm": 0.38547563552856445, "learning_rate": 0.0002, "epoch": 0.09540034071550256, "step": 140}, {"loss": 1.9037, "grad_norm": 0.37948688864707947, "learning_rate": 0.0002, "epoch": 0.10221465076660988, "step": 150}, {"loss": 1.8664, "grad_norm": 0.33022379875183105, "learning_rate": 0.0002, "epoch": 0.10902896081771721, "step": 160}, {"loss": 1.8659, "grad_norm": 0.33703792095184326, "learning_rate": 0.0002, "epoch": 0.11584327086882454, "step": 170}, {"loss": 1.8389, "grad_norm": 0.3259912431240082, "learning_rate": 0.0002, "epoch": 0.12265758091993186, "step": 180}, {"loss": 1.8424, "grad_norm": 0.3593858778476715, "learning_rate": 0.0002, "epoch": 0.12947189097103917, "step": 190}, {"loss": 1.8457, "grad_norm": 0.3401614725589752, "learning_rate": 0.0002, "epoch": 0.1362862010221465, "step": 200}, {"loss": 1.7721, "grad_norm": 0.3892078697681427, "learning_rate": 0.0002, "epoch": 0.14310051107325383, "step": 210}, {"loss": 1.8351, "grad_norm": 0.315374493598938, "learning_rate": 0.0002, "epoch": 0.14991482112436116, "step": 220}, {"loss": 1.8307, "grad_norm": 0.3679497539997101, "learning_rate": 0.0002, "epoch": 0.1567291311754685, "step": 230}, {"loss": 1.816, "grad_norm": 0.336730033159256, "learning_rate": 0.0002, "epoch": 0.1635434412265758, "step": 240}, {"loss": 1.7849, "grad_norm": 0.36472755670547485, "learning_rate": 0.0002, "epoch": 0.17035775127768313, "step": 250}, {"loss": 1.7766, "grad_norm": 0.34864023327827454, "learning_rate": 0.0002, "epoch": 0.17717206132879046, "step": 260}, {"loss": 1.896, "grad_norm": 0.3350819945335388, "learning_rate": 0.0002, "epoch": 0.1839863713798978, "step": 270}, {"loss": 1.8742, "grad_norm": 0.3844246566295624, "learning_rate": 0.0002, "epoch": 0.19080068143100512, "step": 280}, {"loss": 1.8531, "grad_norm": 0.38413065671920776, "learning_rate": 0.0002, "epoch": 0.19761499148211242, "step": 290}, {"loss": 1.8415, "grad_norm": 0.4080047607421875, "learning_rate": 0.0002, "epoch": 0.20442930153321975, "step": 300}, {"loss": 1.9009, "grad_norm": 0.3546800911426544, "learning_rate": 0.0002, "epoch": 0.21124361158432708, "step": 310}, {"loss": 1.8092, "grad_norm": 0.38577890396118164, "learning_rate": 0.0002, "epoch": 0.21805792163543442, "step": 320}, {"loss": 1.7113, "grad_norm": 0.38979098200798035, "learning_rate": 0.0002, "epoch": 0.22487223168654175, "step": 330}, {"loss": 1.8162, "grad_norm": 0.35105520486831665, "learning_rate": 0.0002, "epoch": 0.23168654173764908, "step": 340}, {"loss": 1.8408, "grad_norm": 0.31671223044395447, "learning_rate": 0.0002, "epoch": 0.23850085178875638, "step": 350}, {"loss": 1.8014, "grad_norm": 0.33034196496009827, "learning_rate": 0.0002, "epoch": 0.2453151618398637, "step": 360}, {"loss": 1.7132, "grad_norm": 0.2990533709526062, "learning_rate": 0.0002, "epoch": 0.252129471890971, "step": 370}, {"loss": 1.8612, "grad_norm": 0.362208753824234, "learning_rate": 0.0002, "epoch": 0.25894378194207834, "step": 380}, {"loss": 1.8001, "grad_norm": 0.3269096612930298, "learning_rate": 0.0002, "epoch": 0.2657580919931857, "step": 390}, {"loss": 1.786, "grad_norm": 0.30555954575538635, "learning_rate": 0.0002, "epoch": 0.272572402044293, "step": 400}, {"loss": 1.8018, "grad_norm": 0.332933247089386, "learning_rate": 0.0002, "epoch": 0.27938671209540034, "step": 410}, {"loss": 1.8157, "grad_norm": 0.39454060792922974, "learning_rate": 0.0002, "epoch": 0.28620102214650767, "step": 420}, {"loss": 1.7862, "grad_norm": 0.34589633345603943, "learning_rate": 0.0002, "epoch": 0.293015332197615, "step": 430}, {"loss": 1.7612, "grad_norm": 0.3747332990169525, "learning_rate": 0.0002, "epoch": 0.29982964224872233, "step": 440}, {"loss": 1.8476, "grad_norm": 0.34825369715690613, "learning_rate": 0.0002, "epoch": 0.30664395229982966, "step": 450}, {"loss": 1.775, "grad_norm": 0.32906976342201233, "learning_rate": 0.0002, "epoch": 0.313458262350937, "step": 460}, {"loss": 1.7764, "grad_norm": 0.33108609914779663, "learning_rate": 0.0002, "epoch": 0.3202725724020443, "step": 470}, {"loss": 1.8012, "grad_norm": 0.3170463442802429, "learning_rate": 0.0002, "epoch": 0.3270868824531516, "step": 480}, {"loss": 1.8264, "grad_norm": 0.30792081356048584, "learning_rate": 0.0002, "epoch": 0.3339011925042589, "step": 490}, {"loss": 1.7361, "grad_norm": 0.31772997975349426, "learning_rate": 0.0002, "epoch": 0.34071550255536626, "step": 500}, {"loss": 1.8147, "grad_norm": 0.32714012265205383, "learning_rate": 0.0002, "epoch": 0.3475298126064736, "step": 510}, {"loss": 1.8332, "grad_norm": 0.3021100163459778, "learning_rate": 0.0002, "epoch": 0.3543441226575809, "step": 520}, {"loss": 1.7337, "grad_norm": 0.6045835018157959, "learning_rate": 0.0002, "epoch": 0.36115843270868825, "step": 530}, {"loss": 1.8661, "grad_norm": 0.3003896474838257, "learning_rate": 0.0002, "epoch": 0.3679727427597956, "step": 540}, {"loss": 1.8359, "grad_norm": 0.3678470551967621, "learning_rate": 0.0002, "epoch": 0.3747870528109029, "step": 550}, {"loss": 1.7512, "grad_norm": 0.35787615180015564, "learning_rate": 0.0002, "epoch": 0.38160136286201024, "step": 560}, {"loss": 1.8048, "grad_norm": 0.31882143020629883, "learning_rate": 0.0002, "epoch": 0.38841567291311757, "step": 570}, {"loss": 1.8108, "grad_norm": 0.3186313509941101, "learning_rate": 0.0002, "epoch": 0.39522998296422485, "step": 580}, {"loss": 1.8012, "grad_norm": 0.41443702578544617, "learning_rate": 0.0002, "epoch": 0.4020442930153322, "step": 590}, {"loss": 1.7794, "grad_norm": 0.28773069381713867, "learning_rate": 0.0002, "epoch": 0.4088586030664395, "step": 600}, {"loss": 1.8231, "grad_norm": 0.35743263363838196, "learning_rate": 0.0002, "epoch": 0.41567291311754684, "step": 610}, {"loss": 1.7531, "grad_norm": 0.3360286355018616, "learning_rate": 0.0002, "epoch": 0.42248722316865417, "step": 620}, {"loss": 1.7654, "grad_norm": 0.32838866114616394, "learning_rate": 0.0002, "epoch": 0.4293015332197615, "step": 630}, {"loss": 1.8176, "grad_norm": 0.2994388937950134, "learning_rate": 0.0002, "epoch": 0.43611584327086883, "step": 640}, {"loss": 1.7241, "grad_norm": 0.3306307792663574, "learning_rate": 0.0002, "epoch": 0.44293015332197616, "step": 650}, {"loss": 1.8201, "grad_norm": 0.3129560351371765, "learning_rate": 0.0002, "epoch": 0.4497444633730835, "step": 660}, {"loss": 1.803, "grad_norm": 0.3244289457798004, "learning_rate": 0.0002, "epoch": 0.4565587734241908, "step": 670}, {"loss": 1.8399, "grad_norm": 0.3196892738342285, "learning_rate": 0.0002, "epoch": 0.46337308347529815, "step": 680}, {"loss": 1.8291, "grad_norm": 0.3135230243206024, "learning_rate": 0.0002, "epoch": 0.47018739352640543, "step": 690}, {"loss": 1.7423, "grad_norm": 0.28677991032600403, "learning_rate": 0.0002, "epoch": 0.47700170357751276, "step": 700}, {"loss": 1.6982, "grad_norm": 0.3074065148830414, "learning_rate": 0.0002, "epoch": 0.4838160136286201, "step": 710}, {"loss": 1.8078, "grad_norm": 0.3354290723800659, "learning_rate": 0.0002, "epoch": 0.4906303236797274, "step": 720}, {"loss": 1.7333, "grad_norm": 0.324370801448822, "learning_rate": 0.0002, "epoch": 0.49744463373083475, "step": 730}, {"loss": 1.8506, "grad_norm": 0.29496142268180847, "learning_rate": 0.0002, "epoch": 0.504258943781942, "step": 740}, {"loss": 1.7761, "grad_norm": 0.30694130063056946, "learning_rate": 0.0002, "epoch": 0.5110732538330494, "step": 750}, {"loss": 1.7854, "grad_norm": 0.36168408393859863, "learning_rate": 0.0002, "epoch": 0.5178875638841567, "step": 760}, {"loss": 1.7149, "grad_norm": 0.2930343449115753, "learning_rate": 0.0002, "epoch": 0.524701873935264, "step": 770}, {"loss": 1.7924, "grad_norm": 0.3023432493209839, "learning_rate": 0.0002, "epoch": 0.5315161839863713, "step": 780}, {"loss": 1.8467, "grad_norm": 0.3272720277309418, "learning_rate": 0.0002, "epoch": 0.5383304940374787, "step": 790}, {"loss": 1.8639, "grad_norm": 0.2907974421977997, "learning_rate": 0.0002, "epoch": 0.545144804088586, "step": 800}, {"loss": 1.9018, "grad_norm": 0.32267168164253235, "learning_rate": 0.0002, "epoch": 0.5519591141396933, "step": 810}, {"loss": 1.8311, "grad_norm": 0.32059940695762634, "learning_rate": 0.0002, "epoch": 0.5587734241908007, "step": 820}, {"loss": 1.7234, "grad_norm": 0.30951258540153503, "learning_rate": 0.0002, "epoch": 0.565587734241908, "step": 830}, {"loss": 1.8063, "grad_norm": 0.33976122736930847, "learning_rate": 0.0002, "epoch": 0.5724020442930153, "step": 840}, {"loss": 1.7021, "grad_norm": 0.3195820450782776, "learning_rate": 0.0002, "epoch": 0.5792163543441227, "step": 850}, {"loss": 1.8073, "grad_norm": 0.2828562557697296, "learning_rate": 0.0002, "epoch": 0.58603066439523, "step": 860}, {"loss": 1.695, "grad_norm": 0.29591670632362366, "learning_rate": 0.0002, "epoch": 0.5928449744463373, "step": 870}, {"loss": 1.8109, "grad_norm": 0.3086104393005371, "learning_rate": 0.0002, "epoch": 0.5996592844974447, "step": 880}, {"loss": 1.8207, "grad_norm": 0.3592929542064667, "learning_rate": 0.0002, "epoch": 0.606473594548552, "step": 890}, {"loss": 1.7662, "grad_norm": 0.2830186188220978, "learning_rate": 0.0002, "epoch": 0.6132879045996593, "step": 900}, {"loss": 1.8344, "grad_norm": 0.3128598630428314, "learning_rate": 0.0002, "epoch": 0.6201022146507666, "step": 910}, {"loss": 1.7953, "grad_norm": 0.31957563757896423, "learning_rate": 0.0002, "epoch": 0.626916524701874, "step": 920}, {"loss": 1.7578, "grad_norm": 0.30994319915771484, "learning_rate": 0.0002, "epoch": 0.6337308347529813, "step": 930}, {"loss": 1.8494, "grad_norm": 0.3352845013141632, "learning_rate": 0.0002, "epoch": 0.6405451448040886, "step": 940}, {"loss": 1.7054, "grad_norm": 0.2960077226161957, "learning_rate": 0.0002, "epoch": 0.6473594548551959, "step": 950}, {"loss": 1.7209, "grad_norm": 0.32675081491470337, "learning_rate": 0.0002, "epoch": 0.6541737649063032, "step": 960}, {"loss": 1.7776, "grad_norm": 0.30042028427124023, "learning_rate": 0.0002, "epoch": 0.6609880749574105, "step": 970}, {"loss": 1.7597, "grad_norm": 0.3288673758506775, "learning_rate": 0.0002, "epoch": 0.6678023850085179, "step": 980}, {"loss": 1.7962, "grad_norm": 0.323215126991272, "learning_rate": 0.0002, "epoch": 0.6746166950596252, "step": 990}, {"loss": 1.821, "grad_norm": 0.30041399598121643, "learning_rate": 0.0002, "epoch": 0.6814310051107325, "step": 1000}, {"loss": 1.7716, "grad_norm": 0.3076179623603821, "learning_rate": 0.0002, "epoch": 0.6882453151618398, "step": 1010}, {"loss": 1.7827, "grad_norm": 0.2941909730434418, "learning_rate": 0.0002, "epoch": 0.6950596252129472, "step": 1020}, {"loss": 1.7964, "grad_norm": 0.32220420241355896, "learning_rate": 0.0002, "epoch": 0.7018739352640545, "step": 1030}, {"loss": 1.7101, "grad_norm": 0.2989702820777893, "learning_rate": 0.0002, "epoch": 0.7086882453151618, "step": 1040}, {"loss": 1.8749, "grad_norm": 0.3049640357494354, "learning_rate": 0.0002, "epoch": 0.7155025553662692, "step": 1050}, {"loss": 1.782, "grad_norm": 0.3183284103870392, "learning_rate": 0.0002, "epoch": 0.7223168654173765, "step": 1060}, {"loss": 1.785, "grad_norm": 0.3070095181465149, "learning_rate": 0.0002, "epoch": 0.7291311754684838, "step": 1070}, {"loss": 1.7832, "grad_norm": 0.33263063430786133, "learning_rate": 0.0002, "epoch": 0.7359454855195912, "step": 1080}, {"loss": 1.7627, "grad_norm": 0.28774312138557434, "learning_rate": 0.0002, "epoch": 0.7427597955706985, "step": 1090}, {"loss": 1.7343, "grad_norm": 0.29304224252700806, "learning_rate": 0.0002, "epoch": 0.7495741056218058, "step": 1100}, {"loss": 1.8082, "grad_norm": 0.27529507875442505, "learning_rate": 0.0002, "epoch": 0.7563884156729132, "step": 1110}, {"loss": 1.8071, "grad_norm": 0.32319945096969604, "learning_rate": 0.0002, "epoch": 0.7632027257240205, "step": 1120}, {"loss": 1.6998, "grad_norm": 0.33777597546577454, "learning_rate": 0.0002, "epoch": 0.7700170357751278, "step": 1130}, {"loss": 1.8488, "grad_norm": 0.29847201704978943, "learning_rate": 0.0002, "epoch": 0.7768313458262351, "step": 1140}, {"loss": 1.7014, "grad_norm": 0.31370633840560913, "learning_rate": 0.0002, "epoch": 0.7836456558773425, "step": 1150}, {"loss": 1.8175, "grad_norm": 0.31633856892585754, "learning_rate": 0.0002, "epoch": 0.7904599659284497, "step": 1160}, {"loss": 1.7637, "grad_norm": 0.33849263191223145, "learning_rate": 0.0002, "epoch": 0.797274275979557, "step": 1170}, {"loss": 1.8264, "grad_norm": 0.3306333124637604, "learning_rate": 0.0002, "epoch": 0.8040885860306644, "step": 1180}, {"loss": 1.777, "grad_norm": 0.3249678313732147, "learning_rate": 0.0002, "epoch": 0.8109028960817717, "step": 1190}, {"loss": 1.7631, "grad_norm": 0.3252817690372467, "learning_rate": 0.0002, "epoch": 0.817717206132879, "step": 1200}, {"loss": 1.7877, "grad_norm": 0.31772559881210327, "learning_rate": 0.0002, "epoch": 0.8245315161839863, "step": 1210}, {"loss": 1.8068, "grad_norm": 0.2803371846675873, "learning_rate": 0.0002, "epoch": 0.8313458262350937, "step": 1220}, {"loss": 1.738, "grad_norm": 0.26700571179389954, "learning_rate": 0.0002, "epoch": 0.838160136286201, "step": 1230}, {"loss": 1.8301, "grad_norm": 0.3060092031955719, "learning_rate": 0.0002, "epoch": 0.8449744463373083, "step": 1240}, {"loss": 1.7993, "grad_norm": 0.28831684589385986, "learning_rate": 0.0002, "epoch": 0.8517887563884157, "step": 1250}, {"loss": 1.6909, "grad_norm": 0.30708742141723633, "learning_rate": 0.0002, "epoch": 0.858603066439523, "step": 1260}, {"loss": 1.8506, "grad_norm": 0.2915987968444824, "learning_rate": 0.0002, "epoch": 0.8654173764906303, "step": 1270}, {"loss": 1.7536, "grad_norm": 0.2893589735031128, "learning_rate": 0.0002, "epoch": 0.8722316865417377, "step": 1280}, {"loss": 1.7437, "grad_norm": 0.29545632004737854, "learning_rate": 0.0002, "epoch": 0.879045996592845, "step": 1290}, {"loss": 1.859, "grad_norm": 0.3354771137237549, "learning_rate": 0.0002, "epoch": 0.8858603066439523, "step": 1300}, {"loss": 1.7644, "grad_norm": 0.37715399265289307, "learning_rate": 0.0002, "epoch": 0.8926746166950597, "step": 1310}, {"loss": 1.7731, "grad_norm": 0.28847193717956543, "learning_rate": 0.0002, "epoch": 0.899488926746167, "step": 1320}, {"loss": 1.9077, "grad_norm": 0.2780889868736267, "learning_rate": 0.0002, "epoch": 0.9063032367972743, "step": 1330}, {"loss": 1.7861, "grad_norm": 0.2714342176914215, "learning_rate": 0.0002, "epoch": 0.9131175468483816, "step": 1340}, {"loss": 1.8, "grad_norm": 0.2950133979320526, "learning_rate": 0.0002, "epoch": 0.919931856899489, "step": 1350}, {"loss": 1.7368, "grad_norm": 0.29097145795822144, "learning_rate": 0.0002, "epoch": 0.9267461669505963, "step": 1360}, {"loss": 1.7864, "grad_norm": 0.32540133595466614, "learning_rate": 0.0002, "epoch": 0.9335604770017035, "step": 1370}, {"loss": 1.7571, "grad_norm": 0.3076636493206024, "learning_rate": 0.0002, "epoch": 0.9403747870528109, "step": 1380}, {"loss": 1.8312, "grad_norm": 0.2962130308151245, "learning_rate": 0.0002, "epoch": 0.9471890971039182, "step": 1390}, {"loss": 1.7858, "grad_norm": 0.30086860060691833, "learning_rate": 0.0002, "epoch": 0.9540034071550255, "step": 1400}, {"loss": 1.7991, "grad_norm": 0.28634947538375854, "learning_rate": 0.0002, "epoch": 0.9608177172061328, "step": 1410}, {"loss": 1.8385, "grad_norm": 0.35314416885375977, "learning_rate": 0.0002, "epoch": 0.9676320272572402, "step": 1420}, {"loss": 1.8054, "grad_norm": 0.2939317524433136, "learning_rate": 0.0002, "epoch": 0.9744463373083475, "step": 1430}, {"loss": 1.7582, "grad_norm": 0.3010196089744568, "learning_rate": 0.0002, "epoch": 0.9812606473594548, "step": 1440}, {"loss": 1.758, "grad_norm": 0.30816152691841125, "learning_rate": 0.0002, "epoch": 0.9880749574105622, "step": 1450}, {"loss": 1.8159, "grad_norm": 0.28152793645858765, "learning_rate": 0.0002, "epoch": 0.9948892674616695, "step": 1460}, {"eval_loss": 1.8034634590148926, "eval_runtime": 53.6017, "eval_samples_per_second": 9.459, "eval_steps_per_second": 1.194, "epoch": 0.9996592844974447, "step": 1467}, {"loss": 1.7072, "grad_norm": 0.29246416687965393, "learning_rate": 0.0002, "epoch": 1.0017035775127767, "step": 1470}, {"loss": 1.8159, "grad_norm": 0.2668602168560028, "learning_rate": 0.0002, "epoch": 1.008517887563884, "step": 1480}, {"loss": 1.6868, "grad_norm": 0.2998567819595337, "learning_rate": 0.0002, "epoch": 1.0153321976149914, "step": 1490}, {"loss": 1.7331, "grad_norm": 0.3284934461116791, "learning_rate": 0.0002, "epoch": 1.0221465076660987, "step": 1500}, {"loss": 1.689, "grad_norm": 0.3275827169418335, "learning_rate": 0.0002, "epoch": 1.028960817717206, "step": 1510}, {"loss": 1.7092, "grad_norm": 0.3382718563079834, "learning_rate": 0.0002, "epoch": 1.0357751277683134, "step": 1520}, {"loss": 1.7215, "grad_norm": 0.36737215518951416, "learning_rate": 0.0002, "epoch": 1.0425894378194207, "step": 1530}, {"loss": 1.6993, "grad_norm": 0.3442603647708893, "learning_rate": 0.0002, "epoch": 1.049403747870528, "step": 1540}, {"loss": 1.6996, "grad_norm": 0.3323381245136261, "learning_rate": 0.0002, "epoch": 1.0562180579216354, "step": 1550}, {"loss": 1.7444, "grad_norm": 0.341227650642395, "learning_rate": 0.0002, "epoch": 1.0630323679727427, "step": 1560}, {"loss": 1.7419, "grad_norm": 0.3361579477787018, "learning_rate": 0.0002, "epoch": 1.06984667802385, "step": 1570}, {"loss": 1.7337, "grad_norm": 0.3556230962276459, "learning_rate": 0.0002, "epoch": 1.0766609880749574, "step": 1580}, {"loss": 1.6978, "grad_norm": 0.27130424976348877, "learning_rate": 0.0002, "epoch": 1.0834752981260647, "step": 1590}, {"loss": 1.6087, "grad_norm": 0.29366323351860046, "learning_rate": 0.0002, "epoch": 1.090289608177172, "step": 1600}, {"loss": 1.6721, "grad_norm": 0.3581245541572571, "learning_rate": 0.0002, "epoch": 1.0971039182282794, "step": 1610}, {"loss": 1.7639, "grad_norm": 0.3021670877933502, "learning_rate": 0.0002, "epoch": 1.1039182282793867, "step": 1620}, {"loss": 1.7314, "grad_norm": 0.3145572543144226, "learning_rate": 0.0002, "epoch": 1.110732538330494, "step": 1630}, {"loss": 1.7408, "grad_norm": 0.35362154245376587, "learning_rate": 0.0002, "epoch": 1.1175468483816013, "step": 1640}, {"loss": 1.7071, "grad_norm": 0.5413113236427307, "learning_rate": 0.0002, "epoch": 1.1243611584327087, "step": 1650}, {"loss": 1.7615, "grad_norm": 0.6858654022216797, "learning_rate": 0.0002, "epoch": 1.131175468483816, "step": 1660}, {"loss": 1.7029, "grad_norm": 0.3781903386116028, "learning_rate": 0.0002, "epoch": 1.1379897785349233, "step": 1670}, {"loss": 1.659, "grad_norm": 0.404864102602005, "learning_rate": 0.0002, "epoch": 1.1448040885860307, "step": 1680}, {"loss": 1.7212, "grad_norm": 0.3595100939273834, "learning_rate": 0.0002, "epoch": 1.151618398637138, "step": 1690}, {"loss": 1.7023, "grad_norm": 0.33682283759117126, "learning_rate": 0.0002, "epoch": 1.1584327086882453, "step": 1700}, {"loss": 1.7336, "grad_norm": 0.3877373933792114, "learning_rate": 0.0002, "epoch": 1.1652470187393527, "step": 1710}, {"loss": 1.7676, "grad_norm": 0.34606897830963135, "learning_rate": 0.0002, "epoch": 1.17206132879046, "step": 1720}, {"loss": 1.6889, "grad_norm": 0.3122918903827667, "learning_rate": 0.0002, "epoch": 1.1788756388415673, "step": 1730}, {"loss": 1.6585, "grad_norm": 0.34081900119781494, "learning_rate": 0.0002, "epoch": 1.1856899488926746, "step": 1740}, {"loss": 1.7794, "grad_norm": 0.3418807089328766, "learning_rate": 0.0002, "epoch": 1.192504258943782, "step": 1750}, {"loss": 1.7773, "grad_norm": 0.3495500981807709, "learning_rate": 0.0002, "epoch": 1.1993185689948893, "step": 1760}, {"loss": 1.6578, "grad_norm": 0.557288408279419, "learning_rate": 0.0002, "epoch": 1.2061328790459966, "step": 1770}, {"loss": 1.7488, "grad_norm": 0.3193778693675995, "learning_rate": 0.0002, "epoch": 1.212947189097104, "step": 1780}, {"loss": 1.6815, "grad_norm": 0.3306216895580292, "learning_rate": 0.0002, "epoch": 1.2197614991482113, "step": 1790}, {"loss": 1.6772, "grad_norm": 0.37998732924461365, "learning_rate": 0.0002, "epoch": 1.2265758091993186, "step": 1800}, {"loss": 1.7094, "grad_norm": 0.3255669176578522, "learning_rate": 0.0002, "epoch": 1.233390119250426, "step": 1810}, {"loss": 1.7096, "grad_norm": 0.3741287291049957, "learning_rate": 0.0002, "epoch": 1.2402044293015333, "step": 1820}, {"loss": 1.7317, "grad_norm": 0.36727291345596313, "learning_rate": 0.0002, "epoch": 1.2470187393526406, "step": 1830}, {"loss": 1.7418, "grad_norm": 0.3479527235031128, "learning_rate": 0.0002, "epoch": 1.253833049403748, "step": 1840}, {"loss": 1.7062, "grad_norm": 0.3472636640071869, "learning_rate": 0.0002, "epoch": 1.2606473594548553, "step": 1850}, {"loss": 1.6263, "grad_norm": 0.3702869415283203, "learning_rate": 0.0002, "epoch": 1.2674616695059626, "step": 1860}, {"loss": 1.6222, "grad_norm": 0.3934040069580078, "learning_rate": 0.0002, "epoch": 1.27427597955707, "step": 1870}, {"loss": 1.7616, "grad_norm": 0.46887534856796265, "learning_rate": 0.0002, "epoch": 1.2810902896081773, "step": 1880}, {"loss": 1.6071, "grad_norm": 0.3191998600959778, "learning_rate": 0.0002, "epoch": 1.2879045996592846, "step": 1890}, {"loss": 1.7001, "grad_norm": 0.34032225608825684, "learning_rate": 0.0002, "epoch": 1.294718909710392, "step": 1900}, {"loss": 1.8316, "grad_norm": 0.33453696966171265, "learning_rate": 0.0002, "epoch": 1.3015332197614993, "step": 1910}, {"loss": 1.6567, "grad_norm": 0.3451494872570038, "learning_rate": 0.0002, "epoch": 1.3083475298126066, "step": 1920}, {"loss": 1.708, "grad_norm": 0.36203092336654663, "learning_rate": 0.0002, "epoch": 1.315161839863714, "step": 1930}, {"loss": 1.7095, "grad_norm": 0.43794456124305725, "learning_rate": 0.0002, "epoch": 1.321976149914821, "step": 1940}, {"loss": 1.7264, "grad_norm": 0.3630591034889221, "learning_rate": 0.0002, "epoch": 1.3287904599659284, "step": 1950}, {"loss": 1.6529, "grad_norm": 0.36951911449432373, "learning_rate": 0.0002, "epoch": 1.3356047700170357, "step": 1960}, {"loss": 1.651, "grad_norm": 0.4001159965991974, "learning_rate": 0.0002, "epoch": 1.342419080068143, "step": 1970}, {"loss": 1.677, "grad_norm": 0.3820836544036865, "learning_rate": 0.0002, "epoch": 1.3492333901192504, "step": 1980}, {"loss": 1.583, "grad_norm": 0.3705870807170868, "learning_rate": 0.0002, "epoch": 1.3560477001703577, "step": 1990}, {"loss": 1.6207, "grad_norm": 0.3557972013950348, "learning_rate": 0.0002, "epoch": 1.362862010221465, "step": 2000}, {"loss": 1.7656, "grad_norm": 0.38546398282051086, "learning_rate": 0.0002, "epoch": 1.3696763202725724, "step": 2010}, {"loss": 1.6881, "grad_norm": 0.3908020853996277, "learning_rate": 0.0002, "epoch": 1.3764906303236797, "step": 2020}, {"loss": 1.727, "grad_norm": 0.3822040855884552, "learning_rate": 0.0002, "epoch": 1.383304940374787, "step": 2030}, {"loss": 1.7923, "grad_norm": 0.425327867269516, "learning_rate": 0.0002, "epoch": 1.3901192504258943, "step": 2040}, {"loss": 1.7032, "grad_norm": 0.3436259329319, "learning_rate": 0.0002, "epoch": 1.3969335604770017, "step": 2050}, {"loss": 1.6398, "grad_norm": 0.33124062418937683, "learning_rate": 0.0002, "epoch": 1.403747870528109, "step": 2060}, {"loss": 1.6815, "grad_norm": 0.3662424683570862, "learning_rate": 0.0002, "epoch": 1.4105621805792163, "step": 2070}, {"loss": 1.7258, "grad_norm": 0.3720635175704956, "learning_rate": 0.0002, "epoch": 1.4173764906303237, "step": 2080}, {"loss": 1.7186, "grad_norm": 0.3361680805683136, "learning_rate": 0.0002, "epoch": 1.424190800681431, "step": 2090}, {"loss": 1.7606, "grad_norm": 0.32751724123954773, "learning_rate": 0.0002, "epoch": 1.4310051107325383, "step": 2100}, {"loss": 1.7051, "grad_norm": 0.34333378076553345, "learning_rate": 0.0002, "epoch": 1.4378194207836457, "step": 2110}, {"loss": 1.6979, "grad_norm": 0.37777671217918396, "learning_rate": 0.0002, "epoch": 1.444633730834753, "step": 2120}, {"loss": 1.7085, "grad_norm": 0.37126365303993225, "learning_rate": 0.0002, "epoch": 1.4514480408858603, "step": 2130}, {"loss": 1.721, "grad_norm": 0.3602267801761627, "learning_rate": 0.0002, "epoch": 1.4582623509369677, "step": 2140}, {"loss": 1.8148, "grad_norm": 0.3287110924720764, "learning_rate": 0.0002, "epoch": 1.465076660988075, "step": 2150}, {"loss": 1.6966, "grad_norm": 0.3562135100364685, "learning_rate": 0.0002, "epoch": 1.4718909710391823, "step": 2160}, {"loss": 1.713, "grad_norm": 0.38292962312698364, "learning_rate": 0.0002, "epoch": 1.4787052810902896, "step": 2170}, {"loss": 1.7036, "grad_norm": 0.38220319151878357, "learning_rate": 0.0002, "epoch": 1.485519591141397, "step": 2180}, {"loss": 1.7297, "grad_norm": 0.3570062220096588, "learning_rate": 0.0002, "epoch": 1.4923339011925043, "step": 2190}, {"loss": 1.5652, "grad_norm": 0.363146036863327, "learning_rate": 0.0002, "epoch": 1.4991482112436116, "step": 2200}, {"loss": 1.6716, "grad_norm": 0.37393274903297424, "learning_rate": 0.0002, "epoch": 1.5059625212947187, "step": 2210}, {"loss": 1.6839, "grad_norm": 0.3628501892089844, "learning_rate": 0.0002, "epoch": 1.512776831345826, "step": 2220}, {"loss": 1.6627, "grad_norm": 0.33430740237236023, "learning_rate": 0.0002, "epoch": 1.5195911413969334, "step": 2230}, {"loss": 1.7495, "grad_norm": 0.35363978147506714, "learning_rate": 0.0002, "epoch": 1.5264054514480407, "step": 2240}, {"loss": 1.6364, "grad_norm": 0.37220337986946106, "learning_rate": 0.0002, "epoch": 1.533219761499148, "step": 2250}, {"loss": 1.7773, "grad_norm": 0.35020262002944946, "learning_rate": 0.0002, "epoch": 1.5400340715502554, "step": 2260}, {"loss": 1.7453, "grad_norm": 0.35274937748908997, "learning_rate": 0.0002, "epoch": 1.5468483816013627, "step": 2270}, {"loss": 1.7162, "grad_norm": 0.3797738552093506, "learning_rate": 0.0002, "epoch": 1.55366269165247, "step": 2280}, {"loss": 1.6197, "grad_norm": 0.4160412847995758, "learning_rate": 0.0002, "epoch": 1.5604770017035774, "step": 2290}, {"loss": 1.7101, "grad_norm": 0.38779592514038086, "learning_rate": 0.0002, "epoch": 1.5672913117546847, "step": 2300}, {"loss": 1.7254, "grad_norm": 0.39171287417411804, "learning_rate": 0.0002, "epoch": 1.574105621805792, "step": 2310}, {"loss": 1.7087, "grad_norm": 0.3606826663017273, "learning_rate": 0.0002, "epoch": 1.5809199318568994, "step": 2320}, {"loss": 1.7269, "grad_norm": 0.3745017647743225, "learning_rate": 0.0002, "epoch": 1.5877342419080067, "step": 2330}, {"loss": 1.7068, "grad_norm": 0.34933462738990784, "learning_rate": 0.0002, "epoch": 1.594548551959114, "step": 2340}, {"loss": 1.7073, "grad_norm": 0.37268444895744324, "learning_rate": 0.0002, "epoch": 1.6013628620102214, "step": 2350}, {"loss": 1.728, "grad_norm": 0.4603484869003296, "learning_rate": 0.0002, "epoch": 1.6081771720613287, "step": 2360}, {"loss": 1.7621, "grad_norm": 0.35689088702201843, "learning_rate": 0.0002, "epoch": 1.614991482112436, "step": 2370}, {"loss": 1.6989, "grad_norm": 0.3392031490802765, "learning_rate": 0.0002, "epoch": 1.6218057921635434, "step": 2380}, {"loss": 1.7077, "grad_norm": 0.394653856754303, "learning_rate": 0.0002, "epoch": 1.6286201022146507, "step": 2390}, {"loss": 1.7448, "grad_norm": 0.33972012996673584, "learning_rate": 0.0002, "epoch": 1.635434412265758, "step": 2400}, {"loss": 1.7681, "grad_norm": 0.3854375183582306, "learning_rate": 0.0002, "epoch": 1.6422487223168654, "step": 2410}, {"loss": 1.7102, "grad_norm": 0.36143961548805237, "learning_rate": 0.0002, "epoch": 1.6490630323679727, "step": 2420}, {"loss": 1.7432, "grad_norm": 0.35816189646720886, "learning_rate": 0.0002, "epoch": 1.65587734241908, "step": 2430}, {"loss": 1.6407, "grad_norm": 0.36298736929893494, "learning_rate": 0.0002, "epoch": 1.6626916524701874, "step": 2440}, {"loss": 1.723, "grad_norm": 0.36756929755210876, "learning_rate": 0.0002, "epoch": 1.6695059625212947, "step": 2450}, {"loss": 1.6824, "grad_norm": 0.35969603061676025, "learning_rate": 0.0002, "epoch": 1.676320272572402, "step": 2460}, {"loss": 1.7014, "grad_norm": 0.38449376821517944, "learning_rate": 0.0002, "epoch": 1.6831345826235093, "step": 2470}, {"loss": 1.7261, "grad_norm": 0.44511452317237854, "learning_rate": 0.0002, "epoch": 1.6899488926746167, "step": 2480}, {"loss": 1.7397, "grad_norm": 0.3831416368484497, "learning_rate": 0.0002, "epoch": 1.696763202725724, "step": 2490}, {"loss": 1.7046, "grad_norm": 0.3795325756072998, "learning_rate": 0.0002, "epoch": 1.7035775127768313, "step": 2500}, {"loss": 1.6993, "grad_norm": 0.34978193044662476, "learning_rate": 0.0002, "epoch": 1.7103918228279387, "step": 2510}, {"loss": 1.8307, "grad_norm": 0.35923877358436584, "learning_rate": 0.0002, "epoch": 1.717206132879046, "step": 2520}, {"loss": 1.6253, "grad_norm": 0.352999746799469, "learning_rate": 0.0002, "epoch": 1.7240204429301533, "step": 2530}, {"loss": 1.6953, "grad_norm": 0.43673479557037354, "learning_rate": 0.0002, "epoch": 1.7308347529812607, "step": 2540}, {"loss": 1.7079, "grad_norm": 0.4153687357902527, "learning_rate": 0.0002, "epoch": 1.737649063032368, "step": 2550}, {"loss": 1.6714, "grad_norm": 0.35541167855262756, "learning_rate": 0.0002, "epoch": 1.7444633730834753, "step": 2560}, {"loss": 1.696, "grad_norm": 0.3288775086402893, "learning_rate": 0.0002, "epoch": 1.7512776831345827, "step": 2570}, {"loss": 1.7486, "grad_norm": 0.3991123139858246, "learning_rate": 0.0002, "epoch": 1.75809199318569, "step": 2580}, {"loss": 1.645, "grad_norm": 0.39967241883277893, "learning_rate": 0.0002, "epoch": 1.7649063032367973, "step": 2590}, {"loss": 1.6855, "grad_norm": 0.41104283928871155, "learning_rate": 0.0002, "epoch": 1.7717206132879046, "step": 2600}, {"loss": 1.6993, "grad_norm": 0.44885286688804626, "learning_rate": 0.0002, "epoch": 1.778534923339012, "step": 2610}, {"loss": 1.7224, "grad_norm": 0.38384467363357544, "learning_rate": 0.0002, "epoch": 1.7853492333901193, "step": 2620}, {"loss": 1.7213, "grad_norm": 0.35020917654037476, "learning_rate": 0.0002, "epoch": 1.7921635434412266, "step": 2630}, {"loss": 1.6706, "grad_norm": 0.3360341489315033, "learning_rate": 0.0002, "epoch": 1.798977853492334, "step": 2640}, {"loss": 1.7037, "grad_norm": 0.38875144720077515, "learning_rate": 0.0002, "epoch": 1.8057921635434413, "step": 2650}, {"loss": 1.693, "grad_norm": 0.34876883029937744, "learning_rate": 0.0002, "epoch": 1.8126064735945486, "step": 2660}, {"loss": 1.7743, "grad_norm": 0.419979989528656, "learning_rate": 0.0002, "epoch": 1.819420783645656, "step": 2670}, {"loss": 1.7089, "grad_norm": 0.3648919463157654, "learning_rate": 0.0002, "epoch": 1.8262350936967633, "step": 2680}, {"loss": 1.7828, "grad_norm": 0.3485383987426758, "learning_rate": 0.0002, "epoch": 1.8330494037478706, "step": 2690}, {"loss": 1.705, "grad_norm": 0.3647468686103821, "learning_rate": 0.0002, "epoch": 1.839863713798978, "step": 2700}, {"loss": 1.7318, "grad_norm": 0.37003210186958313, "learning_rate": 0.0002, "epoch": 1.8466780238500853, "step": 2710}, {"loss": 1.6647, "grad_norm": 0.37031617760658264, "learning_rate": 0.0002, "epoch": 1.8534923339011926, "step": 2720}, {"loss": 1.69, "grad_norm": 0.3438796103000641, "learning_rate": 0.0002, "epoch": 1.8603066439523, "step": 2730}, {"loss": 1.5995, "grad_norm": 0.41574627161026, "learning_rate": 0.0002, "epoch": 1.8671209540034073, "step": 2740}, {"loss": 1.6877, "grad_norm": 0.35049930214881897, "learning_rate": 0.0002, "epoch": 1.8739352640545146, "step": 2750}, {"loss": 1.7048, "grad_norm": 0.3943989872932434, "learning_rate": 0.0002, "epoch": 1.880749574105622, "step": 2760}, {"loss": 1.7047, "grad_norm": 0.3384978175163269, "learning_rate": 0.0002, "epoch": 1.8875638841567293, "step": 2770}, {"loss": 1.7848, "grad_norm": 0.3501328229904175, "learning_rate": 0.0002, "epoch": 1.8943781942078366, "step": 2780}, {"loss": 1.6986, "grad_norm": 0.37484532594680786, "learning_rate": 0.0002, "epoch": 1.901192504258944, "step": 2790}, {"loss": 1.6867, "grad_norm": 0.34497788548469543, "learning_rate": 0.0002, "epoch": 1.9080068143100513, "step": 2800}, {"loss": 1.6513, "grad_norm": 0.3530851900577545, "learning_rate": 0.0002, "epoch": 1.9148211243611586, "step": 2810}, {"loss": 1.6369, "grad_norm": 0.3879254162311554, "learning_rate": 0.0002, "epoch": 1.921635434412266, "step": 2820}, {"loss": 1.6786, "grad_norm": 0.3885590136051178, "learning_rate": 0.0002, "epoch": 1.9284497444633732, "step": 2830}, {"loss": 1.8049, "grad_norm": 0.3868715465068817, "learning_rate": 0.0002, "epoch": 1.9352640545144804, "step": 2840}, {"loss": 1.7099, "grad_norm": 0.4152422249317169, "learning_rate": 0.0002, "epoch": 1.9420783645655877, "step": 2850}, {"loss": 1.6696, "grad_norm": 0.401714563369751, "learning_rate": 0.0002, "epoch": 1.948892674616695, "step": 2860}, {"loss": 1.7182, "grad_norm": 0.34825265407562256, "learning_rate": 0.0002, "epoch": 1.9557069846678024, "step": 2870}, {"loss": 1.6612, "grad_norm": 0.3620675504207611, "learning_rate": 0.0002, "epoch": 1.9625212947189097, "step": 2880}, {"loss": 1.7451, "grad_norm": 0.3977806866168976, "learning_rate": 0.0002, "epoch": 1.969335604770017, "step": 2890}, {"loss": 1.6514, "grad_norm": 0.3687497079372406, "learning_rate": 0.0002, "epoch": 1.9761499148211243, "step": 2900}, {"loss": 1.7712, "grad_norm": 0.408640056848526, "learning_rate": 0.0002, "epoch": 1.9829642248722317, "step": 2910}, {"loss": 1.695, "grad_norm": 0.34510108828544617, "learning_rate": 0.0002, "epoch": 1.989778534923339, "step": 2920}, {"loss": 1.662, "grad_norm": 0.3596334755420685, "learning_rate": 0.0002, "epoch": 1.9965928449744463, "step": 2930}, {"eval_loss": 1.8056600093841553, "eval_runtime": 60.7049, "eval_samples_per_second": 8.352, "eval_steps_per_second": 1.054, "epoch": 2.0, "step": 2935}, {"loss": 1.6918, "grad_norm": 0.3460802137851715, "learning_rate": 0.0002, "epoch": 2.0034071550255534, "step": 2940}, {"loss": 1.6036, "grad_norm": 0.4038620591163635, "learning_rate": 0.0002, "epoch": 2.0102214650766608, "step": 2950}, {"loss": 1.5386, "grad_norm": 0.3950219750404358, "learning_rate": 0.0002, "epoch": 2.017035775127768, "step": 2960}, {"loss": 1.5649, "grad_norm": 0.519116997718811, "learning_rate": 0.0002, "epoch": 2.0238500851788754, "step": 2970}, {"loss": 1.5397, "grad_norm": 0.4097684919834137, "learning_rate": 0.0002, "epoch": 2.0306643952299828, "step": 2980}, {"loss": 1.6044, "grad_norm": 0.4153544306755066, "learning_rate": 0.0002, "epoch": 2.03747870528109, "step": 2990}, {"loss": 1.6006, "grad_norm": 0.4351160526275635, "learning_rate": 0.0002, "epoch": 2.0442930153321974, "step": 3000}, {"loss": 1.5091, "grad_norm": 0.42036259174346924, "learning_rate": 0.0002, "epoch": 2.0511073253833048, "step": 3010}, {"loss": 1.5686, "grad_norm": 0.4433218836784363, "learning_rate": 0.0002, "epoch": 2.057921635434412, "step": 3020}, {"loss": 1.5478, "grad_norm": 0.46511581540107727, "learning_rate": 0.0002, "epoch": 2.0647359454855194, "step": 3030}, {"loss": 1.5554, "grad_norm": 0.4567560851573944, "learning_rate": 0.0002, "epoch": 2.0715502555366268, "step": 3040}, {"loss": 1.5561, "grad_norm": 0.45671048760414124, "learning_rate": 0.0002, "epoch": 2.078364565587734, "step": 3050}, {"loss": 1.606, "grad_norm": 0.4598552882671356, "learning_rate": 0.0002, "epoch": 2.0851788756388414, "step": 3060}, {"loss": 1.6357, "grad_norm": 0.4582861661911011, "learning_rate": 0.0002, "epoch": 2.0919931856899487, "step": 3070}, {"loss": 1.5853, "grad_norm": 0.4366969168186188, "learning_rate": 0.0002, "epoch": 2.098807495741056, "step": 3080}, {"loss": 1.5574, "grad_norm": 0.495917409658432, "learning_rate": 0.0002, "epoch": 2.1056218057921634, "step": 3090}, {"loss": 1.5798, "grad_norm": 1.6846044063568115, "learning_rate": 0.0002, "epoch": 2.1124361158432707, "step": 3100}, {"loss": 1.5877, "grad_norm": 0.4765092134475708, "learning_rate": 0.0002, "epoch": 2.119250425894378, "step": 3110}, {"loss": 1.6106, "grad_norm": 0.45029810070991516, "learning_rate": 0.0002, "epoch": 2.1260647359454854, "step": 3120}, {"loss": 1.5707, "grad_norm": 0.5706973075866699, "learning_rate": 0.0002, "epoch": 2.1328790459965927, "step": 3130}, {"loss": 1.589, "grad_norm": 0.4606274366378784, "learning_rate": 0.0002, "epoch": 2.1396933560477, "step": 3140}, {"loss": 1.564, "grad_norm": 2.199115753173828, "learning_rate": 0.0002, "epoch": 2.1465076660988074, "step": 3150}, {"loss": 1.5808, "grad_norm": 0.6231027245521545, "learning_rate": 0.0002, "epoch": 2.1533219761499147, "step": 3160}, {"loss": 1.5397, "grad_norm": 0.46918219327926636, "learning_rate": 0.0002, "epoch": 2.160136286201022, "step": 3170}, {"loss": 1.6082, "grad_norm": 0.5006393194198608, "learning_rate": 0.0002, "epoch": 2.1669505962521294, "step": 3180}, {"loss": 1.591, "grad_norm": 0.4745093286037445, "learning_rate": 0.0002, "epoch": 2.1737649063032367, "step": 3190}, {"loss": 1.6459, "grad_norm": 0.511576771736145, "learning_rate": 0.0002, "epoch": 2.180579216354344, "step": 3200}, {"loss": 1.6019, "grad_norm": 0.38622918725013733, "learning_rate": 0.0002, "epoch": 2.1873935264054514, "step": 3210}, {"loss": 1.6187, "grad_norm": 0.4425644278526306, "learning_rate": 0.0002, "epoch": 2.1942078364565587, "step": 3220}, {"loss": 1.6114, "grad_norm": 0.45643091201782227, "learning_rate": 0.0002, "epoch": 2.201022146507666, "step": 3230}, {"loss": 1.6224, "grad_norm": 0.4975406229496002, "learning_rate": 0.0002, "epoch": 2.2078364565587734, "step": 3240}, {"loss": 1.6654, "grad_norm": 0.4673331081867218, "learning_rate": 0.0002, "epoch": 2.2146507666098807, "step": 3250}, {"loss": 1.6155, "grad_norm": 0.5081731081008911, "learning_rate": 0.0002, "epoch": 2.221465076660988, "step": 3260}, {"loss": 1.53, "grad_norm": 0.4790806770324707, "learning_rate": 0.0002, "epoch": 2.2282793867120954, "step": 3270}, {"loss": 1.6362, "grad_norm": 0.5184140801429749, "learning_rate": 0.0002, "epoch": 2.2350936967632027, "step": 3280}, {"loss": 1.5646, "grad_norm": 0.5159083604812622, "learning_rate": 0.0002, "epoch": 2.24190800681431, "step": 3290}, {"loss": 1.6577, "grad_norm": 0.4876042604446411, "learning_rate": 0.0002, "epoch": 2.2487223168654173, "step": 3300}, {"loss": 1.5718, "grad_norm": 0.5454957485198975, "learning_rate": 0.0002, "epoch": 2.2555366269165247, "step": 3310}, {"loss": 1.5885, "grad_norm": 0.49866822361946106, "learning_rate": 0.0002, "epoch": 2.262350936967632, "step": 3320}, {"loss": 1.5923, "grad_norm": 0.42674365639686584, "learning_rate": 0.0002, "epoch": 2.2691652470187393, "step": 3330}, {"loss": 1.5588, "grad_norm": 0.5202316641807556, "learning_rate": 0.0002, "epoch": 2.2759795570698467, "step": 3340}, {"loss": 1.6032, "grad_norm": 0.4849465489387512, "learning_rate": 0.0002, "epoch": 2.282793867120954, "step": 3350}, {"loss": 1.6853, "grad_norm": 0.47202569246292114, "learning_rate": 0.0002, "epoch": 2.2896081771720613, "step": 3360}, {"loss": 1.6164, "grad_norm": 0.5311620235443115, "learning_rate": 0.0002, "epoch": 2.2964224872231687, "step": 3370}, {"loss": 1.6004, "grad_norm": 0.49011409282684326, "learning_rate": 0.0002, "epoch": 2.303236797274276, "step": 3380}, {"loss": 1.5484, "grad_norm": 0.4789247512817383, "learning_rate": 0.0002, "epoch": 2.3100511073253833, "step": 3390}, {"loss": 1.5866, "grad_norm": 0.46646103262901306, "learning_rate": 0.0002, "epoch": 2.3168654173764907, "step": 3400}, {"loss": 1.5308, "grad_norm": 0.5552441477775574, "learning_rate": 0.0002, "epoch": 2.323679727427598, "step": 3410}, {"loss": 1.5761, "grad_norm": 0.4530351758003235, "learning_rate": 0.0002, "epoch": 2.3304940374787053, "step": 3420}, {"loss": 1.5919, "grad_norm": 0.4806232750415802, "learning_rate": 0.0002, "epoch": 2.3373083475298126, "step": 3430}, {"loss": 1.5569, "grad_norm": 0.5998363494873047, "learning_rate": 0.0002, "epoch": 2.34412265758092, "step": 3440}, {"loss": 1.513, "grad_norm": 0.4918554425239563, "learning_rate": 0.0002, "epoch": 2.3509369676320273, "step": 3450}, {"loss": 1.6323, "grad_norm": 0.5359559655189514, "learning_rate": 0.0002, "epoch": 2.3577512776831346, "step": 3460}, {"loss": 1.5973, "grad_norm": 0.5053277611732483, "learning_rate": 0.0002, "epoch": 2.364565587734242, "step": 3470}, {"loss": 1.5673, "grad_norm": 0.5058915019035339, "learning_rate": 0.0002, "epoch": 2.3713798977853493, "step": 3480}, {"loss": 1.5434, "grad_norm": 0.5314899682998657, "learning_rate": 0.0002, "epoch": 2.3781942078364566, "step": 3490}, {"loss": 1.5203, "grad_norm": 0.48035913705825806, "learning_rate": 0.0002, "epoch": 2.385008517887564, "step": 3500}, {"loss": 1.5936, "grad_norm": 0.45864903926849365, "learning_rate": 0.0002, "epoch": 2.3918228279386713, "step": 3510}, {"loss": 1.6285, "grad_norm": 0.4553050398826599, "learning_rate": 0.0002, "epoch": 2.3986371379897786, "step": 3520}, {"loss": 1.5444, "grad_norm": 0.4483442008495331, "learning_rate": 0.0002, "epoch": 2.405451448040886, "step": 3530}, {"loss": 1.587, "grad_norm": 0.5043742060661316, "learning_rate": 0.0002, "epoch": 2.4122657580919933, "step": 3540}, {"loss": 1.5692, "grad_norm": 0.44807168841362, "learning_rate": 0.0002, "epoch": 2.4190800681431006, "step": 3550}, {"loss": 1.6306, "grad_norm": 0.5065137147903442, "learning_rate": 0.0002, "epoch": 2.425894378194208, "step": 3560}, {"loss": 1.5842, "grad_norm": 0.5186443328857422, "learning_rate": 0.0002, "epoch": 2.4327086882453153, "step": 3570}, {"loss": 1.5956, "grad_norm": 0.49743232131004333, "learning_rate": 0.0002, "epoch": 2.4395229982964226, "step": 3580}, {"loss": 1.6021, "grad_norm": 0.524450421333313, "learning_rate": 0.0002, "epoch": 2.44633730834753, "step": 3590}, {"loss": 1.6283, "grad_norm": 0.5053797364234924, "learning_rate": 0.0002, "epoch": 2.4531516183986373, "step": 3600}, {"loss": 1.6335, "grad_norm": 0.5223091840744019, "learning_rate": 0.0002, "epoch": 2.4599659284497446, "step": 3610}, {"loss": 1.6315, "grad_norm": 0.4763810932636261, "learning_rate": 0.0002, "epoch": 2.466780238500852, "step": 3620}, {"loss": 1.5623, "grad_norm": 0.5097282528877258, "learning_rate": 0.0002, "epoch": 2.4735945485519593, "step": 3630}, {"loss": 1.605, "grad_norm": 0.5831942558288574, "learning_rate": 0.0002, "epoch": 2.4804088586030666, "step": 3640}, {"loss": 1.6074, "grad_norm": 0.47573572397232056, "learning_rate": 0.0002, "epoch": 2.487223168654174, "step": 3650}, {"loss": 1.6411, "grad_norm": 0.49602726101875305, "learning_rate": 0.0002, "epoch": 2.4940374787052813, "step": 3660}, {"loss": 1.571, "grad_norm": 0.5468524694442749, "learning_rate": 0.0002, "epoch": 2.500851788756388, "step": 3670}, {"loss": 1.5782, "grad_norm": 0.45899373292922974, "learning_rate": 0.0002, "epoch": 2.507666098807496, "step": 3680}, {"loss": 1.5114, "grad_norm": 0.5031567215919495, "learning_rate": 0.0002, "epoch": 2.514480408858603, "step": 3690}, {"loss": 1.538, "grad_norm": 0.5224900841712952, "learning_rate": 0.0002, "epoch": 2.5212947189097106, "step": 3700}, {"loss": 1.5269, "grad_norm": 0.504769504070282, "learning_rate": 0.0002, "epoch": 2.5281090289608175, "step": 3710}, {"loss": 1.5141, "grad_norm": 0.6120529770851135, "learning_rate": 0.0002, "epoch": 2.5349233390119252, "step": 3720}, {"loss": 1.5666, "grad_norm": 0.47930678725242615, "learning_rate": 0.0002, "epoch": 2.541737649063032, "step": 3730}, {"loss": 1.6156, "grad_norm": 0.5039092302322388, "learning_rate": 0.0002, "epoch": 2.54855195911414, "step": 3740}, {"loss": 1.5585, "grad_norm": 0.49758994579315186, "learning_rate": 0.0002, "epoch": 2.555366269165247, "step": 3750}, {"loss": 1.5351, "grad_norm": 0.44739171862602234, "learning_rate": 0.0002, "epoch": 2.5621805792163546, "step": 3760}, {"loss": 1.5099, "grad_norm": 0.47056373953819275, "learning_rate": 0.0002, "epoch": 2.5689948892674614, "step": 3770}, {"loss": 1.5524, "grad_norm": 0.5077595114707947, "learning_rate": 0.0002, "epoch": 2.575809199318569, "step": 3780}, {"loss": 1.5524, "grad_norm": 0.4981902837753296, "learning_rate": 0.0002, "epoch": 2.582623509369676, "step": 3790}, {"loss": 1.5381, "grad_norm": 0.5736238360404968, "learning_rate": 0.0002, "epoch": 2.589437819420784, "step": 3800}, {"loss": 1.67, "grad_norm": 0.4898384213447571, "learning_rate": 0.0002, "epoch": 2.5962521294718908, "step": 3810}, {"loss": 1.5411, "grad_norm": 0.4611325263977051, "learning_rate": 0.0002, "epoch": 2.6030664395229985, "step": 3820}, {"loss": 1.5662, "grad_norm": 0.5285341739654541, "learning_rate": 0.0002, "epoch": 2.6098807495741054, "step": 3830}, {"loss": 1.5875, "grad_norm": 0.5679430961608887, "learning_rate": 0.0002, "epoch": 2.616695059625213, "step": 3840}, {"loss": 1.5544, "grad_norm": 0.48532548546791077, "learning_rate": 0.0002, "epoch": 2.62350936967632, "step": 3850}, {"loss": 1.579, "grad_norm": 0.45506492257118225, "learning_rate": 0.0002, "epoch": 2.630323679727428, "step": 3860}, {"loss": 1.5775, "grad_norm": 0.6552556753158569, "learning_rate": 0.0002, "epoch": 2.6371379897785348, "step": 3870}, {"loss": 1.608, "grad_norm": 0.537874698638916, "learning_rate": 0.0002, "epoch": 2.643952299829642, "step": 3880}, {"loss": 1.5653, "grad_norm": 0.46102389693260193, "learning_rate": 0.0002, "epoch": 2.6507666098807494, "step": 3890}, {"loss": 1.5293, "grad_norm": 0.45531195402145386, "learning_rate": 0.0002, "epoch": 2.6575809199318567, "step": 3900}, {"loss": 1.5492, "grad_norm": 0.5327293872833252, "learning_rate": 0.0002, "epoch": 2.664395229982964, "step": 3910}, {"loss": 1.5921, "grad_norm": 0.4968956410884857, "learning_rate": 0.0002, "epoch": 2.6712095400340714, "step": 3920}, {"loss": 1.5823, "grad_norm": 0.4790082275867462, "learning_rate": 0.0002, "epoch": 2.6780238500851787, "step": 3930}, {"loss": 1.615, "grad_norm": 0.5392967462539673, "learning_rate": 0.0002, "epoch": 2.684838160136286, "step": 3940}, {"loss": 1.6218, "grad_norm": 0.5076649785041809, "learning_rate": 0.0002, "epoch": 2.6916524701873934, "step": 3950}, {"loss": 1.6478, "grad_norm": 0.5628064274787903, "learning_rate": 0.0002, "epoch": 2.6984667802385007, "step": 3960}, {"loss": 1.5417, "grad_norm": 0.5012659430503845, "learning_rate": 0.0002, "epoch": 2.705281090289608, "step": 3970}, {"loss": 1.5339, "grad_norm": 0.4947647452354431, "learning_rate": 0.0002, "epoch": 2.7120954003407154, "step": 3980}, {"loss": 1.5724, "grad_norm": 0.4890969693660736, "learning_rate": 0.0002, "epoch": 2.7189097103918227, "step": 3990}, {"loss": 1.5746, "grad_norm": 0.4471694231033325, "learning_rate": 0.0002, "epoch": 2.72572402044293, "step": 4000}, {"loss": 1.6669, "grad_norm": 0.5116439461708069, "learning_rate": 0.0002, "epoch": 2.7325383304940374, "step": 4010}, {"loss": 1.584, "grad_norm": 0.5720411539077759, "learning_rate": 0.0002, "epoch": 2.7393526405451447, "step": 4020}, {"loss": 1.6151, "grad_norm": 0.5529406070709229, "learning_rate": 0.0002, "epoch": 2.746166950596252, "step": 4030}, {"loss": 1.6296, "grad_norm": 0.5229396820068359, "learning_rate": 0.0002, "epoch": 2.7529812606473594, "step": 4040}, {"loss": 1.5363, "grad_norm": 0.5270276069641113, "learning_rate": 0.0002, "epoch": 2.7597955706984667, "step": 4050}, {"loss": 1.6305, "grad_norm": 0.48413026332855225, "learning_rate": 0.0002, "epoch": 2.766609880749574, "step": 4060}, {"loss": 1.5488, "grad_norm": 0.5145403742790222, "learning_rate": 0.0002, "epoch": 2.7734241908006814, "step": 4070}, {"loss": 1.6023, "grad_norm": 0.48626071214675903, "learning_rate": 0.0002, "epoch": 2.7802385008517887, "step": 4080}, {"loss": 1.6082, "grad_norm": 0.5018984079360962, "learning_rate": 0.0002, "epoch": 2.787052810902896, "step": 4090}, {"loss": 1.7166, "grad_norm": 0.4881938695907593, "learning_rate": 0.0002, "epoch": 2.7938671209540034, "step": 4100}, {"loss": 1.515, "grad_norm": 0.5151546001434326, "learning_rate": 0.0002, "epoch": 2.8006814310051107, "step": 4110}, {"loss": 1.6069, "grad_norm": 0.5109850764274597, "learning_rate": 0.0002, "epoch": 2.807495741056218, "step": 4120}, {"loss": 1.6153, "grad_norm": 0.5109251141548157, "learning_rate": 0.0002, "epoch": 2.8143100511073254, "step": 4130}, {"loss": 1.6365, "grad_norm": 0.5025496482849121, "learning_rate": 0.0002, "epoch": 2.8211243611584327, "step": 4140}, {"loss": 1.6292, "grad_norm": 0.49027004837989807, "learning_rate": 0.0002, "epoch": 2.82793867120954, "step": 4150}, {"loss": 1.5591, "grad_norm": 0.4957362413406372, "learning_rate": 0.0002, "epoch": 2.8347529812606473, "step": 4160}, {"loss": 1.6759, "grad_norm": 0.5159927606582642, "learning_rate": 0.0002, "epoch": 2.8415672913117547, "step": 4170}, {"loss": 1.577, "grad_norm": 0.6040670871734619, "learning_rate": 0.0002, "epoch": 2.848381601362862, "step": 4180}, {"loss": 1.5295, "grad_norm": 0.5489953756332397, "learning_rate": 0.0002, "epoch": 2.8551959114139693, "step": 4190}, {"loss": 1.5909, "grad_norm": 0.5416634678840637, "learning_rate": 0.0002, "epoch": 2.8620102214650767, "step": 4200}, {"loss": 1.6014, "grad_norm": 0.5278245210647583, "learning_rate": 0.0002, "epoch": 2.868824531516184, "step": 4210}, {"loss": 1.4848, "grad_norm": 0.43382319808006287, "learning_rate": 0.0002, "epoch": 2.8756388415672913, "step": 4220}, {"loss": 1.6092, "grad_norm": 0.4724387228488922, "learning_rate": 0.0002, "epoch": 2.8824531516183987, "step": 4230}, {"loss": 1.5748, "grad_norm": 0.49824780225753784, "learning_rate": 0.0002, "epoch": 2.889267461669506, "step": 4240}, {"loss": 1.6163, "grad_norm": 0.5360262989997864, "learning_rate": 0.0002, "epoch": 2.8960817717206133, "step": 4250}, {"loss": 1.5934, "grad_norm": 0.49090322852134705, "learning_rate": 0.0002, "epoch": 2.9028960817717206, "step": 4260}, {"loss": 1.624, "grad_norm": 0.5613328218460083, "learning_rate": 0.0002, "epoch": 2.909710391822828, "step": 4270}, {"loss": 1.5713, "grad_norm": 0.4611356258392334, "learning_rate": 0.0002, "epoch": 2.9165247018739353, "step": 4280}, {"loss": 1.5457, "grad_norm": 0.550897479057312, "learning_rate": 0.0002, "epoch": 2.9233390119250426, "step": 4290}, {"loss": 1.6225, "grad_norm": 0.5089612603187561, "learning_rate": 0.0002, "epoch": 2.93015332197615, "step": 4300}, {"loss": 1.5897, "grad_norm": 0.5210904479026794, "learning_rate": 0.0002, "epoch": 2.9369676320272573, "step": 4310}, {"loss": 1.6198, "grad_norm": 0.5506424903869629, "learning_rate": 0.0002, "epoch": 2.9437819420783646, "step": 4320}, {"loss": 1.6395, "grad_norm": 0.5118561387062073, "learning_rate": 0.0002, "epoch": 2.950596252129472, "step": 4330}, {"loss": 1.704, "grad_norm": 0.5034464597702026, "learning_rate": 0.0002, "epoch": 2.9574105621805793, "step": 4340}, {"loss": 1.6314, "grad_norm": 0.5019990801811218, "learning_rate": 0.0002, "epoch": 2.9642248722316866, "step": 4350}, {"loss": 1.6161, "grad_norm": 0.5423325300216675, "learning_rate": 0.0002, "epoch": 2.971039182282794, "step": 4360}, {"loss": 1.6144, "grad_norm": 0.5287469625473022, "learning_rate": 0.0002, "epoch": 2.9778534923339013, "step": 4370}, {"loss": 1.5227, "grad_norm": 0.5206913352012634, "learning_rate": 0.0002, "epoch": 2.9846678023850086, "step": 4380}, {"loss": 1.6026, "grad_norm": 0.5407394170761108, "learning_rate": 0.0002, "epoch": 2.991482112436116, "step": 4390}, {"loss": 1.5908, "grad_norm": 0.5244600176811218, "learning_rate": 0.0002, "epoch": 2.9982964224872233, "step": 4400}, {"eval_loss": 1.8412635326385498, "eval_runtime": 65.5583, "eval_samples_per_second": 7.734, "eval_steps_per_second": 0.976, "epoch": 2.9996592844974446, "step": 4402}, {"loss": 1.5157, "grad_norm": 0.5172150731086731, "learning_rate": 0.0002, "epoch": 3.0051107325383306, "step": 4410}, {"loss": 1.398, "grad_norm": 0.6882525086402893, "learning_rate": 0.0002, "epoch": 3.011925042589438, "step": 4420}, {"loss": 1.3884, "grad_norm": 0.6435003280639648, "learning_rate": 0.0002, "epoch": 3.0187393526405453, "step": 4430}, {"loss": 1.4493, "grad_norm": 0.7126057147979736, "learning_rate": 0.0002, "epoch": 3.0255536626916526, "step": 4440}, {"loss": 1.4397, "grad_norm": 0.6634385585784912, "learning_rate": 0.0002, "epoch": 3.03236797274276, "step": 4450}, {"loss": 1.3674, "grad_norm": 0.6468435525894165, "learning_rate": 0.0002, "epoch": 3.0391822827938673, "step": 4460}, {"loss": 1.4045, "grad_norm": 0.5690478086471558, "learning_rate": 0.0002, "epoch": 3.0459965928449746, "step": 4470}, {"loss": 1.3742, "grad_norm": 0.7323708534240723, "learning_rate": 0.0002, "epoch": 3.052810902896082, "step": 4480}, {"loss": 1.3281, "grad_norm": 0.6989302039146423, "learning_rate": 0.0002, "epoch": 3.0596252129471893, "step": 4490}, {"loss": 1.379, "grad_norm": 0.6704450845718384, "learning_rate": 0.0002, "epoch": 3.0664395229982966, "step": 4500}, {"loss": 1.4028, "grad_norm": 0.769137442111969, "learning_rate": 0.0002, "epoch": 3.073253833049404, "step": 4510}, {"loss": 1.4295, "grad_norm": 0.6556448936462402, "learning_rate": 0.0002, "epoch": 3.0800681431005112, "step": 4520}, {"loss": 1.2763, "grad_norm": 0.7143950462341309, "learning_rate": 0.0002, "epoch": 3.0868824531516186, "step": 4530}, {"loss": 1.4806, "grad_norm": 0.7060510516166687, "learning_rate": 0.0002, "epoch": 3.093696763202726, "step": 4540}, {"loss": 1.4097, "grad_norm": 0.6637526750564575, "learning_rate": 0.0002, "epoch": 3.1005110732538332, "step": 4550}, {"loss": 1.4752, "grad_norm": 0.822989284992218, "learning_rate": 0.0002, "epoch": 3.1073253833049406, "step": 4560}, {"loss": 1.4994, "grad_norm": 0.5542152523994446, "learning_rate": 0.0002, "epoch": 3.114139693356048, "step": 4570}, {"loss": 1.4306, "grad_norm": 0.7780306935310364, "learning_rate": 0.0002, "epoch": 3.1209540034071552, "step": 4580}, {"loss": 1.3909, "grad_norm": 0.7372637987136841, "learning_rate": 0.0002, "epoch": 3.1277683134582626, "step": 4590}, {"loss": 1.3989, "grad_norm": 0.6730087995529175, "learning_rate": 0.0002, "epoch": 3.1345826235093694, "step": 4600}, {"loss": 1.3591, "grad_norm": 0.6687398552894592, "learning_rate": 0.0002, "epoch": 3.1413969335604772, "step": 4610}, {"loss": 1.436, "grad_norm": 0.7645083665847778, "learning_rate": 0.0002, "epoch": 3.148211243611584, "step": 4620}, {"loss": 1.3681, "grad_norm": 0.6770380139350891, "learning_rate": 0.0002, "epoch": 3.155025553662692, "step": 4630}, {"loss": 1.405, "grad_norm": 0.7200576663017273, "learning_rate": 0.0002, "epoch": 3.1618398637137988, "step": 4640}, {"loss": 1.3752, "grad_norm": 0.6663638949394226, "learning_rate": 0.0002, "epoch": 3.168654173764906, "step": 4650}, {"loss": 1.4099, "grad_norm": 0.6602960228919983, "learning_rate": 0.0002, "epoch": 3.1754684838160134, "step": 4660}, {"loss": 1.4003, "grad_norm": 0.7838228344917297, "learning_rate": 0.0002, "epoch": 3.1822827938671208, "step": 4670}, {"loss": 1.3853, "grad_norm": 0.7559184432029724, "learning_rate": 0.0002, "epoch": 3.189097103918228, "step": 4680}, {"loss": 1.4516, "grad_norm": 0.6609814167022705, "learning_rate": 0.0002, "epoch": 3.1959114139693354, "step": 4690}, {"loss": 1.4464, "grad_norm": 0.8470419645309448, "learning_rate": 0.0002, "epoch": 3.2027257240204428, "step": 4700}, {"loss": 1.428, "grad_norm": 0.7282822728157043, "learning_rate": 0.0002, "epoch": 3.20954003407155, "step": 4710}, {"loss": 1.5261, "grad_norm": 0.6722773313522339, "learning_rate": 0.0002, "epoch": 3.2163543441226574, "step": 4720}, {"loss": 1.3809, "grad_norm": 0.7630265355110168, "learning_rate": 0.0002, "epoch": 3.2231686541737647, "step": 4730}, {"loss": 1.42, "grad_norm": 0.7102773785591125, "learning_rate": 0.0002, "epoch": 3.229982964224872, "step": 4740}, {"loss": 1.3529, "grad_norm": 0.7778299450874329, "learning_rate": 0.0002, "epoch": 3.2367972742759794, "step": 4750}, {"loss": 1.4715, "grad_norm": 0.7189921736717224, "learning_rate": 0.0002, "epoch": 3.2436115843270867, "step": 4760}, {"loss": 1.4328, "grad_norm": 0.7708092331886292, "learning_rate": 0.0002, "epoch": 3.250425894378194, "step": 4770}, {"loss": 1.3855, "grad_norm": 0.7208452224731445, "learning_rate": 0.0002, "epoch": 3.2572402044293014, "step": 4780}, {"loss": 1.3206, "grad_norm": 0.7220432758331299, "learning_rate": 0.0002, "epoch": 3.2640545144804087, "step": 4790}, {"loss": 1.463, "grad_norm": 0.7064954042434692, "learning_rate": 0.0002, "epoch": 3.270868824531516, "step": 4800}, {"loss": 1.4236, "grad_norm": 0.6618382334709167, "learning_rate": 0.0002, "epoch": 3.2776831345826234, "step": 4810}, {"loss": 1.3878, "grad_norm": 0.6854256391525269, "learning_rate": 0.0002, "epoch": 3.2844974446337307, "step": 4820}, {"loss": 1.4236, "grad_norm": 0.6036319136619568, "learning_rate": 0.0002, "epoch": 3.291311754684838, "step": 4830}, {"loss": 1.4796, "grad_norm": 0.714678943157196, "learning_rate": 0.0002, "epoch": 3.2981260647359454, "step": 4840}, {"loss": 1.4273, "grad_norm": 0.7218600511550903, "learning_rate": 0.0002, "epoch": 3.3049403747870527, "step": 4850}, {"loss": 1.3915, "grad_norm": 0.7243074774742126, "learning_rate": 0.0002, "epoch": 3.31175468483816, "step": 4860}, {"loss": 1.4088, "grad_norm": 0.7058630585670471, "learning_rate": 0.0002, "epoch": 3.3185689948892674, "step": 4870}, {"loss": 1.3837, "grad_norm": 0.7091076970100403, "learning_rate": 0.0002, "epoch": 3.3253833049403747, "step": 4880}, {"loss": 1.4745, "grad_norm": 0.7375147342681885, "learning_rate": 0.0002, "epoch": 3.332197614991482, "step": 4890}, {"loss": 1.4826, "grad_norm": 0.9426755309104919, "learning_rate": 0.0002, "epoch": 3.3390119250425894, "step": 4900}, {"loss": 1.369, "grad_norm": 0.6508213877677917, "learning_rate": 0.0002, "epoch": 3.3458262350936967, "step": 4910}, {"loss": 1.3839, "grad_norm": 0.6945043206214905, "learning_rate": 0.0002, "epoch": 3.352640545144804, "step": 4920}, {"loss": 1.3571, "grad_norm": 0.6335888504981995, "learning_rate": 0.0002, "epoch": 3.3594548551959114, "step": 4930}, {"loss": 1.4391, "grad_norm": 0.6947107911109924, "learning_rate": 0.0002, "epoch": 3.3662691652470187, "step": 4940}, {"loss": 1.3885, "grad_norm": 0.8204733729362488, "learning_rate": 0.0002, "epoch": 3.373083475298126, "step": 4950}, {"loss": 1.4886, "grad_norm": 0.7212244868278503, "learning_rate": 0.0002, "epoch": 3.3798977853492334, "step": 4960}, {"loss": 1.4581, "grad_norm": 0.6053042411804199, "learning_rate": 0.0002, "epoch": 3.3867120954003407, "step": 4970}, {"loss": 1.3863, "grad_norm": 0.7820029854774475, "learning_rate": 0.0002, "epoch": 3.393526405451448, "step": 4980}, {"loss": 1.4326, "grad_norm": 0.6866770386695862, "learning_rate": 0.0002, "epoch": 3.4003407155025553, "step": 4990}, {"loss": 1.4287, "grad_norm": 0.6652463674545288, "learning_rate": 0.0002, "epoch": 3.4071550255536627, "step": 5000}, {"loss": 1.3667, "grad_norm": 1.1209032535552979, "learning_rate": 0.0002, "epoch": 3.41396933560477, "step": 5010}, {"loss": 1.4461, "grad_norm": 0.8390814661979675, "learning_rate": 0.0002, "epoch": 3.4207836456558773, "step": 5020}, {"loss": 1.4556, "grad_norm": 0.7541858553886414, "learning_rate": 0.0002, "epoch": 3.4275979557069847, "step": 5030}, {"loss": 1.4245, "grad_norm": 0.6902772784233093, "learning_rate": 0.0002, "epoch": 3.434412265758092, "step": 5040}, {"loss": 1.3953, "grad_norm": 0.7070329785346985, "learning_rate": 0.0002, "epoch": 3.4412265758091993, "step": 5050}, {"loss": 1.3903, "grad_norm": 0.8075643181800842, "learning_rate": 0.0002, "epoch": 3.4480408858603067, "step": 5060}, {"loss": 1.3929, "grad_norm": 0.7133861780166626, "learning_rate": 0.0002, "epoch": 3.454855195911414, "step": 5070}, {"loss": 1.4632, "grad_norm": 0.6631823182106018, "learning_rate": 0.0002, "epoch": 3.4616695059625213, "step": 5080}, {"loss": 1.4162, "grad_norm": 0.673870325088501, "learning_rate": 0.0002, "epoch": 3.4684838160136287, "step": 5090}, {"loss": 1.4247, "grad_norm": 0.6438634395599365, "learning_rate": 0.0002, "epoch": 3.475298126064736, "step": 5100}, {"loss": 1.4421, "grad_norm": 0.7560495734214783, "learning_rate": 0.0002, "epoch": 3.4821124361158433, "step": 5110}, {"loss": 1.4125, "grad_norm": 0.6877814531326294, "learning_rate": 0.0002, "epoch": 3.4889267461669506, "step": 5120}, {"loss": 1.4308, "grad_norm": 0.7031328678131104, "learning_rate": 0.0002, "epoch": 3.495741056218058, "step": 5130}, {"loss": 1.3705, "grad_norm": 0.6797195672988892, "learning_rate": 0.0002, "epoch": 3.5025553662691653, "step": 5140}, {"loss": 1.4687, "grad_norm": 0.6766413450241089, "learning_rate": 0.0002, "epoch": 3.5093696763202726, "step": 5150}, {"loss": 1.4194, "grad_norm": 0.666656494140625, "learning_rate": 0.0002, "epoch": 3.51618398637138, "step": 5160}, {"loss": 1.469, "grad_norm": 0.74996417760849, "learning_rate": 0.0002, "epoch": 3.5229982964224873, "step": 5170}, {"loss": 1.4848, "grad_norm": 0.7370911836624146, "learning_rate": 0.0002, "epoch": 3.5298126064735946, "step": 5180}, {"loss": 1.4523, "grad_norm": 0.9063456654548645, "learning_rate": 0.0002, "epoch": 3.536626916524702, "step": 5190}, {"loss": 1.4726, "grad_norm": 0.6861422657966614, "learning_rate": 0.0002, "epoch": 3.5434412265758093, "step": 5200}, {"loss": 1.4803, "grad_norm": 0.7104039788246155, "learning_rate": 0.0002, "epoch": 3.5502555366269166, "step": 5210}, {"loss": 1.4313, "grad_norm": 0.6578653454780579, "learning_rate": 0.0002, "epoch": 3.557069846678024, "step": 5220}, {"loss": 1.4596, "grad_norm": 0.7336562275886536, "learning_rate": 0.0002, "epoch": 3.5638841567291313, "step": 5230}, {"loss": 1.4591, "grad_norm": 0.7163010835647583, "learning_rate": 0.0002, "epoch": 3.5706984667802386, "step": 5240}, {"loss": 1.3814, "grad_norm": 0.8112391233444214, "learning_rate": 0.0002, "epoch": 3.577512776831346, "step": 5250}, {"loss": 1.4249, "grad_norm": 0.7260391116142273, "learning_rate": 0.0002, "epoch": 3.5843270868824533, "step": 5260}, {"loss": 1.4249, "grad_norm": 0.7038731575012207, "learning_rate": 0.0002, "epoch": 3.5911413969335606, "step": 5270}, {"loss": 1.4172, "grad_norm": 0.7864376902580261, "learning_rate": 0.0002, "epoch": 3.597955706984668, "step": 5280}, {"loss": 1.4637, "grad_norm": 0.6968383193016052, "learning_rate": 0.0002, "epoch": 3.6047700170357753, "step": 5290}, {"loss": 1.5269, "grad_norm": 0.6726206541061401, "learning_rate": 0.0002, "epoch": 3.6115843270868826, "step": 5300}, {"loss": 1.4199, "grad_norm": 0.6716854572296143, "learning_rate": 0.0002, "epoch": 3.61839863713799, "step": 5310}, {"loss": 1.4686, "grad_norm": 0.7229742407798767, "learning_rate": 0.0002, "epoch": 3.6252129471890973, "step": 5320}, {"loss": 1.4441, "grad_norm": 0.7338683009147644, "learning_rate": 0.0002, "epoch": 3.6320272572402046, "step": 5330}, {"loss": 1.4116, "grad_norm": 0.771672785282135, "learning_rate": 0.0002, "epoch": 3.638841567291312, "step": 5340}, {"loss": 1.4007, "grad_norm": 0.7024078369140625, "learning_rate": 0.0002, "epoch": 3.645655877342419, "step": 5350}, {"loss": 1.4996, "grad_norm": 0.6847538352012634, "learning_rate": 0.0002, "epoch": 3.6524701873935266, "step": 5360}, {"loss": 1.4111, "grad_norm": 0.71802818775177, "learning_rate": 0.0002, "epoch": 3.6592844974446335, "step": 5370}, {"loss": 1.4224, "grad_norm": 0.78530353307724, "learning_rate": 0.0002, "epoch": 3.6660988074957412, "step": 5380}, {"loss": 1.4582, "grad_norm": 0.7262226939201355, "learning_rate": 0.0002, "epoch": 3.672913117546848, "step": 5390}, {"loss": 1.4704, "grad_norm": 0.7608316540718079, "learning_rate": 0.0002, "epoch": 3.679727427597956, "step": 5400}, {"loss": 1.3742, "grad_norm": 0.6994926333427429, "learning_rate": 0.0002, "epoch": 3.686541737649063, "step": 5410}, {"loss": 1.4738, "grad_norm": 0.7888479828834534, "learning_rate": 0.0002, "epoch": 3.6933560477001706, "step": 5420}, {"loss": 1.4213, "grad_norm": 0.7053858041763306, "learning_rate": 0.0002, "epoch": 3.7001703577512775, "step": 5430}, {"loss": 1.4988, "grad_norm": 0.7063165903091431, "learning_rate": 0.0002, "epoch": 3.7069846678023852, "step": 5440}, {"loss": 1.4386, "grad_norm": 0.6603744626045227, "learning_rate": 0.0002, "epoch": 3.713798977853492, "step": 5450}, {"loss": 1.4695, "grad_norm": 0.7043602466583252, "learning_rate": 0.0002, "epoch": 3.7206132879046, "step": 5460}, {"loss": 1.5051, "grad_norm": 0.7026081681251526, "learning_rate": 0.0002, "epoch": 3.7274275979557068, "step": 5470}, {"loss": 1.5613, "grad_norm": 0.7200090289115906, "learning_rate": 0.0002, "epoch": 3.7342419080068145, "step": 5480}, {"loss": 1.4182, "grad_norm": 0.7170904278755188, "learning_rate": 0.0002, "epoch": 3.7410562180579214, "step": 5490}, {"loss": 1.4344, "grad_norm": 0.7489104866981506, "learning_rate": 0.0002, "epoch": 3.747870528109029, "step": 5500}, {"loss": 1.4911, "grad_norm": 0.6540989875793457, "learning_rate": 0.0002, "epoch": 3.754684838160136, "step": 5510}, {"loss": 1.4955, "grad_norm": 0.6654048562049866, "learning_rate": 0.0002, "epoch": 3.761499148211244, "step": 5520}, {"loss": 1.4487, "grad_norm": 0.6577395796775818, "learning_rate": 0.0002, "epoch": 3.7683134582623508, "step": 5530}, {"loss": 1.4283, "grad_norm": 0.7762192487716675, "learning_rate": 0.0002, "epoch": 3.7751277683134585, "step": 5540}, {"loss": 1.4727, "grad_norm": 0.6336314678192139, "learning_rate": 0.0002, "epoch": 3.7819420783645654, "step": 5550}, {"loss": 1.4588, "grad_norm": 0.7098057866096497, "learning_rate": 0.0002, "epoch": 3.7887563884156727, "step": 5560}, {"loss": 1.4679, "grad_norm": 0.7379715442657471, "learning_rate": 0.0002, "epoch": 3.79557069846678, "step": 5570}, {"loss": 1.4633, "grad_norm": 0.6726924777030945, "learning_rate": 0.0002, "epoch": 3.8023850085178874, "step": 5580}, {"loss": 1.4751, "grad_norm": 1.1212009191513062, "learning_rate": 0.0002, "epoch": 3.8091993185689947, "step": 5590}, {"loss": 1.4503, "grad_norm": 0.6503795981407166, "learning_rate": 0.0002, "epoch": 3.816013628620102, "step": 5600}, {"loss": 1.4754, "grad_norm": 0.7041325569152832, "learning_rate": 0.0002, "epoch": 3.8228279386712094, "step": 5610}, {"loss": 1.4199, "grad_norm": 0.7962933778762817, "learning_rate": 0.0002, "epoch": 3.8296422487223167, "step": 5620}, {"loss": 1.4672, "grad_norm": 0.6613591909408569, "learning_rate": 0.0002, "epoch": 3.836456558773424, "step": 5630}, {"loss": 1.5688, "grad_norm": 0.7293516397476196, "learning_rate": 0.0002, "epoch": 3.8432708688245314, "step": 5640}, {"loss": 1.4149, "grad_norm": 0.7388607859611511, "learning_rate": 0.0002, "epoch": 3.8500851788756387, "step": 5650}, {"loss": 1.4743, "grad_norm": 0.6440677642822266, "learning_rate": 0.0002, "epoch": 3.856899488926746, "step": 5660}, {"loss": 1.5082, "grad_norm": 0.7729013562202454, "learning_rate": 0.0002, "epoch": 3.8637137989778534, "step": 5670}, {"loss": 1.4608, "grad_norm": 0.6696794033050537, "learning_rate": 0.0002, "epoch": 3.8705281090289607, "step": 5680}, {"loss": 1.472, "grad_norm": 0.7151781320571899, "learning_rate": 0.0002, "epoch": 3.877342419080068, "step": 5690}, {"loss": 1.4923, "grad_norm": 0.6736966371536255, "learning_rate": 0.0002, "epoch": 3.8841567291311754, "step": 5700}, {"loss": 1.4453, "grad_norm": 0.7444243431091309, "learning_rate": 0.0002, "epoch": 3.8909710391822827, "step": 5710}, {"loss": 1.4562, "grad_norm": 0.6701464653015137, "learning_rate": 0.0002, "epoch": 3.89778534923339, "step": 5720}, {"loss": 1.4478, "grad_norm": 0.7231952548027039, "learning_rate": 0.0002, "epoch": 3.9045996592844974, "step": 5730}, {"loss": 1.4539, "grad_norm": 0.831954300403595, "learning_rate": 0.0002, "epoch": 3.9114139693356047, "step": 5740}, {"loss": 1.5122, "grad_norm": 0.7697733640670776, "learning_rate": 0.0002, "epoch": 3.918228279386712, "step": 5750}, {"loss": 1.4552, "grad_norm": 0.6964395046234131, "learning_rate": 0.0002, "epoch": 3.9250425894378194, "step": 5760}, {"loss": 1.4688, "grad_norm": 0.6942925453186035, "learning_rate": 0.0002, "epoch": 3.9318568994889267, "step": 5770}, {"loss": 1.4668, "grad_norm": 0.6491202712059021, "learning_rate": 0.0002, "epoch": 3.938671209540034, "step": 5780}, {"loss": 1.4404, "grad_norm": 0.7004382610321045, "learning_rate": 0.0002, "epoch": 3.9454855195911414, "step": 5790}, {"loss": 1.5022, "grad_norm": 0.7337747812271118, "learning_rate": 0.0002, "epoch": 3.9522998296422487, "step": 5800}, {"loss": 1.5314, "grad_norm": 0.6923640966415405, "learning_rate": 0.0002, "epoch": 3.959114139693356, "step": 5810}, {"loss": 1.4811, "grad_norm": 0.6815266609191895, "learning_rate": 0.0002, "epoch": 3.9659284497444633, "step": 5820}, {"loss": 1.437, "grad_norm": 0.6755654811859131, "learning_rate": 0.0002, "epoch": 3.9727427597955707, "step": 5830}, {"loss": 1.4277, "grad_norm": 0.6912487149238586, "learning_rate": 0.0002, "epoch": 3.979557069846678, "step": 5840}, {"loss": 1.4654, "grad_norm": 0.6948044896125793, "learning_rate": 0.0002, "epoch": 3.9863713798977853, "step": 5850}, {"loss": 1.4779, "grad_norm": 0.6735455989837646, "learning_rate": 0.0002, "epoch": 3.9931856899488927, "step": 5860}, {"loss": 1.5102, "grad_norm": 0.7005048990249634, "learning_rate": 0.0002, "epoch": 4.0, "step": 5870}, {"eval_loss": 1.923058032989502, "eval_runtime": 58.9903, "eval_samples_per_second": 8.595, "eval_steps_per_second": 1.085, "epoch": 4.0, "step": 5870}, {"loss": 1.2417, "grad_norm": 0.809018075466156, "learning_rate": 0.0002, "epoch": 4.006814310051107, "step": 5880}, {"loss": 1.2874, "grad_norm": 0.9499403238296509, "learning_rate": 0.0002, "epoch": 4.013628620102215, "step": 5890}, {"loss": 1.2245, "grad_norm": 0.7944574356079102, "learning_rate": 0.0002, "epoch": 4.0204429301533215, "step": 5900}, {"loss": 1.2751, "grad_norm": 0.9501046538352966, "learning_rate": 0.0002, "epoch": 4.027257240204429, "step": 5910}, {"loss": 1.2706, "grad_norm": 0.8247923254966736, "learning_rate": 0.0002, "epoch": 4.034071550255536, "step": 5920}, {"loss": 1.2762, "grad_norm": 0.9358038902282715, "learning_rate": 0.0002, "epoch": 4.040885860306644, "step": 5930}, {"loss": 1.2953, "grad_norm": 1.0102452039718628, "learning_rate": 0.0002, "epoch": 4.047700170357751, "step": 5940}, {"loss": 1.216, "grad_norm": 1.0248252153396606, "learning_rate": 0.0002, "epoch": 4.054514480408859, "step": 5950}, {"loss": 1.2115, "grad_norm": 1.0438553094863892, "learning_rate": 0.0002, "epoch": 4.0613287904599655, "step": 5960}, {"loss": 1.2516, "grad_norm": 0.7964957356452942, "learning_rate": 0.0002, "epoch": 4.068143100511073, "step": 5970}, {"loss": 1.1555, "grad_norm": 0.9757015109062195, "learning_rate": 0.0002, "epoch": 4.07495741056218, "step": 5980}, {"loss": 1.2243, "grad_norm": 0.9157161116600037, "learning_rate": 0.0002, "epoch": 4.081771720613288, "step": 5990}, {"loss": 1.2481, "grad_norm": 0.9372851848602295, "learning_rate": 0.0002, "epoch": 4.088586030664395, "step": 6000}, {"loss": 1.2091, "grad_norm": 1.240779995918274, "learning_rate": 0.0002, "epoch": 4.095400340715503, "step": 6010}, {"loss": 1.1727, "grad_norm": 0.8394840359687805, "learning_rate": 0.0002, "epoch": 4.1022146507666095, "step": 6020}, {"loss": 1.2926, "grad_norm": 1.1081455945968628, "learning_rate": 0.0002, "epoch": 4.109028960817717, "step": 6030}, {"loss": 1.2417, "grad_norm": 0.9227745532989502, "learning_rate": 0.0002, "epoch": 4.115843270868824, "step": 6040}, {"loss": 1.1994, "grad_norm": 0.8487664461135864, "learning_rate": 0.0002, "epoch": 4.122657580919932, "step": 6050}, {"loss": 1.2378, "grad_norm": 0.9643339514732361, "learning_rate": 0.0002, "epoch": 4.129471890971039, "step": 6060}, {"loss": 1.2254, "grad_norm": 1.0296099185943604, "learning_rate": 0.0002, "epoch": 4.136286201022147, "step": 6070}, {"loss": 1.2419, "grad_norm": 0.9534215927124023, "learning_rate": 0.0002, "epoch": 4.1431005110732535, "step": 6080}, {"loss": 1.1849, "grad_norm": 0.9647086262702942, "learning_rate": 0.0002, "epoch": 4.149914821124361, "step": 6090}, {"loss": 1.2713, "grad_norm": 1.084836721420288, "learning_rate": 0.0002, "epoch": 4.156729131175468, "step": 6100}, {"loss": 1.1788, "grad_norm": 0.9315235614776611, "learning_rate": 0.0002, "epoch": 4.163543441226576, "step": 6110}, {"loss": 1.17, "grad_norm": 0.9541679620742798, "learning_rate": 0.0002, "epoch": 4.170357751277683, "step": 6120}, {"loss": 1.1407, "grad_norm": 0.9792100191116333, "learning_rate": 0.0002, "epoch": 4.177172061328791, "step": 6130}, {"loss": 1.2069, "grad_norm": 1.065783143043518, "learning_rate": 0.0002, "epoch": 4.1839863713798975, "step": 6140}, {"loss": 1.2512, "grad_norm": 1.036161184310913, "learning_rate": 0.0002, "epoch": 4.190800681431005, "step": 6150}, {"loss": 1.2371, "grad_norm": 0.8979679942131042, "learning_rate": 0.0002, "epoch": 4.197614991482112, "step": 6160}, {"loss": 1.2212, "grad_norm": 0.7584333419799805, "learning_rate": 0.0002, "epoch": 4.20442930153322, "step": 6170}, {"loss": 1.2128, "grad_norm": 1.1970131397247314, "learning_rate": 0.0002, "epoch": 4.211243611584327, "step": 6180}, {"loss": 1.1982, "grad_norm": 2.6447298526763916, "learning_rate": 0.0002, "epoch": 4.218057921635435, "step": 6190}, {"loss": 1.2465, "grad_norm": 0.9357487559318542, "learning_rate": 0.0002, "epoch": 4.2248722316865415, "step": 6200}, {"loss": 1.2963, "grad_norm": 0.9141183495521545, "learning_rate": 0.0002, "epoch": 4.231686541737649, "step": 6210}, {"loss": 1.1959, "grad_norm": 1.0606296062469482, "learning_rate": 0.0002, "epoch": 4.238500851788756, "step": 6220}, {"loss": 1.2629, "grad_norm": 0.9999088048934937, "learning_rate": 0.0002, "epoch": 4.245315161839864, "step": 6230}, {"loss": 1.1471, "grad_norm": 0.9469764232635498, "learning_rate": 0.0002, "epoch": 4.252129471890971, "step": 6240}, {"loss": 1.223, "grad_norm": 1.1508198976516724, "learning_rate": 0.0002, "epoch": 4.258943781942079, "step": 6250}, {"loss": 1.2677, "grad_norm": 1.2576130628585815, "learning_rate": 0.0002, "epoch": 4.2657580919931855, "step": 6260}, {"loss": 1.2216, "grad_norm": 0.9435968399047852, "learning_rate": 0.0002, "epoch": 4.272572402044293, "step": 6270}, {"loss": 1.2788, "grad_norm": 0.9290348887443542, "learning_rate": 0.0002, "epoch": 4.2793867120954, "step": 6280}, {"loss": 1.2631, "grad_norm": 0.9973701238632202, "learning_rate": 0.0002, "epoch": 4.286201022146508, "step": 6290}, {"loss": 1.2276, "grad_norm": 1.012855887413025, "learning_rate": 0.0002, "epoch": 4.293015332197615, "step": 6300}, {"loss": 1.2115, "grad_norm": 0.8371705412864685, "learning_rate": 0.0002, "epoch": 4.2998296422487225, "step": 6310}, {"loss": 1.2423, "grad_norm": 1.0867925882339478, "learning_rate": 0.0002, "epoch": 4.306643952299829, "step": 6320}, {"loss": 1.2262, "grad_norm": 0.9763767123222351, "learning_rate": 0.0002, "epoch": 4.313458262350937, "step": 6330}, {"loss": 1.2557, "grad_norm": 1.1844252347946167, "learning_rate": 0.0002, "epoch": 4.320272572402044, "step": 6340}, {"loss": 1.2635, "grad_norm": 0.8292830586433411, "learning_rate": 0.0002, "epoch": 4.327086882453152, "step": 6350}, {"loss": 1.262, "grad_norm": 0.9351436495780945, "learning_rate": 0.0002, "epoch": 4.333901192504259, "step": 6360}, {"loss": 1.2678, "grad_norm": 1.0425835847854614, "learning_rate": 0.0002, "epoch": 4.3407155025553665, "step": 6370}, {"loss": 1.2476, "grad_norm": 0.8894261121749878, "learning_rate": 0.0002, "epoch": 4.347529812606473, "step": 6380}, {"loss": 1.2965, "grad_norm": 0.9663366079330444, "learning_rate": 0.0002, "epoch": 4.354344122657581, "step": 6390}, {"loss": 1.2529, "grad_norm": 0.8915578126907349, "learning_rate": 0.0002, "epoch": 4.361158432708688, "step": 6400}, {"loss": 1.2573, "grad_norm": 1.0393000841140747, "learning_rate": 0.0002, "epoch": 4.367972742759796, "step": 6410}, {"loss": 1.2254, "grad_norm": 0.917398989200592, "learning_rate": 0.0002, "epoch": 4.374787052810903, "step": 6420}, {"loss": 1.3115, "grad_norm": 1.0496646165847778, "learning_rate": 0.0002, "epoch": 4.3816013628620105, "step": 6430}, {"loss": 1.2607, "grad_norm": 0.9349859356880188, "learning_rate": 0.0002, "epoch": 4.388415672913117, "step": 6440}, {"loss": 1.3414, "grad_norm": 1.0981004238128662, "learning_rate": 0.0002, "epoch": 4.395229982964225, "step": 6450}, {"loss": 1.2391, "grad_norm": 0.9794871807098389, "learning_rate": 0.0002, "epoch": 4.402044293015332, "step": 6460}, {"loss": 1.208, "grad_norm": 0.9321421384811401, "learning_rate": 0.0002, "epoch": 4.40885860306644, "step": 6470}, {"loss": 1.3398, "grad_norm": 0.9158342480659485, "learning_rate": 0.0002, "epoch": 4.415672913117547, "step": 6480}, {"loss": 1.1832, "grad_norm": 0.9462087750434875, "learning_rate": 0.0002, "epoch": 4.4224872231686545, "step": 6490}, {"loss": 1.2366, "grad_norm": 0.9740175604820251, "learning_rate": 0.0002, "epoch": 4.429301533219761, "step": 6500}, {"loss": 1.3074, "grad_norm": 0.8477463126182556, "learning_rate": 0.0002, "epoch": 4.436115843270869, "step": 6510}, {"loss": 1.2719, "grad_norm": 1.0296647548675537, "learning_rate": 0.0002, "epoch": 4.442930153321976, "step": 6520}, {"loss": 1.2647, "grad_norm": 0.9437751173973083, "learning_rate": 0.0002, "epoch": 4.449744463373084, "step": 6530}, {"loss": 1.2043, "grad_norm": 1.011192798614502, "learning_rate": 0.0002, "epoch": 4.456558773424191, "step": 6540}, {"loss": 1.3673, "grad_norm": 0.8836222290992737, "learning_rate": 0.0002, "epoch": 4.4633730834752985, "step": 6550}, {"loss": 1.3028, "grad_norm": 1.2799941301345825, "learning_rate": 0.0002, "epoch": 4.470187393526405, "step": 6560}, {"loss": 1.2789, "grad_norm": 0.925910472869873, "learning_rate": 0.0002, "epoch": 4.477001703577513, "step": 6570}, {"loss": 1.2723, "grad_norm": 0.957401692867279, "learning_rate": 0.0002, "epoch": 4.48381601362862, "step": 6580}, {"loss": 1.242, "grad_norm": 1.0789544582366943, "learning_rate": 0.0002, "epoch": 4.490630323679728, "step": 6590}, {"loss": 1.2553, "grad_norm": 0.8874586820602417, "learning_rate": 0.0002, "epoch": 4.497444633730835, "step": 6600}, {"loss": 1.2779, "grad_norm": 0.9394784569740295, "learning_rate": 0.0002, "epoch": 4.504258943781942, "step": 6610}, {"loss": 1.2744, "grad_norm": 1.029640793800354, "learning_rate": 0.0002, "epoch": 4.511073253833049, "step": 6620}, {"loss": 1.2634, "grad_norm": 0.9510841965675354, "learning_rate": 0.0002, "epoch": 4.517887563884157, "step": 6630}, {"loss": 1.2562, "grad_norm": 0.9992963671684265, "learning_rate": 0.0002, "epoch": 4.524701873935264, "step": 6640}, {"loss": 1.2942, "grad_norm": 0.9312878847122192, "learning_rate": 0.0002, "epoch": 4.531516183986371, "step": 6650}, {"loss": 1.2572, "grad_norm": 0.9406482577323914, "learning_rate": 0.0002, "epoch": 4.538330494037479, "step": 6660}, {"loss": 1.2283, "grad_norm": 1.1058286428451538, "learning_rate": 0.0002, "epoch": 4.5451448040885865, "step": 6670}, {"loss": 1.2391, "grad_norm": 0.9389635920524597, "learning_rate": 0.0002, "epoch": 4.551959114139693, "step": 6680}, {"loss": 1.2696, "grad_norm": 1.0356028079986572, "learning_rate": 0.0002, "epoch": 4.5587734241908, "step": 6690}, {"loss": 1.2935, "grad_norm": 0.9370909929275513, "learning_rate": 0.0002, "epoch": 4.565587734241908, "step": 6700}, {"loss": 1.2914, "grad_norm": 0.9917567372322083, "learning_rate": 0.0002, "epoch": 4.572402044293016, "step": 6710}, {"loss": 1.3318, "grad_norm": 0.9065384864807129, "learning_rate": 0.0002, "epoch": 4.579216354344123, "step": 6720}, {"loss": 1.2909, "grad_norm": 1.3347833156585693, "learning_rate": 0.0002, "epoch": 4.5860306643952296, "step": 6730}, {"loss": 1.3322, "grad_norm": 0.910632312297821, "learning_rate": 0.0002, "epoch": 4.592844974446337, "step": 6740}, {"loss": 1.2584, "grad_norm": 0.8874805569648743, "learning_rate": 0.0002, "epoch": 4.599659284497445, "step": 6750}, {"loss": 1.3173, "grad_norm": 0.9355664253234863, "learning_rate": 0.0002, "epoch": 4.606473594548552, "step": 6760}, {"loss": 1.3515, "grad_norm": 0.9360204339027405, "learning_rate": 0.0002, "epoch": 4.613287904599659, "step": 6770}, {"loss": 1.2326, "grad_norm": 0.9931750893592834, "learning_rate": 0.0002, "epoch": 4.620102214650767, "step": 6780}, {"loss": 1.2677, "grad_norm": 0.9195131063461304, "learning_rate": 0.0002, "epoch": 4.626916524701874, "step": 6790}, {"loss": 1.3417, "grad_norm": 0.9448373913764954, "learning_rate": 0.0002, "epoch": 4.633730834752981, "step": 6800}, {"loss": 1.2658, "grad_norm": 1.162890911102295, "learning_rate": 0.0002, "epoch": 4.640545144804088, "step": 6810}, {"loss": 1.2841, "grad_norm": 0.9739466905593872, "learning_rate": 0.0002, "epoch": 4.647359454855196, "step": 6820}, {"loss": 1.3068, "grad_norm": 0.9462909698486328, "learning_rate": 0.0002, "epoch": 4.654173764906303, "step": 6830}, {"loss": 1.284, "grad_norm": 1.042639970779419, "learning_rate": 0.0002, "epoch": 4.660988074957411, "step": 6840}, {"loss": 1.3337, "grad_norm": 0.8910539150238037, "learning_rate": 0.0002, "epoch": 4.6678023850085175, "step": 6850}, {"loss": 1.3025, "grad_norm": 1.0806447267532349, "learning_rate": 0.0002, "epoch": 4.674616695059625, "step": 6860}, {"loss": 1.2258, "grad_norm": 1.0054864883422852, "learning_rate": 0.0002, "epoch": 4.681431005110732, "step": 6870}, {"loss": 1.3261, "grad_norm": 0.7774158120155334, "learning_rate": 0.0002, "epoch": 4.68824531516184, "step": 6880}, {"loss": 1.2545, "grad_norm": 0.9729512333869934, "learning_rate": 0.0002, "epoch": 4.695059625212947, "step": 6890}, {"loss": 1.3251, "grad_norm": 1.2025411128997803, "learning_rate": 0.0002, "epoch": 4.701873935264055, "step": 6900}, {"loss": 1.3418, "grad_norm": 1.1654069423675537, "learning_rate": 0.0002, "epoch": 4.7086882453151615, "step": 6910}, {"loss": 1.3091, "grad_norm": 1.1501442193984985, "learning_rate": 0.0002, "epoch": 4.715502555366269, "step": 6920}, {"loss": 1.2627, "grad_norm": 1.1083979606628418, "learning_rate": 0.0002, "epoch": 4.722316865417376, "step": 6930}, {"loss": 1.2836, "grad_norm": 0.9431378841400146, "learning_rate": 0.0002, "epoch": 4.729131175468484, "step": 6940}, {"loss": 1.3381, "grad_norm": 0.9722502827644348, "learning_rate": 0.0002, "epoch": 4.735945485519591, "step": 6950}, {"loss": 1.3228, "grad_norm": 0.9094559550285339, "learning_rate": 0.0002, "epoch": 4.742759795570699, "step": 6960}, {"loss": 1.3474, "grad_norm": 0.9918473958969116, "learning_rate": 0.0002, "epoch": 4.7495741056218055, "step": 6970}, {"loss": 1.3352, "grad_norm": 0.9999690651893616, "learning_rate": 0.0002, "epoch": 4.756388415672913, "step": 6980}, {"loss": 1.3579, "grad_norm": 1.0453810691833496, "learning_rate": 0.0002, "epoch": 4.76320272572402, "step": 6990}, {"loss": 1.294, "grad_norm": 1.0167806148529053, "learning_rate": 0.0002, "epoch": 4.770017035775128, "step": 7000}, {"loss": 1.3247, "grad_norm": 0.8133894801139832, "learning_rate": 0.0002, "epoch": 4.776831345826235, "step": 7010}, {"loss": 1.2577, "grad_norm": 0.8000897765159607, "learning_rate": 0.0002, "epoch": 4.783645655877343, "step": 7020}, {"loss": 1.2802, "grad_norm": 0.992080569267273, "learning_rate": 0.0002, "epoch": 4.7904599659284495, "step": 7030}, {"loss": 1.3269, "grad_norm": 0.9824522137641907, "learning_rate": 0.0002, "epoch": 4.797274275979557, "step": 7040}, {"loss": 1.279, "grad_norm": 0.9808870553970337, "learning_rate": 0.0002, "epoch": 4.804088586030664, "step": 7050}, {"loss": 1.3342, "grad_norm": 0.9679701924324036, "learning_rate": 0.0002, "epoch": 4.810902896081772, "step": 7060}, {"loss": 1.2711, "grad_norm": 0.9895215034484863, "learning_rate": 0.0002, "epoch": 4.817717206132879, "step": 7070}, {"loss": 1.3008, "grad_norm": 1.052246332168579, "learning_rate": 0.0002, "epoch": 4.824531516183987, "step": 7080}, {"loss": 1.2874, "grad_norm": 0.9243564605712891, "learning_rate": 0.0002, "epoch": 4.8313458262350935, "step": 7090}, {"loss": 1.2835, "grad_norm": 0.9545369744300842, "learning_rate": 0.0002, "epoch": 4.838160136286201, "step": 7100}, {"loss": 1.31, "grad_norm": 0.9655884504318237, "learning_rate": 0.0002, "epoch": 4.844974446337308, "step": 7110}, {"loss": 1.2862, "grad_norm": 0.9708049893379211, "learning_rate": 0.0002, "epoch": 4.851788756388416, "step": 7120}, {"loss": 1.3425, "grad_norm": 1.0064880847930908, "learning_rate": 0.0002, "epoch": 4.858603066439523, "step": 7130}, {"loss": 1.2899, "grad_norm": 0.939943790435791, "learning_rate": 0.0002, "epoch": 4.8654173764906306, "step": 7140}, {"loss": 1.2887, "grad_norm": 1.0750784873962402, "learning_rate": 0.0002, "epoch": 4.872231686541737, "step": 7150}, {"loss": 1.3367, "grad_norm": 0.9708989262580872, "learning_rate": 0.0002, "epoch": 4.879045996592845, "step": 7160}, {"loss": 1.2797, "grad_norm": 1.0228253602981567, "learning_rate": 0.0002, "epoch": 4.885860306643952, "step": 7170}, {"loss": 1.2695, "grad_norm": 0.8963132500648499, "learning_rate": 0.0002, "epoch": 4.89267461669506, "step": 7180}, {"loss": 1.3473, "grad_norm": 0.9198015928268433, "learning_rate": 0.0002, "epoch": 4.899488926746167, "step": 7190}, {"loss": 1.2541, "grad_norm": 1.099906086921692, "learning_rate": 0.0002, "epoch": 4.9063032367972745, "step": 7200}, {"loss": 1.3188, "grad_norm": 1.0624815225601196, "learning_rate": 0.0002, "epoch": 4.913117546848381, "step": 7210}, {"loss": 1.3026, "grad_norm": 0.9688444137573242, "learning_rate": 0.0002, "epoch": 4.919931856899489, "step": 7220}, {"loss": 1.3379, "grad_norm": 0.867011547088623, "learning_rate": 0.0002, "epoch": 4.926746166950596, "step": 7230}, {"loss": 1.289, "grad_norm": 0.9600282311439514, "learning_rate": 0.0002, "epoch": 4.933560477001704, "step": 7240}, {"loss": 1.2751, "grad_norm": 0.8979372978210449, "learning_rate": 0.0002, "epoch": 4.940374787052811, "step": 7250}, {"loss": 1.3426, "grad_norm": 0.951474130153656, "learning_rate": 0.0002, "epoch": 4.9471890971039185, "step": 7260}, {"loss": 1.2726, "grad_norm": 0.824851393699646, "learning_rate": 0.0002, "epoch": 4.954003407155025, "step": 7270}, {"loss": 1.2679, "grad_norm": 1.2926591634750366, "learning_rate": 0.0002, "epoch": 4.960817717206133, "step": 7280}, {"loss": 1.2974, "grad_norm": 1.1057835817337036, "learning_rate": 0.0002, "epoch": 4.96763202725724, "step": 7290}, {"loss": 1.2275, "grad_norm": 0.9814816117286682, "learning_rate": 0.0002, "epoch": 4.974446337308348, "step": 7300}, {"loss": 1.3001, "grad_norm": 1.0251333713531494, "learning_rate": 0.0002, "epoch": 4.981260647359455, "step": 7310}, {"loss": 1.3113, "grad_norm": 0.9748668074607849, "learning_rate": 0.0002, "epoch": 4.9880749574105625, "step": 7320}, {"loss": 1.3595, "grad_norm": 0.8552228808403015, "learning_rate": 0.0002, "epoch": 4.994889267461669, "step": 7330}, {"eval_loss": 2.03971004486084, "eval_runtime": 67.4144, "eval_samples_per_second": 7.521, "eval_steps_per_second": 0.949, "epoch": 4.999659284497445, "step": 7337}, {"loss": 1.2464, "grad_norm": 0.8210785388946533, "learning_rate": 0.0002, "epoch": 5.001703577512777, "step": 7340}, {"loss": 1.0356, "grad_norm": 1.2577511072158813, "learning_rate": 0.0002, "epoch": 5.008517887563884, "step": 7350}, {"loss": 0.9944, "grad_norm": 1.280604362487793, "learning_rate": 0.0002, "epoch": 5.015332197614992, "step": 7360}, {"loss": 1.0858, "grad_norm": 1.3985474109649658, "learning_rate": 0.0002, "epoch": 5.022146507666099, "step": 7370}, {"loss": 1.0122, "grad_norm": 1.1621310710906982, "learning_rate": 0.0002, "epoch": 5.0289608177172065, "step": 7380}, {"loss": 1.05, "grad_norm": 1.3278541564941406, "learning_rate": 0.0002, "epoch": 5.035775127768313, "step": 7390}, {"loss": 1.0237, "grad_norm": 1.1166491508483887, "learning_rate": 0.0002, "epoch": 5.042589437819421, "step": 7400}, {"loss": 1.0397, "grad_norm": 1.8087667226791382, "learning_rate": 0.0002, "epoch": 5.049403747870528, "step": 7410}, {"loss": 1.0191, "grad_norm": 1.1517921686172485, "learning_rate": 0.0002, "epoch": 5.056218057921636, "step": 7420}, {"loss": 1.025, "grad_norm": 1.2875889539718628, "learning_rate": 0.0002, "epoch": 5.063032367972743, "step": 7430}, {"loss": 1.043, "grad_norm": 1.199702262878418, "learning_rate": 0.0002, "epoch": 5.0698466780238505, "step": 7440}, {"loss": 1.0176, "grad_norm": 1.2912452220916748, "learning_rate": 0.0002, "epoch": 5.076660988074957, "step": 7450}, {"loss": 1.0042, "grad_norm": 1.1446452140808105, "learning_rate": 0.0002, "epoch": 5.083475298126065, "step": 7460}, {"loss": 1.047, "grad_norm": 1.3625746965408325, "learning_rate": 0.0002, "epoch": 5.090289608177172, "step": 7470}, {"loss": 1.052, "grad_norm": 1.2116546630859375, "learning_rate": 0.0002, "epoch": 5.09710391822828, "step": 7480}, {"loss": 1.1041, "grad_norm": 1.3896098136901855, "learning_rate": 0.0002, "epoch": 5.103918228279387, "step": 7490}, {"loss": 1.0668, "grad_norm": 1.6265277862548828, "learning_rate": 0.0002, "epoch": 5.1107325383304945, "step": 7500}, {"loss": 1.028, "grad_norm": 1.1468392610549927, "learning_rate": 0.0002, "epoch": 5.117546848381601, "step": 7510}, {"loss": 0.9915, "grad_norm": 1.2649329900741577, "learning_rate": 0.0002, "epoch": 5.124361158432709, "step": 7520}, {"loss": 1.0251, "grad_norm": 1.1866015195846558, "learning_rate": 0.0002, "epoch": 5.131175468483816, "step": 7530}, {"loss": 1.0626, "grad_norm": 1.1517255306243896, "learning_rate": 0.0002, "epoch": 5.137989778534923, "step": 7540}, {"loss": 1.0303, "grad_norm": 1.3475146293640137, "learning_rate": 0.0002, "epoch": 5.144804088586031, "step": 7550}, {"loss": 1.0456, "grad_norm": 1.1167018413543701, "learning_rate": 0.0002, "epoch": 5.151618398637138, "step": 7560}, {"loss": 1.04, "grad_norm": 1.209572434425354, "learning_rate": 0.0002, "epoch": 5.158432708688245, "step": 7570}, {"loss": 1.0533, "grad_norm": 1.3578280210494995, "learning_rate": 0.0002, "epoch": 5.165247018739352, "step": 7580}, {"loss": 1.0958, "grad_norm": 1.2447012662887573, "learning_rate": 0.0002, "epoch": 5.17206132879046, "step": 7590}, {"loss": 1.0521, "grad_norm": 1.3715848922729492, "learning_rate": 0.0002, "epoch": 5.178875638841567, "step": 7600}, {"loss": 1.0556, "grad_norm": 1.435860276222229, "learning_rate": 0.0002, "epoch": 5.185689948892675, "step": 7610}, {"loss": 1.0504, "grad_norm": 1.4093858003616333, "learning_rate": 0.0002, "epoch": 5.1925042589437815, "step": 7620}, {"loss": 1.083, "grad_norm": 1.1747535467147827, "learning_rate": 0.0002, "epoch": 5.199318568994889, "step": 7630}, {"loss": 1.048, "grad_norm": 1.4704833030700684, "learning_rate": 0.0002, "epoch": 5.206132879045996, "step": 7640}, {"loss": 0.9991, "grad_norm": 1.2270972728729248, "learning_rate": 0.0002, "epoch": 5.212947189097104, "step": 7650}, {"loss": 1.0738, "grad_norm": 1.2215691804885864, "learning_rate": 0.0002, "epoch": 5.219761499148211, "step": 7660}, {"loss": 1.0628, "grad_norm": 1.3641486167907715, "learning_rate": 0.0002, "epoch": 5.226575809199319, "step": 7670}, {"loss": 1.1066, "grad_norm": 1.3532041311264038, "learning_rate": 0.0002, "epoch": 5.2333901192504255, "step": 7680}, {"loss": 1.0209, "grad_norm": 1.2243095636367798, "learning_rate": 0.0002, "epoch": 5.240204429301533, "step": 7690}, {"loss": 1.0503, "grad_norm": 1.3644746541976929, "learning_rate": 0.0002, "epoch": 5.24701873935264, "step": 7700}, {"loss": 1.0406, "grad_norm": 1.18478262424469, "learning_rate": 0.0002, "epoch": 5.253833049403748, "step": 7710}, {"loss": 1.1023, "grad_norm": 1.2146114110946655, "learning_rate": 0.0002, "epoch": 5.260647359454855, "step": 7720}, {"loss": 1.1528, "grad_norm": 1.233984112739563, "learning_rate": 0.0002, "epoch": 5.267461669505963, "step": 7730}, {"loss": 1.0681, "grad_norm": 1.3709665536880493, "learning_rate": 0.0002, "epoch": 5.2742759795570695, "step": 7740}, {"loss": 1.0195, "grad_norm": 1.36055326461792, "learning_rate": 0.0002, "epoch": 5.281090289608177, "step": 7750}, {"loss": 1.0447, "grad_norm": 1.6232351064682007, "learning_rate": 0.0002, "epoch": 5.287904599659284, "step": 7760}, {"loss": 1.0627, "grad_norm": 1.3359960317611694, "learning_rate": 0.0002, "epoch": 5.294718909710392, "step": 7770}, {"loss": 1.1082, "grad_norm": 1.3815656900405884, "learning_rate": 0.0002, "epoch": 5.301533219761499, "step": 7780}, {"loss": 1.0891, "grad_norm": 1.1392076015472412, "learning_rate": 0.0002, "epoch": 5.308347529812607, "step": 7790}, {"loss": 1.0364, "grad_norm": 1.3006905317306519, "learning_rate": 0.0002, "epoch": 5.3151618398637135, "step": 7800}, {"loss": 1.1005, "grad_norm": 1.503645896911621, "learning_rate": 0.0002, "epoch": 5.321976149914821, "step": 7810}, {"loss": 1.0075, "grad_norm": 1.141939640045166, "learning_rate": 0.0002, "epoch": 5.328790459965928, "step": 7820}, {"loss": 1.0284, "grad_norm": 1.4654004573822021, "learning_rate": 0.0002, "epoch": 5.335604770017036, "step": 7830}, {"loss": 1.1185, "grad_norm": 1.4195219278335571, "learning_rate": 0.0002, "epoch": 5.342419080068143, "step": 7840}, {"loss": 1.0535, "grad_norm": 1.2354168891906738, "learning_rate": 0.0002, "epoch": 5.349233390119251, "step": 7850}, {"loss": 1.0923, "grad_norm": 1.529862880706787, "learning_rate": 0.0002, "epoch": 5.3560477001703575, "step": 7860}, {"loss": 1.1005, "grad_norm": 1.364678978919983, "learning_rate": 0.0002, "epoch": 5.362862010221465, "step": 7870}, {"loss": 1.1084, "grad_norm": 1.1010444164276123, "learning_rate": 0.0002, "epoch": 5.369676320272572, "step": 7880}, {"loss": 1.1225, "grad_norm": 1.1949712038040161, "learning_rate": 0.0002, "epoch": 5.37649063032368, "step": 7890}, {"loss": 1.058, "grad_norm": 1.485922932624817, "learning_rate": 0.0002, "epoch": 5.383304940374787, "step": 7900}, {"loss": 0.9894, "grad_norm": 1.0844227075576782, "learning_rate": 0.0002, "epoch": 5.390119250425895, "step": 7910}, {"loss": 1.0418, "grad_norm": 1.3784468173980713, "learning_rate": 0.0002, "epoch": 5.3969335604770015, "step": 7920}, {"loss": 1.0542, "grad_norm": 1.4771490097045898, "learning_rate": 0.0002, "epoch": 5.403747870528109, "step": 7930}, {"loss": 1.1265, "grad_norm": 1.2460103034973145, "learning_rate": 0.0002, "epoch": 5.410562180579216, "step": 7940}, {"loss": 1.096, "grad_norm": 1.3047645092010498, "learning_rate": 0.0002, "epoch": 5.417376490630324, "step": 7950}, {"loss": 1.0956, "grad_norm": 1.1396620273590088, "learning_rate": 0.0002, "epoch": 5.424190800681431, "step": 7960}, {"loss": 1.0685, "grad_norm": 1.4193450212478638, "learning_rate": 0.0002, "epoch": 5.4310051107325386, "step": 7970}, {"loss": 1.1347, "grad_norm": 1.2085850238800049, "learning_rate": 0.0002, "epoch": 5.437819420783645, "step": 7980}, {"loss": 1.0277, "grad_norm": 1.2721607685089111, "learning_rate": 0.0002, "epoch": 5.444633730834753, "step": 7990}, {"loss": 1.1316, "grad_norm": 1.4134020805358887, "learning_rate": 0.0002, "epoch": 5.45144804088586, "step": 8000}, {"loss": 1.0576, "grad_norm": 1.4283325672149658, "learning_rate": 0.0002, "epoch": 5.458262350936968, "step": 8010}, {"loss": 1.0505, "grad_norm": 1.3127079010009766, "learning_rate": 0.0002, "epoch": 5.465076660988075, "step": 8020}, {"loss": 1.0812, "grad_norm": 1.2924352884292603, "learning_rate": 0.0002, "epoch": 5.4718909710391825, "step": 8030}, {"loss": 1.1178, "grad_norm": 1.8000653982162476, "learning_rate": 0.0002, "epoch": 5.478705281090289, "step": 8040}, {"loss": 1.1205, "grad_norm": 1.1538785696029663, "learning_rate": 0.0002, "epoch": 5.485519591141397, "step": 8050}, {"loss": 1.1015, "grad_norm": 1.1173290014266968, "learning_rate": 0.0002, "epoch": 5.492333901192504, "step": 8060}, {"loss": 1.1597, "grad_norm": 1.1501243114471436, "learning_rate": 0.0002, "epoch": 5.499148211243612, "step": 8070}, {"loss": 1.1465, "grad_norm": 1.1335760354995728, "learning_rate": 0.0002, "epoch": 5.505962521294719, "step": 8080}, {"loss": 1.1005, "grad_norm": 1.565274953842163, "learning_rate": 0.0002, "epoch": 5.5127768313458265, "step": 8090}, {"loss": 1.1085, "grad_norm": 1.3415014743804932, "learning_rate": 0.0002, "epoch": 5.519591141396933, "step": 8100}, {"loss": 1.1166, "grad_norm": 1.2377240657806396, "learning_rate": 0.0002, "epoch": 5.526405451448041, "step": 8110}, {"loss": 1.0766, "grad_norm": 1.3333637714385986, "learning_rate": 0.0002, "epoch": 5.533219761499148, "step": 8120}, {"loss": 1.1515, "grad_norm": 1.2620662450790405, "learning_rate": 0.0002, "epoch": 5.540034071550256, "step": 8130}, {"loss": 1.0839, "grad_norm": 1.2806652784347534, "learning_rate": 0.0002, "epoch": 5.546848381601363, "step": 8140}, {"loss": 1.1221, "grad_norm": 1.2057335376739502, "learning_rate": 0.0002, "epoch": 5.5536626916524705, "step": 8150}, {"loss": 1.1292, "grad_norm": 1.411726951599121, "learning_rate": 0.0002, "epoch": 5.560477001703577, "step": 8160}, {"loss": 1.0887, "grad_norm": 1.381104588508606, "learning_rate": 0.0002, "epoch": 5.567291311754685, "step": 8170}, {"loss": 1.1317, "grad_norm": 1.3449294567108154, "learning_rate": 0.0002, "epoch": 5.574105621805792, "step": 8180}, {"loss": 1.1392, "grad_norm": 1.2791016101837158, "learning_rate": 0.0002, "epoch": 5.5809199318569, "step": 8190}, {"loss": 1.0972, "grad_norm": 1.276891827583313, "learning_rate": 0.0002, "epoch": 5.587734241908007, "step": 8200}, {"loss": 1.1001, "grad_norm": 1.3951541185379028, "learning_rate": 0.0002, "epoch": 5.5945485519591145, "step": 8210}, {"loss": 1.0993, "grad_norm": 1.4167890548706055, "learning_rate": 0.0002, "epoch": 5.601362862010221, "step": 8220}, {"loss": 1.0826, "grad_norm": 1.4388375282287598, "learning_rate": 0.0002, "epoch": 5.608177172061329, "step": 8230}, {"loss": 1.1941, "grad_norm": 1.210157036781311, "learning_rate": 0.0002, "epoch": 5.614991482112436, "step": 8240}, {"loss": 1.0833, "grad_norm": 1.0557862520217896, "learning_rate": 0.0002, "epoch": 5.621805792163544, "step": 8250}, {"loss": 1.1197, "grad_norm": 1.2913990020751953, "learning_rate": 0.0002, "epoch": 5.628620102214651, "step": 8260}, {"loss": 1.0346, "grad_norm": 1.2204737663269043, "learning_rate": 0.0002, "epoch": 5.6354344122657585, "step": 8270}, {"loss": 1.1429, "grad_norm": 1.57016921043396, "learning_rate": 0.0002, "epoch": 5.642248722316865, "step": 8280}, {"loss": 1.0988, "grad_norm": 1.0117967128753662, "learning_rate": 0.0002, "epoch": 5.649063032367973, "step": 8290}, {"loss": 1.0786, "grad_norm": 1.3195525407791138, "learning_rate": 0.0002, "epoch": 5.65587734241908, "step": 8300}, {"loss": 1.0618, "grad_norm": 1.2566497325897217, "learning_rate": 0.0002, "epoch": 5.662691652470187, "step": 8310}, {"loss": 1.1635, "grad_norm": 1.1446818113327026, "learning_rate": 0.0002, "epoch": 5.669505962521295, "step": 8320}, {"loss": 1.2201, "grad_norm": 1.2928680181503296, "learning_rate": 0.0002, "epoch": 5.6763202725724025, "step": 8330}, {"loss": 1.1488, "grad_norm": 1.2823996543884277, "learning_rate": 0.0002, "epoch": 5.683134582623509, "step": 8340}, {"loss": 1.0686, "grad_norm": 1.1523874998092651, "learning_rate": 0.0002, "epoch": 5.689948892674616, "step": 8350}, {"loss": 1.0938, "grad_norm": 1.0819287300109863, "learning_rate": 0.0002, "epoch": 5.696763202725724, "step": 8360}, {"loss": 1.167, "grad_norm": 1.2384417057037354, "learning_rate": 0.0002, "epoch": 5.703577512776832, "step": 8370}, {"loss": 1.1136, "grad_norm": 1.1733224391937256, "learning_rate": 0.0002, "epoch": 5.710391822827939, "step": 8380}, {"loss": 1.1041, "grad_norm": 1.3173418045043945, "learning_rate": 0.0002, "epoch": 5.7172061328790456, "step": 8390}, {"loss": 1.1014, "grad_norm": 1.285880446434021, "learning_rate": 0.0002, "epoch": 5.724020442930153, "step": 8400}, {"loss": 1.1161, "grad_norm": 1.1404874324798584, "learning_rate": 0.0002, "epoch": 5.730834752981261, "step": 8410}, {"loss": 1.192, "grad_norm": 1.2432540655136108, "learning_rate": 0.0002, "epoch": 5.737649063032368, "step": 8420}, {"loss": 1.1702, "grad_norm": 1.2432233095169067, "learning_rate": 0.0002, "epoch": 5.744463373083475, "step": 8430}, {"loss": 1.1357, "grad_norm": 1.154496669769287, "learning_rate": 0.0002, "epoch": 5.751277683134583, "step": 8440}, {"loss": 1.1706, "grad_norm": 1.3301030397415161, "learning_rate": 0.0002, "epoch": 5.75809199318569, "step": 8450}, {"loss": 1.2052, "grad_norm": 1.243760347366333, "learning_rate": 0.0002, "epoch": 5.764906303236797, "step": 8460}, {"loss": 1.1035, "grad_norm": 1.4083361625671387, "learning_rate": 0.0002, "epoch": 5.771720613287904, "step": 8470}, {"loss": 1.1362, "grad_norm": 1.5662120580673218, "learning_rate": 0.0002, "epoch": 5.778534923339012, "step": 8480}, {"loss": 1.1578, "grad_norm": 1.2111139297485352, "learning_rate": 0.0002, "epoch": 5.78534923339012, "step": 8490}, {"loss": 1.1333, "grad_norm": 1.2776305675506592, "learning_rate": 0.0002, "epoch": 5.792163543441227, "step": 8500}, {"loss": 1.1439, "grad_norm": 1.1777727603912354, "learning_rate": 0.0002, "epoch": 5.7989778534923335, "step": 8510}, {"loss": 1.0859, "grad_norm": 1.1696112155914307, "learning_rate": 0.0002, "epoch": 5.805792163543441, "step": 8520}, {"loss": 1.162, "grad_norm": 1.137397289276123, "learning_rate": 0.0002, "epoch": 5.812606473594548, "step": 8530}, {"loss": 1.2099, "grad_norm": 1.3182098865509033, "learning_rate": 0.0002, "epoch": 5.819420783645656, "step": 8540}, {"loss": 1.1427, "grad_norm": 1.359756588935852, "learning_rate": 0.0002, "epoch": 5.826235093696763, "step": 8550}, {"loss": 1.1714, "grad_norm": 1.4118162393569946, "learning_rate": 0.0002, "epoch": 5.833049403747871, "step": 8560}, {"loss": 1.1758, "grad_norm": 1.1899290084838867, "learning_rate": 0.0002, "epoch": 5.8398637137989775, "step": 8570}, {"loss": 1.1511, "grad_norm": 1.1764532327651978, "learning_rate": 0.0002, "epoch": 5.846678023850085, "step": 8580}, {"loss": 1.1633, "grad_norm": 1.33274245262146, "learning_rate": 0.0002, "epoch": 5.853492333901192, "step": 8590}, {"loss": 1.1092, "grad_norm": 1.2571861743927002, "learning_rate": 0.0002, "epoch": 5.8603066439523, "step": 8600}, {"loss": 1.1137, "grad_norm": 1.3523616790771484, "learning_rate": 0.0002, "epoch": 5.867120954003407, "step": 8610}, {"loss": 1.2442, "grad_norm": 1.3556902408599854, "learning_rate": 0.0002, "epoch": 5.873935264054515, "step": 8620}, {"loss": 1.0967, "grad_norm": 1.2864879369735718, "learning_rate": 0.0002, "epoch": 5.8807495741056215, "step": 8630}, {"loss": 1.1491, "grad_norm": 1.2872768640518188, "learning_rate": 0.0002, "epoch": 5.887563884156729, "step": 8640}, {"loss": 1.1003, "grad_norm": 1.1446053981781006, "learning_rate": 0.0002, "epoch": 5.894378194207836, "step": 8650}, {"loss": 1.1095, "grad_norm": 1.292615532875061, "learning_rate": 0.0002, "epoch": 5.901192504258944, "step": 8660}, {"loss": 1.2009, "grad_norm": 1.190891981124878, "learning_rate": 0.0002, "epoch": 5.908006814310051, "step": 8670}, {"loss": 1.1386, "grad_norm": 1.330273985862732, "learning_rate": 0.0002, "epoch": 5.914821124361159, "step": 8680}, {"loss": 1.1874, "grad_norm": 1.41121244430542, "learning_rate": 0.0002, "epoch": 5.9216354344122655, "step": 8690}, {"loss": 1.1573, "grad_norm": 1.1360729932785034, "learning_rate": 0.0002, "epoch": 5.928449744463373, "step": 8700}, {"loss": 1.115, "grad_norm": 1.2220772504806519, "learning_rate": 0.0002, "epoch": 5.93526405451448, "step": 8710}, {"loss": 1.1696, "grad_norm": 1.1077110767364502, "learning_rate": 0.0002, "epoch": 5.942078364565588, "step": 8720}, {"loss": 1.1443, "grad_norm": 1.3632500171661377, "learning_rate": 0.0002, "epoch": 5.948892674616695, "step": 8730}, {"loss": 1.1474, "grad_norm": 1.4695830345153809, "learning_rate": 0.0002, "epoch": 5.955706984667803, "step": 8740}, {"loss": 1.1825, "grad_norm": 1.217741847038269, "learning_rate": 0.0002, "epoch": 5.9625212947189095, "step": 8750}, {"loss": 1.1495, "grad_norm": 1.0386874675750732, "learning_rate": 0.0002, "epoch": 5.969335604770017, "step": 8760}, {"loss": 1.1146, "grad_norm": 1.2067872285842896, "learning_rate": 0.0002, "epoch": 5.976149914821124, "step": 8770}, {"loss": 1.1987, "grad_norm": 1.3842018842697144, "learning_rate": 0.0002, "epoch": 5.982964224872232, "step": 8780}, {"loss": 1.2147, "grad_norm": 1.4584033489227295, "learning_rate": 0.0002, "epoch": 5.989778534923339, "step": 8790}, {"loss": 1.2078, "grad_norm": 1.1912888288497925, "learning_rate": 0.0002, "epoch": 5.996592844974447, "step": 8800}, {"eval_loss": 2.261807441711426, "eval_runtime": 68.1125, "eval_samples_per_second": 7.444, "eval_steps_per_second": 0.94, "epoch": 6.0, "step": 8805}, {"loss": 1.004, "grad_norm": 1.1715940237045288, "learning_rate": 0.0002, "epoch": 6.003407155025553, "step": 8810}, {"loss": 0.8665, "grad_norm": 1.6573960781097412, "learning_rate": 0.0002, "epoch": 6.010221465076661, "step": 8820}, {"loss": 0.8866, "grad_norm": 1.2845953702926636, "learning_rate": 0.0002, "epoch": 6.017035775127768, "step": 8830}, {"loss": 0.8528, "grad_norm": 1.526754379272461, "learning_rate": 0.0002, "epoch": 6.023850085178876, "step": 8840}, {"loss": 0.8555, "grad_norm": 1.4536073207855225, "learning_rate": 0.0002, "epoch": 6.030664395229983, "step": 8850}, {"loss": 0.8839, "grad_norm": 1.68099045753479, "learning_rate": 0.0002, "epoch": 6.0374787052810905, "step": 8860}, {"loss": 0.8538, "grad_norm": 1.485777497291565, "learning_rate": 0.0002, "epoch": 6.044293015332197, "step": 8870}, {"loss": 0.8534, "grad_norm": 1.5084402561187744, "learning_rate": 0.0002, "epoch": 6.051107325383305, "step": 8880}, {"loss": 0.8587, "grad_norm": 1.3901145458221436, "learning_rate": 0.0002, "epoch": 6.057921635434412, "step": 8890}, {"loss": 0.8625, "grad_norm": 1.528954267501831, "learning_rate": 0.0002, "epoch": 6.06473594548552, "step": 8900}, {"loss": 0.9115, "grad_norm": 1.6869531869888306, "learning_rate": 0.0002, "epoch": 6.071550255536627, "step": 8910}, {"loss": 0.8817, "grad_norm": 1.4149913787841797, "learning_rate": 0.0002, "epoch": 6.0783645655877345, "step": 8920}, {"loss": 0.8734, "grad_norm": 1.6853618621826172, "learning_rate": 0.0002, "epoch": 6.085178875638841, "step": 8930}, {"loss": 0.8836, "grad_norm": 1.694443702697754, "learning_rate": 0.0002, "epoch": 6.091993185689949, "step": 8940}, {"loss": 0.9144, "grad_norm": 2.1037111282348633, "learning_rate": 0.0002, "epoch": 6.098807495741056, "step": 8950}, {"loss": 0.8296, "grad_norm": 2.1236703395843506, "learning_rate": 0.0002, "epoch": 6.105621805792164, "step": 8960}, {"loss": 0.8451, "grad_norm": 1.6621695756912231, "learning_rate": 0.0002, "epoch": 6.112436115843271, "step": 8970}, {"loss": 0.8423, "grad_norm": 1.5390307903289795, "learning_rate": 0.0002, "epoch": 6.1192504258943785, "step": 8980}, {"loss": 0.8829, "grad_norm": 1.7841306924819946, "learning_rate": 0.0002, "epoch": 6.126064735945485, "step": 8990}, {"loss": 0.8872, "grad_norm": 1.8420580625534058, "learning_rate": 0.0002, "epoch": 6.132879045996593, "step": 9000}, {"loss": 0.9411, "grad_norm": 1.8198356628417969, "learning_rate": 0.0002, "epoch": 6.1396933560477, "step": 9010}, {"loss": 0.8921, "grad_norm": 1.6955933570861816, "learning_rate": 0.0002, "epoch": 6.146507666098808, "step": 9020}, {"loss": 0.9241, "grad_norm": 1.5072602033615112, "learning_rate": 0.0002, "epoch": 6.153321976149915, "step": 9030}, {"loss": 0.8643, "grad_norm": 1.63434898853302, "learning_rate": 0.0002, "epoch": 6.1601362862010225, "step": 9040}, {"loss": 0.8317, "grad_norm": 1.3761866092681885, "learning_rate": 0.0002, "epoch": 6.166950596252129, "step": 9050}, {"loss": 0.8136, "grad_norm": 1.7027268409729004, "learning_rate": 0.0002, "epoch": 6.173764906303237, "step": 9060}, {"loss": 0.8333, "grad_norm": 1.3534049987792969, "learning_rate": 0.0002, "epoch": 6.180579216354344, "step": 9070}, {"loss": 0.847, "grad_norm": 1.4437154531478882, "learning_rate": 0.0002, "epoch": 6.187393526405452, "step": 9080}, {"loss": 0.9169, "grad_norm": 1.4449656009674072, "learning_rate": 0.0002, "epoch": 6.194207836456559, "step": 9090}, {"loss": 0.846, "grad_norm": 1.5854601860046387, "learning_rate": 0.0002, "epoch": 6.2010221465076665, "step": 9100}, {"loss": 0.8801, "grad_norm": 1.5987509489059448, "learning_rate": 0.0002, "epoch": 6.207836456558773, "step": 9110}, {"loss": 0.9077, "grad_norm": 1.6309672594070435, "learning_rate": 0.0002, "epoch": 6.214650766609881, "step": 9120}, {"loss": 0.8802, "grad_norm": 1.526936411857605, "learning_rate": 0.0002, "epoch": 6.221465076660988, "step": 9130}, {"loss": 0.8858, "grad_norm": 1.4649606943130493, "learning_rate": 0.0002, "epoch": 6.228279386712096, "step": 9140}, {"loss": 0.9414, "grad_norm": 1.589350700378418, "learning_rate": 0.0002, "epoch": 6.235093696763203, "step": 9150}, {"loss": 0.9001, "grad_norm": 1.655668020248413, "learning_rate": 0.0002, "epoch": 6.2419080068143105, "step": 9160}, {"loss": 0.9879, "grad_norm": 1.5296401977539062, "learning_rate": 0.0002, "epoch": 6.248722316865417, "step": 9170}, {"loss": 0.8908, "grad_norm": 1.5857278108596802, "learning_rate": 0.0002, "epoch": 6.255536626916525, "step": 9180}, {"loss": 0.9329, "grad_norm": 1.7779686450958252, "learning_rate": 0.0002, "epoch": 6.262350936967632, "step": 9190}, {"loss": 0.9683, "grad_norm": 1.588886022567749, "learning_rate": 0.0002, "epoch": 6.269165247018739, "step": 9200}, {"loss": 0.9091, "grad_norm": 1.3818320035934448, "learning_rate": 0.0002, "epoch": 6.275979557069847, "step": 9210}, {"loss": 0.9003, "grad_norm": 1.6675978899002075, "learning_rate": 0.0002, "epoch": 6.2827938671209544, "step": 9220}, {"loss": 0.9125, "grad_norm": 1.5672610998153687, "learning_rate": 0.0002, "epoch": 6.289608177172061, "step": 9230}, {"loss": 0.9083, "grad_norm": 1.4558004140853882, "learning_rate": 0.0002, "epoch": 6.296422487223168, "step": 9240}, {"loss": 0.9362, "grad_norm": 1.5393446683883667, "learning_rate": 0.0002, "epoch": 6.303236797274276, "step": 9250}, {"loss": 0.8807, "grad_norm": 1.4367083311080933, "learning_rate": 0.0002, "epoch": 6.310051107325384, "step": 9260}, {"loss": 0.9203, "grad_norm": 1.5045381784439087, "learning_rate": 0.0002, "epoch": 6.316865417376491, "step": 9270}, {"loss": 0.9239, "grad_norm": 1.8604016304016113, "learning_rate": 0.0002, "epoch": 6.3236797274275975, "step": 9280}, {"loss": 0.9644, "grad_norm": 1.4863131046295166, "learning_rate": 0.0002, "epoch": 6.330494037478705, "step": 9290}, {"loss": 0.9052, "grad_norm": 1.511121392250061, "learning_rate": 0.0002, "epoch": 6.337308347529812, "step": 9300}, {"loss": 0.8609, "grad_norm": 1.6979162693023682, "learning_rate": 0.0002, "epoch": 6.34412265758092, "step": 9310}, {"loss": 0.953, "grad_norm": 1.6060494184494019, "learning_rate": 0.0002, "epoch": 6.350936967632027, "step": 9320}, {"loss": 0.9552, "grad_norm": 1.6572561264038086, "learning_rate": 0.0002, "epoch": 6.357751277683135, "step": 9330}, {"loss": 0.9201, "grad_norm": 1.6706757545471191, "learning_rate": 0.0002, "epoch": 6.3645655877342415, "step": 9340}, {"loss": 0.8693, "grad_norm": 1.620836615562439, "learning_rate": 0.0002, "epoch": 6.371379897785349, "step": 9350}, {"loss": 0.9281, "grad_norm": 1.482940673828125, "learning_rate": 0.0002, "epoch": 6.378194207836456, "step": 9360}, {"loss": 0.9026, "grad_norm": 1.3969961404800415, "learning_rate": 0.0002, "epoch": 6.385008517887564, "step": 9370}, {"loss": 0.8909, "grad_norm": 1.611212134361267, "learning_rate": 0.0002, "epoch": 6.391822827938671, "step": 9380}, {"loss": 0.9137, "grad_norm": 1.5586223602294922, "learning_rate": 0.0002, "epoch": 6.398637137989779, "step": 9390}, {"loss": 0.9254, "grad_norm": 1.394761562347412, "learning_rate": 0.0002, "epoch": 6.4054514480408855, "step": 9400}, {"loss": 0.8935, "grad_norm": 1.559618592262268, "learning_rate": 0.0002, "epoch": 6.412265758091993, "step": 9410}, {"loss": 0.9585, "grad_norm": 1.462173581123352, "learning_rate": 0.0002, "epoch": 6.4190800681431, "step": 9420}, {"loss": 0.9492, "grad_norm": 1.5655437707901, "learning_rate": 0.0002, "epoch": 6.425894378194208, "step": 9430}, {"loss": 0.9371, "grad_norm": 1.4344340562820435, "learning_rate": 0.0002, "epoch": 6.432708688245315, "step": 9440}, {"loss": 0.9396, "grad_norm": 1.5132373571395874, "learning_rate": 0.0002, "epoch": 6.439522998296423, "step": 9450}, {"loss": 0.9229, "grad_norm": 1.68776535987854, "learning_rate": 0.0002, "epoch": 6.4463373083475295, "step": 9460}, {"loss": 0.9524, "grad_norm": 1.556823968887329, "learning_rate": 0.0002, "epoch": 6.453151618398637, "step": 9470}, {"loss": 0.94, "grad_norm": 1.4254260063171387, "learning_rate": 0.0002, "epoch": 6.459965928449744, "step": 9480}, {"loss": 0.9689, "grad_norm": 1.7901203632354736, "learning_rate": 0.0002, "epoch": 6.466780238500852, "step": 9490}, {"loss": 0.9267, "grad_norm": 1.5098410844802856, "learning_rate": 0.0002, "epoch": 6.473594548551959, "step": 9500}, {"loss": 0.9159, "grad_norm": 1.6036792993545532, "learning_rate": 0.0002, "epoch": 6.480408858603067, "step": 9510}, {"loss": 0.9253, "grad_norm": 1.5011411905288696, "learning_rate": 0.0002, "epoch": 6.4872231686541735, "step": 9520}, {"loss": 0.9527, "grad_norm": 1.410780906677246, "learning_rate": 0.0002, "epoch": 6.494037478705281, "step": 9530}, {"loss": 0.8927, "grad_norm": 1.7451791763305664, "learning_rate": 0.0002, "epoch": 6.500851788756388, "step": 9540}, {"loss": 0.9566, "grad_norm": 1.5888725519180298, "learning_rate": 0.0002, "epoch": 6.507666098807496, "step": 9550}, {"loss": 0.9324, "grad_norm": 1.3016585111618042, "learning_rate": 0.0002, "epoch": 6.514480408858603, "step": 9560}, {"loss": 0.9576, "grad_norm": 1.629522442817688, "learning_rate": 0.0002, "epoch": 6.521294718909711, "step": 9570}, {"loss": 0.92, "grad_norm": 1.494436264038086, "learning_rate": 0.0002, "epoch": 6.5281090289608175, "step": 9580}, {"loss": 0.9154, "grad_norm": 1.323195219039917, "learning_rate": 0.0002, "epoch": 6.534923339011925, "step": 9590}, {"loss": 0.9891, "grad_norm": 1.4904460906982422, "learning_rate": 0.0002, "epoch": 6.541737649063032, "step": 9600}, {"loss": 0.9316, "grad_norm": 1.6079169511795044, "learning_rate": 0.0002, "epoch": 6.54855195911414, "step": 9610}, {"loss": 1.0105, "grad_norm": 1.5113396644592285, "learning_rate": 0.0002, "epoch": 6.555366269165247, "step": 9620}, {"loss": 0.9618, "grad_norm": 1.7113087177276611, "learning_rate": 0.0002, "epoch": 6.562180579216355, "step": 9630}, {"loss": 0.9699, "grad_norm": 1.359394907951355, "learning_rate": 0.0002, "epoch": 6.5689948892674614, "step": 9640}, {"loss": 1.0267, "grad_norm": 1.7701337337493896, "learning_rate": 0.0002, "epoch": 6.575809199318569, "step": 9650}, {"loss": 0.9639, "grad_norm": 1.6381222009658813, "learning_rate": 0.0002, "epoch": 6.582623509369676, "step": 9660}, {"loss": 0.9292, "grad_norm": 1.781891942024231, "learning_rate": 0.0002, "epoch": 6.589437819420784, "step": 9670}, {"loss": 1.0078, "grad_norm": 1.47724449634552, "learning_rate": 0.0002, "epoch": 6.596252129471891, "step": 9680}, {"loss": 1.0268, "grad_norm": 1.5498195886611938, "learning_rate": 0.0002, "epoch": 6.6030664395229985, "step": 9690}, {"loss": 0.9794, "grad_norm": 1.5682368278503418, "learning_rate": 0.0002, "epoch": 6.609880749574105, "step": 9700}, {"loss": 0.9298, "grad_norm": 1.6106981039047241, "learning_rate": 0.0002, "epoch": 6.616695059625213, "step": 9710}, {"loss": 0.9644, "grad_norm": 1.5388364791870117, "learning_rate": 0.0002, "epoch": 6.62350936967632, "step": 9720}, {"loss": 0.9385, "grad_norm": 1.5432790517807007, "learning_rate": 0.0002, "epoch": 6.630323679727428, "step": 9730}, {"loss": 0.9995, "grad_norm": 1.4929786920547485, "learning_rate": 0.0002, "epoch": 6.637137989778535, "step": 9740}, {"loss": 0.932, "grad_norm": 1.6959431171417236, "learning_rate": 0.0002, "epoch": 6.6439522998296425, "step": 9750}, {"loss": 0.9397, "grad_norm": 1.4990962743759155, "learning_rate": 0.0002, "epoch": 6.650766609880749, "step": 9760}, {"loss": 0.9808, "grad_norm": 1.5235223770141602, "learning_rate": 0.0002, "epoch": 6.657580919931857, "step": 9770}, {"loss": 0.9522, "grad_norm": 1.8264366388320923, "learning_rate": 0.0002, "epoch": 6.664395229982964, "step": 9780}, {"loss": 0.9751, "grad_norm": 1.4298417568206787, "learning_rate": 0.0002, "epoch": 6.671209540034072, "step": 9790}, {"loss": 0.9607, "grad_norm": 1.5926862955093384, "learning_rate": 0.0002, "epoch": 6.678023850085179, "step": 9800}, {"loss": 0.9681, "grad_norm": 1.4592483043670654, "learning_rate": 0.0002, "epoch": 6.6848381601362865, "step": 9810}, {"loss": 0.9385, "grad_norm": 1.375799536705017, "learning_rate": 0.0002, "epoch": 6.691652470187393, "step": 9820}, {"loss": 0.9684, "grad_norm": 1.5767531394958496, "learning_rate": 0.0002, "epoch": 6.698466780238501, "step": 9830}, {"loss": 0.9313, "grad_norm": 1.6452189683914185, "learning_rate": 0.0002, "epoch": 6.705281090289608, "step": 9840}, {"loss": 0.9781, "grad_norm": 1.3874469995498657, "learning_rate": 0.0002, "epoch": 6.712095400340716, "step": 9850}, {"loss": 0.9803, "grad_norm": 1.5470930337905884, "learning_rate": 0.0002, "epoch": 6.718909710391823, "step": 9860}, {"loss": 0.9335, "grad_norm": 1.499840259552002, "learning_rate": 0.0002, "epoch": 6.7257240204429305, "step": 9870}, {"loss": 0.9209, "grad_norm": 1.4733195304870605, "learning_rate": 0.0002, "epoch": 6.732538330494037, "step": 9880}, {"loss": 0.9124, "grad_norm": 1.921722173690796, "learning_rate": 0.0002, "epoch": 6.739352640545145, "step": 9890}, {"loss": 0.9311, "grad_norm": 1.848003625869751, "learning_rate": 0.0002, "epoch": 6.746166950596252, "step": 9900}, {"loss": 0.9601, "grad_norm": 1.6050934791564941, "learning_rate": 0.0002, "epoch": 6.75298126064736, "step": 9910}, {"loss": 0.941, "grad_norm": 1.716424822807312, "learning_rate": 0.0002, "epoch": 6.759795570698467, "step": 9920}, {"loss": 0.9592, "grad_norm": 1.5647642612457275, "learning_rate": 0.0002, "epoch": 6.7666098807495745, "step": 9930}, {"loss": 0.927, "grad_norm": 1.5500049591064453, "learning_rate": 0.0002, "epoch": 6.773424190800681, "step": 9940}, {"loss": 0.9921, "grad_norm": 1.5384467840194702, "learning_rate": 0.0002, "epoch": 6.780238500851789, "step": 9950}, {"loss": 0.9673, "grad_norm": 1.8312339782714844, "learning_rate": 0.0002, "epoch": 6.787052810902896, "step": 9960}, {"loss": 0.9647, "grad_norm": 1.3505569696426392, "learning_rate": 0.0002, "epoch": 6.793867120954003, "step": 9970}, {"loss": 0.9553, "grad_norm": 1.6717044115066528, "learning_rate": 0.0002, "epoch": 6.800681431005111, "step": 9980}, {"loss": 0.9688, "grad_norm": 1.7072664499282837, "learning_rate": 0.0002, "epoch": 6.8074957410562185, "step": 9990}, {"loss": 0.951, "grad_norm": 1.3609364032745361, "learning_rate": 0.0002, "epoch": 6.814310051107325, "step": 10000}, {"loss": 0.9638, "grad_norm": 1.4862881898880005, "learning_rate": 0.0002, "epoch": 6.821124361158432, "step": 10010}, {"loss": 1.016, "grad_norm": 1.4808303117752075, "learning_rate": 0.0002, "epoch": 6.82793867120954, "step": 10020}, {"loss": 0.9233, "grad_norm": 1.6531925201416016, "learning_rate": 0.0002, "epoch": 6.834752981260648, "step": 10030}, {"loss": 0.9435, "grad_norm": 1.5090917348861694, "learning_rate": 0.0002, "epoch": 6.841567291311755, "step": 10040}, {"loss": 0.9395, "grad_norm": 1.5361953973770142, "learning_rate": 0.0002, "epoch": 6.848381601362862, "step": 10050}, {"loss": 1.0095, "grad_norm": 1.7302757501602173, "learning_rate": 0.0002, "epoch": 6.855195911413969, "step": 10060}, {"loss": 0.9796, "grad_norm": 1.5626600980758667, "learning_rate": 0.0002, "epoch": 6.862010221465077, "step": 10070}, {"loss": 1.0244, "grad_norm": 1.4168927669525146, "learning_rate": 0.0002, "epoch": 6.868824531516184, "step": 10080}, {"loss": 0.9253, "grad_norm": 1.3921427726745605, "learning_rate": 0.0002, "epoch": 6.875638841567291, "step": 10090}, {"loss": 1.0037, "grad_norm": 1.6304726600646973, "learning_rate": 0.0002, "epoch": 6.882453151618399, "step": 10100}, {"loss": 1.0088, "grad_norm": 1.5463745594024658, "learning_rate": 0.0002, "epoch": 6.889267461669506, "step": 10110}, {"loss": 1.0276, "grad_norm": 1.4989547729492188, "learning_rate": 0.0002, "epoch": 6.896081771720613, "step": 10120}, {"loss": 1.0352, "grad_norm": 1.7281252145767212, "learning_rate": 0.0002, "epoch": 6.90289608177172, "step": 10130}, {"loss": 1.031, "grad_norm": 1.469348669052124, "learning_rate": 0.0002, "epoch": 6.909710391822828, "step": 10140}, {"loss": 1.0301, "grad_norm": 1.3762892484664917, "learning_rate": 0.0002, "epoch": 6.916524701873936, "step": 10150}, {"loss": 1.0032, "grad_norm": 1.489425539970398, "learning_rate": 0.0002, "epoch": 6.923339011925043, "step": 10160}, {"loss": 0.9487, "grad_norm": 1.4514580965042114, "learning_rate": 0.0002, "epoch": 6.9301533219761495, "step": 10170}, {"loss": 0.9898, "grad_norm": 1.6008871793746948, "learning_rate": 0.0002, "epoch": 6.936967632027257, "step": 10180}, {"loss": 1.0577, "grad_norm": 1.6893450021743774, "learning_rate": 0.0002, "epoch": 6.943781942078364, "step": 10190}, {"loss": 0.9699, "grad_norm": 1.66379976272583, "learning_rate": 0.0002, "epoch": 6.950596252129472, "step": 10200}, {"loss": 1.0159, "grad_norm": 1.501943588256836, "learning_rate": 0.0002, "epoch": 6.957410562180579, "step": 10210}, {"loss": 1.0414, "grad_norm": 1.6803759336471558, "learning_rate": 0.0002, "epoch": 6.964224872231687, "step": 10220}, {"loss": 1.0413, "grad_norm": 1.4512689113616943, "learning_rate": 0.0002, "epoch": 6.9710391822827935, "step": 10230}, {"loss": 0.9791, "grad_norm": 1.6071290969848633, "learning_rate": 0.0002, "epoch": 6.977853492333901, "step": 10240}, {"loss": 1.0574, "grad_norm": 1.598915696144104, "learning_rate": 0.0002, "epoch": 6.984667802385008, "step": 10250}, {"loss": 1.0379, "grad_norm": 1.7178512811660767, "learning_rate": 0.0002, "epoch": 6.991482112436116, "step": 10260}, {"loss": 1.0082, "grad_norm": 1.4407050609588623, "learning_rate": 0.0002, "epoch": 6.998296422487223, "step": 10270}]} +{"epoch": 7.997274275979557, "step": 11736, "epoch_duration": 2921.997413635254, "total_accumulated_duration": 20960.551059007645, "gpu_info": {"GPU_0": "NVIDIA A100-PCIE-40GB"}, "memory_usage": {"avg_memory_usage": {"GPU_0": 7751.47119140625}, "peak_memory_usage": {"GPU_0": 19996.724609375}, "avg_memory_reserved": {"GPU_0": 28746.0}, "peak_memory_reserved": {"GPU_0": 28746.0}, "total_memory": {"GPU_0": 40444.375}}, "best_checkpoint_path": "outputs-001/gemma-2-9b-it_int4_hellaswag-routerbench-0shot_lr-0.0002_e-8_seq-512_lora-a-32-d-0.05-r-64_bs-1_gas-2_tf32-True_tunedata-portion-p-1.0-num-7029-sd-1/checkpoint-1467", "params": {"epochs": 8, "batch_size": 1, "learning_rate": 0.0002, "gradient_accumulation_steps": 2, "warmup_ratio": 0.03, "max_grad_norm": 0.3, "lora_alpha": 32, "lora_dropout": 0.05, "lora_r": 64, "tf32": true, "seed": 42}, "log_history": [{"loss": 3.0988, "grad_norm": 0.635574460029602, "learning_rate": 0.0002, "epoch": 0.0068143100511073255, "step": 10}, {"loss": 2.5594, "grad_norm": 1.0401769876480103, "learning_rate": 0.0002, "epoch": 0.013628620102214651, "step": 20}, {"loss": 2.3587, "grad_norm": 0.4514131247997284, "learning_rate": 0.0002, "epoch": 0.020442930153321975, "step": 30}, {"loss": 2.121, "grad_norm": 0.6303355693817139, "learning_rate": 0.0002, "epoch": 0.027257240204429302, "step": 40}, {"loss": 1.9833, "grad_norm": 0.4648270606994629, "learning_rate": 0.0002, "epoch": 0.034071550255536626, "step": 50}, {"loss": 1.9384, "grad_norm": 0.42953479290008545, "learning_rate": 0.0002, "epoch": 0.04088586030664395, "step": 60}, {"loss": 1.9202, "grad_norm": 0.433614581823349, "learning_rate": 0.0002, "epoch": 0.04770017035775128, "step": 70}, {"loss": 1.911, "grad_norm": 0.45318254828453064, "learning_rate": 0.0002, "epoch": 0.054514480408858604, "step": 80}, {"loss": 1.8895, "grad_norm": 0.4023568630218506, "learning_rate": 0.0002, "epoch": 0.06132879045996593, "step": 90}, {"loss": 1.9257, "grad_norm": 0.43260207772254944, "learning_rate": 0.0002, "epoch": 0.06814310051107325, "step": 100}, {"loss": 1.9178, "grad_norm": 0.43389809131622314, "learning_rate": 0.0002, "epoch": 0.07495741056218058, "step": 110}, {"loss": 1.9071, "grad_norm": 0.39307987689971924, "learning_rate": 0.0002, "epoch": 0.0817717206132879, "step": 120}, {"loss": 1.9482, "grad_norm": 0.7703037261962891, "learning_rate": 0.0002, "epoch": 0.08858603066439523, "step": 130}, {"loss": 1.8394, "grad_norm": 0.38547563552856445, "learning_rate": 0.0002, "epoch": 0.09540034071550256, "step": 140}, {"loss": 1.9037, "grad_norm": 0.37948688864707947, "learning_rate": 0.0002, "epoch": 0.10221465076660988, "step": 150}, {"loss": 1.8664, "grad_norm": 0.33022379875183105, "learning_rate": 0.0002, "epoch": 0.10902896081771721, "step": 160}, {"loss": 1.8659, "grad_norm": 0.33703792095184326, "learning_rate": 0.0002, "epoch": 0.11584327086882454, "step": 170}, {"loss": 1.8389, "grad_norm": 0.3259912431240082, "learning_rate": 0.0002, "epoch": 0.12265758091993186, "step": 180}, {"loss": 1.8424, "grad_norm": 0.3593858778476715, "learning_rate": 0.0002, "epoch": 0.12947189097103917, "step": 190}, {"loss": 1.8457, "grad_norm": 0.3401614725589752, "learning_rate": 0.0002, "epoch": 0.1362862010221465, "step": 200}, {"loss": 1.7721, "grad_norm": 0.3892078697681427, "learning_rate": 0.0002, "epoch": 0.14310051107325383, "step": 210}, {"loss": 1.8351, "grad_norm": 0.315374493598938, "learning_rate": 0.0002, "epoch": 0.14991482112436116, "step": 220}, {"loss": 1.8307, "grad_norm": 0.3679497539997101, "learning_rate": 0.0002, "epoch": 0.1567291311754685, "step": 230}, {"loss": 1.816, "grad_norm": 0.336730033159256, "learning_rate": 0.0002, "epoch": 0.1635434412265758, "step": 240}, {"loss": 1.7849, "grad_norm": 0.36472755670547485, "learning_rate": 0.0002, "epoch": 0.17035775127768313, "step": 250}, {"loss": 1.7766, "grad_norm": 0.34864023327827454, "learning_rate": 0.0002, "epoch": 0.17717206132879046, "step": 260}, {"loss": 1.896, "grad_norm": 0.3350819945335388, "learning_rate": 0.0002, "epoch": 0.1839863713798978, "step": 270}, {"loss": 1.8742, "grad_norm": 0.3844246566295624, "learning_rate": 0.0002, "epoch": 0.19080068143100512, "step": 280}, {"loss": 1.8531, "grad_norm": 0.38413065671920776, "learning_rate": 0.0002, "epoch": 0.19761499148211242, "step": 290}, {"loss": 1.8415, "grad_norm": 0.4080047607421875, "learning_rate": 0.0002, "epoch": 0.20442930153321975, "step": 300}, {"loss": 1.9009, "grad_norm": 0.3546800911426544, "learning_rate": 0.0002, "epoch": 0.21124361158432708, "step": 310}, {"loss": 1.8092, "grad_norm": 0.38577890396118164, "learning_rate": 0.0002, "epoch": 0.21805792163543442, "step": 320}, {"loss": 1.7113, "grad_norm": 0.38979098200798035, "learning_rate": 0.0002, "epoch": 0.22487223168654175, "step": 330}, {"loss": 1.8162, "grad_norm": 0.35105520486831665, "learning_rate": 0.0002, "epoch": 0.23168654173764908, "step": 340}, {"loss": 1.8408, "grad_norm": 0.31671223044395447, "learning_rate": 0.0002, "epoch": 0.23850085178875638, "step": 350}, {"loss": 1.8014, "grad_norm": 0.33034196496009827, "learning_rate": 0.0002, "epoch": 0.2453151618398637, "step": 360}, {"loss": 1.7132, "grad_norm": 0.2990533709526062, "learning_rate": 0.0002, "epoch": 0.252129471890971, "step": 370}, {"loss": 1.8612, "grad_norm": 0.362208753824234, "learning_rate": 0.0002, "epoch": 0.25894378194207834, "step": 380}, {"loss": 1.8001, "grad_norm": 0.3269096612930298, "learning_rate": 0.0002, "epoch": 0.2657580919931857, "step": 390}, {"loss": 1.786, "grad_norm": 0.30555954575538635, "learning_rate": 0.0002, "epoch": 0.272572402044293, "step": 400}, {"loss": 1.8018, "grad_norm": 0.332933247089386, "learning_rate": 0.0002, "epoch": 0.27938671209540034, "step": 410}, {"loss": 1.8157, "grad_norm": 0.39454060792922974, "learning_rate": 0.0002, "epoch": 0.28620102214650767, "step": 420}, {"loss": 1.7862, "grad_norm": 0.34589633345603943, "learning_rate": 0.0002, "epoch": 0.293015332197615, "step": 430}, {"loss": 1.7612, "grad_norm": 0.3747332990169525, "learning_rate": 0.0002, "epoch": 0.29982964224872233, "step": 440}, {"loss": 1.8476, "grad_norm": 0.34825369715690613, "learning_rate": 0.0002, "epoch": 0.30664395229982966, "step": 450}, {"loss": 1.775, "grad_norm": 0.32906976342201233, "learning_rate": 0.0002, "epoch": 0.313458262350937, "step": 460}, {"loss": 1.7764, "grad_norm": 0.33108609914779663, "learning_rate": 0.0002, "epoch": 0.3202725724020443, "step": 470}, {"loss": 1.8012, "grad_norm": 0.3170463442802429, "learning_rate": 0.0002, "epoch": 0.3270868824531516, "step": 480}, {"loss": 1.8264, "grad_norm": 0.30792081356048584, "learning_rate": 0.0002, "epoch": 0.3339011925042589, "step": 490}, {"loss": 1.7361, "grad_norm": 0.31772997975349426, "learning_rate": 0.0002, "epoch": 0.34071550255536626, "step": 500}, {"loss": 1.8147, "grad_norm": 0.32714012265205383, "learning_rate": 0.0002, "epoch": 0.3475298126064736, "step": 510}, {"loss": 1.8332, "grad_norm": 0.3021100163459778, "learning_rate": 0.0002, "epoch": 0.3543441226575809, "step": 520}, {"loss": 1.7337, "grad_norm": 0.6045835018157959, "learning_rate": 0.0002, "epoch": 0.36115843270868825, "step": 530}, {"loss": 1.8661, "grad_norm": 0.3003896474838257, "learning_rate": 0.0002, "epoch": 0.3679727427597956, "step": 540}, {"loss": 1.8359, "grad_norm": 0.3678470551967621, "learning_rate": 0.0002, "epoch": 0.3747870528109029, "step": 550}, {"loss": 1.7512, "grad_norm": 0.35787615180015564, "learning_rate": 0.0002, "epoch": 0.38160136286201024, "step": 560}, {"loss": 1.8048, "grad_norm": 0.31882143020629883, "learning_rate": 0.0002, "epoch": 0.38841567291311757, "step": 570}, {"loss": 1.8108, "grad_norm": 0.3186313509941101, "learning_rate": 0.0002, "epoch": 0.39522998296422485, "step": 580}, {"loss": 1.8012, "grad_norm": 0.41443702578544617, "learning_rate": 0.0002, "epoch": 0.4020442930153322, "step": 590}, {"loss": 1.7794, "grad_norm": 0.28773069381713867, "learning_rate": 0.0002, "epoch": 0.4088586030664395, "step": 600}, {"loss": 1.8231, "grad_norm": 0.35743263363838196, "learning_rate": 0.0002, "epoch": 0.41567291311754684, "step": 610}, {"loss": 1.7531, "grad_norm": 0.3360286355018616, "learning_rate": 0.0002, "epoch": 0.42248722316865417, "step": 620}, {"loss": 1.7654, "grad_norm": 0.32838866114616394, "learning_rate": 0.0002, "epoch": 0.4293015332197615, "step": 630}, {"loss": 1.8176, "grad_norm": 0.2994388937950134, "learning_rate": 0.0002, "epoch": 0.43611584327086883, "step": 640}, {"loss": 1.7241, "grad_norm": 0.3306307792663574, "learning_rate": 0.0002, "epoch": 0.44293015332197616, "step": 650}, {"loss": 1.8201, "grad_norm": 0.3129560351371765, "learning_rate": 0.0002, "epoch": 0.4497444633730835, "step": 660}, {"loss": 1.803, "grad_norm": 0.3244289457798004, "learning_rate": 0.0002, "epoch": 0.4565587734241908, "step": 670}, {"loss": 1.8399, "grad_norm": 0.3196892738342285, "learning_rate": 0.0002, "epoch": 0.46337308347529815, "step": 680}, {"loss": 1.8291, "grad_norm": 0.3135230243206024, "learning_rate": 0.0002, "epoch": 0.47018739352640543, "step": 690}, {"loss": 1.7423, "grad_norm": 0.28677991032600403, "learning_rate": 0.0002, "epoch": 0.47700170357751276, "step": 700}, {"loss": 1.6982, "grad_norm": 0.3074065148830414, "learning_rate": 0.0002, "epoch": 0.4838160136286201, "step": 710}, {"loss": 1.8078, "grad_norm": 0.3354290723800659, "learning_rate": 0.0002, "epoch": 0.4906303236797274, "step": 720}, {"loss": 1.7333, "grad_norm": 0.324370801448822, "learning_rate": 0.0002, "epoch": 0.49744463373083475, "step": 730}, {"loss": 1.8506, "grad_norm": 0.29496142268180847, "learning_rate": 0.0002, "epoch": 0.504258943781942, "step": 740}, {"loss": 1.7761, "grad_norm": 0.30694130063056946, "learning_rate": 0.0002, "epoch": 0.5110732538330494, "step": 750}, {"loss": 1.7854, "grad_norm": 0.36168408393859863, "learning_rate": 0.0002, "epoch": 0.5178875638841567, "step": 760}, {"loss": 1.7149, "grad_norm": 0.2930343449115753, "learning_rate": 0.0002, "epoch": 0.524701873935264, "step": 770}, {"loss": 1.7924, "grad_norm": 0.3023432493209839, "learning_rate": 0.0002, "epoch": 0.5315161839863713, "step": 780}, {"loss": 1.8467, "grad_norm": 0.3272720277309418, "learning_rate": 0.0002, "epoch": 0.5383304940374787, "step": 790}, {"loss": 1.8639, "grad_norm": 0.2907974421977997, "learning_rate": 0.0002, "epoch": 0.545144804088586, "step": 800}, {"loss": 1.9018, "grad_norm": 0.32267168164253235, "learning_rate": 0.0002, "epoch": 0.5519591141396933, "step": 810}, {"loss": 1.8311, "grad_norm": 0.32059940695762634, "learning_rate": 0.0002, "epoch": 0.5587734241908007, "step": 820}, {"loss": 1.7234, "grad_norm": 0.30951258540153503, "learning_rate": 0.0002, "epoch": 0.565587734241908, "step": 830}, {"loss": 1.8063, "grad_norm": 0.33976122736930847, "learning_rate": 0.0002, "epoch": 0.5724020442930153, "step": 840}, {"loss": 1.7021, "grad_norm": 0.3195820450782776, "learning_rate": 0.0002, "epoch": 0.5792163543441227, "step": 850}, {"loss": 1.8073, "grad_norm": 0.2828562557697296, "learning_rate": 0.0002, "epoch": 0.58603066439523, "step": 860}, {"loss": 1.695, "grad_norm": 0.29591670632362366, "learning_rate": 0.0002, "epoch": 0.5928449744463373, "step": 870}, {"loss": 1.8109, "grad_norm": 0.3086104393005371, "learning_rate": 0.0002, "epoch": 0.5996592844974447, "step": 880}, {"loss": 1.8207, "grad_norm": 0.3592929542064667, "learning_rate": 0.0002, "epoch": 0.606473594548552, "step": 890}, {"loss": 1.7662, "grad_norm": 0.2830186188220978, "learning_rate": 0.0002, "epoch": 0.6132879045996593, "step": 900}, {"loss": 1.8344, "grad_norm": 0.3128598630428314, "learning_rate": 0.0002, "epoch": 0.6201022146507666, "step": 910}, {"loss": 1.7953, "grad_norm": 0.31957563757896423, "learning_rate": 0.0002, "epoch": 0.626916524701874, "step": 920}, {"loss": 1.7578, "grad_norm": 0.30994319915771484, "learning_rate": 0.0002, "epoch": 0.6337308347529813, "step": 930}, {"loss": 1.8494, "grad_norm": 0.3352845013141632, "learning_rate": 0.0002, "epoch": 0.6405451448040886, "step": 940}, {"loss": 1.7054, "grad_norm": 0.2960077226161957, "learning_rate": 0.0002, "epoch": 0.6473594548551959, "step": 950}, {"loss": 1.7209, "grad_norm": 0.32675081491470337, "learning_rate": 0.0002, "epoch": 0.6541737649063032, "step": 960}, {"loss": 1.7776, "grad_norm": 0.30042028427124023, "learning_rate": 0.0002, "epoch": 0.6609880749574105, "step": 970}, {"loss": 1.7597, "grad_norm": 0.3288673758506775, "learning_rate": 0.0002, "epoch": 0.6678023850085179, "step": 980}, {"loss": 1.7962, "grad_norm": 0.323215126991272, "learning_rate": 0.0002, "epoch": 0.6746166950596252, "step": 990}, {"loss": 1.821, "grad_norm": 0.30041399598121643, "learning_rate": 0.0002, "epoch": 0.6814310051107325, "step": 1000}, {"loss": 1.7716, "grad_norm": 0.3076179623603821, "learning_rate": 0.0002, "epoch": 0.6882453151618398, "step": 1010}, {"loss": 1.7827, "grad_norm": 0.2941909730434418, "learning_rate": 0.0002, "epoch": 0.6950596252129472, "step": 1020}, {"loss": 1.7964, "grad_norm": 0.32220420241355896, "learning_rate": 0.0002, "epoch": 0.7018739352640545, "step": 1030}, {"loss": 1.7101, "grad_norm": 0.2989702820777893, "learning_rate": 0.0002, "epoch": 0.7086882453151618, "step": 1040}, {"loss": 1.8749, "grad_norm": 0.3049640357494354, "learning_rate": 0.0002, "epoch": 0.7155025553662692, "step": 1050}, {"loss": 1.782, "grad_norm": 0.3183284103870392, "learning_rate": 0.0002, "epoch": 0.7223168654173765, "step": 1060}, {"loss": 1.785, "grad_norm": 0.3070095181465149, "learning_rate": 0.0002, "epoch": 0.7291311754684838, "step": 1070}, {"loss": 1.7832, "grad_norm": 0.33263063430786133, "learning_rate": 0.0002, "epoch": 0.7359454855195912, "step": 1080}, {"loss": 1.7627, "grad_norm": 0.28774312138557434, "learning_rate": 0.0002, "epoch": 0.7427597955706985, "step": 1090}, {"loss": 1.7343, "grad_norm": 0.29304224252700806, "learning_rate": 0.0002, "epoch": 0.7495741056218058, "step": 1100}, {"loss": 1.8082, "grad_norm": 0.27529507875442505, "learning_rate": 0.0002, "epoch": 0.7563884156729132, "step": 1110}, {"loss": 1.8071, "grad_norm": 0.32319945096969604, "learning_rate": 0.0002, "epoch": 0.7632027257240205, "step": 1120}, {"loss": 1.6998, "grad_norm": 0.33777597546577454, "learning_rate": 0.0002, "epoch": 0.7700170357751278, "step": 1130}, {"loss": 1.8488, "grad_norm": 0.29847201704978943, "learning_rate": 0.0002, "epoch": 0.7768313458262351, "step": 1140}, {"loss": 1.7014, "grad_norm": 0.31370633840560913, "learning_rate": 0.0002, "epoch": 0.7836456558773425, "step": 1150}, {"loss": 1.8175, "grad_norm": 0.31633856892585754, "learning_rate": 0.0002, "epoch": 0.7904599659284497, "step": 1160}, {"loss": 1.7637, "grad_norm": 0.33849263191223145, "learning_rate": 0.0002, "epoch": 0.797274275979557, "step": 1170}, {"loss": 1.8264, "grad_norm": 0.3306333124637604, "learning_rate": 0.0002, "epoch": 0.8040885860306644, "step": 1180}, {"loss": 1.777, "grad_norm": 0.3249678313732147, "learning_rate": 0.0002, "epoch": 0.8109028960817717, "step": 1190}, {"loss": 1.7631, "grad_norm": 0.3252817690372467, "learning_rate": 0.0002, "epoch": 0.817717206132879, "step": 1200}, {"loss": 1.7877, "grad_norm": 0.31772559881210327, "learning_rate": 0.0002, "epoch": 0.8245315161839863, "step": 1210}, {"loss": 1.8068, "grad_norm": 0.2803371846675873, "learning_rate": 0.0002, "epoch": 0.8313458262350937, "step": 1220}, {"loss": 1.738, "grad_norm": 0.26700571179389954, "learning_rate": 0.0002, "epoch": 0.838160136286201, "step": 1230}, {"loss": 1.8301, "grad_norm": 0.3060092031955719, "learning_rate": 0.0002, "epoch": 0.8449744463373083, "step": 1240}, {"loss": 1.7993, "grad_norm": 0.28831684589385986, "learning_rate": 0.0002, "epoch": 0.8517887563884157, "step": 1250}, {"loss": 1.6909, "grad_norm": 0.30708742141723633, "learning_rate": 0.0002, "epoch": 0.858603066439523, "step": 1260}, {"loss": 1.8506, "grad_norm": 0.2915987968444824, "learning_rate": 0.0002, "epoch": 0.8654173764906303, "step": 1270}, {"loss": 1.7536, "grad_norm": 0.2893589735031128, "learning_rate": 0.0002, "epoch": 0.8722316865417377, "step": 1280}, {"loss": 1.7437, "grad_norm": 0.29545632004737854, "learning_rate": 0.0002, "epoch": 0.879045996592845, "step": 1290}, {"loss": 1.859, "grad_norm": 0.3354771137237549, "learning_rate": 0.0002, "epoch": 0.8858603066439523, "step": 1300}, {"loss": 1.7644, "grad_norm": 0.37715399265289307, "learning_rate": 0.0002, "epoch": 0.8926746166950597, "step": 1310}, {"loss": 1.7731, "grad_norm": 0.28847193717956543, "learning_rate": 0.0002, "epoch": 0.899488926746167, "step": 1320}, {"loss": 1.9077, "grad_norm": 0.2780889868736267, "learning_rate": 0.0002, "epoch": 0.9063032367972743, "step": 1330}, {"loss": 1.7861, "grad_norm": 0.2714342176914215, "learning_rate": 0.0002, "epoch": 0.9131175468483816, "step": 1340}, {"loss": 1.8, "grad_norm": 0.2950133979320526, "learning_rate": 0.0002, "epoch": 0.919931856899489, "step": 1350}, {"loss": 1.7368, "grad_norm": 0.29097145795822144, "learning_rate": 0.0002, "epoch": 0.9267461669505963, "step": 1360}, {"loss": 1.7864, "grad_norm": 0.32540133595466614, "learning_rate": 0.0002, "epoch": 0.9335604770017035, "step": 1370}, {"loss": 1.7571, "grad_norm": 0.3076636493206024, "learning_rate": 0.0002, "epoch": 0.9403747870528109, "step": 1380}, {"loss": 1.8312, "grad_norm": 0.2962130308151245, "learning_rate": 0.0002, "epoch": 0.9471890971039182, "step": 1390}, {"loss": 1.7858, "grad_norm": 0.30086860060691833, "learning_rate": 0.0002, "epoch": 0.9540034071550255, "step": 1400}, {"loss": 1.7991, "grad_norm": 0.28634947538375854, "learning_rate": 0.0002, "epoch": 0.9608177172061328, "step": 1410}, {"loss": 1.8385, "grad_norm": 0.35314416885375977, "learning_rate": 0.0002, "epoch": 0.9676320272572402, "step": 1420}, {"loss": 1.8054, "grad_norm": 0.2939317524433136, "learning_rate": 0.0002, "epoch": 0.9744463373083475, "step": 1430}, {"loss": 1.7582, "grad_norm": 0.3010196089744568, "learning_rate": 0.0002, "epoch": 0.9812606473594548, "step": 1440}, {"loss": 1.758, "grad_norm": 0.30816152691841125, "learning_rate": 0.0002, "epoch": 0.9880749574105622, "step": 1450}, {"loss": 1.8159, "grad_norm": 0.28152793645858765, "learning_rate": 0.0002, "epoch": 0.9948892674616695, "step": 1460}, {"eval_loss": 1.8034634590148926, "eval_runtime": 53.6017, "eval_samples_per_second": 9.459, "eval_steps_per_second": 1.194, "epoch": 0.9996592844974447, "step": 1467}, {"loss": 1.7072, "grad_norm": 0.29246416687965393, "learning_rate": 0.0002, "epoch": 1.0017035775127767, "step": 1470}, {"loss": 1.8159, "grad_norm": 0.2668602168560028, "learning_rate": 0.0002, "epoch": 1.008517887563884, "step": 1480}, {"loss": 1.6868, "grad_norm": 0.2998567819595337, "learning_rate": 0.0002, "epoch": 1.0153321976149914, "step": 1490}, {"loss": 1.7331, "grad_norm": 0.3284934461116791, "learning_rate": 0.0002, "epoch": 1.0221465076660987, "step": 1500}, {"loss": 1.689, "grad_norm": 0.3275827169418335, "learning_rate": 0.0002, "epoch": 1.028960817717206, "step": 1510}, {"loss": 1.7092, "grad_norm": 0.3382718563079834, "learning_rate": 0.0002, "epoch": 1.0357751277683134, "step": 1520}, {"loss": 1.7215, "grad_norm": 0.36737215518951416, "learning_rate": 0.0002, "epoch": 1.0425894378194207, "step": 1530}, {"loss": 1.6993, "grad_norm": 0.3442603647708893, "learning_rate": 0.0002, "epoch": 1.049403747870528, "step": 1540}, {"loss": 1.6996, "grad_norm": 0.3323381245136261, "learning_rate": 0.0002, "epoch": 1.0562180579216354, "step": 1550}, {"loss": 1.7444, "grad_norm": 0.341227650642395, "learning_rate": 0.0002, "epoch": 1.0630323679727427, "step": 1560}, {"loss": 1.7419, "grad_norm": 0.3361579477787018, "learning_rate": 0.0002, "epoch": 1.06984667802385, "step": 1570}, {"loss": 1.7337, "grad_norm": 0.3556230962276459, "learning_rate": 0.0002, "epoch": 1.0766609880749574, "step": 1580}, {"loss": 1.6978, "grad_norm": 0.27130424976348877, "learning_rate": 0.0002, "epoch": 1.0834752981260647, "step": 1590}, {"loss": 1.6087, "grad_norm": 0.29366323351860046, "learning_rate": 0.0002, "epoch": 1.090289608177172, "step": 1600}, {"loss": 1.6721, "grad_norm": 0.3581245541572571, "learning_rate": 0.0002, "epoch": 1.0971039182282794, "step": 1610}, {"loss": 1.7639, "grad_norm": 0.3021670877933502, "learning_rate": 0.0002, "epoch": 1.1039182282793867, "step": 1620}, {"loss": 1.7314, "grad_norm": 0.3145572543144226, "learning_rate": 0.0002, "epoch": 1.110732538330494, "step": 1630}, {"loss": 1.7408, "grad_norm": 0.35362154245376587, "learning_rate": 0.0002, "epoch": 1.1175468483816013, "step": 1640}, {"loss": 1.7071, "grad_norm": 0.5413113236427307, "learning_rate": 0.0002, "epoch": 1.1243611584327087, "step": 1650}, {"loss": 1.7615, "grad_norm": 0.6858654022216797, "learning_rate": 0.0002, "epoch": 1.131175468483816, "step": 1660}, {"loss": 1.7029, "grad_norm": 0.3781903386116028, "learning_rate": 0.0002, "epoch": 1.1379897785349233, "step": 1670}, {"loss": 1.659, "grad_norm": 0.404864102602005, "learning_rate": 0.0002, "epoch": 1.1448040885860307, "step": 1680}, {"loss": 1.7212, "grad_norm": 0.3595100939273834, "learning_rate": 0.0002, "epoch": 1.151618398637138, "step": 1690}, {"loss": 1.7023, "grad_norm": 0.33682283759117126, "learning_rate": 0.0002, "epoch": 1.1584327086882453, "step": 1700}, {"loss": 1.7336, "grad_norm": 0.3877373933792114, "learning_rate": 0.0002, "epoch": 1.1652470187393527, "step": 1710}, {"loss": 1.7676, "grad_norm": 0.34606897830963135, "learning_rate": 0.0002, "epoch": 1.17206132879046, "step": 1720}, {"loss": 1.6889, "grad_norm": 0.3122918903827667, "learning_rate": 0.0002, "epoch": 1.1788756388415673, "step": 1730}, {"loss": 1.6585, "grad_norm": 0.34081900119781494, "learning_rate": 0.0002, "epoch": 1.1856899488926746, "step": 1740}, {"loss": 1.7794, "grad_norm": 0.3418807089328766, "learning_rate": 0.0002, "epoch": 1.192504258943782, "step": 1750}, {"loss": 1.7773, "grad_norm": 0.3495500981807709, "learning_rate": 0.0002, "epoch": 1.1993185689948893, "step": 1760}, {"loss": 1.6578, "grad_norm": 0.557288408279419, "learning_rate": 0.0002, "epoch": 1.2061328790459966, "step": 1770}, {"loss": 1.7488, "grad_norm": 0.3193778693675995, "learning_rate": 0.0002, "epoch": 1.212947189097104, "step": 1780}, {"loss": 1.6815, "grad_norm": 0.3306216895580292, "learning_rate": 0.0002, "epoch": 1.2197614991482113, "step": 1790}, {"loss": 1.6772, "grad_norm": 0.37998732924461365, "learning_rate": 0.0002, "epoch": 1.2265758091993186, "step": 1800}, {"loss": 1.7094, "grad_norm": 0.3255669176578522, "learning_rate": 0.0002, "epoch": 1.233390119250426, "step": 1810}, {"loss": 1.7096, "grad_norm": 0.3741287291049957, "learning_rate": 0.0002, "epoch": 1.2402044293015333, "step": 1820}, {"loss": 1.7317, "grad_norm": 0.36727291345596313, "learning_rate": 0.0002, "epoch": 1.2470187393526406, "step": 1830}, {"loss": 1.7418, "grad_norm": 0.3479527235031128, "learning_rate": 0.0002, "epoch": 1.253833049403748, "step": 1840}, {"loss": 1.7062, "grad_norm": 0.3472636640071869, "learning_rate": 0.0002, "epoch": 1.2606473594548553, "step": 1850}, {"loss": 1.6263, "grad_norm": 0.3702869415283203, "learning_rate": 0.0002, "epoch": 1.2674616695059626, "step": 1860}, {"loss": 1.6222, "grad_norm": 0.3934040069580078, "learning_rate": 0.0002, "epoch": 1.27427597955707, "step": 1870}, {"loss": 1.7616, "grad_norm": 0.46887534856796265, "learning_rate": 0.0002, "epoch": 1.2810902896081773, "step": 1880}, {"loss": 1.6071, "grad_norm": 0.3191998600959778, "learning_rate": 0.0002, "epoch": 1.2879045996592846, "step": 1890}, {"loss": 1.7001, "grad_norm": 0.34032225608825684, "learning_rate": 0.0002, "epoch": 1.294718909710392, "step": 1900}, {"loss": 1.8316, "grad_norm": 0.33453696966171265, "learning_rate": 0.0002, "epoch": 1.3015332197614993, "step": 1910}, {"loss": 1.6567, "grad_norm": 0.3451494872570038, "learning_rate": 0.0002, "epoch": 1.3083475298126066, "step": 1920}, {"loss": 1.708, "grad_norm": 0.36203092336654663, "learning_rate": 0.0002, "epoch": 1.315161839863714, "step": 1930}, {"loss": 1.7095, "grad_norm": 0.43794456124305725, "learning_rate": 0.0002, "epoch": 1.321976149914821, "step": 1940}, {"loss": 1.7264, "grad_norm": 0.3630591034889221, "learning_rate": 0.0002, "epoch": 1.3287904599659284, "step": 1950}, {"loss": 1.6529, "grad_norm": 0.36951911449432373, "learning_rate": 0.0002, "epoch": 1.3356047700170357, "step": 1960}, {"loss": 1.651, "grad_norm": 0.4001159965991974, "learning_rate": 0.0002, "epoch": 1.342419080068143, "step": 1970}, {"loss": 1.677, "grad_norm": 0.3820836544036865, "learning_rate": 0.0002, "epoch": 1.3492333901192504, "step": 1980}, {"loss": 1.583, "grad_norm": 0.3705870807170868, "learning_rate": 0.0002, "epoch": 1.3560477001703577, "step": 1990}, {"loss": 1.6207, "grad_norm": 0.3557972013950348, "learning_rate": 0.0002, "epoch": 1.362862010221465, "step": 2000}, {"loss": 1.7656, "grad_norm": 0.38546398282051086, "learning_rate": 0.0002, "epoch": 1.3696763202725724, "step": 2010}, {"loss": 1.6881, "grad_norm": 0.3908020853996277, "learning_rate": 0.0002, "epoch": 1.3764906303236797, "step": 2020}, {"loss": 1.727, "grad_norm": 0.3822040855884552, "learning_rate": 0.0002, "epoch": 1.383304940374787, "step": 2030}, {"loss": 1.7923, "grad_norm": 0.425327867269516, "learning_rate": 0.0002, "epoch": 1.3901192504258943, "step": 2040}, {"loss": 1.7032, "grad_norm": 0.3436259329319, "learning_rate": 0.0002, "epoch": 1.3969335604770017, "step": 2050}, {"loss": 1.6398, "grad_norm": 0.33124062418937683, "learning_rate": 0.0002, "epoch": 1.403747870528109, "step": 2060}, {"loss": 1.6815, "grad_norm": 0.3662424683570862, "learning_rate": 0.0002, "epoch": 1.4105621805792163, "step": 2070}, {"loss": 1.7258, "grad_norm": 0.3720635175704956, "learning_rate": 0.0002, "epoch": 1.4173764906303237, "step": 2080}, {"loss": 1.7186, "grad_norm": 0.3361680805683136, "learning_rate": 0.0002, "epoch": 1.424190800681431, "step": 2090}, {"loss": 1.7606, "grad_norm": 0.32751724123954773, "learning_rate": 0.0002, "epoch": 1.4310051107325383, "step": 2100}, {"loss": 1.7051, "grad_norm": 0.34333378076553345, "learning_rate": 0.0002, "epoch": 1.4378194207836457, "step": 2110}, {"loss": 1.6979, "grad_norm": 0.37777671217918396, "learning_rate": 0.0002, "epoch": 1.444633730834753, "step": 2120}, {"loss": 1.7085, "grad_norm": 0.37126365303993225, "learning_rate": 0.0002, "epoch": 1.4514480408858603, "step": 2130}, {"loss": 1.721, "grad_norm": 0.3602267801761627, "learning_rate": 0.0002, "epoch": 1.4582623509369677, "step": 2140}, {"loss": 1.8148, "grad_norm": 0.3287110924720764, "learning_rate": 0.0002, "epoch": 1.465076660988075, "step": 2150}, {"loss": 1.6966, "grad_norm": 0.3562135100364685, "learning_rate": 0.0002, "epoch": 1.4718909710391823, "step": 2160}, {"loss": 1.713, "grad_norm": 0.38292962312698364, "learning_rate": 0.0002, "epoch": 1.4787052810902896, "step": 2170}, {"loss": 1.7036, "grad_norm": 0.38220319151878357, "learning_rate": 0.0002, "epoch": 1.485519591141397, "step": 2180}, {"loss": 1.7297, "grad_norm": 0.3570062220096588, "learning_rate": 0.0002, "epoch": 1.4923339011925043, "step": 2190}, {"loss": 1.5652, "grad_norm": 0.363146036863327, "learning_rate": 0.0002, "epoch": 1.4991482112436116, "step": 2200}, {"loss": 1.6716, "grad_norm": 0.37393274903297424, "learning_rate": 0.0002, "epoch": 1.5059625212947187, "step": 2210}, {"loss": 1.6839, "grad_norm": 0.3628501892089844, "learning_rate": 0.0002, "epoch": 1.512776831345826, "step": 2220}, {"loss": 1.6627, "grad_norm": 0.33430740237236023, "learning_rate": 0.0002, "epoch": 1.5195911413969334, "step": 2230}, {"loss": 1.7495, "grad_norm": 0.35363978147506714, "learning_rate": 0.0002, "epoch": 1.5264054514480407, "step": 2240}, {"loss": 1.6364, "grad_norm": 0.37220337986946106, "learning_rate": 0.0002, "epoch": 1.533219761499148, "step": 2250}, {"loss": 1.7773, "grad_norm": 0.35020262002944946, "learning_rate": 0.0002, "epoch": 1.5400340715502554, "step": 2260}, {"loss": 1.7453, "grad_norm": 0.35274937748908997, "learning_rate": 0.0002, "epoch": 1.5468483816013627, "step": 2270}, {"loss": 1.7162, "grad_norm": 0.3797738552093506, "learning_rate": 0.0002, "epoch": 1.55366269165247, "step": 2280}, {"loss": 1.6197, "grad_norm": 0.4160412847995758, "learning_rate": 0.0002, "epoch": 1.5604770017035774, "step": 2290}, {"loss": 1.7101, "grad_norm": 0.38779592514038086, "learning_rate": 0.0002, "epoch": 1.5672913117546847, "step": 2300}, {"loss": 1.7254, "grad_norm": 0.39171287417411804, "learning_rate": 0.0002, "epoch": 1.574105621805792, "step": 2310}, {"loss": 1.7087, "grad_norm": 0.3606826663017273, "learning_rate": 0.0002, "epoch": 1.5809199318568994, "step": 2320}, {"loss": 1.7269, "grad_norm": 0.3745017647743225, "learning_rate": 0.0002, "epoch": 1.5877342419080067, "step": 2330}, {"loss": 1.7068, "grad_norm": 0.34933462738990784, "learning_rate": 0.0002, "epoch": 1.594548551959114, "step": 2340}, {"loss": 1.7073, "grad_norm": 0.37268444895744324, "learning_rate": 0.0002, "epoch": 1.6013628620102214, "step": 2350}, {"loss": 1.728, "grad_norm": 0.4603484869003296, "learning_rate": 0.0002, "epoch": 1.6081771720613287, "step": 2360}, {"loss": 1.7621, "grad_norm": 0.35689088702201843, "learning_rate": 0.0002, "epoch": 1.614991482112436, "step": 2370}, {"loss": 1.6989, "grad_norm": 0.3392031490802765, "learning_rate": 0.0002, "epoch": 1.6218057921635434, "step": 2380}, {"loss": 1.7077, "grad_norm": 0.394653856754303, "learning_rate": 0.0002, "epoch": 1.6286201022146507, "step": 2390}, {"loss": 1.7448, "grad_norm": 0.33972012996673584, "learning_rate": 0.0002, "epoch": 1.635434412265758, "step": 2400}, {"loss": 1.7681, "grad_norm": 0.3854375183582306, "learning_rate": 0.0002, "epoch": 1.6422487223168654, "step": 2410}, {"loss": 1.7102, "grad_norm": 0.36143961548805237, "learning_rate": 0.0002, "epoch": 1.6490630323679727, "step": 2420}, {"loss": 1.7432, "grad_norm": 0.35816189646720886, "learning_rate": 0.0002, "epoch": 1.65587734241908, "step": 2430}, {"loss": 1.6407, "grad_norm": 0.36298736929893494, "learning_rate": 0.0002, "epoch": 1.6626916524701874, "step": 2440}, {"loss": 1.723, "grad_norm": 0.36756929755210876, "learning_rate": 0.0002, "epoch": 1.6695059625212947, "step": 2450}, {"loss": 1.6824, "grad_norm": 0.35969603061676025, "learning_rate": 0.0002, "epoch": 1.676320272572402, "step": 2460}, {"loss": 1.7014, "grad_norm": 0.38449376821517944, "learning_rate": 0.0002, "epoch": 1.6831345826235093, "step": 2470}, {"loss": 1.7261, "grad_norm": 0.44511452317237854, "learning_rate": 0.0002, "epoch": 1.6899488926746167, "step": 2480}, {"loss": 1.7397, "grad_norm": 0.3831416368484497, "learning_rate": 0.0002, "epoch": 1.696763202725724, "step": 2490}, {"loss": 1.7046, "grad_norm": 0.3795325756072998, "learning_rate": 0.0002, "epoch": 1.7035775127768313, "step": 2500}, {"loss": 1.6993, "grad_norm": 0.34978193044662476, "learning_rate": 0.0002, "epoch": 1.7103918228279387, "step": 2510}, {"loss": 1.8307, "grad_norm": 0.35923877358436584, "learning_rate": 0.0002, "epoch": 1.717206132879046, "step": 2520}, {"loss": 1.6253, "grad_norm": 0.352999746799469, "learning_rate": 0.0002, "epoch": 1.7240204429301533, "step": 2530}, {"loss": 1.6953, "grad_norm": 0.43673479557037354, "learning_rate": 0.0002, "epoch": 1.7308347529812607, "step": 2540}, {"loss": 1.7079, "grad_norm": 0.4153687357902527, "learning_rate": 0.0002, "epoch": 1.737649063032368, "step": 2550}, {"loss": 1.6714, "grad_norm": 0.35541167855262756, "learning_rate": 0.0002, "epoch": 1.7444633730834753, "step": 2560}, {"loss": 1.696, "grad_norm": 0.3288775086402893, "learning_rate": 0.0002, "epoch": 1.7512776831345827, "step": 2570}, {"loss": 1.7486, "grad_norm": 0.3991123139858246, "learning_rate": 0.0002, "epoch": 1.75809199318569, "step": 2580}, {"loss": 1.645, "grad_norm": 0.39967241883277893, "learning_rate": 0.0002, "epoch": 1.7649063032367973, "step": 2590}, {"loss": 1.6855, "grad_norm": 0.41104283928871155, "learning_rate": 0.0002, "epoch": 1.7717206132879046, "step": 2600}, {"loss": 1.6993, "grad_norm": 0.44885286688804626, "learning_rate": 0.0002, "epoch": 1.778534923339012, "step": 2610}, {"loss": 1.7224, "grad_norm": 0.38384467363357544, "learning_rate": 0.0002, "epoch": 1.7853492333901193, "step": 2620}, {"loss": 1.7213, "grad_norm": 0.35020917654037476, "learning_rate": 0.0002, "epoch": 1.7921635434412266, "step": 2630}, {"loss": 1.6706, "grad_norm": 0.3360341489315033, "learning_rate": 0.0002, "epoch": 1.798977853492334, "step": 2640}, {"loss": 1.7037, "grad_norm": 0.38875144720077515, "learning_rate": 0.0002, "epoch": 1.8057921635434413, "step": 2650}, {"loss": 1.693, "grad_norm": 0.34876883029937744, "learning_rate": 0.0002, "epoch": 1.8126064735945486, "step": 2660}, {"loss": 1.7743, "grad_norm": 0.419979989528656, "learning_rate": 0.0002, "epoch": 1.819420783645656, "step": 2670}, {"loss": 1.7089, "grad_norm": 0.3648919463157654, "learning_rate": 0.0002, "epoch": 1.8262350936967633, "step": 2680}, {"loss": 1.7828, "grad_norm": 0.3485383987426758, "learning_rate": 0.0002, "epoch": 1.8330494037478706, "step": 2690}, {"loss": 1.705, "grad_norm": 0.3647468686103821, "learning_rate": 0.0002, "epoch": 1.839863713798978, "step": 2700}, {"loss": 1.7318, "grad_norm": 0.37003210186958313, "learning_rate": 0.0002, "epoch": 1.8466780238500853, "step": 2710}, {"loss": 1.6647, "grad_norm": 0.37031617760658264, "learning_rate": 0.0002, "epoch": 1.8534923339011926, "step": 2720}, {"loss": 1.69, "grad_norm": 0.3438796103000641, "learning_rate": 0.0002, "epoch": 1.8603066439523, "step": 2730}, {"loss": 1.5995, "grad_norm": 0.41574627161026, "learning_rate": 0.0002, "epoch": 1.8671209540034073, "step": 2740}, {"loss": 1.6877, "grad_norm": 0.35049930214881897, "learning_rate": 0.0002, "epoch": 1.8739352640545146, "step": 2750}, {"loss": 1.7048, "grad_norm": 0.3943989872932434, "learning_rate": 0.0002, "epoch": 1.880749574105622, "step": 2760}, {"loss": 1.7047, "grad_norm": 0.3384978175163269, "learning_rate": 0.0002, "epoch": 1.8875638841567293, "step": 2770}, {"loss": 1.7848, "grad_norm": 0.3501328229904175, "learning_rate": 0.0002, "epoch": 1.8943781942078366, "step": 2780}, {"loss": 1.6986, "grad_norm": 0.37484532594680786, "learning_rate": 0.0002, "epoch": 1.901192504258944, "step": 2790}, {"loss": 1.6867, "grad_norm": 0.34497788548469543, "learning_rate": 0.0002, "epoch": 1.9080068143100513, "step": 2800}, {"loss": 1.6513, "grad_norm": 0.3530851900577545, "learning_rate": 0.0002, "epoch": 1.9148211243611586, "step": 2810}, {"loss": 1.6369, "grad_norm": 0.3879254162311554, "learning_rate": 0.0002, "epoch": 1.921635434412266, "step": 2820}, {"loss": 1.6786, "grad_norm": 0.3885590136051178, "learning_rate": 0.0002, "epoch": 1.9284497444633732, "step": 2830}, {"loss": 1.8049, "grad_norm": 0.3868715465068817, "learning_rate": 0.0002, "epoch": 1.9352640545144804, "step": 2840}, {"loss": 1.7099, "grad_norm": 0.4152422249317169, "learning_rate": 0.0002, "epoch": 1.9420783645655877, "step": 2850}, {"loss": 1.6696, "grad_norm": 0.401714563369751, "learning_rate": 0.0002, "epoch": 1.948892674616695, "step": 2860}, {"loss": 1.7182, "grad_norm": 0.34825265407562256, "learning_rate": 0.0002, "epoch": 1.9557069846678024, "step": 2870}, {"loss": 1.6612, "grad_norm": 0.3620675504207611, "learning_rate": 0.0002, "epoch": 1.9625212947189097, "step": 2880}, {"loss": 1.7451, "grad_norm": 0.3977806866168976, "learning_rate": 0.0002, "epoch": 1.969335604770017, "step": 2890}, {"loss": 1.6514, "grad_norm": 0.3687497079372406, "learning_rate": 0.0002, "epoch": 1.9761499148211243, "step": 2900}, {"loss": 1.7712, "grad_norm": 0.408640056848526, "learning_rate": 0.0002, "epoch": 1.9829642248722317, "step": 2910}, {"loss": 1.695, "grad_norm": 0.34510108828544617, "learning_rate": 0.0002, "epoch": 1.989778534923339, "step": 2920}, {"loss": 1.662, "grad_norm": 0.3596334755420685, "learning_rate": 0.0002, "epoch": 1.9965928449744463, "step": 2930}, {"eval_loss": 1.8056600093841553, "eval_runtime": 60.7049, "eval_samples_per_second": 8.352, "eval_steps_per_second": 1.054, "epoch": 2.0, "step": 2935}, {"loss": 1.6918, "grad_norm": 0.3460802137851715, "learning_rate": 0.0002, "epoch": 2.0034071550255534, "step": 2940}, {"loss": 1.6036, "grad_norm": 0.4038620591163635, "learning_rate": 0.0002, "epoch": 2.0102214650766608, "step": 2950}, {"loss": 1.5386, "grad_norm": 0.3950219750404358, "learning_rate": 0.0002, "epoch": 2.017035775127768, "step": 2960}, {"loss": 1.5649, "grad_norm": 0.519116997718811, "learning_rate": 0.0002, "epoch": 2.0238500851788754, "step": 2970}, {"loss": 1.5397, "grad_norm": 0.4097684919834137, "learning_rate": 0.0002, "epoch": 2.0306643952299828, "step": 2980}, {"loss": 1.6044, "grad_norm": 0.4153544306755066, "learning_rate": 0.0002, "epoch": 2.03747870528109, "step": 2990}, {"loss": 1.6006, "grad_norm": 0.4351160526275635, "learning_rate": 0.0002, "epoch": 2.0442930153321974, "step": 3000}, {"loss": 1.5091, "grad_norm": 0.42036259174346924, "learning_rate": 0.0002, "epoch": 2.0511073253833048, "step": 3010}, {"loss": 1.5686, "grad_norm": 0.4433218836784363, "learning_rate": 0.0002, "epoch": 2.057921635434412, "step": 3020}, {"loss": 1.5478, "grad_norm": 0.46511581540107727, "learning_rate": 0.0002, "epoch": 2.0647359454855194, "step": 3030}, {"loss": 1.5554, "grad_norm": 0.4567560851573944, "learning_rate": 0.0002, "epoch": 2.0715502555366268, "step": 3040}, {"loss": 1.5561, "grad_norm": 0.45671048760414124, "learning_rate": 0.0002, "epoch": 2.078364565587734, "step": 3050}, {"loss": 1.606, "grad_norm": 0.4598552882671356, "learning_rate": 0.0002, "epoch": 2.0851788756388414, "step": 3060}, {"loss": 1.6357, "grad_norm": 0.4582861661911011, "learning_rate": 0.0002, "epoch": 2.0919931856899487, "step": 3070}, {"loss": 1.5853, "grad_norm": 0.4366969168186188, "learning_rate": 0.0002, "epoch": 2.098807495741056, "step": 3080}, {"loss": 1.5574, "grad_norm": 0.495917409658432, "learning_rate": 0.0002, "epoch": 2.1056218057921634, "step": 3090}, {"loss": 1.5798, "grad_norm": 1.6846044063568115, "learning_rate": 0.0002, "epoch": 2.1124361158432707, "step": 3100}, {"loss": 1.5877, "grad_norm": 0.4765092134475708, "learning_rate": 0.0002, "epoch": 2.119250425894378, "step": 3110}, {"loss": 1.6106, "grad_norm": 0.45029810070991516, "learning_rate": 0.0002, "epoch": 2.1260647359454854, "step": 3120}, {"loss": 1.5707, "grad_norm": 0.5706973075866699, "learning_rate": 0.0002, "epoch": 2.1328790459965927, "step": 3130}, {"loss": 1.589, "grad_norm": 0.4606274366378784, "learning_rate": 0.0002, "epoch": 2.1396933560477, "step": 3140}, {"loss": 1.564, "grad_norm": 2.199115753173828, "learning_rate": 0.0002, "epoch": 2.1465076660988074, "step": 3150}, {"loss": 1.5808, "grad_norm": 0.6231027245521545, "learning_rate": 0.0002, "epoch": 2.1533219761499147, "step": 3160}, {"loss": 1.5397, "grad_norm": 0.46918219327926636, "learning_rate": 0.0002, "epoch": 2.160136286201022, "step": 3170}, {"loss": 1.6082, "grad_norm": 0.5006393194198608, "learning_rate": 0.0002, "epoch": 2.1669505962521294, "step": 3180}, {"loss": 1.591, "grad_norm": 0.4745093286037445, "learning_rate": 0.0002, "epoch": 2.1737649063032367, "step": 3190}, {"loss": 1.6459, "grad_norm": 0.511576771736145, "learning_rate": 0.0002, "epoch": 2.180579216354344, "step": 3200}, {"loss": 1.6019, "grad_norm": 0.38622918725013733, "learning_rate": 0.0002, "epoch": 2.1873935264054514, "step": 3210}, {"loss": 1.6187, "grad_norm": 0.4425644278526306, "learning_rate": 0.0002, "epoch": 2.1942078364565587, "step": 3220}, {"loss": 1.6114, "grad_norm": 0.45643091201782227, "learning_rate": 0.0002, "epoch": 2.201022146507666, "step": 3230}, {"loss": 1.6224, "grad_norm": 0.4975406229496002, "learning_rate": 0.0002, "epoch": 2.2078364565587734, "step": 3240}, {"loss": 1.6654, "grad_norm": 0.4673331081867218, "learning_rate": 0.0002, "epoch": 2.2146507666098807, "step": 3250}, {"loss": 1.6155, "grad_norm": 0.5081731081008911, "learning_rate": 0.0002, "epoch": 2.221465076660988, "step": 3260}, {"loss": 1.53, "grad_norm": 0.4790806770324707, "learning_rate": 0.0002, "epoch": 2.2282793867120954, "step": 3270}, {"loss": 1.6362, "grad_norm": 0.5184140801429749, "learning_rate": 0.0002, "epoch": 2.2350936967632027, "step": 3280}, {"loss": 1.5646, "grad_norm": 0.5159083604812622, "learning_rate": 0.0002, "epoch": 2.24190800681431, "step": 3290}, {"loss": 1.6577, "grad_norm": 0.4876042604446411, "learning_rate": 0.0002, "epoch": 2.2487223168654173, "step": 3300}, {"loss": 1.5718, "grad_norm": 0.5454957485198975, "learning_rate": 0.0002, "epoch": 2.2555366269165247, "step": 3310}, {"loss": 1.5885, "grad_norm": 0.49866822361946106, "learning_rate": 0.0002, "epoch": 2.262350936967632, "step": 3320}, {"loss": 1.5923, "grad_norm": 0.42674365639686584, "learning_rate": 0.0002, "epoch": 2.2691652470187393, "step": 3330}, {"loss": 1.5588, "grad_norm": 0.5202316641807556, "learning_rate": 0.0002, "epoch": 2.2759795570698467, "step": 3340}, {"loss": 1.6032, "grad_norm": 0.4849465489387512, "learning_rate": 0.0002, "epoch": 2.282793867120954, "step": 3350}, {"loss": 1.6853, "grad_norm": 0.47202569246292114, "learning_rate": 0.0002, "epoch": 2.2896081771720613, "step": 3360}, {"loss": 1.6164, "grad_norm": 0.5311620235443115, "learning_rate": 0.0002, "epoch": 2.2964224872231687, "step": 3370}, {"loss": 1.6004, "grad_norm": 0.49011409282684326, "learning_rate": 0.0002, "epoch": 2.303236797274276, "step": 3380}, {"loss": 1.5484, "grad_norm": 0.4789247512817383, "learning_rate": 0.0002, "epoch": 2.3100511073253833, "step": 3390}, {"loss": 1.5866, "grad_norm": 0.46646103262901306, "learning_rate": 0.0002, "epoch": 2.3168654173764907, "step": 3400}, {"loss": 1.5308, "grad_norm": 0.5552441477775574, "learning_rate": 0.0002, "epoch": 2.323679727427598, "step": 3410}, {"loss": 1.5761, "grad_norm": 0.4530351758003235, "learning_rate": 0.0002, "epoch": 2.3304940374787053, "step": 3420}, {"loss": 1.5919, "grad_norm": 0.4806232750415802, "learning_rate": 0.0002, "epoch": 2.3373083475298126, "step": 3430}, {"loss": 1.5569, "grad_norm": 0.5998363494873047, "learning_rate": 0.0002, "epoch": 2.34412265758092, "step": 3440}, {"loss": 1.513, "grad_norm": 0.4918554425239563, "learning_rate": 0.0002, "epoch": 2.3509369676320273, "step": 3450}, {"loss": 1.6323, "grad_norm": 0.5359559655189514, "learning_rate": 0.0002, "epoch": 2.3577512776831346, "step": 3460}, {"loss": 1.5973, "grad_norm": 0.5053277611732483, "learning_rate": 0.0002, "epoch": 2.364565587734242, "step": 3470}, {"loss": 1.5673, "grad_norm": 0.5058915019035339, "learning_rate": 0.0002, "epoch": 2.3713798977853493, "step": 3480}, {"loss": 1.5434, "grad_norm": 0.5314899682998657, "learning_rate": 0.0002, "epoch": 2.3781942078364566, "step": 3490}, {"loss": 1.5203, "grad_norm": 0.48035913705825806, "learning_rate": 0.0002, "epoch": 2.385008517887564, "step": 3500}, {"loss": 1.5936, "grad_norm": 0.45864903926849365, "learning_rate": 0.0002, "epoch": 2.3918228279386713, "step": 3510}, {"loss": 1.6285, "grad_norm": 0.4553050398826599, "learning_rate": 0.0002, "epoch": 2.3986371379897786, "step": 3520}, {"loss": 1.5444, "grad_norm": 0.4483442008495331, "learning_rate": 0.0002, "epoch": 2.405451448040886, "step": 3530}, {"loss": 1.587, "grad_norm": 0.5043742060661316, "learning_rate": 0.0002, "epoch": 2.4122657580919933, "step": 3540}, {"loss": 1.5692, "grad_norm": 0.44807168841362, "learning_rate": 0.0002, "epoch": 2.4190800681431006, "step": 3550}, {"loss": 1.6306, "grad_norm": 0.5065137147903442, "learning_rate": 0.0002, "epoch": 2.425894378194208, "step": 3560}, {"loss": 1.5842, "grad_norm": 0.5186443328857422, "learning_rate": 0.0002, "epoch": 2.4327086882453153, "step": 3570}, {"loss": 1.5956, "grad_norm": 0.49743232131004333, "learning_rate": 0.0002, "epoch": 2.4395229982964226, "step": 3580}, {"loss": 1.6021, "grad_norm": 0.524450421333313, "learning_rate": 0.0002, "epoch": 2.44633730834753, "step": 3590}, {"loss": 1.6283, "grad_norm": 0.5053797364234924, "learning_rate": 0.0002, "epoch": 2.4531516183986373, "step": 3600}, {"loss": 1.6335, "grad_norm": 0.5223091840744019, "learning_rate": 0.0002, "epoch": 2.4599659284497446, "step": 3610}, {"loss": 1.6315, "grad_norm": 0.4763810932636261, "learning_rate": 0.0002, "epoch": 2.466780238500852, "step": 3620}, {"loss": 1.5623, "grad_norm": 0.5097282528877258, "learning_rate": 0.0002, "epoch": 2.4735945485519593, "step": 3630}, {"loss": 1.605, "grad_norm": 0.5831942558288574, "learning_rate": 0.0002, "epoch": 2.4804088586030666, "step": 3640}, {"loss": 1.6074, "grad_norm": 0.47573572397232056, "learning_rate": 0.0002, "epoch": 2.487223168654174, "step": 3650}, {"loss": 1.6411, "grad_norm": 0.49602726101875305, "learning_rate": 0.0002, "epoch": 2.4940374787052813, "step": 3660}, {"loss": 1.571, "grad_norm": 0.5468524694442749, "learning_rate": 0.0002, "epoch": 2.500851788756388, "step": 3670}, {"loss": 1.5782, "grad_norm": 0.45899373292922974, "learning_rate": 0.0002, "epoch": 2.507666098807496, "step": 3680}, {"loss": 1.5114, "grad_norm": 0.5031567215919495, "learning_rate": 0.0002, "epoch": 2.514480408858603, "step": 3690}, {"loss": 1.538, "grad_norm": 0.5224900841712952, "learning_rate": 0.0002, "epoch": 2.5212947189097106, "step": 3700}, {"loss": 1.5269, "grad_norm": 0.504769504070282, "learning_rate": 0.0002, "epoch": 2.5281090289608175, "step": 3710}, {"loss": 1.5141, "grad_norm": 0.6120529770851135, "learning_rate": 0.0002, "epoch": 2.5349233390119252, "step": 3720}, {"loss": 1.5666, "grad_norm": 0.47930678725242615, "learning_rate": 0.0002, "epoch": 2.541737649063032, "step": 3730}, {"loss": 1.6156, "grad_norm": 0.5039092302322388, "learning_rate": 0.0002, "epoch": 2.54855195911414, "step": 3740}, {"loss": 1.5585, "grad_norm": 0.49758994579315186, "learning_rate": 0.0002, "epoch": 2.555366269165247, "step": 3750}, {"loss": 1.5351, "grad_norm": 0.44739171862602234, "learning_rate": 0.0002, "epoch": 2.5621805792163546, "step": 3760}, {"loss": 1.5099, "grad_norm": 0.47056373953819275, "learning_rate": 0.0002, "epoch": 2.5689948892674614, "step": 3770}, {"loss": 1.5524, "grad_norm": 0.5077595114707947, "learning_rate": 0.0002, "epoch": 2.575809199318569, "step": 3780}, {"loss": 1.5524, "grad_norm": 0.4981902837753296, "learning_rate": 0.0002, "epoch": 2.582623509369676, "step": 3790}, {"loss": 1.5381, "grad_norm": 0.5736238360404968, "learning_rate": 0.0002, "epoch": 2.589437819420784, "step": 3800}, {"loss": 1.67, "grad_norm": 0.4898384213447571, "learning_rate": 0.0002, "epoch": 2.5962521294718908, "step": 3810}, {"loss": 1.5411, "grad_norm": 0.4611325263977051, "learning_rate": 0.0002, "epoch": 2.6030664395229985, "step": 3820}, {"loss": 1.5662, "grad_norm": 0.5285341739654541, "learning_rate": 0.0002, "epoch": 2.6098807495741054, "step": 3830}, {"loss": 1.5875, "grad_norm": 0.5679430961608887, "learning_rate": 0.0002, "epoch": 2.616695059625213, "step": 3840}, {"loss": 1.5544, "grad_norm": 0.48532548546791077, "learning_rate": 0.0002, "epoch": 2.62350936967632, "step": 3850}, {"loss": 1.579, "grad_norm": 0.45506492257118225, "learning_rate": 0.0002, "epoch": 2.630323679727428, "step": 3860}, {"loss": 1.5775, "grad_norm": 0.6552556753158569, "learning_rate": 0.0002, "epoch": 2.6371379897785348, "step": 3870}, {"loss": 1.608, "grad_norm": 0.537874698638916, "learning_rate": 0.0002, "epoch": 2.643952299829642, "step": 3880}, {"loss": 1.5653, "grad_norm": 0.46102389693260193, "learning_rate": 0.0002, "epoch": 2.6507666098807494, "step": 3890}, {"loss": 1.5293, "grad_norm": 0.45531195402145386, "learning_rate": 0.0002, "epoch": 2.6575809199318567, "step": 3900}, {"loss": 1.5492, "grad_norm": 0.5327293872833252, "learning_rate": 0.0002, "epoch": 2.664395229982964, "step": 3910}, {"loss": 1.5921, "grad_norm": 0.4968956410884857, "learning_rate": 0.0002, "epoch": 2.6712095400340714, "step": 3920}, {"loss": 1.5823, "grad_norm": 0.4790082275867462, "learning_rate": 0.0002, "epoch": 2.6780238500851787, "step": 3930}, {"loss": 1.615, "grad_norm": 0.5392967462539673, "learning_rate": 0.0002, "epoch": 2.684838160136286, "step": 3940}, {"loss": 1.6218, "grad_norm": 0.5076649785041809, "learning_rate": 0.0002, "epoch": 2.6916524701873934, "step": 3950}, {"loss": 1.6478, "grad_norm": 0.5628064274787903, "learning_rate": 0.0002, "epoch": 2.6984667802385007, "step": 3960}, {"loss": 1.5417, "grad_norm": 0.5012659430503845, "learning_rate": 0.0002, "epoch": 2.705281090289608, "step": 3970}, {"loss": 1.5339, "grad_norm": 0.4947647452354431, "learning_rate": 0.0002, "epoch": 2.7120954003407154, "step": 3980}, {"loss": 1.5724, "grad_norm": 0.4890969693660736, "learning_rate": 0.0002, "epoch": 2.7189097103918227, "step": 3990}, {"loss": 1.5746, "grad_norm": 0.4471694231033325, "learning_rate": 0.0002, "epoch": 2.72572402044293, "step": 4000}, {"loss": 1.6669, "grad_norm": 0.5116439461708069, "learning_rate": 0.0002, "epoch": 2.7325383304940374, "step": 4010}, {"loss": 1.584, "grad_norm": 0.5720411539077759, "learning_rate": 0.0002, "epoch": 2.7393526405451447, "step": 4020}, {"loss": 1.6151, "grad_norm": 0.5529406070709229, "learning_rate": 0.0002, "epoch": 2.746166950596252, "step": 4030}, {"loss": 1.6296, "grad_norm": 0.5229396820068359, "learning_rate": 0.0002, "epoch": 2.7529812606473594, "step": 4040}, {"loss": 1.5363, "grad_norm": 0.5270276069641113, "learning_rate": 0.0002, "epoch": 2.7597955706984667, "step": 4050}, {"loss": 1.6305, "grad_norm": 0.48413026332855225, "learning_rate": 0.0002, "epoch": 2.766609880749574, "step": 4060}, {"loss": 1.5488, "grad_norm": 0.5145403742790222, "learning_rate": 0.0002, "epoch": 2.7734241908006814, "step": 4070}, {"loss": 1.6023, "grad_norm": 0.48626071214675903, "learning_rate": 0.0002, "epoch": 2.7802385008517887, "step": 4080}, {"loss": 1.6082, "grad_norm": 0.5018984079360962, "learning_rate": 0.0002, "epoch": 2.787052810902896, "step": 4090}, {"loss": 1.7166, "grad_norm": 0.4881938695907593, "learning_rate": 0.0002, "epoch": 2.7938671209540034, "step": 4100}, {"loss": 1.515, "grad_norm": 0.5151546001434326, "learning_rate": 0.0002, "epoch": 2.8006814310051107, "step": 4110}, {"loss": 1.6069, "grad_norm": 0.5109850764274597, "learning_rate": 0.0002, "epoch": 2.807495741056218, "step": 4120}, {"loss": 1.6153, "grad_norm": 0.5109251141548157, "learning_rate": 0.0002, "epoch": 2.8143100511073254, "step": 4130}, {"loss": 1.6365, "grad_norm": 0.5025496482849121, "learning_rate": 0.0002, "epoch": 2.8211243611584327, "step": 4140}, {"loss": 1.6292, "grad_norm": 0.49027004837989807, "learning_rate": 0.0002, "epoch": 2.82793867120954, "step": 4150}, {"loss": 1.5591, "grad_norm": 0.4957362413406372, "learning_rate": 0.0002, "epoch": 2.8347529812606473, "step": 4160}, {"loss": 1.6759, "grad_norm": 0.5159927606582642, "learning_rate": 0.0002, "epoch": 2.8415672913117547, "step": 4170}, {"loss": 1.577, "grad_norm": 0.6040670871734619, "learning_rate": 0.0002, "epoch": 2.848381601362862, "step": 4180}, {"loss": 1.5295, "grad_norm": 0.5489953756332397, "learning_rate": 0.0002, "epoch": 2.8551959114139693, "step": 4190}, {"loss": 1.5909, "grad_norm": 0.5416634678840637, "learning_rate": 0.0002, "epoch": 2.8620102214650767, "step": 4200}, {"loss": 1.6014, "grad_norm": 0.5278245210647583, "learning_rate": 0.0002, "epoch": 2.868824531516184, "step": 4210}, {"loss": 1.4848, "grad_norm": 0.43382319808006287, "learning_rate": 0.0002, "epoch": 2.8756388415672913, "step": 4220}, {"loss": 1.6092, "grad_norm": 0.4724387228488922, "learning_rate": 0.0002, "epoch": 2.8824531516183987, "step": 4230}, {"loss": 1.5748, "grad_norm": 0.49824780225753784, "learning_rate": 0.0002, "epoch": 2.889267461669506, "step": 4240}, {"loss": 1.6163, "grad_norm": 0.5360262989997864, "learning_rate": 0.0002, "epoch": 2.8960817717206133, "step": 4250}, {"loss": 1.5934, "grad_norm": 0.49090322852134705, "learning_rate": 0.0002, "epoch": 2.9028960817717206, "step": 4260}, {"loss": 1.624, "grad_norm": 0.5613328218460083, "learning_rate": 0.0002, "epoch": 2.909710391822828, "step": 4270}, {"loss": 1.5713, "grad_norm": 0.4611356258392334, "learning_rate": 0.0002, "epoch": 2.9165247018739353, "step": 4280}, {"loss": 1.5457, "grad_norm": 0.550897479057312, "learning_rate": 0.0002, "epoch": 2.9233390119250426, "step": 4290}, {"loss": 1.6225, "grad_norm": 0.5089612603187561, "learning_rate": 0.0002, "epoch": 2.93015332197615, "step": 4300}, {"loss": 1.5897, "grad_norm": 0.5210904479026794, "learning_rate": 0.0002, "epoch": 2.9369676320272573, "step": 4310}, {"loss": 1.6198, "grad_norm": 0.5506424903869629, "learning_rate": 0.0002, "epoch": 2.9437819420783646, "step": 4320}, {"loss": 1.6395, "grad_norm": 0.5118561387062073, "learning_rate": 0.0002, "epoch": 2.950596252129472, "step": 4330}, {"loss": 1.704, "grad_norm": 0.5034464597702026, "learning_rate": 0.0002, "epoch": 2.9574105621805793, "step": 4340}, {"loss": 1.6314, "grad_norm": 0.5019990801811218, "learning_rate": 0.0002, "epoch": 2.9642248722316866, "step": 4350}, {"loss": 1.6161, "grad_norm": 0.5423325300216675, "learning_rate": 0.0002, "epoch": 2.971039182282794, "step": 4360}, {"loss": 1.6144, "grad_norm": 0.5287469625473022, "learning_rate": 0.0002, "epoch": 2.9778534923339013, "step": 4370}, {"loss": 1.5227, "grad_norm": 0.5206913352012634, "learning_rate": 0.0002, "epoch": 2.9846678023850086, "step": 4380}, {"loss": 1.6026, "grad_norm": 0.5407394170761108, "learning_rate": 0.0002, "epoch": 2.991482112436116, "step": 4390}, {"loss": 1.5908, "grad_norm": 0.5244600176811218, "learning_rate": 0.0002, "epoch": 2.9982964224872233, "step": 4400}, {"eval_loss": 1.8412635326385498, "eval_runtime": 65.5583, "eval_samples_per_second": 7.734, "eval_steps_per_second": 0.976, "epoch": 2.9996592844974446, "step": 4402}, {"loss": 1.5157, "grad_norm": 0.5172150731086731, "learning_rate": 0.0002, "epoch": 3.0051107325383306, "step": 4410}, {"loss": 1.398, "grad_norm": 0.6882525086402893, "learning_rate": 0.0002, "epoch": 3.011925042589438, "step": 4420}, {"loss": 1.3884, "grad_norm": 0.6435003280639648, "learning_rate": 0.0002, "epoch": 3.0187393526405453, "step": 4430}, {"loss": 1.4493, "grad_norm": 0.7126057147979736, "learning_rate": 0.0002, "epoch": 3.0255536626916526, "step": 4440}, {"loss": 1.4397, "grad_norm": 0.6634385585784912, "learning_rate": 0.0002, "epoch": 3.03236797274276, "step": 4450}, {"loss": 1.3674, "grad_norm": 0.6468435525894165, "learning_rate": 0.0002, "epoch": 3.0391822827938673, "step": 4460}, {"loss": 1.4045, "grad_norm": 0.5690478086471558, "learning_rate": 0.0002, "epoch": 3.0459965928449746, "step": 4470}, {"loss": 1.3742, "grad_norm": 0.7323708534240723, "learning_rate": 0.0002, "epoch": 3.052810902896082, "step": 4480}, {"loss": 1.3281, "grad_norm": 0.6989302039146423, "learning_rate": 0.0002, "epoch": 3.0596252129471893, "step": 4490}, {"loss": 1.379, "grad_norm": 0.6704450845718384, "learning_rate": 0.0002, "epoch": 3.0664395229982966, "step": 4500}, {"loss": 1.4028, "grad_norm": 0.769137442111969, "learning_rate": 0.0002, "epoch": 3.073253833049404, "step": 4510}, {"loss": 1.4295, "grad_norm": 0.6556448936462402, "learning_rate": 0.0002, "epoch": 3.0800681431005112, "step": 4520}, {"loss": 1.2763, "grad_norm": 0.7143950462341309, "learning_rate": 0.0002, "epoch": 3.0868824531516186, "step": 4530}, {"loss": 1.4806, "grad_norm": 0.7060510516166687, "learning_rate": 0.0002, "epoch": 3.093696763202726, "step": 4540}, {"loss": 1.4097, "grad_norm": 0.6637526750564575, "learning_rate": 0.0002, "epoch": 3.1005110732538332, "step": 4550}, {"loss": 1.4752, "grad_norm": 0.822989284992218, "learning_rate": 0.0002, "epoch": 3.1073253833049406, "step": 4560}, {"loss": 1.4994, "grad_norm": 0.5542152523994446, "learning_rate": 0.0002, "epoch": 3.114139693356048, "step": 4570}, {"loss": 1.4306, "grad_norm": 0.7780306935310364, "learning_rate": 0.0002, "epoch": 3.1209540034071552, "step": 4580}, {"loss": 1.3909, "grad_norm": 0.7372637987136841, "learning_rate": 0.0002, "epoch": 3.1277683134582626, "step": 4590}, {"loss": 1.3989, "grad_norm": 0.6730087995529175, "learning_rate": 0.0002, "epoch": 3.1345826235093694, "step": 4600}, {"loss": 1.3591, "grad_norm": 0.6687398552894592, "learning_rate": 0.0002, "epoch": 3.1413969335604772, "step": 4610}, {"loss": 1.436, "grad_norm": 0.7645083665847778, "learning_rate": 0.0002, "epoch": 3.148211243611584, "step": 4620}, {"loss": 1.3681, "grad_norm": 0.6770380139350891, "learning_rate": 0.0002, "epoch": 3.155025553662692, "step": 4630}, {"loss": 1.405, "grad_norm": 0.7200576663017273, "learning_rate": 0.0002, "epoch": 3.1618398637137988, "step": 4640}, {"loss": 1.3752, "grad_norm": 0.6663638949394226, "learning_rate": 0.0002, "epoch": 3.168654173764906, "step": 4650}, {"loss": 1.4099, "grad_norm": 0.6602960228919983, "learning_rate": 0.0002, "epoch": 3.1754684838160134, "step": 4660}, {"loss": 1.4003, "grad_norm": 0.7838228344917297, "learning_rate": 0.0002, "epoch": 3.1822827938671208, "step": 4670}, {"loss": 1.3853, "grad_norm": 0.7559184432029724, "learning_rate": 0.0002, "epoch": 3.189097103918228, "step": 4680}, {"loss": 1.4516, "grad_norm": 0.6609814167022705, "learning_rate": 0.0002, "epoch": 3.1959114139693354, "step": 4690}, {"loss": 1.4464, "grad_norm": 0.8470419645309448, "learning_rate": 0.0002, "epoch": 3.2027257240204428, "step": 4700}, {"loss": 1.428, "grad_norm": 0.7282822728157043, "learning_rate": 0.0002, "epoch": 3.20954003407155, "step": 4710}, {"loss": 1.5261, "grad_norm": 0.6722773313522339, "learning_rate": 0.0002, "epoch": 3.2163543441226574, "step": 4720}, {"loss": 1.3809, "grad_norm": 0.7630265355110168, "learning_rate": 0.0002, "epoch": 3.2231686541737647, "step": 4730}, {"loss": 1.42, "grad_norm": 0.7102773785591125, "learning_rate": 0.0002, "epoch": 3.229982964224872, "step": 4740}, {"loss": 1.3529, "grad_norm": 0.7778299450874329, "learning_rate": 0.0002, "epoch": 3.2367972742759794, "step": 4750}, {"loss": 1.4715, "grad_norm": 0.7189921736717224, "learning_rate": 0.0002, "epoch": 3.2436115843270867, "step": 4760}, {"loss": 1.4328, "grad_norm": 0.7708092331886292, "learning_rate": 0.0002, "epoch": 3.250425894378194, "step": 4770}, {"loss": 1.3855, "grad_norm": 0.7208452224731445, "learning_rate": 0.0002, "epoch": 3.2572402044293014, "step": 4780}, {"loss": 1.3206, "grad_norm": 0.7220432758331299, "learning_rate": 0.0002, "epoch": 3.2640545144804087, "step": 4790}, {"loss": 1.463, "grad_norm": 0.7064954042434692, "learning_rate": 0.0002, "epoch": 3.270868824531516, "step": 4800}, {"loss": 1.4236, "grad_norm": 0.6618382334709167, "learning_rate": 0.0002, "epoch": 3.2776831345826234, "step": 4810}, {"loss": 1.3878, "grad_norm": 0.6854256391525269, "learning_rate": 0.0002, "epoch": 3.2844974446337307, "step": 4820}, {"loss": 1.4236, "grad_norm": 0.6036319136619568, "learning_rate": 0.0002, "epoch": 3.291311754684838, "step": 4830}, {"loss": 1.4796, "grad_norm": 0.714678943157196, "learning_rate": 0.0002, "epoch": 3.2981260647359454, "step": 4840}, {"loss": 1.4273, "grad_norm": 0.7218600511550903, "learning_rate": 0.0002, "epoch": 3.3049403747870527, "step": 4850}, {"loss": 1.3915, "grad_norm": 0.7243074774742126, "learning_rate": 0.0002, "epoch": 3.31175468483816, "step": 4860}, {"loss": 1.4088, "grad_norm": 0.7058630585670471, "learning_rate": 0.0002, "epoch": 3.3185689948892674, "step": 4870}, {"loss": 1.3837, "grad_norm": 0.7091076970100403, "learning_rate": 0.0002, "epoch": 3.3253833049403747, "step": 4880}, {"loss": 1.4745, "grad_norm": 0.7375147342681885, "learning_rate": 0.0002, "epoch": 3.332197614991482, "step": 4890}, {"loss": 1.4826, "grad_norm": 0.9426755309104919, "learning_rate": 0.0002, "epoch": 3.3390119250425894, "step": 4900}, {"loss": 1.369, "grad_norm": 0.6508213877677917, "learning_rate": 0.0002, "epoch": 3.3458262350936967, "step": 4910}, {"loss": 1.3839, "grad_norm": 0.6945043206214905, "learning_rate": 0.0002, "epoch": 3.352640545144804, "step": 4920}, {"loss": 1.3571, "grad_norm": 0.6335888504981995, "learning_rate": 0.0002, "epoch": 3.3594548551959114, "step": 4930}, {"loss": 1.4391, "grad_norm": 0.6947107911109924, "learning_rate": 0.0002, "epoch": 3.3662691652470187, "step": 4940}, {"loss": 1.3885, "grad_norm": 0.8204733729362488, "learning_rate": 0.0002, "epoch": 3.373083475298126, "step": 4950}, {"loss": 1.4886, "grad_norm": 0.7212244868278503, "learning_rate": 0.0002, "epoch": 3.3798977853492334, "step": 4960}, {"loss": 1.4581, "grad_norm": 0.6053042411804199, "learning_rate": 0.0002, "epoch": 3.3867120954003407, "step": 4970}, {"loss": 1.3863, "grad_norm": 0.7820029854774475, "learning_rate": 0.0002, "epoch": 3.393526405451448, "step": 4980}, {"loss": 1.4326, "grad_norm": 0.6866770386695862, "learning_rate": 0.0002, "epoch": 3.4003407155025553, "step": 4990}, {"loss": 1.4287, "grad_norm": 0.6652463674545288, "learning_rate": 0.0002, "epoch": 3.4071550255536627, "step": 5000}, {"loss": 1.3667, "grad_norm": 1.1209032535552979, "learning_rate": 0.0002, "epoch": 3.41396933560477, "step": 5010}, {"loss": 1.4461, "grad_norm": 0.8390814661979675, "learning_rate": 0.0002, "epoch": 3.4207836456558773, "step": 5020}, {"loss": 1.4556, "grad_norm": 0.7541858553886414, "learning_rate": 0.0002, "epoch": 3.4275979557069847, "step": 5030}, {"loss": 1.4245, "grad_norm": 0.6902772784233093, "learning_rate": 0.0002, "epoch": 3.434412265758092, "step": 5040}, {"loss": 1.3953, "grad_norm": 0.7070329785346985, "learning_rate": 0.0002, "epoch": 3.4412265758091993, "step": 5050}, {"loss": 1.3903, "grad_norm": 0.8075643181800842, "learning_rate": 0.0002, "epoch": 3.4480408858603067, "step": 5060}, {"loss": 1.3929, "grad_norm": 0.7133861780166626, "learning_rate": 0.0002, "epoch": 3.454855195911414, "step": 5070}, {"loss": 1.4632, "grad_norm": 0.6631823182106018, "learning_rate": 0.0002, "epoch": 3.4616695059625213, "step": 5080}, {"loss": 1.4162, "grad_norm": 0.673870325088501, "learning_rate": 0.0002, "epoch": 3.4684838160136287, "step": 5090}, {"loss": 1.4247, "grad_norm": 0.6438634395599365, "learning_rate": 0.0002, "epoch": 3.475298126064736, "step": 5100}, {"loss": 1.4421, "grad_norm": 0.7560495734214783, "learning_rate": 0.0002, "epoch": 3.4821124361158433, "step": 5110}, {"loss": 1.4125, "grad_norm": 0.6877814531326294, "learning_rate": 0.0002, "epoch": 3.4889267461669506, "step": 5120}, {"loss": 1.4308, "grad_norm": 0.7031328678131104, "learning_rate": 0.0002, "epoch": 3.495741056218058, "step": 5130}, {"loss": 1.3705, "grad_norm": 0.6797195672988892, "learning_rate": 0.0002, "epoch": 3.5025553662691653, "step": 5140}, {"loss": 1.4687, "grad_norm": 0.6766413450241089, "learning_rate": 0.0002, "epoch": 3.5093696763202726, "step": 5150}, {"loss": 1.4194, "grad_norm": 0.666656494140625, "learning_rate": 0.0002, "epoch": 3.51618398637138, "step": 5160}, {"loss": 1.469, "grad_norm": 0.74996417760849, "learning_rate": 0.0002, "epoch": 3.5229982964224873, "step": 5170}, {"loss": 1.4848, "grad_norm": 0.7370911836624146, "learning_rate": 0.0002, "epoch": 3.5298126064735946, "step": 5180}, {"loss": 1.4523, "grad_norm": 0.9063456654548645, "learning_rate": 0.0002, "epoch": 3.536626916524702, "step": 5190}, {"loss": 1.4726, "grad_norm": 0.6861422657966614, "learning_rate": 0.0002, "epoch": 3.5434412265758093, "step": 5200}, {"loss": 1.4803, "grad_norm": 0.7104039788246155, "learning_rate": 0.0002, "epoch": 3.5502555366269166, "step": 5210}, {"loss": 1.4313, "grad_norm": 0.6578653454780579, "learning_rate": 0.0002, "epoch": 3.557069846678024, "step": 5220}, {"loss": 1.4596, "grad_norm": 0.7336562275886536, "learning_rate": 0.0002, "epoch": 3.5638841567291313, "step": 5230}, {"loss": 1.4591, "grad_norm": 0.7163010835647583, "learning_rate": 0.0002, "epoch": 3.5706984667802386, "step": 5240}, {"loss": 1.3814, "grad_norm": 0.8112391233444214, "learning_rate": 0.0002, "epoch": 3.577512776831346, "step": 5250}, {"loss": 1.4249, "grad_norm": 0.7260391116142273, "learning_rate": 0.0002, "epoch": 3.5843270868824533, "step": 5260}, {"loss": 1.4249, "grad_norm": 0.7038731575012207, "learning_rate": 0.0002, "epoch": 3.5911413969335606, "step": 5270}, {"loss": 1.4172, "grad_norm": 0.7864376902580261, "learning_rate": 0.0002, "epoch": 3.597955706984668, "step": 5280}, {"loss": 1.4637, "grad_norm": 0.6968383193016052, "learning_rate": 0.0002, "epoch": 3.6047700170357753, "step": 5290}, {"loss": 1.5269, "grad_norm": 0.6726206541061401, "learning_rate": 0.0002, "epoch": 3.6115843270868826, "step": 5300}, {"loss": 1.4199, "grad_norm": 0.6716854572296143, "learning_rate": 0.0002, "epoch": 3.61839863713799, "step": 5310}, {"loss": 1.4686, "grad_norm": 0.7229742407798767, "learning_rate": 0.0002, "epoch": 3.6252129471890973, "step": 5320}, {"loss": 1.4441, "grad_norm": 0.7338683009147644, "learning_rate": 0.0002, "epoch": 3.6320272572402046, "step": 5330}, {"loss": 1.4116, "grad_norm": 0.771672785282135, "learning_rate": 0.0002, "epoch": 3.638841567291312, "step": 5340}, {"loss": 1.4007, "grad_norm": 0.7024078369140625, "learning_rate": 0.0002, "epoch": 3.645655877342419, "step": 5350}, {"loss": 1.4996, "grad_norm": 0.6847538352012634, "learning_rate": 0.0002, "epoch": 3.6524701873935266, "step": 5360}, {"loss": 1.4111, "grad_norm": 0.71802818775177, "learning_rate": 0.0002, "epoch": 3.6592844974446335, "step": 5370}, {"loss": 1.4224, "grad_norm": 0.78530353307724, "learning_rate": 0.0002, "epoch": 3.6660988074957412, "step": 5380}, {"loss": 1.4582, "grad_norm": 0.7262226939201355, "learning_rate": 0.0002, "epoch": 3.672913117546848, "step": 5390}, {"loss": 1.4704, "grad_norm": 0.7608316540718079, "learning_rate": 0.0002, "epoch": 3.679727427597956, "step": 5400}, {"loss": 1.3742, "grad_norm": 0.6994926333427429, "learning_rate": 0.0002, "epoch": 3.686541737649063, "step": 5410}, {"loss": 1.4738, "grad_norm": 0.7888479828834534, "learning_rate": 0.0002, "epoch": 3.6933560477001706, "step": 5420}, {"loss": 1.4213, "grad_norm": 0.7053858041763306, "learning_rate": 0.0002, "epoch": 3.7001703577512775, "step": 5430}, {"loss": 1.4988, "grad_norm": 0.7063165903091431, "learning_rate": 0.0002, "epoch": 3.7069846678023852, "step": 5440}, {"loss": 1.4386, "grad_norm": 0.6603744626045227, "learning_rate": 0.0002, "epoch": 3.713798977853492, "step": 5450}, {"loss": 1.4695, "grad_norm": 0.7043602466583252, "learning_rate": 0.0002, "epoch": 3.7206132879046, "step": 5460}, {"loss": 1.5051, "grad_norm": 0.7026081681251526, "learning_rate": 0.0002, "epoch": 3.7274275979557068, "step": 5470}, {"loss": 1.5613, "grad_norm": 0.7200090289115906, "learning_rate": 0.0002, "epoch": 3.7342419080068145, "step": 5480}, {"loss": 1.4182, "grad_norm": 0.7170904278755188, "learning_rate": 0.0002, "epoch": 3.7410562180579214, "step": 5490}, {"loss": 1.4344, "grad_norm": 0.7489104866981506, "learning_rate": 0.0002, "epoch": 3.747870528109029, "step": 5500}, {"loss": 1.4911, "grad_norm": 0.6540989875793457, "learning_rate": 0.0002, "epoch": 3.754684838160136, "step": 5510}, {"loss": 1.4955, "grad_norm": 0.6654048562049866, "learning_rate": 0.0002, "epoch": 3.761499148211244, "step": 5520}, {"loss": 1.4487, "grad_norm": 0.6577395796775818, "learning_rate": 0.0002, "epoch": 3.7683134582623508, "step": 5530}, {"loss": 1.4283, "grad_norm": 0.7762192487716675, "learning_rate": 0.0002, "epoch": 3.7751277683134585, "step": 5540}, {"loss": 1.4727, "grad_norm": 0.6336314678192139, "learning_rate": 0.0002, "epoch": 3.7819420783645654, "step": 5550}, {"loss": 1.4588, "grad_norm": 0.7098057866096497, "learning_rate": 0.0002, "epoch": 3.7887563884156727, "step": 5560}, {"loss": 1.4679, "grad_norm": 0.7379715442657471, "learning_rate": 0.0002, "epoch": 3.79557069846678, "step": 5570}, {"loss": 1.4633, "grad_norm": 0.6726924777030945, "learning_rate": 0.0002, "epoch": 3.8023850085178874, "step": 5580}, {"loss": 1.4751, "grad_norm": 1.1212009191513062, "learning_rate": 0.0002, "epoch": 3.8091993185689947, "step": 5590}, {"loss": 1.4503, "grad_norm": 0.6503795981407166, "learning_rate": 0.0002, "epoch": 3.816013628620102, "step": 5600}, {"loss": 1.4754, "grad_norm": 0.7041325569152832, "learning_rate": 0.0002, "epoch": 3.8228279386712094, "step": 5610}, {"loss": 1.4199, "grad_norm": 0.7962933778762817, "learning_rate": 0.0002, "epoch": 3.8296422487223167, "step": 5620}, {"loss": 1.4672, "grad_norm": 0.6613591909408569, "learning_rate": 0.0002, "epoch": 3.836456558773424, "step": 5630}, {"loss": 1.5688, "grad_norm": 0.7293516397476196, "learning_rate": 0.0002, "epoch": 3.8432708688245314, "step": 5640}, {"loss": 1.4149, "grad_norm": 0.7388607859611511, "learning_rate": 0.0002, "epoch": 3.8500851788756387, "step": 5650}, {"loss": 1.4743, "grad_norm": 0.6440677642822266, "learning_rate": 0.0002, "epoch": 3.856899488926746, "step": 5660}, {"loss": 1.5082, "grad_norm": 0.7729013562202454, "learning_rate": 0.0002, "epoch": 3.8637137989778534, "step": 5670}, {"loss": 1.4608, "grad_norm": 0.6696794033050537, "learning_rate": 0.0002, "epoch": 3.8705281090289607, "step": 5680}, {"loss": 1.472, "grad_norm": 0.7151781320571899, "learning_rate": 0.0002, "epoch": 3.877342419080068, "step": 5690}, {"loss": 1.4923, "grad_norm": 0.6736966371536255, "learning_rate": 0.0002, "epoch": 3.8841567291311754, "step": 5700}, {"loss": 1.4453, "grad_norm": 0.7444243431091309, "learning_rate": 0.0002, "epoch": 3.8909710391822827, "step": 5710}, {"loss": 1.4562, "grad_norm": 0.6701464653015137, "learning_rate": 0.0002, "epoch": 3.89778534923339, "step": 5720}, {"loss": 1.4478, "grad_norm": 0.7231952548027039, "learning_rate": 0.0002, "epoch": 3.9045996592844974, "step": 5730}, {"loss": 1.4539, "grad_norm": 0.831954300403595, "learning_rate": 0.0002, "epoch": 3.9114139693356047, "step": 5740}, {"loss": 1.5122, "grad_norm": 0.7697733640670776, "learning_rate": 0.0002, "epoch": 3.918228279386712, "step": 5750}, {"loss": 1.4552, "grad_norm": 0.6964395046234131, "learning_rate": 0.0002, "epoch": 3.9250425894378194, "step": 5760}, {"loss": 1.4688, "grad_norm": 0.6942925453186035, "learning_rate": 0.0002, "epoch": 3.9318568994889267, "step": 5770}, {"loss": 1.4668, "grad_norm": 0.6491202712059021, "learning_rate": 0.0002, "epoch": 3.938671209540034, "step": 5780}, {"loss": 1.4404, "grad_norm": 0.7004382610321045, "learning_rate": 0.0002, "epoch": 3.9454855195911414, "step": 5790}, {"loss": 1.5022, "grad_norm": 0.7337747812271118, "learning_rate": 0.0002, "epoch": 3.9522998296422487, "step": 5800}, {"loss": 1.5314, "grad_norm": 0.6923640966415405, "learning_rate": 0.0002, "epoch": 3.959114139693356, "step": 5810}, {"loss": 1.4811, "grad_norm": 0.6815266609191895, "learning_rate": 0.0002, "epoch": 3.9659284497444633, "step": 5820}, {"loss": 1.437, "grad_norm": 0.6755654811859131, "learning_rate": 0.0002, "epoch": 3.9727427597955707, "step": 5830}, {"loss": 1.4277, "grad_norm": 0.6912487149238586, "learning_rate": 0.0002, "epoch": 3.979557069846678, "step": 5840}, {"loss": 1.4654, "grad_norm": 0.6948044896125793, "learning_rate": 0.0002, "epoch": 3.9863713798977853, "step": 5850}, {"loss": 1.4779, "grad_norm": 0.6735455989837646, "learning_rate": 0.0002, "epoch": 3.9931856899488927, "step": 5860}, {"loss": 1.5102, "grad_norm": 0.7005048990249634, "learning_rate": 0.0002, "epoch": 4.0, "step": 5870}, {"eval_loss": 1.923058032989502, "eval_runtime": 58.9903, "eval_samples_per_second": 8.595, "eval_steps_per_second": 1.085, "epoch": 4.0, "step": 5870}, {"loss": 1.2417, "grad_norm": 0.809018075466156, "learning_rate": 0.0002, "epoch": 4.006814310051107, "step": 5880}, {"loss": 1.2874, "grad_norm": 0.9499403238296509, "learning_rate": 0.0002, "epoch": 4.013628620102215, "step": 5890}, {"loss": 1.2245, "grad_norm": 0.7944574356079102, "learning_rate": 0.0002, "epoch": 4.0204429301533215, "step": 5900}, {"loss": 1.2751, "grad_norm": 0.9501046538352966, "learning_rate": 0.0002, "epoch": 4.027257240204429, "step": 5910}, {"loss": 1.2706, "grad_norm": 0.8247923254966736, "learning_rate": 0.0002, "epoch": 4.034071550255536, "step": 5920}, {"loss": 1.2762, "grad_norm": 0.9358038902282715, "learning_rate": 0.0002, "epoch": 4.040885860306644, "step": 5930}, {"loss": 1.2953, "grad_norm": 1.0102452039718628, "learning_rate": 0.0002, "epoch": 4.047700170357751, "step": 5940}, {"loss": 1.216, "grad_norm": 1.0248252153396606, "learning_rate": 0.0002, "epoch": 4.054514480408859, "step": 5950}, {"loss": 1.2115, "grad_norm": 1.0438553094863892, "learning_rate": 0.0002, "epoch": 4.0613287904599655, "step": 5960}, {"loss": 1.2516, "grad_norm": 0.7964957356452942, "learning_rate": 0.0002, "epoch": 4.068143100511073, "step": 5970}, {"loss": 1.1555, "grad_norm": 0.9757015109062195, "learning_rate": 0.0002, "epoch": 4.07495741056218, "step": 5980}, {"loss": 1.2243, "grad_norm": 0.9157161116600037, "learning_rate": 0.0002, "epoch": 4.081771720613288, "step": 5990}, {"loss": 1.2481, "grad_norm": 0.9372851848602295, "learning_rate": 0.0002, "epoch": 4.088586030664395, "step": 6000}, {"loss": 1.2091, "grad_norm": 1.240779995918274, "learning_rate": 0.0002, "epoch": 4.095400340715503, "step": 6010}, {"loss": 1.1727, "grad_norm": 0.8394840359687805, "learning_rate": 0.0002, "epoch": 4.1022146507666095, "step": 6020}, {"loss": 1.2926, "grad_norm": 1.1081455945968628, "learning_rate": 0.0002, "epoch": 4.109028960817717, "step": 6030}, {"loss": 1.2417, "grad_norm": 0.9227745532989502, "learning_rate": 0.0002, "epoch": 4.115843270868824, "step": 6040}, {"loss": 1.1994, "grad_norm": 0.8487664461135864, "learning_rate": 0.0002, "epoch": 4.122657580919932, "step": 6050}, {"loss": 1.2378, "grad_norm": 0.9643339514732361, "learning_rate": 0.0002, "epoch": 4.129471890971039, "step": 6060}, {"loss": 1.2254, "grad_norm": 1.0296099185943604, "learning_rate": 0.0002, "epoch": 4.136286201022147, "step": 6070}, {"loss": 1.2419, "grad_norm": 0.9534215927124023, "learning_rate": 0.0002, "epoch": 4.1431005110732535, "step": 6080}, {"loss": 1.1849, "grad_norm": 0.9647086262702942, "learning_rate": 0.0002, "epoch": 4.149914821124361, "step": 6090}, {"loss": 1.2713, "grad_norm": 1.084836721420288, "learning_rate": 0.0002, "epoch": 4.156729131175468, "step": 6100}, {"loss": 1.1788, "grad_norm": 0.9315235614776611, "learning_rate": 0.0002, "epoch": 4.163543441226576, "step": 6110}, {"loss": 1.17, "grad_norm": 0.9541679620742798, "learning_rate": 0.0002, "epoch": 4.170357751277683, "step": 6120}, {"loss": 1.1407, "grad_norm": 0.9792100191116333, "learning_rate": 0.0002, "epoch": 4.177172061328791, "step": 6130}, {"loss": 1.2069, "grad_norm": 1.065783143043518, "learning_rate": 0.0002, "epoch": 4.1839863713798975, "step": 6140}, {"loss": 1.2512, "grad_norm": 1.036161184310913, "learning_rate": 0.0002, "epoch": 4.190800681431005, "step": 6150}, {"loss": 1.2371, "grad_norm": 0.8979679942131042, "learning_rate": 0.0002, "epoch": 4.197614991482112, "step": 6160}, {"loss": 1.2212, "grad_norm": 0.7584333419799805, "learning_rate": 0.0002, "epoch": 4.20442930153322, "step": 6170}, {"loss": 1.2128, "grad_norm": 1.1970131397247314, "learning_rate": 0.0002, "epoch": 4.211243611584327, "step": 6180}, {"loss": 1.1982, "grad_norm": 2.6447298526763916, "learning_rate": 0.0002, "epoch": 4.218057921635435, "step": 6190}, {"loss": 1.2465, "grad_norm": 0.9357487559318542, "learning_rate": 0.0002, "epoch": 4.2248722316865415, "step": 6200}, {"loss": 1.2963, "grad_norm": 0.9141183495521545, "learning_rate": 0.0002, "epoch": 4.231686541737649, "step": 6210}, {"loss": 1.1959, "grad_norm": 1.0606296062469482, "learning_rate": 0.0002, "epoch": 4.238500851788756, "step": 6220}, {"loss": 1.2629, "grad_norm": 0.9999088048934937, "learning_rate": 0.0002, "epoch": 4.245315161839864, "step": 6230}, {"loss": 1.1471, "grad_norm": 0.9469764232635498, "learning_rate": 0.0002, "epoch": 4.252129471890971, "step": 6240}, {"loss": 1.223, "grad_norm": 1.1508198976516724, "learning_rate": 0.0002, "epoch": 4.258943781942079, "step": 6250}, {"loss": 1.2677, "grad_norm": 1.2576130628585815, "learning_rate": 0.0002, "epoch": 4.2657580919931855, "step": 6260}, {"loss": 1.2216, "grad_norm": 0.9435968399047852, "learning_rate": 0.0002, "epoch": 4.272572402044293, "step": 6270}, {"loss": 1.2788, "grad_norm": 0.9290348887443542, "learning_rate": 0.0002, "epoch": 4.2793867120954, "step": 6280}, {"loss": 1.2631, "grad_norm": 0.9973701238632202, "learning_rate": 0.0002, "epoch": 4.286201022146508, "step": 6290}, {"loss": 1.2276, "grad_norm": 1.012855887413025, "learning_rate": 0.0002, "epoch": 4.293015332197615, "step": 6300}, {"loss": 1.2115, "grad_norm": 0.8371705412864685, "learning_rate": 0.0002, "epoch": 4.2998296422487225, "step": 6310}, {"loss": 1.2423, "grad_norm": 1.0867925882339478, "learning_rate": 0.0002, "epoch": 4.306643952299829, "step": 6320}, {"loss": 1.2262, "grad_norm": 0.9763767123222351, "learning_rate": 0.0002, "epoch": 4.313458262350937, "step": 6330}, {"loss": 1.2557, "grad_norm": 1.1844252347946167, "learning_rate": 0.0002, "epoch": 4.320272572402044, "step": 6340}, {"loss": 1.2635, "grad_norm": 0.8292830586433411, "learning_rate": 0.0002, "epoch": 4.327086882453152, "step": 6350}, {"loss": 1.262, "grad_norm": 0.9351436495780945, "learning_rate": 0.0002, "epoch": 4.333901192504259, "step": 6360}, {"loss": 1.2678, "grad_norm": 1.0425835847854614, "learning_rate": 0.0002, "epoch": 4.3407155025553665, "step": 6370}, {"loss": 1.2476, "grad_norm": 0.8894261121749878, "learning_rate": 0.0002, "epoch": 4.347529812606473, "step": 6380}, {"loss": 1.2965, "grad_norm": 0.9663366079330444, "learning_rate": 0.0002, "epoch": 4.354344122657581, "step": 6390}, {"loss": 1.2529, "grad_norm": 0.8915578126907349, "learning_rate": 0.0002, "epoch": 4.361158432708688, "step": 6400}, {"loss": 1.2573, "grad_norm": 1.0393000841140747, "learning_rate": 0.0002, "epoch": 4.367972742759796, "step": 6410}, {"loss": 1.2254, "grad_norm": 0.917398989200592, "learning_rate": 0.0002, "epoch": 4.374787052810903, "step": 6420}, {"loss": 1.3115, "grad_norm": 1.0496646165847778, "learning_rate": 0.0002, "epoch": 4.3816013628620105, "step": 6430}, {"loss": 1.2607, "grad_norm": 0.9349859356880188, "learning_rate": 0.0002, "epoch": 4.388415672913117, "step": 6440}, {"loss": 1.3414, "grad_norm": 1.0981004238128662, "learning_rate": 0.0002, "epoch": 4.395229982964225, "step": 6450}, {"loss": 1.2391, "grad_norm": 0.9794871807098389, "learning_rate": 0.0002, "epoch": 4.402044293015332, "step": 6460}, {"loss": 1.208, "grad_norm": 0.9321421384811401, "learning_rate": 0.0002, "epoch": 4.40885860306644, "step": 6470}, {"loss": 1.3398, "grad_norm": 0.9158342480659485, "learning_rate": 0.0002, "epoch": 4.415672913117547, "step": 6480}, {"loss": 1.1832, "grad_norm": 0.9462087750434875, "learning_rate": 0.0002, "epoch": 4.4224872231686545, "step": 6490}, {"loss": 1.2366, "grad_norm": 0.9740175604820251, "learning_rate": 0.0002, "epoch": 4.429301533219761, "step": 6500}, {"loss": 1.3074, "grad_norm": 0.8477463126182556, "learning_rate": 0.0002, "epoch": 4.436115843270869, "step": 6510}, {"loss": 1.2719, "grad_norm": 1.0296647548675537, "learning_rate": 0.0002, "epoch": 4.442930153321976, "step": 6520}, {"loss": 1.2647, "grad_norm": 0.9437751173973083, "learning_rate": 0.0002, "epoch": 4.449744463373084, "step": 6530}, {"loss": 1.2043, "grad_norm": 1.011192798614502, "learning_rate": 0.0002, "epoch": 4.456558773424191, "step": 6540}, {"loss": 1.3673, "grad_norm": 0.8836222290992737, "learning_rate": 0.0002, "epoch": 4.4633730834752985, "step": 6550}, {"loss": 1.3028, "grad_norm": 1.2799941301345825, "learning_rate": 0.0002, "epoch": 4.470187393526405, "step": 6560}, {"loss": 1.2789, "grad_norm": 0.925910472869873, "learning_rate": 0.0002, "epoch": 4.477001703577513, "step": 6570}, {"loss": 1.2723, "grad_norm": 0.957401692867279, "learning_rate": 0.0002, "epoch": 4.48381601362862, "step": 6580}, {"loss": 1.242, "grad_norm": 1.0789544582366943, "learning_rate": 0.0002, "epoch": 4.490630323679728, "step": 6590}, {"loss": 1.2553, "grad_norm": 0.8874586820602417, "learning_rate": 0.0002, "epoch": 4.497444633730835, "step": 6600}, {"loss": 1.2779, "grad_norm": 0.9394784569740295, "learning_rate": 0.0002, "epoch": 4.504258943781942, "step": 6610}, {"loss": 1.2744, "grad_norm": 1.029640793800354, "learning_rate": 0.0002, "epoch": 4.511073253833049, "step": 6620}, {"loss": 1.2634, "grad_norm": 0.9510841965675354, "learning_rate": 0.0002, "epoch": 4.517887563884157, "step": 6630}, {"loss": 1.2562, "grad_norm": 0.9992963671684265, "learning_rate": 0.0002, "epoch": 4.524701873935264, "step": 6640}, {"loss": 1.2942, "grad_norm": 0.9312878847122192, "learning_rate": 0.0002, "epoch": 4.531516183986371, "step": 6650}, {"loss": 1.2572, "grad_norm": 0.9406482577323914, "learning_rate": 0.0002, "epoch": 4.538330494037479, "step": 6660}, {"loss": 1.2283, "grad_norm": 1.1058286428451538, "learning_rate": 0.0002, "epoch": 4.5451448040885865, "step": 6670}, {"loss": 1.2391, "grad_norm": 0.9389635920524597, "learning_rate": 0.0002, "epoch": 4.551959114139693, "step": 6680}, {"loss": 1.2696, "grad_norm": 1.0356028079986572, "learning_rate": 0.0002, "epoch": 4.5587734241908, "step": 6690}, {"loss": 1.2935, "grad_norm": 0.9370909929275513, "learning_rate": 0.0002, "epoch": 4.565587734241908, "step": 6700}, {"loss": 1.2914, "grad_norm": 0.9917567372322083, "learning_rate": 0.0002, "epoch": 4.572402044293016, "step": 6710}, {"loss": 1.3318, "grad_norm": 0.9065384864807129, "learning_rate": 0.0002, "epoch": 4.579216354344123, "step": 6720}, {"loss": 1.2909, "grad_norm": 1.3347833156585693, "learning_rate": 0.0002, "epoch": 4.5860306643952296, "step": 6730}, {"loss": 1.3322, "grad_norm": 0.910632312297821, "learning_rate": 0.0002, "epoch": 4.592844974446337, "step": 6740}, {"loss": 1.2584, "grad_norm": 0.8874805569648743, "learning_rate": 0.0002, "epoch": 4.599659284497445, "step": 6750}, {"loss": 1.3173, "grad_norm": 0.9355664253234863, "learning_rate": 0.0002, "epoch": 4.606473594548552, "step": 6760}, {"loss": 1.3515, "grad_norm": 0.9360204339027405, "learning_rate": 0.0002, "epoch": 4.613287904599659, "step": 6770}, {"loss": 1.2326, "grad_norm": 0.9931750893592834, "learning_rate": 0.0002, "epoch": 4.620102214650767, "step": 6780}, {"loss": 1.2677, "grad_norm": 0.9195131063461304, "learning_rate": 0.0002, "epoch": 4.626916524701874, "step": 6790}, {"loss": 1.3417, "grad_norm": 0.9448373913764954, "learning_rate": 0.0002, "epoch": 4.633730834752981, "step": 6800}, {"loss": 1.2658, "grad_norm": 1.162890911102295, "learning_rate": 0.0002, "epoch": 4.640545144804088, "step": 6810}, {"loss": 1.2841, "grad_norm": 0.9739466905593872, "learning_rate": 0.0002, "epoch": 4.647359454855196, "step": 6820}, {"loss": 1.3068, "grad_norm": 0.9462909698486328, "learning_rate": 0.0002, "epoch": 4.654173764906303, "step": 6830}, {"loss": 1.284, "grad_norm": 1.042639970779419, "learning_rate": 0.0002, "epoch": 4.660988074957411, "step": 6840}, {"loss": 1.3337, "grad_norm": 0.8910539150238037, "learning_rate": 0.0002, "epoch": 4.6678023850085175, "step": 6850}, {"loss": 1.3025, "grad_norm": 1.0806447267532349, "learning_rate": 0.0002, "epoch": 4.674616695059625, "step": 6860}, {"loss": 1.2258, "grad_norm": 1.0054864883422852, "learning_rate": 0.0002, "epoch": 4.681431005110732, "step": 6870}, {"loss": 1.3261, "grad_norm": 0.7774158120155334, "learning_rate": 0.0002, "epoch": 4.68824531516184, "step": 6880}, {"loss": 1.2545, "grad_norm": 0.9729512333869934, "learning_rate": 0.0002, "epoch": 4.695059625212947, "step": 6890}, {"loss": 1.3251, "grad_norm": 1.2025411128997803, "learning_rate": 0.0002, "epoch": 4.701873935264055, "step": 6900}, {"loss": 1.3418, "grad_norm": 1.1654069423675537, "learning_rate": 0.0002, "epoch": 4.7086882453151615, "step": 6910}, {"loss": 1.3091, "grad_norm": 1.1501442193984985, "learning_rate": 0.0002, "epoch": 4.715502555366269, "step": 6920}, {"loss": 1.2627, "grad_norm": 1.1083979606628418, "learning_rate": 0.0002, "epoch": 4.722316865417376, "step": 6930}, {"loss": 1.2836, "grad_norm": 0.9431378841400146, "learning_rate": 0.0002, "epoch": 4.729131175468484, "step": 6940}, {"loss": 1.3381, "grad_norm": 0.9722502827644348, "learning_rate": 0.0002, "epoch": 4.735945485519591, "step": 6950}, {"loss": 1.3228, "grad_norm": 0.9094559550285339, "learning_rate": 0.0002, "epoch": 4.742759795570699, "step": 6960}, {"loss": 1.3474, "grad_norm": 0.9918473958969116, "learning_rate": 0.0002, "epoch": 4.7495741056218055, "step": 6970}, {"loss": 1.3352, "grad_norm": 0.9999690651893616, "learning_rate": 0.0002, "epoch": 4.756388415672913, "step": 6980}, {"loss": 1.3579, "grad_norm": 1.0453810691833496, "learning_rate": 0.0002, "epoch": 4.76320272572402, "step": 6990}, {"loss": 1.294, "grad_norm": 1.0167806148529053, "learning_rate": 0.0002, "epoch": 4.770017035775128, "step": 7000}, {"loss": 1.3247, "grad_norm": 0.8133894801139832, "learning_rate": 0.0002, "epoch": 4.776831345826235, "step": 7010}, {"loss": 1.2577, "grad_norm": 0.8000897765159607, "learning_rate": 0.0002, "epoch": 4.783645655877343, "step": 7020}, {"loss": 1.2802, "grad_norm": 0.992080569267273, "learning_rate": 0.0002, "epoch": 4.7904599659284495, "step": 7030}, {"loss": 1.3269, "grad_norm": 0.9824522137641907, "learning_rate": 0.0002, "epoch": 4.797274275979557, "step": 7040}, {"loss": 1.279, "grad_norm": 0.9808870553970337, "learning_rate": 0.0002, "epoch": 4.804088586030664, "step": 7050}, {"loss": 1.3342, "grad_norm": 0.9679701924324036, "learning_rate": 0.0002, "epoch": 4.810902896081772, "step": 7060}, {"loss": 1.2711, "grad_norm": 0.9895215034484863, "learning_rate": 0.0002, "epoch": 4.817717206132879, "step": 7070}, {"loss": 1.3008, "grad_norm": 1.052246332168579, "learning_rate": 0.0002, "epoch": 4.824531516183987, "step": 7080}, {"loss": 1.2874, "grad_norm": 0.9243564605712891, "learning_rate": 0.0002, "epoch": 4.8313458262350935, "step": 7090}, {"loss": 1.2835, "grad_norm": 0.9545369744300842, "learning_rate": 0.0002, "epoch": 4.838160136286201, "step": 7100}, {"loss": 1.31, "grad_norm": 0.9655884504318237, "learning_rate": 0.0002, "epoch": 4.844974446337308, "step": 7110}, {"loss": 1.2862, "grad_norm": 0.9708049893379211, "learning_rate": 0.0002, "epoch": 4.851788756388416, "step": 7120}, {"loss": 1.3425, "grad_norm": 1.0064880847930908, "learning_rate": 0.0002, "epoch": 4.858603066439523, "step": 7130}, {"loss": 1.2899, "grad_norm": 0.939943790435791, "learning_rate": 0.0002, "epoch": 4.8654173764906306, "step": 7140}, {"loss": 1.2887, "grad_norm": 1.0750784873962402, "learning_rate": 0.0002, "epoch": 4.872231686541737, "step": 7150}, {"loss": 1.3367, "grad_norm": 0.9708989262580872, "learning_rate": 0.0002, "epoch": 4.879045996592845, "step": 7160}, {"loss": 1.2797, "grad_norm": 1.0228253602981567, "learning_rate": 0.0002, "epoch": 4.885860306643952, "step": 7170}, {"loss": 1.2695, "grad_norm": 0.8963132500648499, "learning_rate": 0.0002, "epoch": 4.89267461669506, "step": 7180}, {"loss": 1.3473, "grad_norm": 0.9198015928268433, "learning_rate": 0.0002, "epoch": 4.899488926746167, "step": 7190}, {"loss": 1.2541, "grad_norm": 1.099906086921692, "learning_rate": 0.0002, "epoch": 4.9063032367972745, "step": 7200}, {"loss": 1.3188, "grad_norm": 1.0624815225601196, "learning_rate": 0.0002, "epoch": 4.913117546848381, "step": 7210}, {"loss": 1.3026, "grad_norm": 0.9688444137573242, "learning_rate": 0.0002, "epoch": 4.919931856899489, "step": 7220}, {"loss": 1.3379, "grad_norm": 0.867011547088623, "learning_rate": 0.0002, "epoch": 4.926746166950596, "step": 7230}, {"loss": 1.289, "grad_norm": 0.9600282311439514, "learning_rate": 0.0002, "epoch": 4.933560477001704, "step": 7240}, {"loss": 1.2751, "grad_norm": 0.8979372978210449, "learning_rate": 0.0002, "epoch": 4.940374787052811, "step": 7250}, {"loss": 1.3426, "grad_norm": 0.951474130153656, "learning_rate": 0.0002, "epoch": 4.9471890971039185, "step": 7260}, {"loss": 1.2726, "grad_norm": 0.824851393699646, "learning_rate": 0.0002, "epoch": 4.954003407155025, "step": 7270}, {"loss": 1.2679, "grad_norm": 1.2926591634750366, "learning_rate": 0.0002, "epoch": 4.960817717206133, "step": 7280}, {"loss": 1.2974, "grad_norm": 1.1057835817337036, "learning_rate": 0.0002, "epoch": 4.96763202725724, "step": 7290}, {"loss": 1.2275, "grad_norm": 0.9814816117286682, "learning_rate": 0.0002, "epoch": 4.974446337308348, "step": 7300}, {"loss": 1.3001, "grad_norm": 1.0251333713531494, "learning_rate": 0.0002, "epoch": 4.981260647359455, "step": 7310}, {"loss": 1.3113, "grad_norm": 0.9748668074607849, "learning_rate": 0.0002, "epoch": 4.9880749574105625, "step": 7320}, {"loss": 1.3595, "grad_norm": 0.8552228808403015, "learning_rate": 0.0002, "epoch": 4.994889267461669, "step": 7330}, {"eval_loss": 2.03971004486084, "eval_runtime": 67.4144, "eval_samples_per_second": 7.521, "eval_steps_per_second": 0.949, "epoch": 4.999659284497445, "step": 7337}, {"loss": 1.2464, "grad_norm": 0.8210785388946533, "learning_rate": 0.0002, "epoch": 5.001703577512777, "step": 7340}, {"loss": 1.0356, "grad_norm": 1.2577511072158813, "learning_rate": 0.0002, "epoch": 5.008517887563884, "step": 7350}, {"loss": 0.9944, "grad_norm": 1.280604362487793, "learning_rate": 0.0002, "epoch": 5.015332197614992, "step": 7360}, {"loss": 1.0858, "grad_norm": 1.3985474109649658, "learning_rate": 0.0002, "epoch": 5.022146507666099, "step": 7370}, {"loss": 1.0122, "grad_norm": 1.1621310710906982, "learning_rate": 0.0002, "epoch": 5.0289608177172065, "step": 7380}, {"loss": 1.05, "grad_norm": 1.3278541564941406, "learning_rate": 0.0002, "epoch": 5.035775127768313, "step": 7390}, {"loss": 1.0237, "grad_norm": 1.1166491508483887, "learning_rate": 0.0002, "epoch": 5.042589437819421, "step": 7400}, {"loss": 1.0397, "grad_norm": 1.8087667226791382, "learning_rate": 0.0002, "epoch": 5.049403747870528, "step": 7410}, {"loss": 1.0191, "grad_norm": 1.1517921686172485, "learning_rate": 0.0002, "epoch": 5.056218057921636, "step": 7420}, {"loss": 1.025, "grad_norm": 1.2875889539718628, "learning_rate": 0.0002, "epoch": 5.063032367972743, "step": 7430}, {"loss": 1.043, "grad_norm": 1.199702262878418, "learning_rate": 0.0002, "epoch": 5.0698466780238505, "step": 7440}, {"loss": 1.0176, "grad_norm": 1.2912452220916748, "learning_rate": 0.0002, "epoch": 5.076660988074957, "step": 7450}, {"loss": 1.0042, "grad_norm": 1.1446452140808105, "learning_rate": 0.0002, "epoch": 5.083475298126065, "step": 7460}, {"loss": 1.047, "grad_norm": 1.3625746965408325, "learning_rate": 0.0002, "epoch": 5.090289608177172, "step": 7470}, {"loss": 1.052, "grad_norm": 1.2116546630859375, "learning_rate": 0.0002, "epoch": 5.09710391822828, "step": 7480}, {"loss": 1.1041, "grad_norm": 1.3896098136901855, "learning_rate": 0.0002, "epoch": 5.103918228279387, "step": 7490}, {"loss": 1.0668, "grad_norm": 1.6265277862548828, "learning_rate": 0.0002, "epoch": 5.1107325383304945, "step": 7500}, {"loss": 1.028, "grad_norm": 1.1468392610549927, "learning_rate": 0.0002, "epoch": 5.117546848381601, "step": 7510}, {"loss": 0.9915, "grad_norm": 1.2649329900741577, "learning_rate": 0.0002, "epoch": 5.124361158432709, "step": 7520}, {"loss": 1.0251, "grad_norm": 1.1866015195846558, "learning_rate": 0.0002, "epoch": 5.131175468483816, "step": 7530}, {"loss": 1.0626, "grad_norm": 1.1517255306243896, "learning_rate": 0.0002, "epoch": 5.137989778534923, "step": 7540}, {"loss": 1.0303, "grad_norm": 1.3475146293640137, "learning_rate": 0.0002, "epoch": 5.144804088586031, "step": 7550}, {"loss": 1.0456, "grad_norm": 1.1167018413543701, "learning_rate": 0.0002, "epoch": 5.151618398637138, "step": 7560}, {"loss": 1.04, "grad_norm": 1.209572434425354, "learning_rate": 0.0002, "epoch": 5.158432708688245, "step": 7570}, {"loss": 1.0533, "grad_norm": 1.3578280210494995, "learning_rate": 0.0002, "epoch": 5.165247018739352, "step": 7580}, {"loss": 1.0958, "grad_norm": 1.2447012662887573, "learning_rate": 0.0002, "epoch": 5.17206132879046, "step": 7590}, {"loss": 1.0521, "grad_norm": 1.3715848922729492, "learning_rate": 0.0002, "epoch": 5.178875638841567, "step": 7600}, {"loss": 1.0556, "grad_norm": 1.435860276222229, "learning_rate": 0.0002, "epoch": 5.185689948892675, "step": 7610}, {"loss": 1.0504, "grad_norm": 1.4093858003616333, "learning_rate": 0.0002, "epoch": 5.1925042589437815, "step": 7620}, {"loss": 1.083, "grad_norm": 1.1747535467147827, "learning_rate": 0.0002, "epoch": 5.199318568994889, "step": 7630}, {"loss": 1.048, "grad_norm": 1.4704833030700684, "learning_rate": 0.0002, "epoch": 5.206132879045996, "step": 7640}, {"loss": 0.9991, "grad_norm": 1.2270972728729248, "learning_rate": 0.0002, "epoch": 5.212947189097104, "step": 7650}, {"loss": 1.0738, "grad_norm": 1.2215691804885864, "learning_rate": 0.0002, "epoch": 5.219761499148211, "step": 7660}, {"loss": 1.0628, "grad_norm": 1.3641486167907715, "learning_rate": 0.0002, "epoch": 5.226575809199319, "step": 7670}, {"loss": 1.1066, "grad_norm": 1.3532041311264038, "learning_rate": 0.0002, "epoch": 5.2333901192504255, "step": 7680}, {"loss": 1.0209, "grad_norm": 1.2243095636367798, "learning_rate": 0.0002, "epoch": 5.240204429301533, "step": 7690}, {"loss": 1.0503, "grad_norm": 1.3644746541976929, "learning_rate": 0.0002, "epoch": 5.24701873935264, "step": 7700}, {"loss": 1.0406, "grad_norm": 1.18478262424469, "learning_rate": 0.0002, "epoch": 5.253833049403748, "step": 7710}, {"loss": 1.1023, "grad_norm": 1.2146114110946655, "learning_rate": 0.0002, "epoch": 5.260647359454855, "step": 7720}, {"loss": 1.1528, "grad_norm": 1.233984112739563, "learning_rate": 0.0002, "epoch": 5.267461669505963, "step": 7730}, {"loss": 1.0681, "grad_norm": 1.3709665536880493, "learning_rate": 0.0002, "epoch": 5.2742759795570695, "step": 7740}, {"loss": 1.0195, "grad_norm": 1.36055326461792, "learning_rate": 0.0002, "epoch": 5.281090289608177, "step": 7750}, {"loss": 1.0447, "grad_norm": 1.6232351064682007, "learning_rate": 0.0002, "epoch": 5.287904599659284, "step": 7760}, {"loss": 1.0627, "grad_norm": 1.3359960317611694, "learning_rate": 0.0002, "epoch": 5.294718909710392, "step": 7770}, {"loss": 1.1082, "grad_norm": 1.3815656900405884, "learning_rate": 0.0002, "epoch": 5.301533219761499, "step": 7780}, {"loss": 1.0891, "grad_norm": 1.1392076015472412, "learning_rate": 0.0002, "epoch": 5.308347529812607, "step": 7790}, {"loss": 1.0364, "grad_norm": 1.3006905317306519, "learning_rate": 0.0002, "epoch": 5.3151618398637135, "step": 7800}, {"loss": 1.1005, "grad_norm": 1.503645896911621, "learning_rate": 0.0002, "epoch": 5.321976149914821, "step": 7810}, {"loss": 1.0075, "grad_norm": 1.141939640045166, "learning_rate": 0.0002, "epoch": 5.328790459965928, "step": 7820}, {"loss": 1.0284, "grad_norm": 1.4654004573822021, "learning_rate": 0.0002, "epoch": 5.335604770017036, "step": 7830}, {"loss": 1.1185, "grad_norm": 1.4195219278335571, "learning_rate": 0.0002, "epoch": 5.342419080068143, "step": 7840}, {"loss": 1.0535, "grad_norm": 1.2354168891906738, "learning_rate": 0.0002, "epoch": 5.349233390119251, "step": 7850}, {"loss": 1.0923, "grad_norm": 1.529862880706787, "learning_rate": 0.0002, "epoch": 5.3560477001703575, "step": 7860}, {"loss": 1.1005, "grad_norm": 1.364678978919983, "learning_rate": 0.0002, "epoch": 5.362862010221465, "step": 7870}, {"loss": 1.1084, "grad_norm": 1.1010444164276123, "learning_rate": 0.0002, "epoch": 5.369676320272572, "step": 7880}, {"loss": 1.1225, "grad_norm": 1.1949712038040161, "learning_rate": 0.0002, "epoch": 5.37649063032368, "step": 7890}, {"loss": 1.058, "grad_norm": 1.485922932624817, "learning_rate": 0.0002, "epoch": 5.383304940374787, "step": 7900}, {"loss": 0.9894, "grad_norm": 1.0844227075576782, "learning_rate": 0.0002, "epoch": 5.390119250425895, "step": 7910}, {"loss": 1.0418, "grad_norm": 1.3784468173980713, "learning_rate": 0.0002, "epoch": 5.3969335604770015, "step": 7920}, {"loss": 1.0542, "grad_norm": 1.4771490097045898, "learning_rate": 0.0002, "epoch": 5.403747870528109, "step": 7930}, {"loss": 1.1265, "grad_norm": 1.2460103034973145, "learning_rate": 0.0002, "epoch": 5.410562180579216, "step": 7940}, {"loss": 1.096, "grad_norm": 1.3047645092010498, "learning_rate": 0.0002, "epoch": 5.417376490630324, "step": 7950}, {"loss": 1.0956, "grad_norm": 1.1396620273590088, "learning_rate": 0.0002, "epoch": 5.424190800681431, "step": 7960}, {"loss": 1.0685, "grad_norm": 1.4193450212478638, "learning_rate": 0.0002, "epoch": 5.4310051107325386, "step": 7970}, {"loss": 1.1347, "grad_norm": 1.2085850238800049, "learning_rate": 0.0002, "epoch": 5.437819420783645, "step": 7980}, {"loss": 1.0277, "grad_norm": 1.2721607685089111, "learning_rate": 0.0002, "epoch": 5.444633730834753, "step": 7990}, {"loss": 1.1316, "grad_norm": 1.4134020805358887, "learning_rate": 0.0002, "epoch": 5.45144804088586, "step": 8000}, {"loss": 1.0576, "grad_norm": 1.4283325672149658, "learning_rate": 0.0002, "epoch": 5.458262350936968, "step": 8010}, {"loss": 1.0505, "grad_norm": 1.3127079010009766, "learning_rate": 0.0002, "epoch": 5.465076660988075, "step": 8020}, {"loss": 1.0812, "grad_norm": 1.2924352884292603, "learning_rate": 0.0002, "epoch": 5.4718909710391825, "step": 8030}, {"loss": 1.1178, "grad_norm": 1.8000653982162476, "learning_rate": 0.0002, "epoch": 5.478705281090289, "step": 8040}, {"loss": 1.1205, "grad_norm": 1.1538785696029663, "learning_rate": 0.0002, "epoch": 5.485519591141397, "step": 8050}, {"loss": 1.1015, "grad_norm": 1.1173290014266968, "learning_rate": 0.0002, "epoch": 5.492333901192504, "step": 8060}, {"loss": 1.1597, "grad_norm": 1.1501243114471436, "learning_rate": 0.0002, "epoch": 5.499148211243612, "step": 8070}, {"loss": 1.1465, "grad_norm": 1.1335760354995728, "learning_rate": 0.0002, "epoch": 5.505962521294719, "step": 8080}, {"loss": 1.1005, "grad_norm": 1.565274953842163, "learning_rate": 0.0002, "epoch": 5.5127768313458265, "step": 8090}, {"loss": 1.1085, "grad_norm": 1.3415014743804932, "learning_rate": 0.0002, "epoch": 5.519591141396933, "step": 8100}, {"loss": 1.1166, "grad_norm": 1.2377240657806396, "learning_rate": 0.0002, "epoch": 5.526405451448041, "step": 8110}, {"loss": 1.0766, "grad_norm": 1.3333637714385986, "learning_rate": 0.0002, "epoch": 5.533219761499148, "step": 8120}, {"loss": 1.1515, "grad_norm": 1.2620662450790405, "learning_rate": 0.0002, "epoch": 5.540034071550256, "step": 8130}, {"loss": 1.0839, "grad_norm": 1.2806652784347534, "learning_rate": 0.0002, "epoch": 5.546848381601363, "step": 8140}, {"loss": 1.1221, "grad_norm": 1.2057335376739502, "learning_rate": 0.0002, "epoch": 5.5536626916524705, "step": 8150}, {"loss": 1.1292, "grad_norm": 1.411726951599121, "learning_rate": 0.0002, "epoch": 5.560477001703577, "step": 8160}, {"loss": 1.0887, "grad_norm": 1.381104588508606, "learning_rate": 0.0002, "epoch": 5.567291311754685, "step": 8170}, {"loss": 1.1317, "grad_norm": 1.3449294567108154, "learning_rate": 0.0002, "epoch": 5.574105621805792, "step": 8180}, {"loss": 1.1392, "grad_norm": 1.2791016101837158, "learning_rate": 0.0002, "epoch": 5.5809199318569, "step": 8190}, {"loss": 1.0972, "grad_norm": 1.276891827583313, "learning_rate": 0.0002, "epoch": 5.587734241908007, "step": 8200}, {"loss": 1.1001, "grad_norm": 1.3951541185379028, "learning_rate": 0.0002, "epoch": 5.5945485519591145, "step": 8210}, {"loss": 1.0993, "grad_norm": 1.4167890548706055, "learning_rate": 0.0002, "epoch": 5.601362862010221, "step": 8220}, {"loss": 1.0826, "grad_norm": 1.4388375282287598, "learning_rate": 0.0002, "epoch": 5.608177172061329, "step": 8230}, {"loss": 1.1941, "grad_norm": 1.210157036781311, "learning_rate": 0.0002, "epoch": 5.614991482112436, "step": 8240}, {"loss": 1.0833, "grad_norm": 1.0557862520217896, "learning_rate": 0.0002, "epoch": 5.621805792163544, "step": 8250}, {"loss": 1.1197, "grad_norm": 1.2913990020751953, "learning_rate": 0.0002, "epoch": 5.628620102214651, "step": 8260}, {"loss": 1.0346, "grad_norm": 1.2204737663269043, "learning_rate": 0.0002, "epoch": 5.6354344122657585, "step": 8270}, {"loss": 1.1429, "grad_norm": 1.57016921043396, "learning_rate": 0.0002, "epoch": 5.642248722316865, "step": 8280}, {"loss": 1.0988, "grad_norm": 1.0117967128753662, "learning_rate": 0.0002, "epoch": 5.649063032367973, "step": 8290}, {"loss": 1.0786, "grad_norm": 1.3195525407791138, "learning_rate": 0.0002, "epoch": 5.65587734241908, "step": 8300}, {"loss": 1.0618, "grad_norm": 1.2566497325897217, "learning_rate": 0.0002, "epoch": 5.662691652470187, "step": 8310}, {"loss": 1.1635, "grad_norm": 1.1446818113327026, "learning_rate": 0.0002, "epoch": 5.669505962521295, "step": 8320}, {"loss": 1.2201, "grad_norm": 1.2928680181503296, "learning_rate": 0.0002, "epoch": 5.6763202725724025, "step": 8330}, {"loss": 1.1488, "grad_norm": 1.2823996543884277, "learning_rate": 0.0002, "epoch": 5.683134582623509, "step": 8340}, {"loss": 1.0686, "grad_norm": 1.1523874998092651, "learning_rate": 0.0002, "epoch": 5.689948892674616, "step": 8350}, {"loss": 1.0938, "grad_norm": 1.0819287300109863, "learning_rate": 0.0002, "epoch": 5.696763202725724, "step": 8360}, {"loss": 1.167, "grad_norm": 1.2384417057037354, "learning_rate": 0.0002, "epoch": 5.703577512776832, "step": 8370}, {"loss": 1.1136, "grad_norm": 1.1733224391937256, "learning_rate": 0.0002, "epoch": 5.710391822827939, "step": 8380}, {"loss": 1.1041, "grad_norm": 1.3173418045043945, "learning_rate": 0.0002, "epoch": 5.7172061328790456, "step": 8390}, {"loss": 1.1014, "grad_norm": 1.285880446434021, "learning_rate": 0.0002, "epoch": 5.724020442930153, "step": 8400}, {"loss": 1.1161, "grad_norm": 1.1404874324798584, "learning_rate": 0.0002, "epoch": 5.730834752981261, "step": 8410}, {"loss": 1.192, "grad_norm": 1.2432540655136108, "learning_rate": 0.0002, "epoch": 5.737649063032368, "step": 8420}, {"loss": 1.1702, "grad_norm": 1.2432233095169067, "learning_rate": 0.0002, "epoch": 5.744463373083475, "step": 8430}, {"loss": 1.1357, "grad_norm": 1.154496669769287, "learning_rate": 0.0002, "epoch": 5.751277683134583, "step": 8440}, {"loss": 1.1706, "grad_norm": 1.3301030397415161, "learning_rate": 0.0002, "epoch": 5.75809199318569, "step": 8450}, {"loss": 1.2052, "grad_norm": 1.243760347366333, "learning_rate": 0.0002, "epoch": 5.764906303236797, "step": 8460}, {"loss": 1.1035, "grad_norm": 1.4083361625671387, "learning_rate": 0.0002, "epoch": 5.771720613287904, "step": 8470}, {"loss": 1.1362, "grad_norm": 1.5662120580673218, "learning_rate": 0.0002, "epoch": 5.778534923339012, "step": 8480}, {"loss": 1.1578, "grad_norm": 1.2111139297485352, "learning_rate": 0.0002, "epoch": 5.78534923339012, "step": 8490}, {"loss": 1.1333, "grad_norm": 1.2776305675506592, "learning_rate": 0.0002, "epoch": 5.792163543441227, "step": 8500}, {"loss": 1.1439, "grad_norm": 1.1777727603912354, "learning_rate": 0.0002, "epoch": 5.7989778534923335, "step": 8510}, {"loss": 1.0859, "grad_norm": 1.1696112155914307, "learning_rate": 0.0002, "epoch": 5.805792163543441, "step": 8520}, {"loss": 1.162, "grad_norm": 1.137397289276123, "learning_rate": 0.0002, "epoch": 5.812606473594548, "step": 8530}, {"loss": 1.2099, "grad_norm": 1.3182098865509033, "learning_rate": 0.0002, "epoch": 5.819420783645656, "step": 8540}, {"loss": 1.1427, "grad_norm": 1.359756588935852, "learning_rate": 0.0002, "epoch": 5.826235093696763, "step": 8550}, {"loss": 1.1714, "grad_norm": 1.4118162393569946, "learning_rate": 0.0002, "epoch": 5.833049403747871, "step": 8560}, {"loss": 1.1758, "grad_norm": 1.1899290084838867, "learning_rate": 0.0002, "epoch": 5.8398637137989775, "step": 8570}, {"loss": 1.1511, "grad_norm": 1.1764532327651978, "learning_rate": 0.0002, "epoch": 5.846678023850085, "step": 8580}, {"loss": 1.1633, "grad_norm": 1.33274245262146, "learning_rate": 0.0002, "epoch": 5.853492333901192, "step": 8590}, {"loss": 1.1092, "grad_norm": 1.2571861743927002, "learning_rate": 0.0002, "epoch": 5.8603066439523, "step": 8600}, {"loss": 1.1137, "grad_norm": 1.3523616790771484, "learning_rate": 0.0002, "epoch": 5.867120954003407, "step": 8610}, {"loss": 1.2442, "grad_norm": 1.3556902408599854, "learning_rate": 0.0002, "epoch": 5.873935264054515, "step": 8620}, {"loss": 1.0967, "grad_norm": 1.2864879369735718, "learning_rate": 0.0002, "epoch": 5.8807495741056215, "step": 8630}, {"loss": 1.1491, "grad_norm": 1.2872768640518188, "learning_rate": 0.0002, "epoch": 5.887563884156729, "step": 8640}, {"loss": 1.1003, "grad_norm": 1.1446053981781006, "learning_rate": 0.0002, "epoch": 5.894378194207836, "step": 8650}, {"loss": 1.1095, "grad_norm": 1.292615532875061, "learning_rate": 0.0002, "epoch": 5.901192504258944, "step": 8660}, {"loss": 1.2009, "grad_norm": 1.190891981124878, "learning_rate": 0.0002, "epoch": 5.908006814310051, "step": 8670}, {"loss": 1.1386, "grad_norm": 1.330273985862732, "learning_rate": 0.0002, "epoch": 5.914821124361159, "step": 8680}, {"loss": 1.1874, "grad_norm": 1.41121244430542, "learning_rate": 0.0002, "epoch": 5.9216354344122655, "step": 8690}, {"loss": 1.1573, "grad_norm": 1.1360729932785034, "learning_rate": 0.0002, "epoch": 5.928449744463373, "step": 8700}, {"loss": 1.115, "grad_norm": 1.2220772504806519, "learning_rate": 0.0002, "epoch": 5.93526405451448, "step": 8710}, {"loss": 1.1696, "grad_norm": 1.1077110767364502, "learning_rate": 0.0002, "epoch": 5.942078364565588, "step": 8720}, {"loss": 1.1443, "grad_norm": 1.3632500171661377, "learning_rate": 0.0002, "epoch": 5.948892674616695, "step": 8730}, {"loss": 1.1474, "grad_norm": 1.4695830345153809, "learning_rate": 0.0002, "epoch": 5.955706984667803, "step": 8740}, {"loss": 1.1825, "grad_norm": 1.217741847038269, "learning_rate": 0.0002, "epoch": 5.9625212947189095, "step": 8750}, {"loss": 1.1495, "grad_norm": 1.0386874675750732, "learning_rate": 0.0002, "epoch": 5.969335604770017, "step": 8760}, {"loss": 1.1146, "grad_norm": 1.2067872285842896, "learning_rate": 0.0002, "epoch": 5.976149914821124, "step": 8770}, {"loss": 1.1987, "grad_norm": 1.3842018842697144, "learning_rate": 0.0002, "epoch": 5.982964224872232, "step": 8780}, {"loss": 1.2147, "grad_norm": 1.4584033489227295, "learning_rate": 0.0002, "epoch": 5.989778534923339, "step": 8790}, {"loss": 1.2078, "grad_norm": 1.1912888288497925, "learning_rate": 0.0002, "epoch": 5.996592844974447, "step": 8800}, {"eval_loss": 2.261807441711426, "eval_runtime": 68.1125, "eval_samples_per_second": 7.444, "eval_steps_per_second": 0.94, "epoch": 6.0, "step": 8805}, {"loss": 1.004, "grad_norm": 1.1715940237045288, "learning_rate": 0.0002, "epoch": 6.003407155025553, "step": 8810}, {"loss": 0.8665, "grad_norm": 1.6573960781097412, "learning_rate": 0.0002, "epoch": 6.010221465076661, "step": 8820}, {"loss": 0.8866, "grad_norm": 1.2845953702926636, "learning_rate": 0.0002, "epoch": 6.017035775127768, "step": 8830}, {"loss": 0.8528, "grad_norm": 1.526754379272461, "learning_rate": 0.0002, "epoch": 6.023850085178876, "step": 8840}, {"loss": 0.8555, "grad_norm": 1.4536073207855225, "learning_rate": 0.0002, "epoch": 6.030664395229983, "step": 8850}, {"loss": 0.8839, "grad_norm": 1.68099045753479, "learning_rate": 0.0002, "epoch": 6.0374787052810905, "step": 8860}, {"loss": 0.8538, "grad_norm": 1.485777497291565, "learning_rate": 0.0002, "epoch": 6.044293015332197, "step": 8870}, {"loss": 0.8534, "grad_norm": 1.5084402561187744, "learning_rate": 0.0002, "epoch": 6.051107325383305, "step": 8880}, {"loss": 0.8587, "grad_norm": 1.3901145458221436, "learning_rate": 0.0002, "epoch": 6.057921635434412, "step": 8890}, {"loss": 0.8625, "grad_norm": 1.528954267501831, "learning_rate": 0.0002, "epoch": 6.06473594548552, "step": 8900}, {"loss": 0.9115, "grad_norm": 1.6869531869888306, "learning_rate": 0.0002, "epoch": 6.071550255536627, "step": 8910}, {"loss": 0.8817, "grad_norm": 1.4149913787841797, "learning_rate": 0.0002, "epoch": 6.0783645655877345, "step": 8920}, {"loss": 0.8734, "grad_norm": 1.6853618621826172, "learning_rate": 0.0002, "epoch": 6.085178875638841, "step": 8930}, {"loss": 0.8836, "grad_norm": 1.694443702697754, "learning_rate": 0.0002, "epoch": 6.091993185689949, "step": 8940}, {"loss": 0.9144, "grad_norm": 2.1037111282348633, "learning_rate": 0.0002, "epoch": 6.098807495741056, "step": 8950}, {"loss": 0.8296, "grad_norm": 2.1236703395843506, "learning_rate": 0.0002, "epoch": 6.105621805792164, "step": 8960}, {"loss": 0.8451, "grad_norm": 1.6621695756912231, "learning_rate": 0.0002, "epoch": 6.112436115843271, "step": 8970}, {"loss": 0.8423, "grad_norm": 1.5390307903289795, "learning_rate": 0.0002, "epoch": 6.1192504258943785, "step": 8980}, {"loss": 0.8829, "grad_norm": 1.7841306924819946, "learning_rate": 0.0002, "epoch": 6.126064735945485, "step": 8990}, {"loss": 0.8872, "grad_norm": 1.8420580625534058, "learning_rate": 0.0002, "epoch": 6.132879045996593, "step": 9000}, {"loss": 0.9411, "grad_norm": 1.8198356628417969, "learning_rate": 0.0002, "epoch": 6.1396933560477, "step": 9010}, {"loss": 0.8921, "grad_norm": 1.6955933570861816, "learning_rate": 0.0002, "epoch": 6.146507666098808, "step": 9020}, {"loss": 0.9241, "grad_norm": 1.5072602033615112, "learning_rate": 0.0002, "epoch": 6.153321976149915, "step": 9030}, {"loss": 0.8643, "grad_norm": 1.63434898853302, "learning_rate": 0.0002, "epoch": 6.1601362862010225, "step": 9040}, {"loss": 0.8317, "grad_norm": 1.3761866092681885, "learning_rate": 0.0002, "epoch": 6.166950596252129, "step": 9050}, {"loss": 0.8136, "grad_norm": 1.7027268409729004, "learning_rate": 0.0002, "epoch": 6.173764906303237, "step": 9060}, {"loss": 0.8333, "grad_norm": 1.3534049987792969, "learning_rate": 0.0002, "epoch": 6.180579216354344, "step": 9070}, {"loss": 0.847, "grad_norm": 1.4437154531478882, "learning_rate": 0.0002, "epoch": 6.187393526405452, "step": 9080}, {"loss": 0.9169, "grad_norm": 1.4449656009674072, "learning_rate": 0.0002, "epoch": 6.194207836456559, "step": 9090}, {"loss": 0.846, "grad_norm": 1.5854601860046387, "learning_rate": 0.0002, "epoch": 6.2010221465076665, "step": 9100}, {"loss": 0.8801, "grad_norm": 1.5987509489059448, "learning_rate": 0.0002, "epoch": 6.207836456558773, "step": 9110}, {"loss": 0.9077, "grad_norm": 1.6309672594070435, "learning_rate": 0.0002, "epoch": 6.214650766609881, "step": 9120}, {"loss": 0.8802, "grad_norm": 1.526936411857605, "learning_rate": 0.0002, "epoch": 6.221465076660988, "step": 9130}, {"loss": 0.8858, "grad_norm": 1.4649606943130493, "learning_rate": 0.0002, "epoch": 6.228279386712096, "step": 9140}, {"loss": 0.9414, "grad_norm": 1.589350700378418, "learning_rate": 0.0002, "epoch": 6.235093696763203, "step": 9150}, {"loss": 0.9001, "grad_norm": 1.655668020248413, "learning_rate": 0.0002, "epoch": 6.2419080068143105, "step": 9160}, {"loss": 0.9879, "grad_norm": 1.5296401977539062, "learning_rate": 0.0002, "epoch": 6.248722316865417, "step": 9170}, {"loss": 0.8908, "grad_norm": 1.5857278108596802, "learning_rate": 0.0002, "epoch": 6.255536626916525, "step": 9180}, {"loss": 0.9329, "grad_norm": 1.7779686450958252, "learning_rate": 0.0002, "epoch": 6.262350936967632, "step": 9190}, {"loss": 0.9683, "grad_norm": 1.588886022567749, "learning_rate": 0.0002, "epoch": 6.269165247018739, "step": 9200}, {"loss": 0.9091, "grad_norm": 1.3818320035934448, "learning_rate": 0.0002, "epoch": 6.275979557069847, "step": 9210}, {"loss": 0.9003, "grad_norm": 1.6675978899002075, "learning_rate": 0.0002, "epoch": 6.2827938671209544, "step": 9220}, {"loss": 0.9125, "grad_norm": 1.5672610998153687, "learning_rate": 0.0002, "epoch": 6.289608177172061, "step": 9230}, {"loss": 0.9083, "grad_norm": 1.4558004140853882, "learning_rate": 0.0002, "epoch": 6.296422487223168, "step": 9240}, {"loss": 0.9362, "grad_norm": 1.5393446683883667, "learning_rate": 0.0002, "epoch": 6.303236797274276, "step": 9250}, {"loss": 0.8807, "grad_norm": 1.4367083311080933, "learning_rate": 0.0002, "epoch": 6.310051107325384, "step": 9260}, {"loss": 0.9203, "grad_norm": 1.5045381784439087, "learning_rate": 0.0002, "epoch": 6.316865417376491, "step": 9270}, {"loss": 0.9239, "grad_norm": 1.8604016304016113, "learning_rate": 0.0002, "epoch": 6.3236797274275975, "step": 9280}, {"loss": 0.9644, "grad_norm": 1.4863131046295166, "learning_rate": 0.0002, "epoch": 6.330494037478705, "step": 9290}, {"loss": 0.9052, "grad_norm": 1.511121392250061, "learning_rate": 0.0002, "epoch": 6.337308347529812, "step": 9300}, {"loss": 0.8609, "grad_norm": 1.6979162693023682, "learning_rate": 0.0002, "epoch": 6.34412265758092, "step": 9310}, {"loss": 0.953, "grad_norm": 1.6060494184494019, "learning_rate": 0.0002, "epoch": 6.350936967632027, "step": 9320}, {"loss": 0.9552, "grad_norm": 1.6572561264038086, "learning_rate": 0.0002, "epoch": 6.357751277683135, "step": 9330}, {"loss": 0.9201, "grad_norm": 1.6706757545471191, "learning_rate": 0.0002, "epoch": 6.3645655877342415, "step": 9340}, {"loss": 0.8693, "grad_norm": 1.620836615562439, "learning_rate": 0.0002, "epoch": 6.371379897785349, "step": 9350}, {"loss": 0.9281, "grad_norm": 1.482940673828125, "learning_rate": 0.0002, "epoch": 6.378194207836456, "step": 9360}, {"loss": 0.9026, "grad_norm": 1.3969961404800415, "learning_rate": 0.0002, "epoch": 6.385008517887564, "step": 9370}, {"loss": 0.8909, "grad_norm": 1.611212134361267, "learning_rate": 0.0002, "epoch": 6.391822827938671, "step": 9380}, {"loss": 0.9137, "grad_norm": 1.5586223602294922, "learning_rate": 0.0002, "epoch": 6.398637137989779, "step": 9390}, {"loss": 0.9254, "grad_norm": 1.394761562347412, "learning_rate": 0.0002, "epoch": 6.4054514480408855, "step": 9400}, {"loss": 0.8935, "grad_norm": 1.559618592262268, "learning_rate": 0.0002, "epoch": 6.412265758091993, "step": 9410}, {"loss": 0.9585, "grad_norm": 1.462173581123352, "learning_rate": 0.0002, "epoch": 6.4190800681431, "step": 9420}, {"loss": 0.9492, "grad_norm": 1.5655437707901, "learning_rate": 0.0002, "epoch": 6.425894378194208, "step": 9430}, {"loss": 0.9371, "grad_norm": 1.4344340562820435, "learning_rate": 0.0002, "epoch": 6.432708688245315, "step": 9440}, {"loss": 0.9396, "grad_norm": 1.5132373571395874, "learning_rate": 0.0002, "epoch": 6.439522998296423, "step": 9450}, {"loss": 0.9229, "grad_norm": 1.68776535987854, "learning_rate": 0.0002, "epoch": 6.4463373083475295, "step": 9460}, {"loss": 0.9524, "grad_norm": 1.556823968887329, "learning_rate": 0.0002, "epoch": 6.453151618398637, "step": 9470}, {"loss": 0.94, "grad_norm": 1.4254260063171387, "learning_rate": 0.0002, "epoch": 6.459965928449744, "step": 9480}, {"loss": 0.9689, "grad_norm": 1.7901203632354736, "learning_rate": 0.0002, "epoch": 6.466780238500852, "step": 9490}, {"loss": 0.9267, "grad_norm": 1.5098410844802856, "learning_rate": 0.0002, "epoch": 6.473594548551959, "step": 9500}, {"loss": 0.9159, "grad_norm": 1.6036792993545532, "learning_rate": 0.0002, "epoch": 6.480408858603067, "step": 9510}, {"loss": 0.9253, "grad_norm": 1.5011411905288696, "learning_rate": 0.0002, "epoch": 6.4872231686541735, "step": 9520}, {"loss": 0.9527, "grad_norm": 1.410780906677246, "learning_rate": 0.0002, "epoch": 6.494037478705281, "step": 9530}, {"loss": 0.8927, "grad_norm": 1.7451791763305664, "learning_rate": 0.0002, "epoch": 6.500851788756388, "step": 9540}, {"loss": 0.9566, "grad_norm": 1.5888725519180298, "learning_rate": 0.0002, "epoch": 6.507666098807496, "step": 9550}, {"loss": 0.9324, "grad_norm": 1.3016585111618042, "learning_rate": 0.0002, "epoch": 6.514480408858603, "step": 9560}, {"loss": 0.9576, "grad_norm": 1.629522442817688, "learning_rate": 0.0002, "epoch": 6.521294718909711, "step": 9570}, {"loss": 0.92, "grad_norm": 1.494436264038086, "learning_rate": 0.0002, "epoch": 6.5281090289608175, "step": 9580}, {"loss": 0.9154, "grad_norm": 1.323195219039917, "learning_rate": 0.0002, "epoch": 6.534923339011925, "step": 9590}, {"loss": 0.9891, "grad_norm": 1.4904460906982422, "learning_rate": 0.0002, "epoch": 6.541737649063032, "step": 9600}, {"loss": 0.9316, "grad_norm": 1.6079169511795044, "learning_rate": 0.0002, "epoch": 6.54855195911414, "step": 9610}, {"loss": 1.0105, "grad_norm": 1.5113396644592285, "learning_rate": 0.0002, "epoch": 6.555366269165247, "step": 9620}, {"loss": 0.9618, "grad_norm": 1.7113087177276611, "learning_rate": 0.0002, "epoch": 6.562180579216355, "step": 9630}, {"loss": 0.9699, "grad_norm": 1.359394907951355, "learning_rate": 0.0002, "epoch": 6.5689948892674614, "step": 9640}, {"loss": 1.0267, "grad_norm": 1.7701337337493896, "learning_rate": 0.0002, "epoch": 6.575809199318569, "step": 9650}, {"loss": 0.9639, "grad_norm": 1.6381222009658813, "learning_rate": 0.0002, "epoch": 6.582623509369676, "step": 9660}, {"loss": 0.9292, "grad_norm": 1.781891942024231, "learning_rate": 0.0002, "epoch": 6.589437819420784, "step": 9670}, {"loss": 1.0078, "grad_norm": 1.47724449634552, "learning_rate": 0.0002, "epoch": 6.596252129471891, "step": 9680}, {"loss": 1.0268, "grad_norm": 1.5498195886611938, "learning_rate": 0.0002, "epoch": 6.6030664395229985, "step": 9690}, {"loss": 0.9794, "grad_norm": 1.5682368278503418, "learning_rate": 0.0002, "epoch": 6.609880749574105, "step": 9700}, {"loss": 0.9298, "grad_norm": 1.6106981039047241, "learning_rate": 0.0002, "epoch": 6.616695059625213, "step": 9710}, {"loss": 0.9644, "grad_norm": 1.5388364791870117, "learning_rate": 0.0002, "epoch": 6.62350936967632, "step": 9720}, {"loss": 0.9385, "grad_norm": 1.5432790517807007, "learning_rate": 0.0002, "epoch": 6.630323679727428, "step": 9730}, {"loss": 0.9995, "grad_norm": 1.4929786920547485, "learning_rate": 0.0002, "epoch": 6.637137989778535, "step": 9740}, {"loss": 0.932, "grad_norm": 1.6959431171417236, "learning_rate": 0.0002, "epoch": 6.6439522998296425, "step": 9750}, {"loss": 0.9397, "grad_norm": 1.4990962743759155, "learning_rate": 0.0002, "epoch": 6.650766609880749, "step": 9760}, {"loss": 0.9808, "grad_norm": 1.5235223770141602, "learning_rate": 0.0002, "epoch": 6.657580919931857, "step": 9770}, {"loss": 0.9522, "grad_norm": 1.8264366388320923, "learning_rate": 0.0002, "epoch": 6.664395229982964, "step": 9780}, {"loss": 0.9751, "grad_norm": 1.4298417568206787, "learning_rate": 0.0002, "epoch": 6.671209540034072, "step": 9790}, {"loss": 0.9607, "grad_norm": 1.5926862955093384, "learning_rate": 0.0002, "epoch": 6.678023850085179, "step": 9800}, {"loss": 0.9681, "grad_norm": 1.4592483043670654, "learning_rate": 0.0002, "epoch": 6.6848381601362865, "step": 9810}, {"loss": 0.9385, "grad_norm": 1.375799536705017, "learning_rate": 0.0002, "epoch": 6.691652470187393, "step": 9820}, {"loss": 0.9684, "grad_norm": 1.5767531394958496, "learning_rate": 0.0002, "epoch": 6.698466780238501, "step": 9830}, {"loss": 0.9313, "grad_norm": 1.6452189683914185, "learning_rate": 0.0002, "epoch": 6.705281090289608, "step": 9840}, {"loss": 0.9781, "grad_norm": 1.3874469995498657, "learning_rate": 0.0002, "epoch": 6.712095400340716, "step": 9850}, {"loss": 0.9803, "grad_norm": 1.5470930337905884, "learning_rate": 0.0002, "epoch": 6.718909710391823, "step": 9860}, {"loss": 0.9335, "grad_norm": 1.499840259552002, "learning_rate": 0.0002, "epoch": 6.7257240204429305, "step": 9870}, {"loss": 0.9209, "grad_norm": 1.4733195304870605, "learning_rate": 0.0002, "epoch": 6.732538330494037, "step": 9880}, {"loss": 0.9124, "grad_norm": 1.921722173690796, "learning_rate": 0.0002, "epoch": 6.739352640545145, "step": 9890}, {"loss": 0.9311, "grad_norm": 1.848003625869751, "learning_rate": 0.0002, "epoch": 6.746166950596252, "step": 9900}, {"loss": 0.9601, "grad_norm": 1.6050934791564941, "learning_rate": 0.0002, "epoch": 6.75298126064736, "step": 9910}, {"loss": 0.941, "grad_norm": 1.716424822807312, "learning_rate": 0.0002, "epoch": 6.759795570698467, "step": 9920}, {"loss": 0.9592, "grad_norm": 1.5647642612457275, "learning_rate": 0.0002, "epoch": 6.7666098807495745, "step": 9930}, {"loss": 0.927, "grad_norm": 1.5500049591064453, "learning_rate": 0.0002, "epoch": 6.773424190800681, "step": 9940}, {"loss": 0.9921, "grad_norm": 1.5384467840194702, "learning_rate": 0.0002, "epoch": 6.780238500851789, "step": 9950}, {"loss": 0.9673, "grad_norm": 1.8312339782714844, "learning_rate": 0.0002, "epoch": 6.787052810902896, "step": 9960}, {"loss": 0.9647, "grad_norm": 1.3505569696426392, "learning_rate": 0.0002, "epoch": 6.793867120954003, "step": 9970}, {"loss": 0.9553, "grad_norm": 1.6717044115066528, "learning_rate": 0.0002, "epoch": 6.800681431005111, "step": 9980}, {"loss": 0.9688, "grad_norm": 1.7072664499282837, "learning_rate": 0.0002, "epoch": 6.8074957410562185, "step": 9990}, {"loss": 0.951, "grad_norm": 1.3609364032745361, "learning_rate": 0.0002, "epoch": 6.814310051107325, "step": 10000}, {"loss": 0.9638, "grad_norm": 1.4862881898880005, "learning_rate": 0.0002, "epoch": 6.821124361158432, "step": 10010}, {"loss": 1.016, "grad_norm": 1.4808303117752075, "learning_rate": 0.0002, "epoch": 6.82793867120954, "step": 10020}, {"loss": 0.9233, "grad_norm": 1.6531925201416016, "learning_rate": 0.0002, "epoch": 6.834752981260648, "step": 10030}, {"loss": 0.9435, "grad_norm": 1.5090917348861694, "learning_rate": 0.0002, "epoch": 6.841567291311755, "step": 10040}, {"loss": 0.9395, "grad_norm": 1.5361953973770142, "learning_rate": 0.0002, "epoch": 6.848381601362862, "step": 10050}, {"loss": 1.0095, "grad_norm": 1.7302757501602173, "learning_rate": 0.0002, "epoch": 6.855195911413969, "step": 10060}, {"loss": 0.9796, "grad_norm": 1.5626600980758667, "learning_rate": 0.0002, "epoch": 6.862010221465077, "step": 10070}, {"loss": 1.0244, "grad_norm": 1.4168927669525146, "learning_rate": 0.0002, "epoch": 6.868824531516184, "step": 10080}, {"loss": 0.9253, "grad_norm": 1.3921427726745605, "learning_rate": 0.0002, "epoch": 6.875638841567291, "step": 10090}, {"loss": 1.0037, "grad_norm": 1.6304726600646973, "learning_rate": 0.0002, "epoch": 6.882453151618399, "step": 10100}, {"loss": 1.0088, "grad_norm": 1.5463745594024658, "learning_rate": 0.0002, "epoch": 6.889267461669506, "step": 10110}, {"loss": 1.0276, "grad_norm": 1.4989547729492188, "learning_rate": 0.0002, "epoch": 6.896081771720613, "step": 10120}, {"loss": 1.0352, "grad_norm": 1.7281252145767212, "learning_rate": 0.0002, "epoch": 6.90289608177172, "step": 10130}, {"loss": 1.031, "grad_norm": 1.469348669052124, "learning_rate": 0.0002, "epoch": 6.909710391822828, "step": 10140}, {"loss": 1.0301, "grad_norm": 1.3762892484664917, "learning_rate": 0.0002, "epoch": 6.916524701873936, "step": 10150}, {"loss": 1.0032, "grad_norm": 1.489425539970398, "learning_rate": 0.0002, "epoch": 6.923339011925043, "step": 10160}, {"loss": 0.9487, "grad_norm": 1.4514580965042114, "learning_rate": 0.0002, "epoch": 6.9301533219761495, "step": 10170}, {"loss": 0.9898, "grad_norm": 1.6008871793746948, "learning_rate": 0.0002, "epoch": 6.936967632027257, "step": 10180}, {"loss": 1.0577, "grad_norm": 1.6893450021743774, "learning_rate": 0.0002, "epoch": 6.943781942078364, "step": 10190}, {"loss": 0.9699, "grad_norm": 1.66379976272583, "learning_rate": 0.0002, "epoch": 6.950596252129472, "step": 10200}, {"loss": 1.0159, "grad_norm": 1.501943588256836, "learning_rate": 0.0002, "epoch": 6.957410562180579, "step": 10210}, {"loss": 1.0414, "grad_norm": 1.6803759336471558, "learning_rate": 0.0002, "epoch": 6.964224872231687, "step": 10220}, {"loss": 1.0413, "grad_norm": 1.4512689113616943, "learning_rate": 0.0002, "epoch": 6.9710391822827935, "step": 10230}, {"loss": 0.9791, "grad_norm": 1.6071290969848633, "learning_rate": 0.0002, "epoch": 6.977853492333901, "step": 10240}, {"loss": 1.0574, "grad_norm": 1.598915696144104, "learning_rate": 0.0002, "epoch": 6.984667802385008, "step": 10250}, {"loss": 1.0379, "grad_norm": 1.7178512811660767, "learning_rate": 0.0002, "epoch": 6.991482112436116, "step": 10260}, {"loss": 1.0082, "grad_norm": 1.4407050609588623, "learning_rate": 0.0002, "epoch": 6.998296422487223, "step": 10270}, {"eval_loss": 2.4567856788635254, "eval_runtime": 69.5742, "eval_samples_per_second": 7.287, "eval_steps_per_second": 0.92, "epoch": 6.999659284497445, "step": 10272}, {"loss": 0.7802, "grad_norm": 1.6635409593582153, "learning_rate": 0.0002, "epoch": 7.005110732538331, "step": 10280}, {"loss": 0.6558, "grad_norm": 1.8180204629898071, "learning_rate": 0.0002, "epoch": 7.0119250425894375, "step": 10290}, {"loss": 0.7228, "grad_norm": 1.7982863187789917, "learning_rate": 0.0002, "epoch": 7.018739352640545, "step": 10300}, {"loss": 0.7028, "grad_norm": 2.1364097595214844, "learning_rate": 0.0002, "epoch": 7.025553662691652, "step": 10310}, {"loss": 0.7317, "grad_norm": 1.9538214206695557, "learning_rate": 0.0002, "epoch": 7.03236797274276, "step": 10320}, {"loss": 0.6774, "grad_norm": 1.7746129035949707, "learning_rate": 0.0002, "epoch": 7.039182282793867, "step": 10330}, {"loss": 0.7045, "grad_norm": 1.5186023712158203, "learning_rate": 0.0002, "epoch": 7.045996592844975, "step": 10340}, {"loss": 0.7208, "grad_norm": 1.9523893594741821, "learning_rate": 0.0002, "epoch": 7.0528109028960815, "step": 10350}, {"loss": 0.6842, "grad_norm": 1.9791967868804932, "learning_rate": 0.0002, "epoch": 7.059625212947189, "step": 10360}, {"loss": 0.67, "grad_norm": 1.4577405452728271, "learning_rate": 0.0002, "epoch": 7.066439522998296, "step": 10370}, {"loss": 0.7209, "grad_norm": 1.7670400142669678, "learning_rate": 0.0002, "epoch": 7.073253833049404, "step": 10380}, {"loss": 0.7416, "grad_norm": 1.9858429431915283, "learning_rate": 0.0002, "epoch": 7.080068143100511, "step": 10390}, {"loss": 0.6793, "grad_norm": 1.4968500137329102, "learning_rate": 0.0002, "epoch": 7.086882453151619, "step": 10400}, {"loss": 0.6852, "grad_norm": 2.2092909812927246, "learning_rate": 0.0002, "epoch": 7.0936967632027255, "step": 10410}, {"loss": 0.7256, "grad_norm": 1.944272756576538, "learning_rate": 0.0002, "epoch": 7.100511073253833, "step": 10420}, {"loss": 0.6841, "grad_norm": 1.7232941389083862, "learning_rate": 0.0002, "epoch": 7.10732538330494, "step": 10430}, {"loss": 0.7197, "grad_norm": 2.098334312438965, "learning_rate": 0.0002, "epoch": 7.114139693356048, "step": 10440}, {"loss": 0.672, "grad_norm": 1.7802670001983643, "learning_rate": 0.0002, "epoch": 7.120954003407155, "step": 10450}, {"loss": 0.7054, "grad_norm": 1.7171560525894165, "learning_rate": 0.0002, "epoch": 7.127768313458263, "step": 10460}, {"loss": 0.7414, "grad_norm": 1.7227827310562134, "learning_rate": 0.0002, "epoch": 7.1345826235093694, "step": 10470}, {"loss": 0.7164, "grad_norm": 2.0002410411834717, "learning_rate": 0.0002, "epoch": 7.141396933560477, "step": 10480}, {"loss": 0.7246, "grad_norm": 2.0559451580047607, "learning_rate": 0.0002, "epoch": 7.148211243611584, "step": 10490}, {"loss": 0.7178, "grad_norm": 1.6929457187652588, "learning_rate": 0.0002, "epoch": 7.155025553662692, "step": 10500}, {"loss": 0.6839, "grad_norm": 1.8747141361236572, "learning_rate": 0.0002, "epoch": 7.161839863713799, "step": 10510}, {"loss": 0.7672, "grad_norm": 2.1793057918548584, "learning_rate": 0.0002, "epoch": 7.1686541737649065, "step": 10520}, {"loss": 0.7485, "grad_norm": 1.8422093391418457, "learning_rate": 0.0002, "epoch": 7.175468483816013, "step": 10530}, {"loss": 0.7678, "grad_norm": 1.4060566425323486, "learning_rate": 0.0002, "epoch": 7.182282793867121, "step": 10540}, {"loss": 0.6677, "grad_norm": 1.8884180784225464, "learning_rate": 0.0002, "epoch": 7.189097103918228, "step": 10550}, {"loss": 0.7458, "grad_norm": 1.523154854774475, "learning_rate": 0.0002, "epoch": 7.195911413969336, "step": 10560}, {"loss": 0.7462, "grad_norm": 1.8293776512145996, "learning_rate": 0.0002, "epoch": 7.202725724020443, "step": 10570}, {"loss": 0.7172, "grad_norm": 1.8931537866592407, "learning_rate": 0.0002, "epoch": 7.2095400340715505, "step": 10580}, {"loss": 0.7183, "grad_norm": 1.7758889198303223, "learning_rate": 0.0002, "epoch": 7.216354344122657, "step": 10590}, {"loss": 0.6953, "grad_norm": 1.9986528158187866, "learning_rate": 0.0002, "epoch": 7.223168654173765, "step": 10600}, {"loss": 0.8125, "grad_norm": 3.0123329162597656, "learning_rate": 0.0002, "epoch": 7.229982964224872, "step": 10610}, {"loss": 0.7435, "grad_norm": 2.203801155090332, "learning_rate": 0.0002, "epoch": 7.23679727427598, "step": 10620}, {"loss": 0.7492, "grad_norm": 1.756627082824707, "learning_rate": 0.0002, "epoch": 7.243611584327087, "step": 10630}, {"loss": 0.7664, "grad_norm": 1.6657848358154297, "learning_rate": 0.0002, "epoch": 7.2504258943781945, "step": 10640}, {"loss": 0.7611, "grad_norm": 1.8871530294418335, "learning_rate": 0.0002, "epoch": 7.257240204429301, "step": 10650}, {"loss": 0.7579, "grad_norm": 1.8031877279281616, "learning_rate": 0.0002, "epoch": 7.264054514480409, "step": 10660}, {"loss": 0.7928, "grad_norm": 1.8694801330566406, "learning_rate": 0.0002, "epoch": 7.270868824531516, "step": 10670}, {"loss": 0.7609, "grad_norm": 1.6305289268493652, "learning_rate": 0.0002, "epoch": 7.277683134582624, "step": 10680}, {"loss": 0.7178, "grad_norm": 1.8838950395584106, "learning_rate": 0.0002, "epoch": 7.284497444633731, "step": 10690}, {"loss": 0.7397, "grad_norm": 1.6298766136169434, "learning_rate": 0.0002, "epoch": 7.2913117546848385, "step": 10700}, {"loss": 0.805, "grad_norm": 1.6832125186920166, "learning_rate": 0.0002, "epoch": 7.298126064735945, "step": 10710}, {"loss": 0.7388, "grad_norm": 1.9299124479293823, "learning_rate": 0.0002, "epoch": 7.304940374787053, "step": 10720}, {"loss": 0.7219, "grad_norm": 1.6476620435714722, "learning_rate": 0.0002, "epoch": 7.31175468483816, "step": 10730}, {"loss": 0.7623, "grad_norm": 2.046297788619995, "learning_rate": 0.0002, "epoch": 7.318568994889268, "step": 10740}, {"loss": 0.7824, "grad_norm": 1.9311174154281616, "learning_rate": 0.0002, "epoch": 7.325383304940375, "step": 10750}, {"loss": 0.7469, "grad_norm": 1.8964996337890625, "learning_rate": 0.0002, "epoch": 7.3321976149914825, "step": 10760}, {"loss": 0.7361, "grad_norm": 1.8085095882415771, "learning_rate": 0.0002, "epoch": 7.339011925042589, "step": 10770}, {"loss": 0.7753, "grad_norm": 1.6951984167099, "learning_rate": 0.0002, "epoch": 7.345826235093697, "step": 10780}, {"loss": 0.7856, "grad_norm": 1.6665486097335815, "learning_rate": 0.0002, "epoch": 7.352640545144804, "step": 10790}, {"loss": 0.7632, "grad_norm": 1.4161039590835571, "learning_rate": 0.0002, "epoch": 7.359454855195912, "step": 10800}, {"loss": 0.7377, "grad_norm": 1.8640085458755493, "learning_rate": 0.0002, "epoch": 7.366269165247019, "step": 10810}, {"loss": 0.7791, "grad_norm": 1.8302277326583862, "learning_rate": 0.0002, "epoch": 7.3730834752981265, "step": 10820}, {"loss": 0.8338, "grad_norm": 1.6959542036056519, "learning_rate": 0.0002, "epoch": 7.379897785349233, "step": 10830}, {"loss": 0.7657, "grad_norm": 2.171138286590576, "learning_rate": 0.0002, "epoch": 7.386712095400341, "step": 10840}, {"loss": 0.7897, "grad_norm": 1.9314014911651611, "learning_rate": 0.0002, "epoch": 7.393526405451448, "step": 10850}, {"loss": 0.7314, "grad_norm": 1.8977826833724976, "learning_rate": 0.0002, "epoch": 7.400340715502555, "step": 10860}, {"loss": 0.7459, "grad_norm": 2.024486541748047, "learning_rate": 0.0002, "epoch": 7.407155025553663, "step": 10870}, {"loss": 0.771, "grad_norm": 1.8545196056365967, "learning_rate": 0.0002, "epoch": 7.4139693356047704, "step": 10880}, {"loss": 0.7558, "grad_norm": 1.9366614818572998, "learning_rate": 0.0002, "epoch": 7.420783645655877, "step": 10890}, {"loss": 0.7121, "grad_norm": 2.051706075668335, "learning_rate": 0.0002, "epoch": 7.427597955706984, "step": 10900}, {"loss": 0.7618, "grad_norm": 1.624997615814209, "learning_rate": 0.0002, "epoch": 7.434412265758092, "step": 10910}, {"loss": 0.7886, "grad_norm": 1.8717564344406128, "learning_rate": 0.0002, "epoch": 7.4412265758092, "step": 10920}, {"loss": 0.7614, "grad_norm": 2.0878796577453613, "learning_rate": 0.0002, "epoch": 7.448040885860307, "step": 10930}, {"loss": 0.8, "grad_norm": 1.7073718309402466, "learning_rate": 0.0002, "epoch": 7.4548551959114135, "step": 10940}, {"loss": 0.7897, "grad_norm": 1.6618555784225464, "learning_rate": 0.0002, "epoch": 7.461669505962521, "step": 10950}, {"loss": 0.8101, "grad_norm": 1.8428804874420166, "learning_rate": 0.0002, "epoch": 7.468483816013628, "step": 10960}, {"loss": 0.777, "grad_norm": 1.8749566078186035, "learning_rate": 0.0002, "epoch": 7.475298126064736, "step": 10970}, {"loss": 0.8125, "grad_norm": 1.846954107284546, "learning_rate": 0.0002, "epoch": 7.482112436115843, "step": 10980}, {"loss": 0.8334, "grad_norm": 1.878496527671814, "learning_rate": 0.0002, "epoch": 7.488926746166951, "step": 10990}, {"loss": 0.7796, "grad_norm": 2.039119005203247, "learning_rate": 0.0002, "epoch": 7.4957410562180575, "step": 11000}, {"loss": 0.7788, "grad_norm": 1.677701473236084, "learning_rate": 0.0002, "epoch": 7.502555366269165, "step": 11010}, {"loss": 0.7649, "grad_norm": 1.7645316123962402, "learning_rate": 0.0002, "epoch": 7.509369676320272, "step": 11020}, {"loss": 0.8325, "grad_norm": 1.7873706817626953, "learning_rate": 0.0002, "epoch": 7.51618398637138, "step": 11030}, {"loss": 0.823, "grad_norm": 1.880903959274292, "learning_rate": 0.0002, "epoch": 7.522998296422487, "step": 11040}, {"loss": 0.8542, "grad_norm": 1.4965842962265015, "learning_rate": 0.0002, "epoch": 7.529812606473595, "step": 11050}, {"loss": 0.814, "grad_norm": 1.9609076976776123, "learning_rate": 0.0002, "epoch": 7.5366269165247015, "step": 11060}, {"loss": 0.8063, "grad_norm": 1.8582744598388672, "learning_rate": 0.0002, "epoch": 7.543441226575809, "step": 11070}, {"loss": 0.7882, "grad_norm": 1.7395402193069458, "learning_rate": 0.0002, "epoch": 7.550255536626916, "step": 11080}, {"loss": 0.8347, "grad_norm": 1.8297388553619385, "learning_rate": 0.0002, "epoch": 7.557069846678024, "step": 11090}, {"loss": 0.7666, "grad_norm": 1.9110262393951416, "learning_rate": 0.0002, "epoch": 7.563884156729131, "step": 11100}, {"loss": 0.8264, "grad_norm": 1.873039722442627, "learning_rate": 0.0002, "epoch": 7.570698466780239, "step": 11110}, {"loss": 0.8212, "grad_norm": 1.8473812341690063, "learning_rate": 0.0002, "epoch": 7.5775127768313455, "step": 11120}, {"loss": 0.8532, "grad_norm": 1.9834227561950684, "learning_rate": 0.0002, "epoch": 7.584327086882453, "step": 11130}, {"loss": 0.8256, "grad_norm": 1.7381705045700073, "learning_rate": 0.0002, "epoch": 7.59114139693356, "step": 11140}, {"loss": 0.815, "grad_norm": 1.619881272315979, "learning_rate": 0.0002, "epoch": 7.597955706984668, "step": 11150}, {"loss": 0.8182, "grad_norm": 1.773484706878662, "learning_rate": 0.0002, "epoch": 7.604770017035775, "step": 11160}, {"loss": 0.848, "grad_norm": 1.8400499820709229, "learning_rate": 0.0002, "epoch": 7.611584327086883, "step": 11170}, {"loss": 0.8356, "grad_norm": 1.936593770980835, "learning_rate": 0.0002, "epoch": 7.6183986371379895, "step": 11180}, {"loss": 0.8384, "grad_norm": 2.037844181060791, "learning_rate": 0.0002, "epoch": 7.625212947189097, "step": 11190}, {"loss": 0.807, "grad_norm": 1.6165574789047241, "learning_rate": 0.0002, "epoch": 7.632027257240204, "step": 11200}, {"loss": 0.7791, "grad_norm": 1.886804461479187, "learning_rate": 0.0002, "epoch": 7.638841567291312, "step": 11210}, {"loss": 0.7953, "grad_norm": 1.8130316734313965, "learning_rate": 0.0002, "epoch": 7.645655877342419, "step": 11220}, {"loss": 0.7991, "grad_norm": 1.7955272197723389, "learning_rate": 0.0002, "epoch": 7.652470187393527, "step": 11230}, {"loss": 0.8104, "grad_norm": 1.6500684022903442, "learning_rate": 0.0002, "epoch": 7.6592844974446335, "step": 11240}, {"loss": 0.8156, "grad_norm": 1.782709002494812, "learning_rate": 0.0002, "epoch": 7.666098807495741, "step": 11250}, {"loss": 0.831, "grad_norm": 1.8072985410690308, "learning_rate": 0.0002, "epoch": 7.672913117546848, "step": 11260}, {"loss": 0.8852, "grad_norm": 1.8962644338607788, "learning_rate": 0.0002, "epoch": 7.679727427597956, "step": 11270}, {"loss": 0.8586, "grad_norm": 1.794803261756897, "learning_rate": 0.0002, "epoch": 7.686541737649063, "step": 11280}, {"loss": 0.8727, "grad_norm": 1.8621071577072144, "learning_rate": 0.0002, "epoch": 7.693356047700171, "step": 11290}, {"loss": 0.8411, "grad_norm": 2.1268274784088135, "learning_rate": 0.0002, "epoch": 7.7001703577512775, "step": 11300}, {"loss": 0.8529, "grad_norm": 1.776221513748169, "learning_rate": 0.0002, "epoch": 7.706984667802385, "step": 11310}, {"loss": 0.8108, "grad_norm": 2.5115597248077393, "learning_rate": 0.0002, "epoch": 7.713798977853492, "step": 11320}, {"loss": 0.8334, "grad_norm": 1.9946764707565308, "learning_rate": 0.0002, "epoch": 7.7206132879046, "step": 11330}, {"loss": 0.7893, "grad_norm": 1.7262247800827026, "learning_rate": 0.0002, "epoch": 7.727427597955707, "step": 11340}, {"loss": 0.8513, "grad_norm": 1.971244215965271, "learning_rate": 0.0002, "epoch": 7.7342419080068145, "step": 11350}, {"loss": 0.8064, "grad_norm": 1.8255480527877808, "learning_rate": 0.0002, "epoch": 7.741056218057921, "step": 11360}, {"loss": 0.7787, "grad_norm": 1.6721539497375488, "learning_rate": 0.0002, "epoch": 7.747870528109029, "step": 11370}, {"loss": 0.8256, "grad_norm": 1.9740724563598633, "learning_rate": 0.0002, "epoch": 7.754684838160136, "step": 11380}, {"loss": 0.8778, "grad_norm": 1.9174233675003052, "learning_rate": 0.0002, "epoch": 7.761499148211244, "step": 11390}, {"loss": 0.8024, "grad_norm": 1.927493691444397, "learning_rate": 0.0002, "epoch": 7.768313458262351, "step": 11400}, {"loss": 0.8443, "grad_norm": 1.6313871145248413, "learning_rate": 0.0002, "epoch": 7.7751277683134585, "step": 11410}, {"loss": 0.8771, "grad_norm": 2.0635557174682617, "learning_rate": 0.0002, "epoch": 7.781942078364565, "step": 11420}, {"loss": 0.7838, "grad_norm": 1.597979187965393, "learning_rate": 0.0002, "epoch": 7.788756388415673, "step": 11430}, {"loss": 0.8391, "grad_norm": 1.8125237226486206, "learning_rate": 0.0002, "epoch": 7.79557069846678, "step": 11440}, {"loss": 0.8462, "grad_norm": 1.6833277940750122, "learning_rate": 0.0002, "epoch": 7.802385008517888, "step": 11450}, {"loss": 0.9158, "grad_norm": 1.9060336351394653, "learning_rate": 0.0002, "epoch": 7.809199318568995, "step": 11460}, {"loss": 0.8473, "grad_norm": 1.6847437620162964, "learning_rate": 0.0002, "epoch": 7.8160136286201025, "step": 11470}, {"loss": 0.851, "grad_norm": 1.8693677186965942, "learning_rate": 0.0002, "epoch": 7.822827938671209, "step": 11480}, {"loss": 0.7793, "grad_norm": 1.7141996622085571, "learning_rate": 0.0002, "epoch": 7.829642248722317, "step": 11490}, {"loss": 0.8254, "grad_norm": 1.7096906900405884, "learning_rate": 0.0002, "epoch": 7.836456558773424, "step": 11500}, {"loss": 0.8372, "grad_norm": 1.7270509004592896, "learning_rate": 0.0002, "epoch": 7.843270868824532, "step": 11510}, {"loss": 0.8513, "grad_norm": 1.6399152278900146, "learning_rate": 0.0002, "epoch": 7.850085178875639, "step": 11520}, {"loss": 0.7867, "grad_norm": 1.7190455198287964, "learning_rate": 0.0002, "epoch": 7.8568994889267465, "step": 11530}, {"loss": 0.803, "grad_norm": 1.7967315912246704, "learning_rate": 0.0002, "epoch": 7.863713798977853, "step": 11540}, {"loss": 0.7835, "grad_norm": 1.904163122177124, "learning_rate": 0.0002, "epoch": 7.870528109028961, "step": 11550}, {"loss": 0.8699, "grad_norm": 1.898577094078064, "learning_rate": 0.0002, "epoch": 7.877342419080068, "step": 11560}, {"loss": 0.8596, "grad_norm": 1.9581187963485718, "learning_rate": 0.0002, "epoch": 7.884156729131176, "step": 11570}, {"loss": 0.8564, "grad_norm": 1.756208062171936, "learning_rate": 0.0002, "epoch": 7.890971039182283, "step": 11580}, {"loss": 0.9012, "grad_norm": 2.020146608352661, "learning_rate": 0.0002, "epoch": 7.8977853492333905, "step": 11590}, {"loss": 0.8174, "grad_norm": 1.647647738456726, "learning_rate": 0.0002, "epoch": 7.904599659284497, "step": 11600}, {"loss": 0.848, "grad_norm": 1.8647202253341675, "learning_rate": 0.0002, "epoch": 7.911413969335605, "step": 11610}, {"loss": 0.8489, "grad_norm": 1.72721266746521, "learning_rate": 0.0002, "epoch": 7.918228279386712, "step": 11620}, {"loss": 0.8407, "grad_norm": 1.9360839128494263, "learning_rate": 0.0002, "epoch": 7.92504258943782, "step": 11630}, {"loss": 0.8777, "grad_norm": 1.7773231267929077, "learning_rate": 0.0002, "epoch": 7.931856899488927, "step": 11640}, {"loss": 0.8201, "grad_norm": 1.762197494506836, "learning_rate": 0.0002, "epoch": 7.9386712095400345, "step": 11650}, {"loss": 0.8083, "grad_norm": 1.8185408115386963, "learning_rate": 0.0002, "epoch": 7.945485519591141, "step": 11660}, {"loss": 0.8979, "grad_norm": 1.9808121919631958, "learning_rate": 0.0002, "epoch": 7.952299829642248, "step": 11670}, {"loss": 0.8176, "grad_norm": 1.888456106185913, "learning_rate": 0.0002, "epoch": 7.959114139693356, "step": 11680}, {"loss": 0.8445, "grad_norm": 1.860640525817871, "learning_rate": 0.0002, "epoch": 7.965928449744464, "step": 11690}, {"loss": 0.8665, "grad_norm": 1.7443981170654297, "learning_rate": 0.0002, "epoch": 7.972742759795571, "step": 11700}, {"loss": 0.8394, "grad_norm": 1.6821815967559814, "learning_rate": 0.0002, "epoch": 7.979557069846678, "step": 11710}, {"loss": 0.8809, "grad_norm": 1.6265391111373901, "learning_rate": 0.0002, "epoch": 7.986371379897785, "step": 11720}, {"loss": 0.9274, "grad_norm": 1.8354634046554565, "learning_rate": 0.0002, "epoch": 7.993185689948893, "step": 11730}]}