End of training

Browse files

Files changed (7) hide show

README.md +3 -2
all_results.json +12 -0
eval_results.json +7 -0
train_results.json +8 -0
trainer_state.json +1530 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: llama3.1
 base_model: meta-llama/Llama-3.1-8B-Instruct
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: openmathinstruct2-llama-3.1-8B-Instruct-lr5-ep2
@@ -15,9 +16,9 @@ should probably proofread and complete it, then remove this comment. -->
 # openmathinstruct2-llama-3.1-8B-Instruct-lr5-ep2
-This model is a fine-tuned version of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) on an unknown dataset.
 It achieves the following results on the evaluation set:
-- Loss: 0.7637
 ## Model description

 base_model: meta-llama/Llama-3.1-8B-Instruct
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: openmathinstruct2-llama-3.1-8B-Instruct-lr5-ep2
 # openmathinstruct2-llama-3.1-8B-Instruct-lr5-ep2
+This model is a fine-tuned version of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) on the openmathinstruct2_cot_20k_train dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.7634
 ## Model description

all_results.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "epoch": 2.0,
+    "eval_loss": 0.7634065747261047,
+    "eval_runtime": 31.2005,
+    "eval_samples_per_second": 59.262,
+    "eval_steps_per_second": 7.436,
+    "total_flos": 34318199685120.0,
+    "train_loss": 0.6641153321816371,
+    "train_runtime": 2465.8736,
+    "train_samples_per_second": 13.491,
+    "train_steps_per_second": 0.844
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "epoch": 2.0,
+    "eval_loss": 0.7634065747261047,
+    "eval_runtime": 31.2005,
+    "eval_samples_per_second": 59.262,
+    "eval_steps_per_second": 7.436
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 2.0,
+    "total_flos": 34318199685120.0,
+    "train_loss": 0.6641153321816371,
+    "train_runtime": 2465.8736,
+    "train_samples_per_second": 13.491,
+    "train_steps_per_second": 0.844
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1530 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.0,
+  "eval_steps": 500,
+  "global_step": 2080,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.009615384615384616,
+      "grad_norm": 9.013119895723426,
+      "learning_rate": 4.807692307692308e-07,
+      "loss": 1.3011,
+      "step": 10
+    },
+    {
+      "epoch": 0.019230769230769232,
+      "grad_norm": 8.170199017296364,
+      "learning_rate": 9.615384615384617e-07,
+      "loss": 1.1715,
+      "step": 20
+    },
+    {
+      "epoch": 0.028846153846153848,
+      "grad_norm": 5.232892874296514,
+      "learning_rate": 1.4423076923076922e-06,
+      "loss": 0.9015,
+      "step": 30
+    },
+    {
+      "epoch": 0.038461538461538464,
+      "grad_norm": 2.862322298571592,
+      "learning_rate": 1.9230769230769234e-06,
+      "loss": 0.7978,
+      "step": 40
+    },
+    {
+      "epoch": 0.04807692307692308,
+      "grad_norm": 2.7249762448136132,
+      "learning_rate": 2.403846153846154e-06,
+      "loss": 0.7609,
+      "step": 50
+    },
+    {
+      "epoch": 0.057692307692307696,
+      "grad_norm": 3.169444209352466,
+      "learning_rate": 2.8846153846153845e-06,
+      "loss": 0.8146,
+      "step": 60
+    },
+    {
+      "epoch": 0.0673076923076923,
+      "grad_norm": 2.6189394700073807,
+      "learning_rate": 3.365384615384616e-06,
+      "loss": 0.777,
+      "step": 70
+    },
+    {
+      "epoch": 0.07692307692307693,
+      "grad_norm": 2.4397555428298077,
+      "learning_rate": 3.846153846153847e-06,
+      "loss": 0.78,
+      "step": 80
+    },
+    {
+      "epoch": 0.08653846153846154,
+      "grad_norm": 2.812934479310922,
+      "learning_rate": 4.326923076923077e-06,
+      "loss": 0.7721,
+      "step": 90
+    },
+    {
+      "epoch": 0.09615384615384616,
+      "grad_norm": 2.5334288416130746,
+      "learning_rate": 4.807692307692308e-06,
+      "loss": 0.7669,
+      "step": 100
+    },
+    {
+      "epoch": 0.10576923076923077,
+      "grad_norm": 2.342690607930004,
+      "learning_rate": 5.288461538461539e-06,
+      "loss": 0.7459,
+      "step": 110
+    },
+    {
+      "epoch": 0.11538461538461539,
+      "grad_norm": 2.3269599095205264,
+      "learning_rate": 5.769230769230769e-06,
+      "loss": 0.7851,
+      "step": 120
+    },
+    {
+      "epoch": 0.125,
+      "grad_norm": 2.42709508438667,
+      "learning_rate": 6.25e-06,
+      "loss": 0.7703,
+      "step": 130
+    },
+    {
+      "epoch": 0.1346153846153846,
+      "grad_norm": 2.6377643494039273,
+      "learning_rate": 6.730769230769232e-06,
+      "loss": 0.7496,
+      "step": 140
+    },
+    {
+      "epoch": 0.14423076923076922,
+      "grad_norm": 2.875686037935072,
+      "learning_rate": 7.211538461538462e-06,
+      "loss": 0.784,
+      "step": 150
+    },
+    {
+      "epoch": 0.15384615384615385,
+      "grad_norm": 2.1768155795174917,
+      "learning_rate": 7.692307692307694e-06,
+      "loss": 0.8022,
+      "step": 160
+    },
+    {
+      "epoch": 0.16346153846153846,
+      "grad_norm": 2.46654983618583,
+      "learning_rate": 8.173076923076923e-06,
+      "loss": 0.7741,
+      "step": 170
+    },
+    {
+      "epoch": 0.17307692307692307,
+      "grad_norm": 2.4450582546281554,
+      "learning_rate": 8.653846153846155e-06,
+      "loss": 0.7827,
+      "step": 180
+    },
+    {
+      "epoch": 0.18269230769230768,
+      "grad_norm": 2.4657718631331567,
+      "learning_rate": 9.134615384615384e-06,
+      "loss": 0.7894,
+      "step": 190
+    },
+    {
+      "epoch": 0.19230769230769232,
+      "grad_norm": 2.54311809351163,
+      "learning_rate": 9.615384615384616e-06,
+      "loss": 0.8164,
+      "step": 200
+    },
+    {
+      "epoch": 0.20192307692307693,
+      "grad_norm": 2.097681181559205,
+      "learning_rate": 9.999971836433636e-06,
+      "loss": 0.8139,
+      "step": 210
+    },
+    {
+      "epoch": 0.21153846153846154,
+      "grad_norm": 2.2579921721680516,
+      "learning_rate": 9.998986144924253e-06,
+      "loss": 0.7919,
+      "step": 220
+    },
+    {
+      "epoch": 0.22115384615384615,
+      "grad_norm": 2.1097886183574466,
+      "learning_rate": 9.996592592355083e-06,
+      "loss": 0.8056,
+      "step": 230
+    },
+    {
+      "epoch": 0.23076923076923078,
+      "grad_norm": 2.22507095955906,
+      "learning_rate": 9.992791852820709e-06,
+      "loss": 0.8108,
+      "step": 240
+    },
+    {
+      "epoch": 0.2403846153846154,
+      "grad_norm": 2.1931221071365425,
+      "learning_rate": 9.987584996720813e-06,
+      "loss": 0.8041,
+      "step": 250
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 2.1220375928827044,
+      "learning_rate": 9.980973490458728e-06,
+      "loss": 0.8149,
+      "step": 260
+    },
+    {
+      "epoch": 0.25961538461538464,
+      "grad_norm": 2.1068957326041904,
+      "learning_rate": 9.972959196028456e-06,
+      "loss": 0.7914,
+      "step": 270
+    },
+    {
+      "epoch": 0.2692307692307692,
+      "grad_norm": 1.9372196057401814,
+      "learning_rate": 9.96354437049027e-06,
+      "loss": 0.8033,
+      "step": 280
+    },
+    {
+      "epoch": 0.27884615384615385,
+      "grad_norm": 2.2606013253960846,
+      "learning_rate": 9.952731665335071e-06,
+      "loss": 0.8039,
+      "step": 290
+    },
+    {
+      "epoch": 0.28846153846153844,
+      "grad_norm": 2.1614476516877406,
+      "learning_rate": 9.940524125737641e-06,
+      "loss": 0.7647,
+      "step": 300
+    },
+    {
+      "epoch": 0.2980769230769231,
+      "grad_norm": 2.220222508508704,
+      "learning_rate": 9.92692518969903e-06,
+      "loss": 0.7901,
+      "step": 310
+    },
+    {
+      "epoch": 0.3076923076923077,
+      "grad_norm": 2.201097779810873,
+      "learning_rate": 9.911938687078324e-06,
+      "loss": 0.7964,
+      "step": 320
+    },
+    {
+      "epoch": 0.3173076923076923,
+      "grad_norm": 2.029609328977134,
+      "learning_rate": 9.895568838514042e-06,
+      "loss": 0.7764,
+      "step": 330
+    },
+    {
+      "epoch": 0.3269230769230769,
+      "grad_norm": 2.2142291176229834,
+      "learning_rate": 9.87782025423547e-06,
+      "loss": 0.8005,
+      "step": 340
+    },
+    {
+      "epoch": 0.33653846153846156,
+      "grad_norm": 2.084315605131572,
+      "learning_rate": 9.8586979327643e-06,
+      "loss": 0.7795,
+      "step": 350
+    },
+    {
+      "epoch": 0.34615384615384615,
+      "grad_norm": 2.3024285135825457,
+      "learning_rate": 9.838207259506891e-06,
+      "loss": 0.7935,
+      "step": 360
+    },
+    {
+      "epoch": 0.3557692307692308,
+      "grad_norm": 2.5288625621043326,
+      "learning_rate": 9.816354005237583e-06,
+      "loss": 0.7749,
+      "step": 370
+    },
+    {
+      "epoch": 0.36538461538461536,
+      "grad_norm": 2.0010687704613743,
+      "learning_rate": 9.793144324473473e-06,
+      "loss": 0.7988,
+      "step": 380
+    },
+    {
+      "epoch": 0.375,
+      "grad_norm": 2.1536447186627496,
+      "learning_rate": 9.768584753741134e-06,
+      "loss": 0.7955,
+      "step": 390
+    },
+    {
+      "epoch": 0.38461538461538464,
+      "grad_norm": 2.025116331685143,
+      "learning_rate": 9.742682209735727e-06,
+      "loss": 0.8014,
+      "step": 400
+    },
+    {
+      "epoch": 0.3942307692307692,
+      "grad_norm": 2.027340612157234,
+      "learning_rate": 9.715443987373062e-06,
+      "loss": 0.8341,
+      "step": 410
+    },
+    {
+      "epoch": 0.40384615384615385,
+      "grad_norm": 1.968065863088579,
+      "learning_rate": 9.686877757735126e-06,
+      "loss": 0.7922,
+      "step": 420
+    },
+    {
+      "epoch": 0.41346153846153844,
+      "grad_norm": 2.016993760012052,
+      "learning_rate": 9.656991565909703e-06,
+      "loss": 0.8328,
+      "step": 430
+    },
+    {
+      "epoch": 0.4230769230769231,
+      "grad_norm": 2.1902508822456213,
+      "learning_rate": 9.62579382872462e-06,
+      "loss": 0.8029,
+      "step": 440
+    },
+    {
+      "epoch": 0.4326923076923077,
+      "grad_norm": 2.0187820173893534,
+      "learning_rate": 9.593293332377325e-06,
+      "loss": 0.7634,
+      "step": 450
+    },
+    {
+      "epoch": 0.4423076923076923,
+      "grad_norm": 2.1364408496498437,
+      "learning_rate": 9.55949922996045e-06,
+      "loss": 0.774,
+      "step": 460
+    },
+    {
+      "epoch": 0.4519230769230769,
+      "grad_norm": 2.0116381073985945,
+      "learning_rate": 9.52442103888402e-06,
+      "loss": 0.8331,
+      "step": 470
+    },
+    {
+      "epoch": 0.46153846153846156,
+      "grad_norm": 2.100311438402205,
+      "learning_rate": 9.488068638195072e-06,
+      "loss": 0.7478,
+      "step": 480
+    },
+    {
+      "epoch": 0.47115384615384615,
+      "grad_norm": 2.059472131869095,
+      "learning_rate": 9.450452265795423e-06,
+      "loss": 0.8202,
+      "step": 490
+    },
+    {
+      "epoch": 0.4807692307692308,
+      "grad_norm": 2.074026027178722,
+      "learning_rate": 9.411582515558391e-06,
+      "loss": 0.8177,
+      "step": 500
+    },
+    {
+      "epoch": 0.4807692307692308,
+      "eval_loss": 0.7822802066802979,
+      "eval_runtime": 31.3783,
+      "eval_samples_per_second": 58.926,
+      "eval_steps_per_second": 7.394,
+      "step": 500
+    },
+    {
+      "epoch": 0.49038461538461536,
+      "grad_norm": 1.9938002782487565,
+      "learning_rate": 9.371470334345232e-06,
+      "loss": 0.804,
+      "step": 510
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 1.9596865504803214,
+      "learning_rate": 9.330127018922195e-06,
+      "loss": 0.7844,
+      "step": 520
+    },
+    {
+      "epoch": 0.5096153846153846,
+      "grad_norm": 1.8726235477694055,
+      "learning_rate": 9.287564212779012e-06,
+      "loss": 0.756,
+      "step": 530
+    },
+    {
+      "epoch": 0.5192307692307693,
+      "grad_norm": 2.091468778530183,
+      "learning_rate": 9.243793902849764e-06,
+      "loss": 0.8057,
+      "step": 540
+    },
+    {
+      "epoch": 0.5288461538461539,
+      "grad_norm": 2.032448528673805,
+      "learning_rate": 9.198828416136991e-06,
+      "loss": 0.8065,
+      "step": 550
+    },
+    {
+      "epoch": 0.5384615384615384,
+      "grad_norm": 2.0648947345620137,
+      "learning_rate": 9.152680416240059e-06,
+      "loss": 0.8101,
+      "step": 560
+    },
+    {
+      "epoch": 0.5480769230769231,
+      "grad_norm": 1.864661141720549,
+      "learning_rate": 9.10536289978872e-06,
+      "loss": 0.7597,
+      "step": 570
+    },
+    {
+      "epoch": 0.5576923076923077,
+      "grad_norm": 1.8389259729678664,
+      "learning_rate": 9.056889192782865e-06,
+      "loss": 0.806,
+      "step": 580
+    },
+    {
+      "epoch": 0.5673076923076923,
+      "grad_norm": 1.997501225318945,
+      "learning_rate": 9.007272946839559e-06,
+      "loss": 0.7696,
+      "step": 590
+    },
+    {
+      "epoch": 0.5769230769230769,
+      "grad_norm": 1.9510249081212931,
+      "learning_rate": 8.95652813534831e-06,
+      "loss": 0.7779,
+      "step": 600
+    },
+    {
+      "epoch": 0.5865384615384616,
+      "grad_norm": 2.043423090422744,
+      "learning_rate": 8.90466904953579e-06,
+      "loss": 0.8142,
+      "step": 610
+    },
+    {
+      "epoch": 0.5961538461538461,
+      "grad_norm": 2.1520676063407,
+      "learning_rate": 8.851710294440974e-06,
+      "loss": 0.804,
+      "step": 620
+    },
+    {
+      "epoch": 0.6057692307692307,
+      "grad_norm": 1.8353482991425896,
+      "learning_rate": 8.797666784801954e-06,
+      "loss": 0.7701,
+      "step": 630
+    },
+    {
+      "epoch": 0.6153846153846154,
+      "grad_norm": 1.8647174071179153,
+      "learning_rate": 8.742553740855507e-06,
+      "loss": 0.7641,
+      "step": 640
+    },
+    {
+      "epoch": 0.625,
+      "grad_norm": 1.859731623435647,
+      "learning_rate": 8.68638668405062e-06,
+      "loss": 0.8293,
+      "step": 650
+    },
+    {
+      "epoch": 0.6346153846153846,
+      "grad_norm": 2.035763511676626,
+      "learning_rate": 8.629181432677213e-06,
+      "loss": 0.8386,
+      "step": 660
+    },
+    {
+      "epoch": 0.6442307692307693,
+      "grad_norm": 1.9498309927555528,
+      "learning_rate": 8.570954097411224e-06,
+      "loss": 0.826,
+      "step": 670
+    },
+    {
+      "epoch": 0.6538461538461539,
+      "grad_norm": 2.157631582471241,
+      "learning_rate": 8.511721076777388e-06,
+      "loss": 0.7933,
+      "step": 680
+    },
+    {
+      "epoch": 0.6634615384615384,
+      "grad_norm": 1.817540210529388,
+      "learning_rate": 8.451499052530923e-06,
+      "loss": 0.794,
+      "step": 690
+    },
+    {
+      "epoch": 0.6730769230769231,
+      "grad_norm": 2.003699550055361,
+      "learning_rate": 8.390304984959455e-06,
+      "loss": 0.8152,
+      "step": 700
+    },
+    {
+      "epoch": 0.6826923076923077,
+      "grad_norm": 1.7853207622424534,
+      "learning_rate": 8.328156108106518e-06,
+      "loss": 0.7869,
+      "step": 710
+    },
+    {
+      "epoch": 0.6923076923076923,
+      "grad_norm": 1.951815011854146,
+      "learning_rate": 8.265069924917925e-06,
+      "loss": 0.8086,
+      "step": 720
+    },
+    {
+      "epoch": 0.7019230769230769,
+      "grad_norm": 1.904892870037223,
+      "learning_rate": 8.20106420231244e-06,
+      "loss": 0.7959,
+      "step": 730
+    },
+    {
+      "epoch": 0.7115384615384616,
+      "grad_norm": 1.9417662866059235,
+      "learning_rate": 8.136156966178082e-06,
+      "loss": 0.8072,
+      "step": 740
+    },
+    {
+      "epoch": 0.7211538461538461,
+      "grad_norm": 1.8480817149859605,
+      "learning_rate": 8.070366496295505e-06,
+      "loss": 0.7814,
+      "step": 750
+    },
+    {
+      "epoch": 0.7307692307692307,
+      "grad_norm": 1.8327729681591678,
+      "learning_rate": 8.003711321189895e-06,
+      "loss": 0.7786,
+      "step": 760
+    },
+    {
+      "epoch": 0.7403846153846154,
+      "grad_norm": 2.1198206040027494,
+      "learning_rate": 7.93621021291277e-06,
+      "loss": 0.7485,
+      "step": 770
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 1.9784614334378419,
+      "learning_rate": 7.86788218175523e-06,
+      "loss": 0.7736,
+      "step": 780
+    },
+    {
+      "epoch": 0.7596153846153846,
+      "grad_norm": 1.733613545143569,
+      "learning_rate": 7.798746470894113e-06,
+      "loss": 0.7613,
+      "step": 790
+    },
+    {
+      "epoch": 0.7692307692307693,
+      "grad_norm": 1.7905885212318795,
+      "learning_rate": 7.728822550972523e-06,
+      "loss": 0.7902,
+      "step": 800
+    },
+    {
+      "epoch": 0.7788461538461539,
+      "grad_norm": 2.02868421572116,
+      "learning_rate": 7.658130114616364e-06,
+      "loss": 0.7962,
+      "step": 810
+    },
+    {
+      "epoch": 0.7884615384615384,
+      "grad_norm": 1.8163329758157731,
+      "learning_rate": 7.586689070888284e-06,
+      "loss": 0.7625,
+      "step": 820
+    },
+    {
+      "epoch": 0.7980769230769231,
+      "grad_norm": 2.0352374441953542,
+      "learning_rate": 7.5145195396807244e-06,
+      "loss": 0.7836,
+      "step": 830
+    },
+    {
+      "epoch": 0.8076923076923077,
+      "grad_norm": 2.0684855751019144,
+      "learning_rate": 7.441641846049557e-06,
+      "loss": 0.8016,
+      "step": 840
+    },
+    {
+      "epoch": 0.8173076923076923,
+      "grad_norm": 2.042151868500934,
+      "learning_rate": 7.368076514489947e-06,
+      "loss": 0.8012,
+      "step": 850
+    },
+    {
+      "epoch": 0.8269230769230769,
+      "grad_norm": 1.9253750246288859,
+      "learning_rate": 7.2938442631560714e-06,
+      "loss": 0.799,
+      "step": 860
+    },
+    {
+      "epoch": 0.8365384615384616,
+      "grad_norm": 1.8896143274467088,
+      "learning_rate": 7.218965998026259e-06,
+      "loss": 0.8099,
+      "step": 870
+    },
+    {
+      "epoch": 0.8461538461538461,
+      "grad_norm": 1.7590979198633765,
+      "learning_rate": 7.143462807015271e-06,
+      "loss": 0.7726,
+      "step": 880
+    },
+    {
+      "epoch": 0.8557692307692307,
+      "grad_norm": 1.9332789610600787,
+      "learning_rate": 7.067355954035316e-06,
+      "loss": 0.808,
+      "step": 890
+    },
+    {
+      "epoch": 0.8653846153846154,
+      "grad_norm": 1.8265190820741775,
+      "learning_rate": 6.990666873007506e-06,
+      "loss": 0.7604,
+      "step": 900
+    },
+    {
+      "epoch": 0.875,
+      "grad_norm": 1.8486951872238018,
+      "learning_rate": 6.913417161825449e-06,
+      "loss": 0.7861,
+      "step": 910
+    },
+    {
+      "epoch": 0.8846153846153846,
+      "grad_norm": 2.0473377615419412,
+      "learning_rate": 6.8356285762726385e-06,
+      "loss": 0.7618,
+      "step": 920
+    },
+    {
+      "epoch": 0.8942307692307693,
+      "grad_norm": 1.7908009618022396,
+      "learning_rate": 6.757323023895388e-06,
+      "loss": 0.7547,
+      "step": 930
+    },
+    {
+      "epoch": 0.9038461538461539,
+      "grad_norm": 1.7499349918180362,
+      "learning_rate": 6.678522557833025e-06,
+      "loss": 0.7595,
+      "step": 940
+    },
+    {
+      "epoch": 0.9134615384615384,
+      "grad_norm": 1.9193288351945685,
+      "learning_rate": 6.599249370607083e-06,
+      "loss": 0.7465,
+      "step": 950
+    },
+    {
+      "epoch": 0.9230769230769231,
+      "grad_norm": 1.7382141318072688,
+      "learning_rate": 6.519525787871235e-06,
+      "loss": 0.7829,
+      "step": 960
+    },
+    {
+      "epoch": 0.9326923076923077,
+      "grad_norm": 1.6639815518816288,
+      "learning_rate": 6.439374262123731e-06,
+      "loss": 0.7483,
+      "step": 970
+    },
+    {
+      "epoch": 0.9423076923076923,
+      "grad_norm": 2.040809216745246,
+      "learning_rate": 6.358817366384122e-06,
+      "loss": 0.7695,
+      "step": 980
+    },
+    {
+      "epoch": 0.9519230769230769,
+      "grad_norm": 1.9765992795556684,
+      "learning_rate": 6.277877787836034e-06,
+      "loss": 0.8039,
+      "step": 990
+    },
+    {
+      "epoch": 0.9615384615384616,
+      "grad_norm": 1.7379165898055615,
+      "learning_rate": 6.1965783214377895e-06,
+      "loss": 0.7708,
+      "step": 1000
+    },
+    {
+      "epoch": 0.9615384615384616,
+      "eval_loss": 0.757188618183136,
+      "eval_runtime": 30.718,
+      "eval_samples_per_second": 60.193,
+      "eval_steps_per_second": 7.553,
+      "step": 1000
+    },
+    {
+      "epoch": 0.9711538461538461,
+      "grad_norm": 1.91002204642886,
+      "learning_rate": 6.114941863502682e-06,
+      "loss": 0.791,
+      "step": 1010
+    },
+    {
+      "epoch": 0.9807692307692307,
+      "grad_norm": 1.7547449600143168,
+      "learning_rate": 6.032991405250702e-06,
+      "loss": 0.7584,
+      "step": 1020
+    },
+    {
+      "epoch": 0.9903846153846154,
+      "grad_norm": 1.7518146591240198,
+      "learning_rate": 5.950750026333534e-06,
+      "loss": 0.7911,
+      "step": 1030
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.8856712522978225,
+      "learning_rate": 5.8682408883346535e-06,
+      "loss": 0.7762,
+      "step": 1040
+    },
+    {
+      "epoch": 1.0096153846153846,
+      "grad_norm": 2.6930087290136417,
+      "learning_rate": 5.785487228246339e-06,
+      "loss": 0.6172,
+      "step": 1050
+    },
+    {
+      "epoch": 1.0192307692307692,
+      "grad_norm": 2.0128509132494217,
+      "learning_rate": 5.7025123519254644e-06,
+      "loss": 0.5335,
+      "step": 1060
+    },
+    {
+      "epoch": 1.0288461538461537,
+      "grad_norm": 2.081106502499814,
+      "learning_rate": 5.619339627529876e-06,
+      "loss": 0.5475,
+      "step": 1070
+    },
+    {
+      "epoch": 1.0384615384615385,
+      "grad_norm": 2.089872779664623,
+      "learning_rate": 5.53599247893724e-06,
+      "loss": 0.5491,
+      "step": 1080
+    },
+    {
+      "epoch": 1.0480769230769231,
+      "grad_norm": 2.487028007682742,
+      "learning_rate": 5.45249437914819e-06,
+      "loss": 0.5396,
+      "step": 1090
+    },
+    {
+      "epoch": 1.0576923076923077,
+      "grad_norm": 2.1183702883792015,
+      "learning_rate": 5.368868843675642e-06,
+      "loss": 0.5602,
+      "step": 1100
+    },
+    {
+      "epoch": 1.0673076923076923,
+      "grad_norm": 2.231623903215653,
+      "learning_rate": 5.285139423922138e-06,
+      "loss": 0.5341,
+      "step": 1110
+    },
+    {
+      "epoch": 1.0769230769230769,
+      "grad_norm": 1.8590648999967128,
+      "learning_rate": 5.201329700547077e-06,
+      "loss": 0.5499,
+      "step": 1120
+    },
+    {
+      "epoch": 1.0865384615384615,
+      "grad_norm": 2.031456951588171,
+      "learning_rate": 5.117463276825711e-06,
+      "loss": 0.5541,
+      "step": 1130
+    },
+    {
+      "epoch": 1.0961538461538463,
+      "grad_norm": 1.996025281675884,
+      "learning_rate": 5.033563772001782e-06,
+      "loss": 0.5229,
+      "step": 1140
+    },
+    {
+      "epoch": 1.1057692307692308,
+      "grad_norm": 2.01803070793536,
+      "learning_rate": 4.949654814635623e-06,
+      "loss": 0.5534,
+      "step": 1150
+    },
+    {
+      "epoch": 1.1153846153846154,
+      "grad_norm": 2.0131103606600163,
+      "learning_rate": 4.865760035949695e-06,
+      "loss": 0.5503,
+      "step": 1160
+    },
+    {
+      "epoch": 1.125,
+      "grad_norm": 1.9109776540155083,
+      "learning_rate": 4.781903063173321e-06,
+      "loss": 0.5587,
+      "step": 1170
+    },
+    {
+      "epoch": 1.1346153846153846,
+      "grad_norm": 2.240195731738998,
+      "learning_rate": 4.69810751288857e-06,
+      "loss": 0.5345,
+      "step": 1180
+    },
+    {
+      "epoch": 1.1442307692307692,
+      "grad_norm": 2.21815291482556,
+      "learning_rate": 4.61439698437914e-06,
+      "loss": 0.5105,
+      "step": 1190
+    },
+    {
+      "epoch": 1.1538461538461537,
+      "grad_norm": 1.959424085301074,
+      "learning_rate": 4.530795052984104e-06,
+      "loss": 0.5472,
+      "step": 1200
+    },
+    {
+      "epoch": 1.1634615384615385,
+      "grad_norm": 1.9724626057174217,
+      "learning_rate": 4.447325263458401e-06,
+      "loss": 0.5425,
+      "step": 1210
+    },
+    {
+      "epoch": 1.1730769230769231,
+      "grad_norm": 1.9404277504052594,
+      "learning_rate": 4.364011123341947e-06,
+      "loss": 0.5383,
+      "step": 1220
+    },
+    {
+      "epoch": 1.1826923076923077,
+      "grad_norm": 1.975957154057773,
+      "learning_rate": 4.280876096339222e-06,
+      "loss": 0.5635,
+      "step": 1230
+    },
+    {
+      "epoch": 1.1923076923076923,
+      "grad_norm": 3.7893846099890203,
+      "learning_rate": 4.1979435957111984e-06,
+      "loss": 0.5508,
+      "step": 1240
+    },
+    {
+      "epoch": 1.2019230769230769,
+      "grad_norm": 2.088379071314605,
+      "learning_rate": 4.11523697768149e-06,
+      "loss": 0.5917,
+      "step": 1250
+    },
+    {
+      "epoch": 1.2115384615384615,
+      "grad_norm": 2.0699986543823914,
+      "learning_rate": 4.032779534858544e-06,
+      "loss": 0.5549,
+      "step": 1260
+    },
+    {
+      "epoch": 1.2211538461538463,
+      "grad_norm": 2.4531750427167744,
+      "learning_rate": 3.9505944896757635e-06,
+      "loss": 0.5314,
+      "step": 1270
+    },
+    {
+      "epoch": 1.2307692307692308,
+      "grad_norm": 2.130560765169762,
+      "learning_rate": 3.86870498785139e-06,
+      "loss": 0.5327,
+      "step": 1280
+    },
+    {
+      "epoch": 1.2403846153846154,
+      "grad_norm": 2.2163755882504947,
+      "learning_rate": 3.7871340918699945e-06,
+      "loss": 0.509,
+      "step": 1290
+    },
+    {
+      "epoch": 1.25,
+      "grad_norm": 2.1107171131060136,
+      "learning_rate": 3.705904774487396e-06,
+      "loss": 0.5153,
+      "step": 1300
+    },
+    {
+      "epoch": 1.2596153846153846,
+      "grad_norm": 2.1786622272309413,
+      "learning_rate": 3.6250399122608713e-06,
+      "loss": 0.5589,
+      "step": 1310
+    },
+    {
+      "epoch": 1.2692307692307692,
+      "grad_norm": 1.9962608207111503,
+      "learning_rate": 3.544562279106436e-06,
+      "loss": 0.5425,
+      "step": 1320
+    },
+    {
+      "epoch": 1.2788461538461537,
+      "grad_norm": 1.8464394968329445,
+      "learning_rate": 3.464494539885047e-06,
+      "loss": 0.5349,
+      "step": 1330
+    },
+    {
+      "epoch": 1.2884615384615383,
+      "grad_norm": 2.138267726845245,
+      "learning_rate": 3.3848592440195118e-06,
+      "loss": 0.5461,
+      "step": 1340
+    },
+    {
+      "epoch": 1.2980769230769231,
+      "grad_norm": 2.0087619862264767,
+      "learning_rate": 3.3056788191439116e-06,
+      "loss": 0.559,
+      "step": 1350
+    },
+    {
+      "epoch": 1.3076923076923077,
+      "grad_norm": 2.1318708987886854,
+      "learning_rate": 3.226975564787322e-06,
+      "loss": 0.5326,
+      "step": 1360
+    },
+    {
+      "epoch": 1.3173076923076923,
+      "grad_norm": 2.0595039744355725,
+      "learning_rate": 3.148771646093608e-06,
+      "loss": 0.5215,
+      "step": 1370
+    },
+    {
+      "epoch": 1.3269230769230769,
+      "grad_norm": 2.106731128516135,
+      "learning_rate": 3.0710890875790745e-06,
+      "loss": 0.5205,
+      "step": 1380
+    },
+    {
+      "epoch": 1.3365384615384617,
+      "grad_norm": 2.015598237138461,
+      "learning_rate": 2.993949766929711e-06,
+      "loss": 0.5155,
+      "step": 1390
+    },
+    {
+      "epoch": 1.3461538461538463,
+      "grad_norm": 2.089932096970866,
+      "learning_rate": 2.917375408839803e-06,
+      "loss": 0.5321,
+      "step": 1400
+    },
+    {
+      "epoch": 1.3557692307692308,
+      "grad_norm": 2.1011121825872414,
+      "learning_rate": 2.8413875788936067e-06,
+      "loss": 0.5654,
+      "step": 1410
+    },
+    {
+      "epoch": 1.3653846153846154,
+      "grad_norm": 2.5049128653170607,
+      "learning_rate": 2.766007677491871e-06,
+      "loss": 0.5332,
+      "step": 1420
+    },
+    {
+      "epoch": 1.375,
+      "grad_norm": 2.199796386555421,
+      "learning_rate": 2.6912569338248317e-06,
+      "loss": 0.5129,
+      "step": 1430
+    },
+    {
+      "epoch": 1.3846153846153846,
+      "grad_norm": 2.122643085103428,
+      "learning_rate": 2.6171563998934605e-06,
+      "loss": 0.5315,
+      "step": 1440
+    },
+    {
+      "epoch": 1.3942307692307692,
+      "grad_norm": 1.9680294145904305,
+      "learning_rate": 2.5437269445806146e-06,
+      "loss": 0.5388,
+      "step": 1450
+    },
+    {
+      "epoch": 1.4038461538461537,
+      "grad_norm": 2.1649286481102505,
+      "learning_rate": 2.4709892477737263e-06,
+      "loss": 0.5449,
+      "step": 1460
+    },
+    {
+      "epoch": 1.4134615384615383,
+      "grad_norm": 1.9108821348630196,
+      "learning_rate": 2.3989637945407547e-06,
+      "loss": 0.5124,
+      "step": 1470
+    },
+    {
+      "epoch": 1.4230769230769231,
+      "grad_norm": 1.8705842302466216,
+      "learning_rate": 2.3276708693609947e-06,
+      "loss": 0.5271,
+      "step": 1480
+    },
+    {
+      "epoch": 1.4326923076923077,
+      "grad_norm": 2.218892686030775,
+      "learning_rate": 2.2571305504123547e-06,
+      "loss": 0.536,
+      "step": 1490
+    },
+    {
+      "epoch": 1.4423076923076923,
+      "grad_norm": 2.151762003437118,
+      "learning_rate": 2.187362703916766e-06,
+      "loss": 0.5513,
+      "step": 1500
+    },
+    {
+      "epoch": 1.4423076923076923,
+      "eval_loss": 0.7693456411361694,
+      "eval_runtime": 31.5006,
+      "eval_samples_per_second": 58.697,
+      "eval_steps_per_second": 7.365,
+      "step": 1500
+    },
+    {
+      "epoch": 1.4519230769230769,
+      "grad_norm": 2.2335086500452768,
+      "learning_rate": 2.1183869785452744e-06,
+      "loss": 0.5343,
+      "step": 1510
+    },
+    {
+      "epoch": 1.4615384615384617,
+      "grad_norm": 2.201272848216364,
+      "learning_rate": 2.050222799884387e-06,
+      "loss": 0.5203,
+      "step": 1520
+    },
+    {
+      "epoch": 1.4711538461538463,
+      "grad_norm": 2.245522812744825,
+      "learning_rate": 1.9828893649652653e-06,
+      "loss": 0.5492,
+      "step": 1530
+    },
+    {
+      "epoch": 1.4807692307692308,
+      "grad_norm": 2.4219731931643134,
+      "learning_rate": 1.9164056368572847e-06,
+      "loss": 0.531,
+      "step": 1540
+    },
+    {
+      "epoch": 1.4903846153846154,
+      "grad_norm": 1.9852458155211052,
+      "learning_rate": 1.8507903393274622e-06,
+      "loss": 0.5413,
+      "step": 1550
+    },
+    {
+      "epoch": 1.5,
+      "grad_norm": 2.057532782936977,
+      "learning_rate": 1.7860619515673034e-06,
+      "loss": 0.5271,
+      "step": 1560
+    },
+    {
+      "epoch": 1.5096153846153846,
+      "grad_norm": 2.3471986544512187,
+      "learning_rate": 1.7222387029885268e-06,
+      "loss": 0.5579,
+      "step": 1570
+    },
+    {
+      "epoch": 1.5192307692307692,
+      "grad_norm": 2.12927805118844,
+      "learning_rate": 1.6593385680891139e-06,
+      "loss": 0.524,
+      "step": 1580
+    },
+    {
+      "epoch": 1.5288461538461537,
+      "grad_norm": 2.3452167989714785,
+      "learning_rate": 1.5973792613911698e-06,
+      "loss": 0.5263,
+      "step": 1590
+    },
+    {
+      "epoch": 1.5384615384615383,
+      "grad_norm": 2.3303569386029115,
+      "learning_rate": 1.5363782324520033e-06,
+      "loss": 0.5075,
+      "step": 1600
+    },
+    {
+      "epoch": 1.5480769230769231,
+      "grad_norm": 2.3477866769254514,
+      "learning_rate": 1.476352660949802e-06,
+      "loss": 0.5611,
+      "step": 1610
+    },
+    {
+      "epoch": 1.5576923076923077,
+      "grad_norm": 1.9831558221073002,
+      "learning_rate": 1.4173194518453415e-06,
+      "loss": 0.5124,
+      "step": 1620
+    },
+    {
+      "epoch": 1.5673076923076923,
+      "grad_norm": 2.5946859927202732,
+      "learning_rate": 1.3592952306210589e-06,
+      "loss": 0.5193,
+      "step": 1630
+    },
+    {
+      "epoch": 1.5769230769230769,
+      "grad_norm": 2.3095838494615433,
+      "learning_rate": 1.3022963385988153e-06,
+      "loss": 0.5374,
+      "step": 1640
+    },
+    {
+      "epoch": 1.5865384615384617,
+      "grad_norm": 2.0028485071747806,
+      "learning_rate": 1.246338828337707e-06,
+      "loss": 0.512,
+      "step": 1650
+    },
+    {
+      "epoch": 1.5961538461538463,
+      "grad_norm": 2.029535183303286,
+      "learning_rate": 1.1914384591132045e-06,
+      "loss": 0.5128,
+      "step": 1660
+    },
+    {
+      "epoch": 1.6057692307692308,
+      "grad_norm": 2.147413310284848,
+      "learning_rate": 1.1376106924788594e-06,
+      "loss": 0.5316,
+      "step": 1670
+    },
+    {
+      "epoch": 1.6153846153846154,
+      "grad_norm": 2.3899145312729693,
+      "learning_rate": 1.0848706879118893e-06,
+      "loss": 0.5658,
+      "step": 1680
+    },
+    {
+      "epoch": 1.625,
+      "grad_norm": 2.0348123827167335,
+      "learning_rate": 1.0332332985438248e-06,
+      "loss": 0.5269,
+      "step": 1690
+    },
+    {
+      "epoch": 1.6346153846153846,
+      "grad_norm": 1.8531683575644768,
+      "learning_rate": 9.82713066977427e-07,
+      "loss": 0.5405,
+      "step": 1700
+    },
+    {
+      "epoch": 1.6442307692307692,
+      "grad_norm": 1.9592530320960306,
+      "learning_rate": 9.333242211910687e-07,
+      "loss": 0.5184,
+      "step": 1710
+    },
+    {
+      "epoch": 1.6538461538461537,
+      "grad_norm": 1.9954409383029053,
+      "learning_rate": 8.850806705317183e-07,
+      "loss": 0.4852,
+      "step": 1720
+    },
+    {
+      "epoch": 1.6634615384615383,
+      "grad_norm": 2.098998703354889,
+      "learning_rate": 8.379960017976546e-07,
+      "loss": 0.5139,
+      "step": 1730
+    },
+    {
+      "epoch": 1.6730769230769231,
+      "grad_norm": 2.5491851954980658,
+      "learning_rate": 7.920834754120305e-07,
+      "loss": 0.5392,
+      "step": 1740
+    },
+    {
+      "epoch": 1.6826923076923077,
+      "grad_norm": 2.3935228227975,
+      "learning_rate": 7.473560216883524e-07,
+      "loss": 0.533,
+      "step": 1750
+    },
+    {
+      "epoch": 1.6923076923076923,
+      "grad_norm": 2.170304242236588,
+      "learning_rate": 7.03826237188916e-07,
+      "loss": 0.5523,
+      "step": 1760
+    },
+    {
+      "epoch": 1.7019230769230769,
+      "grad_norm": 2.238634395178553,
+      "learning_rate": 6.615063811772532e-07,
+      "loss": 0.5122,
+      "step": 1770
+    },
+    {
+      "epoch": 1.7115384615384617,
+      "grad_norm": 2.0570639625612985,
+      "learning_rate": 6.204083721655607e-07,
+      "loss": 0.4928,
+      "step": 1780
+    },
+    {
+      "epoch": 1.7211538461538463,
+      "grad_norm": 2.103504186819526,
+      "learning_rate": 5.805437845580958e-07,
+      "loss": 0.5203,
+      "step": 1790
+    },
+    {
+      "epoch": 1.7307692307692308,
+      "grad_norm": 2.197105961623965,
+      "learning_rate": 5.41923845391486e-07,
+      "loss": 0.5124,
+      "step": 1800
+    },
+    {
+      "epoch": 1.7403846153846154,
+      "grad_norm": 2.1152456056469227,
+      "learning_rate": 5.045594311728708e-07,
+      "loss": 0.4916,
+      "step": 1810
+    },
+    {
+      "epoch": 1.75,
+      "grad_norm": 2.0939964989579973,
+      "learning_rate": 4.6846106481675035e-07,
+      "loss": 0.5111,
+      "step": 1820
+    },
+    {
+      "epoch": 1.7596153846153846,
+      "grad_norm": 2.4677432405873256,
+      "learning_rate": 4.336389126814311e-07,
+      "loss": 0.5346,
+      "step": 1830
+    },
+    {
+      "epoch": 1.7692307692307692,
+      "grad_norm": 1.9790895263145134,
+      "learning_rate": 4.001027817058789e-07,
+      "loss": 0.5051,
+      "step": 1840
+    },
+    {
+      "epoch": 1.7788461538461537,
+      "grad_norm": 2.2327516887468417,
+      "learning_rate": 3.6786211664779583e-07,
+      "loss": 0.5376,
+      "step": 1850
+    },
+    {
+      "epoch": 1.7884615384615383,
+      "grad_norm": 2.0971604625086777,
+      "learning_rate": 3.369259974236988e-07,
+      "loss": 0.4795,
+      "step": 1860
+    },
+    {
+      "epoch": 1.7980769230769231,
+      "grad_norm": 2.124716857194864,
+      "learning_rate": 3.0730313655175647e-07,
+      "loss": 0.5388,
+      "step": 1870
+    },
+    {
+      "epoch": 1.8076923076923077,
+      "grad_norm": 2.1095913682765977,
+      "learning_rate": 2.790018766980773e-07,
+      "loss": 0.545,
+      "step": 1880
+    },
+    {
+      "epoch": 1.8173076923076923,
+      "grad_norm": 2.0130617931416275,
+      "learning_rate": 2.520301883271797e-07,
+      "loss": 0.5169,
+      "step": 1890
+    },
+    {
+      "epoch": 1.8269230769230769,
+      "grad_norm": 2.083745653036941,
+      "learning_rate": 2.2639566745727203e-07,
+      "loss": 0.4928,
+      "step": 1900
+    },
+    {
+      "epoch": 1.8365384615384617,
+      "grad_norm": 2.220101162444105,
+      "learning_rate": 2.0210553352098815e-07,
+      "loss": 0.5075,
+      "step": 1910
+    },
+    {
+      "epoch": 1.8461538461538463,
+      "grad_norm": 2.308065945824805,
+      "learning_rate": 1.7916662733218848e-07,
+      "loss": 0.5116,
+      "step": 1920
+    },
+    {
+      "epoch": 1.8557692307692308,
+      "grad_norm": 1.9607562596689283,
+      "learning_rate": 1.575854091593837e-07,
+      "loss": 0.5183,
+      "step": 1930
+    },
+    {
+      "epoch": 1.8653846153846154,
+      "grad_norm": 2.1443103265105634,
+      "learning_rate": 1.3736795690633353e-07,
+      "loss": 0.5123,
+      "step": 1940
+    },
+    {
+      "epoch": 1.875,
+      "grad_norm": 1.9320175413394869,
+      "learning_rate": 1.185199644003332e-07,
+      "loss": 0.4931,
+      "step": 1950
+    },
+    {
+      "epoch": 1.8846153846153846,
+      "grad_norm": 2.2160725485465567,
+      "learning_rate": 1.0104673978866164e-07,
+      "loss": 0.513,
+      "step": 1960
+    },
+    {
+      "epoch": 1.8942307692307692,
+      "grad_norm": 2.2787872837737697,
+      "learning_rate": 8.495320404365348e-08,
+      "loss": 0.5062,
+      "step": 1970
+    },
+    {
+      "epoch": 1.9038461538461537,
+      "grad_norm": 2.168799751436692,
+      "learning_rate": 7.024388957680705e-08,
+      "loss": 0.5168,
+      "step": 1980
+    },
+    {
+      "epoch": 1.9134615384615383,
+      "grad_norm": 2.064037817068646,
+      "learning_rate": 5.6922938962329364e-08,
+      "loss": 0.5154,
+      "step": 1990
+    },
+    {
+      "epoch": 1.9230769230769231,
+      "grad_norm": 2.085831001764275,
+      "learning_rate": 4.499410377045765e-08,
+      "loss": 0.5059,
+      "step": 2000
+    },
+    {
+      "epoch": 1.9230769230769231,
+      "eval_loss": 0.7636520862579346,
+      "eval_runtime": 31.002,
+      "eval_samples_per_second": 59.641,
+      "eval_steps_per_second": 7.483,
+      "step": 2000
+    },
+    {
+      "epoch": 1.9326923076923077,
+      "grad_norm": 2.0866363192357245,
+      "learning_rate": 3.446074351091566e-08,
+      "loss": 0.535,
+      "step": 2010
+    },
+    {
+      "epoch": 1.9423076923076923,
+      "grad_norm": 1.9303541701779643,
+      "learning_rate": 2.5325824686772138e-08,
+      "loss": 0.5193,
+      "step": 2020
+    },
+    {
+      "epoch": 1.9519230769230769,
+      "grad_norm": 2.035058945568711,
+      "learning_rate": 1.7591919958986348e-08,
+      "loss": 0.5242,
+      "step": 2030
+    },
+    {
+      "epoch": 1.9615384615384617,
+      "grad_norm": 1.906676822404482,
+      "learning_rate": 1.1261207421874309e-08,
+      "loss": 0.4826,
+      "step": 2040
+    },
+    {
+      "epoch": 1.9711538461538463,
+      "grad_norm": 2.0734277093423965,
+      "learning_rate": 6.335469989692255e-09,
+      "loss": 0.5064,
+      "step": 2050
+    },
+    {
+      "epoch": 1.9807692307692308,
+      "grad_norm": 2.320470177512831,
+      "learning_rate": 2.816094894513843e-09,
+      "loss": 0.5264,
+      "step": 2060
+    },
+    {
+      "epoch": 1.9903846153846154,
+      "grad_norm": 2.2494248679357955,
+      "learning_rate": 7.040732955487795e-10,
+      "loss": 0.51,
+      "step": 2070
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 2.2699709074042307,
+      "learning_rate": 0.0,
+      "loss": 0.5211,
+      "step": 2080
+    },
+    {
+      "epoch": 2.0,
+      "step": 2080,
+      "total_flos": 34318199685120.0,
+      "train_loss": 0.6641153321816371,
+      "train_runtime": 2465.8736,
+      "train_samples_per_second": 13.491,
+      "train_steps_per_second": 0.844
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 2080,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 50000,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 34318199685120.0,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed