Training in progress, step 450, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +1064 -6

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4dd273fec3134c867ec868cf1bef6697cd1c00ad0e3cc92d169de5a15391f780
 size 912336848

 version https://git-lfs.github.com/spec/v1
+oid sha256:afb10fcd25ab47f6824a47a0e69a659b482321bd359014b11cd02b39c18c712b
 size 912336848

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1b83ba6bd95e8957278098f297e94827d32c8ca7daa93bd7512ef99299a2c158
 size 463916756

 version https://git-lfs.github.com/spec/v1
+oid sha256:f412242b8b4c1088fdef955be25e987f1a3ae214ed80585611bc33ab628f9141
 size 463916756

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4e8c5a9ccfcdafffe88fbc8c67acf0e1d2f165d6da8fd657767c8cc501d6fd00
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:f56a3eb79b75d9133a792f2e57fd736a8e6c8d874fe198547337ba33d890f211
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2681c20f6b04cb297ca42ba79b92543a57c49e07fb40458eca8cb625497628aa
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:c9636ae38b683f4b5b714bdf172e563b0c593e0efe94f07eea78547963bfbfae
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-  "best_metric": 0.8374229073524475,
-  "best_model_checkpoint": "miner_id_24/checkpoint-300",
-  "epoch": 0.05645464809936018,
   "eval_steps": 150,
-  "global_step": 300,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -2131,6 +2131,1064 @@
       "eval_samples_per_second": 9.109,
       "eval_steps_per_second": 2.278,
       "step": 300
     }
   ],
   "logging_steps": 1,
@@ -2154,12 +3212,12 @@
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
-        "should_training_stop": false
       },
       "attributes": {}
     }
   },
-  "total_flos": 6.849585830323814e+17,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null

 {
+  "best_metric": 0.8302884697914124,
+  "best_model_checkpoint": "miner_id_24/checkpoint-450",
+  "epoch": 0.08468197214904027,
   "eval_steps": 150,
+  "global_step": 450,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 9.109,
       "eval_steps_per_second": 2.278,
       "step": 300
+    },
+    {
+      "epoch": 0.056642830259691385,
+      "grad_norm": 1.5962673425674438,
+      "learning_rate": 2.6813618894527138e-05,
+      "loss": 3.5087,
+      "step": 301
+    },
+    {
+      "epoch": 0.05683101242002258,
+      "grad_norm": 1.5255632400512695,
+      "learning_rate": 2.6490591592961578e-05,
+      "loss": 3.4712,
+      "step": 302
+    },
+    {
+      "epoch": 0.05701919458035378,
+      "grad_norm": 1.542389154434204,
+      "learning_rate": 2.6168819172567392e-05,
+      "loss": 3.2852,
+      "step": 303
+    },
+    {
+      "epoch": 0.05720737674068498,
+      "grad_norm": 1.664982557296753,
+      "learning_rate": 2.5848318808857606e-05,
+      "loss": 3.5038,
+      "step": 304
+    },
+    {
+      "epoch": 0.057395558901016185,
+      "grad_norm": 1.4651890993118286,
+      "learning_rate": 2.5529107609445733e-05,
+      "loss": 3.1837,
+      "step": 305
+    },
+    {
+      "epoch": 0.057583741061347386,
+      "grad_norm": 1.486304521560669,
+      "learning_rate": 2.521120261313241e-05,
+      "loss": 3.2311,
+      "step": 306
+    },
+    {
+      "epoch": 0.05777192322167859,
+      "grad_norm": 1.4881970882415771,
+      "learning_rate": 2.4894620788996037e-05,
+      "loss": 3.419,
+      "step": 307
+    },
+    {
+      "epoch": 0.05796010538200978,
+      "grad_norm": 1.4943639039993286,
+      "learning_rate": 2.457937903548695e-05,
+      "loss": 3.3085,
+      "step": 308
+    },
+    {
+      "epoch": 0.058148287542340985,
+      "grad_norm": 1.5298432111740112,
+      "learning_rate": 2.426549417952542e-05,
+      "loss": 3.608,
+      "step": 309
+    },
+    {
+      "epoch": 0.058336469702672186,
+      "grad_norm": 1.5149232149124146,
+      "learning_rate": 2.3952982975603496e-05,
+      "loss": 3.3398,
+      "step": 310
+    },
+    {
+      "epoch": 0.05852465186300339,
+      "grad_norm": 1.492951512336731,
+      "learning_rate": 2.3641862104890595e-05,
+      "loss": 3.3529,
+      "step": 311
+    },
+    {
+      "epoch": 0.05871283402333459,
+      "grad_norm": 1.448451280593872,
+      "learning_rate": 2.3332148174343254e-05,
+      "loss": 3.2021,
+      "step": 312
+    },
+    {
+      "epoch": 0.05890101618366579,
+      "grad_norm": 1.4983866214752197,
+      "learning_rate": 2.3023857715818532e-05,
+      "loss": 3.3579,
+      "step": 313
+    },
+    {
+      "epoch": 0.059089198343996986,
+      "grad_norm": 1.5230140686035156,
+      "learning_rate": 2.2717007185191674e-05,
+      "loss": 3.4931,
+      "step": 314
+    },
+    {
+      "epoch": 0.05927738050432819,
+      "grad_norm": 1.5284234285354614,
+      "learning_rate": 2.24116129614777e-05,
+      "loss": 3.4331,
+      "step": 315
+    },
+    {
+      "epoch": 0.05946556266465939,
+      "grad_norm": 1.4961637258529663,
+      "learning_rate": 2.2107691345957133e-05,
+      "loss": 3.3148,
+      "step": 316
+    },
+    {
+      "epoch": 0.05965374482499059,
+      "grad_norm": 1.547951340675354,
+      "learning_rate": 2.1805258561305862e-05,
+      "loss": 3.3194,
+      "step": 317
+    },
+    {
+      "epoch": 0.05984192698532179,
+      "grad_norm": 1.5121986865997314,
+      "learning_rate": 2.1504330750729186e-05,
+      "loss": 3.4504,
+      "step": 318
+    },
+    {
+      "epoch": 0.060030109145652995,
+      "grad_norm": 1.51499605178833,
+      "learning_rate": 2.120492397710022e-05,
+      "loss": 3.4349,
+      "step": 319
+    },
+    {
+      "epoch": 0.06021829130598419,
+      "grad_norm": 1.520011305809021,
+      "learning_rate": 2.090705422210237e-05,
+      "loss": 3.3619,
+      "step": 320
+    },
+    {
+      "epoch": 0.06040647346631539,
+      "grad_norm": 1.5512540340423584,
+      "learning_rate": 2.061073738537635e-05,
+      "loss": 3.38,
+      "step": 321
+    },
+    {
+      "epoch": 0.06059465562664659,
+      "grad_norm": 1.463521122932434,
+      "learning_rate": 2.0315989283671473e-05,
+      "loss": 3.1915,
+      "step": 322
+    },
+    {
+      "epoch": 0.060782837786977795,
+      "grad_norm": 1.5051774978637695,
+      "learning_rate": 2.0022825650001387e-05,
+      "loss": 3.4725,
+      "step": 323
+    },
+    {
+      "epoch": 0.060971019947308996,
+      "grad_norm": 1.4965063333511353,
+      "learning_rate": 1.9731262132804274e-05,
+      "loss": 3.3025,
+      "step": 324
+    },
+    {
+      "epoch": 0.0611592021076402,
+      "grad_norm": 1.4896841049194336,
+      "learning_rate": 1.9441314295107537e-05,
+      "loss": 3.4046,
+      "step": 325
+    },
+    {
+      "epoch": 0.06134738426797139,
+      "grad_norm": 1.4985949993133545,
+      "learning_rate": 1.9152997613697183e-05,
+      "loss": 3.4278,
+      "step": 326
+    },
+    {
+      "epoch": 0.061535566428302595,
+      "grad_norm": 1.5722819566726685,
+      "learning_rate": 1.8866327478291546e-05,
+      "loss": 3.4424,
+      "step": 327
+    },
+    {
+      "epoch": 0.061723748588633796,
+      "grad_norm": 1.457684874534607,
+      "learning_rate": 1.8581319190720035e-05,
+      "loss": 3.175,
+      "step": 328
+    },
+    {
+      "epoch": 0.061911930748965,
+      "grad_norm": 1.501202940940857,
+      "learning_rate": 1.8297987964106115e-05,
+      "loss": 3.2706,
+      "step": 329
+    },
+    {
+      "epoch": 0.0621001129092962,
+      "grad_norm": 1.4955599308013916,
+      "learning_rate": 1.801634892205545e-05,
+      "loss": 3.3183,
+      "step": 330
+    },
+    {
+      "epoch": 0.0622882950696274,
+      "grad_norm": 1.52448308467865,
+      "learning_rate": 1.7736417097848506e-05,
+      "loss": 3.3863,
+      "step": 331
+    },
+    {
+      "epoch": 0.0624764772299586,
+      "grad_norm": 1.5245345830917358,
+      "learning_rate": 1.7458207433638223e-05,
+      "loss": 3.4718,
+      "step": 332
+    },
+    {
+      "epoch": 0.0626646593902898,
+      "grad_norm": 1.5418884754180908,
+      "learning_rate": 1.718173477965236e-05,
+      "loss": 3.3681,
+      "step": 333
+    },
+    {
+      "epoch": 0.062852841550621,
+      "grad_norm": 1.487151861190796,
+      "learning_rate": 1.6907013893400837e-05,
+      "loss": 3.3089,
+      "step": 334
+    },
+    {
+      "epoch": 0.0630410237109522,
+      "grad_norm": 1.4714837074279785,
+      "learning_rate": 1.6634059438888033e-05,
+      "loss": 3.181,
+      "step": 335
+    },
+    {
+      "epoch": 0.0632292058712834,
+      "grad_norm": 1.5032023191452026,
+      "learning_rate": 1.636288598583e-05,
+      "loss": 3.3265,
+      "step": 336
+    },
+    {
+      "epoch": 0.0634173880316146,
+      "grad_norm": 1.5267105102539062,
+      "learning_rate": 1.6093508008876857e-05,
+      "loss": 3.1744,
+      "step": 337
+    },
+    {
+      "epoch": 0.0636055701919458,
+      "grad_norm": 1.517006516456604,
+      "learning_rate": 1.5825939886840037e-05,
+      "loss": 3.2575,
+      "step": 338
+    },
+    {
+      "epoch": 0.06379375235227701,
+      "grad_norm": 1.4955118894577026,
+      "learning_rate": 1.5560195901924894e-05,
+      "loss": 3.2745,
+      "step": 339
+    },
+    {
+      "epoch": 0.06398193451260821,
+      "grad_norm": 1.7153363227844238,
+      "learning_rate": 1.5296290238968303e-05,
+      "loss": 3.4614,
+      "step": 340
+    },
+    {
+      "epoch": 0.06417011667293941,
+      "grad_norm": 1.5209360122680664,
+      "learning_rate": 1.50342369846815e-05,
+      "loss": 3.3276,
+      "step": 341
+    },
+    {
+      "epoch": 0.0643582988332706,
+      "grad_norm": 1.5184190273284912,
+      "learning_rate": 1.4774050126898164e-05,
+      "loss": 3.3869,
+      "step": 342
+    },
+    {
+      "epoch": 0.0645464809936018,
+      "grad_norm": 1.5385315418243408,
+      "learning_rate": 1.451574355382776e-05,
+      "loss": 3.4677,
+      "step": 343
+    },
+    {
+      "epoch": 0.064734663153933,
+      "grad_norm": 1.4663329124450684,
+      "learning_rate": 1.425933105331429e-05,
+      "loss": 3.3069,
+      "step": 344
+    },
+    {
+      "epoch": 0.0649228453142642,
+      "grad_norm": 1.5196608304977417,
+      "learning_rate": 1.4004826312100216e-05,
+      "loss": 3.3985,
+      "step": 345
+    },
+    {
+      "epoch": 0.0651110274745954,
+      "grad_norm": 1.5887420177459717,
+      "learning_rate": 1.3752242915095992e-05,
+      "loss": 3.309,
+      "step": 346
+    },
+    {
+      "epoch": 0.06529920963492661,
+      "grad_norm": 1.5067335367202759,
+      "learning_rate": 1.3501594344654884e-05,
+      "loss": 3.186,
+      "step": 347
+    },
+    {
+      "epoch": 0.06548739179525781,
+      "grad_norm": 1.4992436170578003,
+      "learning_rate": 1.3252893979853304e-05,
+      "loss": 3.3428,
+      "step": 348
+    },
+    {
+      "epoch": 0.06567557395558901,
+      "grad_norm": 1.6664701700210571,
+      "learning_rate": 1.3006155095776707e-05,
+      "loss": 3.4061,
+      "step": 349
+    },
+    {
+      "epoch": 0.06586375611592021,
+      "grad_norm": 2.502802848815918,
+      "learning_rate": 1.2761390862810907e-05,
+      "loss": 3.0303,
+      "step": 350
+    },
+    {
+      "epoch": 0.06605193827625142,
+      "grad_norm": 1.500586748123169,
+      "learning_rate": 1.2518614345939212e-05,
+      "loss": 3.4372,
+      "step": 351
+    },
+    {
+      "epoch": 0.06624012043658262,
+      "grad_norm": 1.5125335454940796,
+      "learning_rate": 1.227783850404487e-05,
+      "loss": 3.3834,
+      "step": 352
+    },
+    {
+      "epoch": 0.06642830259691382,
+      "grad_norm": 1.5365911722183228,
+      "learning_rate": 1.2039076189219517e-05,
+      "loss": 3.4872,
+      "step": 353
+    },
+    {
+      "epoch": 0.066616484757245,
+      "grad_norm": 1.5221909284591675,
+      "learning_rate": 1.1802340146077045e-05,
+      "loss": 3.3993,
+      "step": 354
+    },
+    {
+      "epoch": 0.06680466691757621,
+      "grad_norm": 1.5232350826263428,
+      "learning_rate": 1.1567643011073392e-05,
+      "loss": 3.5355,
+      "step": 355
+    },
+    {
+      "epoch": 0.06699284907790741,
+      "grad_norm": 1.5335094928741455,
+      "learning_rate": 1.1334997311832002e-05,
+      "loss": 3.4007,
+      "step": 356
+    },
+    {
+      "epoch": 0.06718103123823861,
+      "grad_norm": 1.4804530143737793,
+      "learning_rate": 1.1104415466475087e-05,
+      "loss": 3.3991,
+      "step": 357
+    },
+    {
+      "epoch": 0.06736921339856981,
+      "grad_norm": 1.4774835109710693,
+      "learning_rate": 1.0875909782960886e-05,
+      "loss": 3.3081,
+      "step": 358
+    },
+    {
+      "epoch": 0.06755739555890101,
+      "grad_norm": 1.507690191268921,
+      "learning_rate": 1.0649492458426564e-05,
+      "loss": 3.5331,
+      "step": 359
+    },
+    {
+      "epoch": 0.06774557771923222,
+      "grad_norm": 1.4900294542312622,
+      "learning_rate": 1.0425175578537299e-05,
+      "loss": 3.4034,
+      "step": 360
+    },
+    {
+      "epoch": 0.06793375987956342,
+      "grad_norm": 1.536511778831482,
+      "learning_rate": 1.020297111684101e-05,
+      "loss": 3.3285,
+      "step": 361
+    },
+    {
+      "epoch": 0.06812194203989462,
+      "grad_norm": 1.4938386678695679,
+      "learning_rate": 9.98289093412938e-06,
+      "loss": 3.3401,
+      "step": 362
+    },
+    {
+      "epoch": 0.06831012420022582,
+      "grad_norm": 1.498859167098999,
+      "learning_rate": 9.764946777804646e-06,
+      "loss": 3.4705,
+      "step": 363
+    },
+    {
+      "epoch": 0.06849830636055702,
+      "grad_norm": 1.505251407623291,
+      "learning_rate": 9.549150281252633e-06,
+      "loss": 3.3769,
+      "step": 364
+    },
+    {
+      "epoch": 0.06868648852088823,
+      "grad_norm": 1.5352188348770142,
+      "learning_rate": 9.335512963221732e-06,
+      "loss": 3.4495,
+      "step": 365
+    },
+    {
+      "epoch": 0.06887467068121943,
+      "grad_norm": 1.5243737697601318,
+      "learning_rate": 9.124046227208082e-06,
+      "loss": 3.3756,
+      "step": 366
+    },
+    {
+      "epoch": 0.06906285284155061,
+      "grad_norm": 1.4866875410079956,
+      "learning_rate": 8.914761360846869e-06,
+      "loss": 3.2548,
+      "step": 367
+    },
+    {
+      "epoch": 0.06925103500188182,
+      "grad_norm": 1.70474374294281,
+      "learning_rate": 8.707669535309793e-06,
+      "loss": 3.3533,
+      "step": 368
+    },
+    {
+      "epoch": 0.06943921716221302,
+      "grad_norm": 1.4734491109848022,
+      "learning_rate": 8.502781804708826e-06,
+      "loss": 3.3556,
+      "step": 369
+    },
+    {
+      "epoch": 0.06962739932254422,
+      "grad_norm": 1.5178672075271606,
+      "learning_rate": 8.30010910550611e-06,
+      "loss": 3.4011,
+      "step": 370
+    },
+    {
+      "epoch": 0.06981558148287542,
+      "grad_norm": 1.5426009893417358,
+      "learning_rate": 8.09966225593024e-06,
+      "loss": 3.3262,
+      "step": 371
+    },
+    {
+      "epoch": 0.07000376364320662,
+      "grad_norm": 1.488108515739441,
+      "learning_rate": 7.901451955398792e-06,
+      "loss": 3.3038,
+      "step": 372
+    },
+    {
+      "epoch": 0.07019194580353783,
+      "grad_norm": 1.5539402961730957,
+      "learning_rate": 7.705488783947202e-06,
+      "loss": 3.4263,
+      "step": 373
+    },
+    {
+      "epoch": 0.07038012796386903,
+      "grad_norm": 1.5011879205703735,
+      "learning_rate": 7.511783201664052e-06,
+      "loss": 3.1954,
+      "step": 374
+    },
+    {
+      "epoch": 0.07056831012420023,
+      "grad_norm": 1.5092988014221191,
+      "learning_rate": 7.320345548132679e-06,
+      "loss": 3.328,
+      "step": 375
+    },
+    {
+      "epoch": 0.07075649228453143,
+      "grad_norm": 1.498982310295105,
+      "learning_rate": 7.131186041879357e-06,
+      "loss": 3.2402,
+      "step": 376
+    },
+    {
+      "epoch": 0.07094467444486263,
+      "grad_norm": 1.5231132507324219,
+      "learning_rate": 6.944314779827749e-06,
+      "loss": 3.4769,
+      "step": 377
+    },
+    {
+      "epoch": 0.07113285660519383,
+      "grad_norm": 1.5165536403656006,
+      "learning_rate": 6.759741736760061e-06,
+      "loss": 3.4978,
+      "step": 378
+    },
+    {
+      "epoch": 0.07132103876552502,
+      "grad_norm": 1.531844973564148,
+      "learning_rate": 6.577476764784546e-06,
+      "loss": 3.3847,
+      "step": 379
+    },
+    {
+      "epoch": 0.07150922092585622,
+      "grad_norm": 1.4768074750900269,
+      "learning_rate": 6.397529592809614e-06,
+      "loss": 3.1253,
+      "step": 380
+    },
+    {
+      "epoch": 0.07169740308618743,
+      "grad_norm": 1.5122772455215454,
+      "learning_rate": 6.219909826024589e-06,
+      "loss": 3.3401,
+      "step": 381
+    },
+    {
+      "epoch": 0.07188558524651863,
+      "grad_norm": 1.5160380601882935,
+      "learning_rate": 6.0446269453868945e-06,
+      "loss": 3.3382,
+      "step": 382
+    },
+    {
+      "epoch": 0.07207376740684983,
+      "grad_norm": 1.5226458311080933,
+      "learning_rate": 5.871690307116107e-06,
+      "loss": 3.431,
+      "step": 383
+    },
+    {
+      "epoch": 0.07226194956718103,
+      "grad_norm": 1.4903466701507568,
+      "learning_rate": 5.701109142194422e-06,
+      "loss": 3.2707,
+      "step": 384
+    },
+    {
+      "epoch": 0.07245013172751223,
+      "grad_norm": 1.5170961618423462,
+      "learning_rate": 5.532892555874059e-06,
+      "loss": 3.3027,
+      "step": 385
+    },
+    {
+      "epoch": 0.07263831388784343,
+      "grad_norm": 1.5108839273452759,
+      "learning_rate": 5.3670495271910925e-06,
+      "loss": 3.3152,
+      "step": 386
+    },
+    {
+      "epoch": 0.07282649604817464,
+      "grad_norm": 1.51710045337677,
+      "learning_rate": 5.203588908486279e-06,
+      "loss": 3.2884,
+      "step": 387
+    },
+    {
+      "epoch": 0.07301467820850584,
+      "grad_norm": 1.5793352127075195,
+      "learning_rate": 5.042519424932513e-06,
+      "loss": 3.5139,
+      "step": 388
+    },
+    {
+      "epoch": 0.07320286036883704,
+      "grad_norm": 1.481472373008728,
+      "learning_rate": 4.883849674069058e-06,
+      "loss": 3.2427,
+      "step": 389
+    },
+    {
+      "epoch": 0.07339104252916824,
+      "grad_norm": 1.5381510257720947,
+      "learning_rate": 4.727588125342669e-06,
+      "loss": 3.3485,
+      "step": 390
+    },
+    {
+      "epoch": 0.07357922468949944,
+      "grad_norm": 1.508355975151062,
+      "learning_rate": 4.573743119655516e-06,
+      "loss": 3.295,
+      "step": 391
+    },
+    {
+      "epoch": 0.07376740684983063,
+      "grad_norm": 1.527873158454895,
+      "learning_rate": 4.422322868919937e-06,
+      "loss": 3.2918,
+      "step": 392
+    },
+    {
+      "epoch": 0.07395558901016183,
+      "grad_norm": 1.5130548477172852,
+      "learning_rate": 4.273335455620097e-06,
+      "loss": 3.3859,
+      "step": 393
+    },
+    {
+      "epoch": 0.07414377117049303,
+      "grad_norm": 1.5391393899917603,
+      "learning_rate": 4.126788832380629e-06,
+      "loss": 3.3711,
+      "step": 394
+    },
+    {
+      "epoch": 0.07433195333082424,
+      "grad_norm": 1.5213444232940674,
+      "learning_rate": 3.982690821542035e-06,
+      "loss": 3.4224,
+      "step": 395
+    },
+    {
+      "epoch": 0.07452013549115544,
+      "grad_norm": 1.4749212265014648,
+      "learning_rate": 3.8410491147432395e-06,
+      "loss": 3.1882,
+      "step": 396
+    },
+    {
+      "epoch": 0.07470831765148664,
+      "grad_norm": 1.4860862493515015,
+      "learning_rate": 3.7018712725109926e-06,
+      "loss": 3.2488,
+      "step": 397
+    },
+    {
+      "epoch": 0.07489649981181784,
+      "grad_norm": 1.5500200986862183,
+      "learning_rate": 3.5651647238562904e-06,
+      "loss": 3.3369,
+      "step": 398
+    },
+    {
+      "epoch": 0.07508468197214904,
+      "grad_norm": 1.8019529581069946,
+      "learning_rate": 3.430936765877857e-06,
+      "loss": 3.1557,
+      "step": 399
+    },
+    {
+      "epoch": 0.07527286413248024,
+      "grad_norm": 2.5598955154418945,
+      "learning_rate": 3.299194563372604e-06,
+      "loss": 3.03,
+      "step": 400
+    },
+    {
+      "epoch": 0.07546104629281145,
+      "grad_norm": 1.5204044580459595,
+      "learning_rate": 3.1699451484532463e-06,
+      "loss": 3.4419,
+      "step": 401
+    },
+    {
+      "epoch": 0.07564922845314265,
+      "grad_norm": 1.4903305768966675,
+      "learning_rate": 3.0431954201728784e-06,
+      "loss": 3.3295,
+      "step": 402
+    },
+    {
+      "epoch": 0.07583741061347385,
+      "grad_norm": 1.5254848003387451,
+      "learning_rate": 2.9189521441567726e-06,
+      "loss": 3.4107,
+      "step": 403
+    },
+    {
+      "epoch": 0.07602559277380504,
+      "grad_norm": 1.5176067352294922,
+      "learning_rate": 2.797221952241219e-06,
+      "loss": 3.3387,
+      "step": 404
+    },
+    {
+      "epoch": 0.07621377493413624,
+      "grad_norm": 1.495973825454712,
+      "learning_rate": 2.6780113421195298e-06,
+      "loss": 3.3161,
+      "step": 405
+    },
+    {
+      "epoch": 0.07640195709446744,
+      "grad_norm": 1.5124212503433228,
+      "learning_rate": 2.561326676995218e-06,
+      "loss": 3.3538,
+      "step": 406
+    },
+    {
+      "epoch": 0.07659013925479864,
+      "grad_norm": 1.4950138330459595,
+      "learning_rate": 2.4471741852423237e-06,
+      "loss": 3.363,
+      "step": 407
+    },
+    {
+      "epoch": 0.07677832141512984,
+      "grad_norm": 1.5359971523284912,
+      "learning_rate": 2.3355599600729915e-06,
+      "loss": 3.4671,
+      "step": 408
+    },
+    {
+      "epoch": 0.07696650357546105,
+      "grad_norm": 1.4483258724212646,
+      "learning_rate": 2.2264899592121744e-06,
+      "loss": 3.1561,
+      "step": 409
+    },
+    {
+      "epoch": 0.07715468573579225,
+      "grad_norm": 1.483723521232605,
+      "learning_rate": 2.1199700045797077e-06,
+      "loss": 3.3924,
+      "step": 410
+    },
+    {
+      "epoch": 0.07734286789612345,
+      "grad_norm": 1.4614593982696533,
+      "learning_rate": 2.0160057819794466e-06,
+      "loss": 3.2969,
+      "step": 411
+    },
+    {
+      "epoch": 0.07753105005645465,
+      "grad_norm": 1.4894236326217651,
+      "learning_rate": 1.9146028407958484e-06,
+      "loss": 3.3376,
+      "step": 412
+    },
+    {
+      "epoch": 0.07771923221678585,
+      "grad_norm": 1.501068115234375,
+      "learning_rate": 1.8157665936977263e-06,
+      "loss": 3.3462,
+      "step": 413
+    },
+    {
+      "epoch": 0.07790741437711705,
+      "grad_norm": 1.4664418697357178,
+      "learning_rate": 1.7195023163493252e-06,
+      "loss": 3.1851,
+      "step": 414
+    },
+    {
+      "epoch": 0.07809559653744826,
+      "grad_norm": 1.5121281147003174,
+      "learning_rate": 1.6258151471287396e-06,
+      "loss": 3.3562,
+      "step": 415
+    },
+    {
+      "epoch": 0.07828377869777944,
+      "grad_norm": 1.5005747079849243,
+      "learning_rate": 1.5347100868536246e-06,
+      "loss": 3.276,
+      "step": 416
+    },
+    {
+      "epoch": 0.07847196085811065,
+      "grad_norm": 1.5016510486602783,
+      "learning_rate": 1.4461919985142735e-06,
+      "loss": 3.1685,
+      "step": 417
+    },
+    {
+      "epoch": 0.07866014301844185,
+      "grad_norm": 1.490427017211914,
+      "learning_rate": 1.3602656070140275e-06,
+      "loss": 3.3398,
+      "step": 418
+    },
+    {
+      "epoch": 0.07884832517877305,
+      "grad_norm": 1.5081381797790527,
+      "learning_rate": 1.27693549891707e-06,
+      "loss": 3.378,
+      "step": 419
+    },
+    {
+      "epoch": 0.07903650733910425,
+      "grad_norm": 1.5135114192962646,
+      "learning_rate": 1.196206122203647e-06,
+      "loss": 3.2584,
+      "step": 420
+    },
+    {
+      "epoch": 0.07922468949943545,
+      "grad_norm": 1.503239631652832,
+      "learning_rate": 1.1180817860325599e-06,
+      "loss": 3.3742,
+      "step": 421
+    },
+    {
+      "epoch": 0.07941287165976665,
+      "grad_norm": 1.5126641988754272,
+      "learning_rate": 1.0425666605112517e-06,
+      "loss": 3.4263,
+      "step": 422
+    },
+    {
+      "epoch": 0.07960105382009786,
+      "grad_norm": 1.5308884382247925,
+      "learning_rate": 9.696647764731337e-07,
+      "loss": 3.3315,
+      "step": 423
+    },
+    {
+      "epoch": 0.07978923598042906,
+      "grad_norm": 1.5186878442764282,
+      "learning_rate": 8.993800252624862e-07,
+      "loss": 3.3336,
+      "step": 424
+    },
+    {
+      "epoch": 0.07997741814076026,
+      "grad_norm": 1.5928887128829956,
+      "learning_rate": 8.317161585266964e-07,
+      "loss": 3.641,
+      "step": 425
+    },
+    {
+      "epoch": 0.08016560030109146,
+      "grad_norm": 1.5158437490463257,
+      "learning_rate": 7.666767880160464e-07,
+      "loss": 3.4028,
+      "step": 426
+    },
+    {
+      "epoch": 0.08035378246142266,
+      "grad_norm": 1.5123188495635986,
+      "learning_rate": 7.042653853909064e-07,
+      "loss": 3.4125,
+      "step": 427
+    },
+    {
+      "epoch": 0.08054196462175386,
+      "grad_norm": 1.4982678890228271,
+      "learning_rate": 6.444852820364222e-07,
+      "loss": 3.2116,
+      "step": 428
+    },
+    {
+      "epoch": 0.08073014678208505,
+      "grad_norm": 1.5639753341674805,
+      "learning_rate": 5.87339668884701e-07,
+      "loss": 3.1289,
+      "step": 429
+    },
+    {
+      "epoch": 0.08091832894241625,
+      "grad_norm": 1.572089672088623,
+      "learning_rate": 5.328315962444874e-07,
+      "loss": 3.3213,
+      "step": 430
+    },
+    {
+      "epoch": 0.08110651110274746,
+      "grad_norm": 1.5132265090942383,
+      "learning_rate": 4.809639736383431e-07,
+      "loss": 3.3827,
+      "step": 431
+    },
+    {
+      "epoch": 0.08129469326307866,
+      "grad_norm": 1.4733517169952393,
+      "learning_rate": 4.317395696473214e-07,
+      "loss": 3.2562,
+      "step": 432
+    },
+    {
+      "epoch": 0.08148287542340986,
+      "grad_norm": 1.548933982849121,
+      "learning_rate": 3.851610117632354e-07,
+      "loss": 3.4471,
+      "step": 433
+    },
+    {
+      "epoch": 0.08167105758374106,
+      "grad_norm": 1.4967180490493774,
+      "learning_rate": 3.4123078624834216e-07,
+      "loss": 3.0655,
+      "step": 434
+    },
+    {
+      "epoch": 0.08185923974407226,
+      "grad_norm": 1.6242047548294067,
+      "learning_rate": 2.9995123800270476e-07,
+      "loss": 3.2837,
+      "step": 435
+    },
+    {
+      "epoch": 0.08204742190440346,
+      "grad_norm": 1.4935169219970703,
+      "learning_rate": 2.613245704389644e-07,
+      "loss": 3.3263,
+      "step": 436
+    },
+    {
+      "epoch": 0.08223560406473467,
+      "grad_norm": 1.4887654781341553,
+      "learning_rate": 2.2535284536476242e-07,
+      "loss": 3.1296,
+      "step": 437
+    },
+    {
+      "epoch": 0.08242378622506587,
+      "grad_norm": 1.511015772819519,
+      "learning_rate": 1.920379828726726e-07,
+      "loss": 3.3212,
+      "step": 438
+    },
+    {
+      "epoch": 0.08261196838539707,
+      "grad_norm": 1.5536808967590332,
+      "learning_rate": 1.6138176123770554e-07,
+      "loss": 3.5203,
+      "step": 439
+    },
+    {
+      "epoch": 0.08280015054572827,
+      "grad_norm": 1.5585739612579346,
+      "learning_rate": 1.333858168224178e-07,
+      "loss": 3.2727,
+      "step": 440
+    },
+    {
+      "epoch": 0.08298833270605946,
+      "grad_norm": 1.5475882291793823,
+      "learning_rate": 1.0805164398952072e-07,
+      "loss": 3.4168,
+      "step": 441
+    },
+    {
+      "epoch": 0.08317651486639066,
+      "grad_norm": 1.5009870529174805,
+      "learning_rate": 8.53805950221498e-08,
+      "loss": 3.3267,
+      "step": 442
+    },
+    {
+      "epoch": 0.08336469702672186,
+      "grad_norm": 1.5358216762542725,
+      "learning_rate": 6.537388005167233e-08,
+      "loss": 3.4914,
+      "step": 443
+    },
+    {
+      "epoch": 0.08355287918705306,
+      "grad_norm": 1.5157291889190674,
+      "learning_rate": 4.8032566993089225e-08,
+      "loss": 3.3647,
+      "step": 444
+    },
+    {
+      "epoch": 0.08374106134738427,
+      "grad_norm": 1.5044530630111694,
+      "learning_rate": 3.3357581488030475e-08,
+      "loss": 3.2696,
+      "step": 445
+    },
+    {
+      "epoch": 0.08392924350771547,
+      "grad_norm": 1.5259320735931396,
+      "learning_rate": 2.134970685536697e-08,
+      "loss": 3.3305,
+      "step": 446
+    },
+    {
+      "epoch": 0.08411742566804667,
+      "grad_norm": 1.4801864624023438,
+      "learning_rate": 1.200958404936059e-08,
+      "loss": 3.1884,
+      "step": 447
+    },
+    {
+      "epoch": 0.08430560782837787,
+      "grad_norm": 1.4972193241119385,
+      "learning_rate": 5.337711625497121e-09,
+      "loss": 3.3929,
+      "step": 448
+    },
+    {
+      "epoch": 0.08449378998870907,
+      "grad_norm": 1.7666032314300537,
+      "learning_rate": 1.3344457138297906e-09,
+      "loss": 3.4298,
+      "step": 449
+    },
+    {
+      "epoch": 0.08468197214904027,
+      "grad_norm": 2.5446274280548096,
+      "learning_rate": 0.0,
+      "loss": 2.8081,
+      "step": 450
+    },
+    {
+      "epoch": 0.08468197214904027,
+      "eval_loss": 0.8302884697914124,
+      "eval_runtime": 982.1123,
+      "eval_samples_per_second": 9.113,
+      "eval_steps_per_second": 2.279,
+      "step": 450
     }
   ],
   "logging_steps": 1,
         "should_evaluate": false,
         "should_log": false,
         "should_save": true,
+        "should_training_stop": true
       },
       "attributes": {}
     }
   },
+  "total_flos": 1.0272928176694886e+18,
   "train_batch_size": 8,
   "trial_name": null,
   "trial_params": null