Training in progress, step 550, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +1936 -3

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8626b77f2c20a16df18627f1586ffdbf4a4e68cc0d49fc23fc4cf5cbe5c9a980
 size 80792096

 version https://git-lfs.github.com/spec/v1
+oid sha256:7f2bba67c1c1484d2bc04c25f371989dddfa7218d9db23366bcfd17cb36894c8
 size 80792096

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:02a6932ab63a65dab89982c10ada890636f14f61756bc24f2a30352d27e6684d
 size 41460084

 version https://git-lfs.github.com/spec/v1
+oid sha256:507ef66950bd52137d14c378fc2f78b5ad9af0c9387506f9ca6699bcba5321d8
 size 41460084

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cf9673839065b58d76bc3b02d183cf197c172d16da9dabd5f8c5d915fe61be07
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:b5d154d045ee189af4c648f80535098cfde6139351de9c4d32c890f904602cee
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c5be8b274530ed5517e32e7b23ab26f34602144201990badc0a09d01cd9796b2
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:789cac547d76905ddc88036180d9f246f307a104c94da93e131a174052f790e8
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.2503128911138924,
   "eval_steps": 275,
-  "global_step": 275,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -1948,6 +1948,1939 @@
       "eval_samples_per_second": 9.017,
       "eval_steps_per_second": 4.511,
       "step": 275
     }
   ],
   "logging_steps": 1,
@@ -1967,7 +3900,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 5.1442616080583885e+17,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.5006257822277848,
   "eval_steps": 275,
+  "global_step": 550,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 9.017,
       "eval_steps_per_second": 4.511,
       "step": 275
+    },
+    {
+      "epoch": 0.251223119808852,
+      "grad_norm": 0.39927324652671814,
+      "learning_rate": 0.00017197404764772805,
+      "loss": 2.1982,
+      "step": 276
+    },
+    {
+      "epoch": 0.2521333485038116,
+      "grad_norm": 0.4287830889225006,
+      "learning_rate": 0.00017177347024911562,
+      "loss": 2.2733,
+      "step": 277
+    },
+    {
+      "epoch": 0.2530435771987712,
+      "grad_norm": 0.3960012197494507,
+      "learning_rate": 0.00017157229552971487,
+      "loss": 2.1884,
+      "step": 278
+    },
+    {
+      "epoch": 0.2539538058937308,
+      "grad_norm": 0.40106138586997986,
+      "learning_rate": 0.00017137052516376345,
+      "loss": 2.1207,
+      "step": 279
+    },
+    {
+      "epoch": 0.2548640345886904,
+      "grad_norm": 0.4410867393016815,
+      "learning_rate": 0.00017116816083045602,
+      "loss": 2.3589,
+      "step": 280
+    },
+    {
+      "epoch": 0.25577426328365,
+      "grad_norm": 0.4092939794063568,
+      "learning_rate": 0.0001709652042139306,
+      "loss": 2.0842,
+      "step": 281
+    },
+    {
+      "epoch": 0.25668449197860965,
+      "grad_norm": 0.40820494294166565,
+      "learning_rate": 0.0001707616570032542,
+      "loss": 2.1658,
+      "step": 282
+    },
+    {
+      "epoch": 0.25759472067356926,
+      "grad_norm": 0.41664186120033264,
+      "learning_rate": 0.00017055752089240907,
+      "loss": 2.1389,
+      "step": 283
+    },
+    {
+      "epoch": 0.25850494936852886,
+      "grad_norm": 0.4125240445137024,
+      "learning_rate": 0.00017035279758027832,
+      "loss": 2.0615,
+      "step": 284
+    },
+    {
+      "epoch": 0.25941517806348846,
+      "grad_norm": 0.42702898383140564,
+      "learning_rate": 0.00017014748877063214,
+      "loss": 2.017,
+      "step": 285
+    },
+    {
+      "epoch": 0.26032540675844806,
+      "grad_norm": 0.44943541288375854,
+      "learning_rate": 0.00016994159617211317,
+      "loss": 2.1901,
+      "step": 286
+    },
+    {
+      "epoch": 0.26123563545340767,
+      "grad_norm": 0.4286860227584839,
+      "learning_rate": 0.00016973512149822274,
+      "loss": 2.0643,
+      "step": 287
+    },
+    {
+      "epoch": 0.26214586414836727,
+      "grad_norm": 0.44938111305236816,
+      "learning_rate": 0.0001695280664673062,
+      "loss": 2.1539,
+      "step": 288
+    },
+    {
+      "epoch": 0.26305609284332687,
+      "grad_norm": 0.4638296067714691,
+      "learning_rate": 0.0001693204328025389,
+      "loss": 2.291,
+      "step": 289
+    },
+    {
+      "epoch": 0.2639663215382865,
+      "grad_norm": 0.49295714497566223,
+      "learning_rate": 0.00016911222223191182,
+      "loss": 2.2538,
+      "step": 290
+    },
+    {
+      "epoch": 0.2648765502332461,
+      "grad_norm": 0.48185715079307556,
+      "learning_rate": 0.00016890343648821697,
+      "loss": 2.2792,
+      "step": 291
+    },
+    {
+      "epoch": 0.26578677892820574,
+      "grad_norm": 0.4750272035598755,
+      "learning_rate": 0.0001686940773090333,
+      "loss": 2.2774,
+      "step": 292
+    },
+    {
+      "epoch": 0.26669700762316534,
+      "grad_norm": 0.5073033571243286,
+      "learning_rate": 0.00016848414643671195,
+      "loss": 2.3261,
+      "step": 293
+    },
+    {
+      "epoch": 0.26760723631812494,
+      "grad_norm": 0.5343424081802368,
+      "learning_rate": 0.00016827364561836187,
+      "loss": 2.4097,
+      "step": 294
+    },
+    {
+      "epoch": 0.26851746501308454,
+      "grad_norm": 0.5311369895935059,
+      "learning_rate": 0.00016806257660583534,
+      "loss": 2.3821,
+      "step": 295
+    },
+    {
+      "epoch": 0.26942769370804415,
+      "grad_norm": 0.5551429986953735,
+      "learning_rate": 0.00016785094115571322,
+      "loss": 2.3795,
+      "step": 296
+    },
+    {
+      "epoch": 0.27033792240300375,
+      "grad_norm": 0.6279783248901367,
+      "learning_rate": 0.0001676387410292906,
+      "loss": 2.435,
+      "step": 297
+    },
+    {
+      "epoch": 0.27124815109796335,
+      "grad_norm": 0.7317250967025757,
+      "learning_rate": 0.00016742597799256182,
+      "loss": 2.6991,
+      "step": 298
+    },
+    {
+      "epoch": 0.27215837979292296,
+      "grad_norm": 0.8485302329063416,
+      "learning_rate": 0.000167212653816206,
+      "loss": 2.7005,
+      "step": 299
+    },
+    {
+      "epoch": 0.27306860848788256,
+      "grad_norm": 1.5959185361862183,
+      "learning_rate": 0.00016699877027557226,
+      "loss": 2.7536,
+      "step": 300
+    },
+    {
+      "epoch": 0.2739788371828422,
+      "grad_norm": 0.4755174219608307,
+      "learning_rate": 0.00016678432915066488,
+      "loss": 2.5907,
+      "step": 301
+    },
+    {
+      "epoch": 0.2748890658778018,
+      "grad_norm": 0.45389342308044434,
+      "learning_rate": 0.00016656933222612854,
+      "loss": 2.4622,
+      "step": 302
+    },
+    {
+      "epoch": 0.2757992945727614,
+      "grad_norm": 0.4949435591697693,
+      "learning_rate": 0.00016635378129123342,
+      "loss": 2.4185,
+      "step": 303
+    },
+    {
+      "epoch": 0.276709523267721,
+      "grad_norm": 0.4521631896495819,
+      "learning_rate": 0.00016613767813986044,
+      "loss": 2.4918,
+      "step": 304
+    },
+    {
+      "epoch": 0.2776197519626806,
+      "grad_norm": 0.4228963553905487,
+      "learning_rate": 0.0001659210245704861,
+      "loss": 2.4194,
+      "step": 305
+    },
+    {
+      "epoch": 0.27852998065764023,
+      "grad_norm": 0.4170341491699219,
+      "learning_rate": 0.00016570382238616777,
+      "loss": 2.4185,
+      "step": 306
+    },
+    {
+      "epoch": 0.27944020935259983,
+      "grad_norm": 0.4123315215110779,
+      "learning_rate": 0.00016548607339452853,
+      "loss": 2.3737,
+      "step": 307
+    },
+    {
+      "epoch": 0.28035043804755944,
+      "grad_norm": 0.4320162832736969,
+      "learning_rate": 0.00016526777940774204,
+      "loss": 2.3317,
+      "step": 308
+    },
+    {
+      "epoch": 0.28126066674251904,
+      "grad_norm": 0.4118390381336212,
+      "learning_rate": 0.00016504894224251778,
+      "loss": 2.3786,
+      "step": 309
+    },
+    {
+      "epoch": 0.28217089543747864,
+      "grad_norm": 0.39763331413269043,
+      "learning_rate": 0.0001648295637200856,
+      "loss": 2.2968,
+      "step": 310
+    },
+    {
+      "epoch": 0.2830811241324383,
+      "grad_norm": 0.4391527473926544,
+      "learning_rate": 0.0001646096456661807,
+      "loss": 2.3764,
+      "step": 311
+    },
+    {
+      "epoch": 0.2839913528273979,
+      "grad_norm": 0.43077877163887024,
+      "learning_rate": 0.00016438918991102842,
+      "loss": 2.2013,
+      "step": 312
+    },
+    {
+      "epoch": 0.2849015815223575,
+      "grad_norm": 0.43149155378341675,
+      "learning_rate": 0.000164168198289329,
+      "loss": 2.3097,
+      "step": 313
+    },
+    {
+      "epoch": 0.2858118102173171,
+      "grad_norm": 0.40134817361831665,
+      "learning_rate": 0.00016394667264024246,
+      "loss": 2.3306,
+      "step": 314
+    },
+    {
+      "epoch": 0.2867220389122767,
+      "grad_norm": 0.4056681990623474,
+      "learning_rate": 0.00016372461480737297,
+      "loss": 2.3146,
+      "step": 315
+    },
+    {
+      "epoch": 0.2876322676072363,
+      "grad_norm": 0.41738027334213257,
+      "learning_rate": 0.00016350202663875386,
+      "loss": 1.9997,
+      "step": 316
+    },
+    {
+      "epoch": 0.2885424963021959,
+      "grad_norm": 0.38182246685028076,
+      "learning_rate": 0.00016327890998683192,
+      "loss": 2.0466,
+      "step": 317
+    },
+    {
+      "epoch": 0.2894527249971555,
+      "grad_norm": 0.39759719371795654,
+      "learning_rate": 0.00016305526670845226,
+      "loss": 2.1788,
+      "step": 318
+    },
+    {
+      "epoch": 0.2903629536921151,
+      "grad_norm": 0.3982352614402771,
+      "learning_rate": 0.0001628310986648427,
+      "loss": 2.2115,
+      "step": 319
+    },
+    {
+      "epoch": 0.2912731823870747,
+      "grad_norm": 0.41679051518440247,
+      "learning_rate": 0.0001626064077215983,
+      "loss": 2.3036,
+      "step": 320
+    },
+    {
+      "epoch": 0.2921834110820344,
+      "grad_norm": 0.40436604619026184,
+      "learning_rate": 0.00016238119574866588,
+      "loss": 2.1493,
+      "step": 321
+    },
+    {
+      "epoch": 0.293093639776994,
+      "grad_norm": 0.4502476751804352,
+      "learning_rate": 0.0001621554646203284,
+      "loss": 1.8572,
+      "step": 322
+    },
+    {
+      "epoch": 0.2940038684719536,
+      "grad_norm": 0.44303473830223083,
+      "learning_rate": 0.00016192921621518944,
+      "loss": 2.1832,
+      "step": 323
+    },
+    {
+      "epoch": 0.2949140971669132,
+      "grad_norm": 0.4064692258834839,
+      "learning_rate": 0.0001617024524161574,
+      "loss": 2.2656,
+      "step": 324
+    },
+    {
+      "epoch": 0.2958243258618728,
+      "grad_norm": 0.4479392170906067,
+      "learning_rate": 0.0001614751751104301,
+      "loss": 2.2462,
+      "step": 325
+    },
+    {
+      "epoch": 0.2967345545568324,
+      "grad_norm": 0.4629363715648651,
+      "learning_rate": 0.0001612473861894788,
+      "loss": 1.9715,
+      "step": 326
+    },
+    {
+      "epoch": 0.297644783251792,
+      "grad_norm": 0.3991665542125702,
+      "learning_rate": 0.00016101908754903268,
+      "loss": 2.0642,
+      "step": 327
+    },
+    {
+      "epoch": 0.2985550119467516,
+      "grad_norm": 0.42503711581230164,
+      "learning_rate": 0.00016079028108906282,
+      "loss": 2.1403,
+      "step": 328
+    },
+    {
+      "epoch": 0.2994652406417112,
+      "grad_norm": 0.4499455392360687,
+      "learning_rate": 0.00016056096871376667,
+      "loss": 2.0534,
+      "step": 329
+    },
+    {
+      "epoch": 0.30037546933667086,
+      "grad_norm": 0.4549277424812317,
+      "learning_rate": 0.00016033115233155202,
+      "loss": 2.2083,
+      "step": 330
+    },
+    {
+      "epoch": 0.30128569803163047,
+      "grad_norm": 0.3974262773990631,
+      "learning_rate": 0.0001601008338550211,
+      "loss": 2.0156,
+      "step": 331
+    },
+    {
+      "epoch": 0.30219592672659007,
+      "grad_norm": 0.43566057085990906,
+      "learning_rate": 0.00015987001520095478,
+      "loss": 2.1801,
+      "step": 332
+    },
+    {
+      "epoch": 0.3031061554215497,
+      "grad_norm": 0.47677701711654663,
+      "learning_rate": 0.00015963869829029658,
+      "loss": 2.1415,
+      "step": 333
+    },
+    {
+      "epoch": 0.3040163841165093,
+      "grad_norm": 0.4603672921657562,
+      "learning_rate": 0.00015940688504813662,
+      "loss": 2.2967,
+      "step": 334
+    },
+    {
+      "epoch": 0.3049266128114689,
+      "grad_norm": 0.4428515136241913,
+      "learning_rate": 0.00015917457740369565,
+      "loss": 2.1447,
+      "step": 335
+    },
+    {
+      "epoch": 0.3058368415064285,
+      "grad_norm": 0.4379275441169739,
+      "learning_rate": 0.000158941777290309,
+      "loss": 2.0957,
+      "step": 336
+    },
+    {
+      "epoch": 0.3067470702013881,
+      "grad_norm": 0.4831966459751129,
+      "learning_rate": 0.00015870848664541044,
+      "loss": 2.2457,
+      "step": 337
+    },
+    {
+      "epoch": 0.3076572988963477,
+      "grad_norm": 0.45160865783691406,
+      "learning_rate": 0.00015847470741051618,
+      "loss": 2.1441,
+      "step": 338
+    },
+    {
+      "epoch": 0.3085675275913073,
+      "grad_norm": 0.44453370571136475,
+      "learning_rate": 0.00015824044153120852,
+      "loss": 2.1073,
+      "step": 339
+    },
+    {
+      "epoch": 0.30947775628626695,
+      "grad_norm": 0.49965375661849976,
+      "learning_rate": 0.00015800569095711982,
+      "loss": 2.1574,
+      "step": 340
+    },
+    {
+      "epoch": 0.31038798498122655,
+      "grad_norm": 0.48138341307640076,
+      "learning_rate": 0.00015777045764191625,
+      "loss": 2.0205,
+      "step": 341
+    },
+    {
+      "epoch": 0.31129821367618615,
+      "grad_norm": 0.5034924745559692,
+      "learning_rate": 0.00015753474354328142,
+      "loss": 2.2319,
+      "step": 342
+    },
+    {
+      "epoch": 0.31220844237114576,
+      "grad_norm": 0.5034711956977844,
+      "learning_rate": 0.00015729855062290022,
+      "loss": 2.4066,
+      "step": 343
+    },
+    {
+      "epoch": 0.31311867106610536,
+      "grad_norm": 0.5409703254699707,
+      "learning_rate": 0.00015706188084644242,
+      "loss": 2.2435,
+      "step": 344
+    },
+    {
+      "epoch": 0.31402889976106496,
+      "grad_norm": 0.544597327709198,
+      "learning_rate": 0.00015682473618354635,
+      "loss": 2.2625,
+      "step": 345
+    },
+    {
+      "epoch": 0.31493912845602456,
+      "grad_norm": 0.6114000082015991,
+      "learning_rate": 0.0001565871186078025,
+      "loss": 2.4302,
+      "step": 346
+    },
+    {
+      "epoch": 0.31584935715098417,
+      "grad_norm": 0.6364843845367432,
+      "learning_rate": 0.00015634903009673705,
+      "loss": 2.5153,
+      "step": 347
+    },
+    {
+      "epoch": 0.31675958584594377,
+      "grad_norm": 0.7510351538658142,
+      "learning_rate": 0.00015611047263179548,
+      "loss": 2.5605,
+      "step": 348
+    },
+    {
+      "epoch": 0.31766981454090343,
+      "grad_norm": 0.8501291275024414,
+      "learning_rate": 0.000155871448198326,
+      "loss": 2.6519,
+      "step": 349
+    },
+    {
+      "epoch": 0.31858004323586303,
+      "grad_norm": 1.7441632747650146,
+      "learning_rate": 0.0001556319587855631,
+      "loss": 2.7517,
+      "step": 350
+    },
+    {
+      "epoch": 0.31949027193082263,
+      "grad_norm": 0.5301811695098877,
+      "learning_rate": 0.00015539200638661104,
+      "loss": 2.6647,
+      "step": 351
+    },
+    {
+      "epoch": 0.32040050062578224,
+      "grad_norm": 0.5063616633415222,
+      "learning_rate": 0.00015515159299842707,
+      "loss": 2.4961,
+      "step": 352
+    },
+    {
+      "epoch": 0.32131072932074184,
+      "grad_norm": 0.4843781590461731,
+      "learning_rate": 0.00015491072062180503,
+      "loss": 2.496,
+      "step": 353
+    },
+    {
+      "epoch": 0.32222095801570144,
+      "grad_norm": 0.4524553716182709,
+      "learning_rate": 0.00015466939126135856,
+      "loss": 2.448,
+      "step": 354
+    },
+    {
+      "epoch": 0.32313118671066104,
+      "grad_norm": 0.43678200244903564,
+      "learning_rate": 0.00015442760692550443,
+      "loss": 2.2687,
+      "step": 355
+    },
+    {
+      "epoch": 0.32404141540562065,
+      "grad_norm": 0.4301970303058624,
+      "learning_rate": 0.00015418536962644592,
+      "loss": 2.4826,
+      "step": 356
+    },
+    {
+      "epoch": 0.32495164410058025,
+      "grad_norm": 0.42540326714515686,
+      "learning_rate": 0.00015394268138015598,
+      "loss": 2.4205,
+      "step": 357
+    },
+    {
+      "epoch": 0.32586187279553985,
+      "grad_norm": 0.4173906445503235,
+      "learning_rate": 0.00015369954420636048,
+      "loss": 2.394,
+      "step": 358
+    },
+    {
+      "epoch": 0.3267721014904995,
+      "grad_norm": 0.43184736371040344,
+      "learning_rate": 0.00015345596012852138,
+      "loss": 2.3504,
+      "step": 359
+    },
+    {
+      "epoch": 0.3276823301854591,
+      "grad_norm": 0.4002053141593933,
+      "learning_rate": 0.00015321193117381996,
+      "loss": 2.2951,
+      "step": 360
+    },
+    {
+      "epoch": 0.3285925588804187,
+      "grad_norm": 0.39067134261131287,
+      "learning_rate": 0.00015296745937313987,
+      "loss": 2.2768,
+      "step": 361
+    },
+    {
+      "epoch": 0.3295027875753783,
+      "grad_norm": 0.40051525831222534,
+      "learning_rate": 0.00015272254676105025,
+      "loss": 2.2235,
+      "step": 362
+    },
+    {
+      "epoch": 0.3304130162703379,
+      "grad_norm": 0.3954068422317505,
+      "learning_rate": 0.00015247719537578883,
+      "loss": 2.2502,
+      "step": 363
+    },
+    {
+      "epoch": 0.3313232449652975,
+      "grad_norm": 0.4123362600803375,
+      "learning_rate": 0.00015223140725924495,
+      "loss": 2.3309,
+      "step": 364
+    },
+    {
+      "epoch": 0.33223347366025713,
+      "grad_norm": 0.4138774871826172,
+      "learning_rate": 0.00015198518445694255,
+      "loss": 2.4107,
+      "step": 365
+    },
+    {
+      "epoch": 0.33314370235521673,
+      "grad_norm": 0.3983847498893738,
+      "learning_rate": 0.0001517385290180231,
+      "loss": 2.2718,
+      "step": 366
+    },
+    {
+      "epoch": 0.33405393105017633,
+      "grad_norm": 0.36962834000587463,
+      "learning_rate": 0.00015149144299522873,
+      "loss": 2.1744,
+      "step": 367
+    },
+    {
+      "epoch": 0.334964159745136,
+      "grad_norm": 0.37924104928970337,
+      "learning_rate": 0.0001512439284448849,
+      "loss": 2.1451,
+      "step": 368
+    },
+    {
+      "epoch": 0.3358743884400956,
+      "grad_norm": 0.39990487694740295,
+      "learning_rate": 0.0001509959874268835,
+      "loss": 2.2508,
+      "step": 369
+    },
+    {
+      "epoch": 0.3367846171350552,
+      "grad_norm": 0.3862214684486389,
+      "learning_rate": 0.00015074762200466556,
+      "loss": 2.1483,
+      "step": 370
+    },
+    {
+      "epoch": 0.3376948458300148,
+      "grad_norm": 0.4037676751613617,
+      "learning_rate": 0.00015049883424520414,
+      "loss": 2.2179,
+      "step": 371
+    },
+    {
+      "epoch": 0.3386050745249744,
+      "grad_norm": 0.40439948439598083,
+      "learning_rate": 0.00015024962621898715,
+      "loss": 2.2054,
+      "step": 372
+    },
+    {
+      "epoch": 0.339515303219934,
+      "grad_norm": 0.3871942460536957,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 2.129,
+      "step": 373
+    },
+    {
+      "epoch": 0.3404255319148936,
+      "grad_norm": 0.4091387093067169,
+      "learning_rate": 0.00014974995766570855,
+      "loss": 2.1395,
+      "step": 374
+    },
+    {
+      "epoch": 0.3413357606098532,
+      "grad_norm": 0.4097527265548706,
+      "learning_rate": 0.00014949950129704162,
+      "loss": 2.1789,
+      "step": 375
+    },
+    {
+      "epoch": 0.3422459893048128,
+      "grad_norm": 0.4139934480190277,
+      "learning_rate": 0.00014924863297837378,
+      "loss": 2.0611,
+      "step": 376
+    },
+    {
+      "epoch": 0.3431562179997724,
+      "grad_norm": 0.4146927297115326,
+      "learning_rate": 0.00014899735479750794,
+      "loss": 2.2488,
+      "step": 377
+    },
+    {
+      "epoch": 0.3440664466947321,
+      "grad_norm": 0.4194958209991455,
+      "learning_rate": 0.00014874566884565807,
+      "loss": 2.0164,
+      "step": 378
+    },
+    {
+      "epoch": 0.3449766753896917,
+      "grad_norm": 0.41280898451805115,
+      "learning_rate": 0.00014849357721743168,
+      "loss": 2.1503,
+      "step": 379
+    },
+    {
+      "epoch": 0.3458869040846513,
+      "grad_norm": 0.4133208692073822,
+      "learning_rate": 0.00014824108201081247,
+      "loss": 2.0895,
+      "step": 380
+    },
+    {
+      "epoch": 0.3467971327796109,
+      "grad_norm": 0.41347819566726685,
+      "learning_rate": 0.00014798818532714279,
+      "loss": 2.0479,
+      "step": 381
+    },
+    {
+      "epoch": 0.3477073614745705,
+      "grad_norm": 0.43102580308914185,
+      "learning_rate": 0.00014773488927110633,
+      "loss": 2.1458,
+      "step": 382
+    },
+    {
+      "epoch": 0.3486175901695301,
+      "grad_norm": 0.41427451372146606,
+      "learning_rate": 0.00014748119595071034,
+      "loss": 1.9396,
+      "step": 383
+    },
+    {
+      "epoch": 0.3495278188644897,
+      "grad_norm": 0.46386152505874634,
+      "learning_rate": 0.0001472271074772683,
+      "loss": 2.2446,
+      "step": 384
+    },
+    {
+      "epoch": 0.3504380475594493,
+      "grad_norm": 0.4310764670372009,
+      "learning_rate": 0.00014697262596538227,
+      "loss": 2.2144,
+      "step": 385
+    },
+    {
+      "epoch": 0.3513482762544089,
+      "grad_norm": 0.4956878423690796,
+      "learning_rate": 0.00014671775353292525,
+      "loss": 2.1875,
+      "step": 386
+    },
+    {
+      "epoch": 0.35225850494936856,
+      "grad_norm": 0.4793931543827057,
+      "learning_rate": 0.00014646249230102366,
+      "loss": 2.2733,
+      "step": 387
+    },
+    {
+      "epoch": 0.35316873364432816,
+      "grad_norm": 0.46217313408851624,
+      "learning_rate": 0.00014620684439403962,
+      "loss": 2.2812,
+      "step": 388
+    },
+    {
+      "epoch": 0.35407896233928776,
+      "grad_norm": 0.4721885323524475,
+      "learning_rate": 0.00014595081193955324,
+      "loss": 2.1223,
+      "step": 389
+    },
+    {
+      "epoch": 0.35498919103424736,
+      "grad_norm": 0.49550965428352356,
+      "learning_rate": 0.000145694397068345,
+      "loss": 2.156,
+      "step": 390
+    },
+    {
+      "epoch": 0.35589941972920697,
+      "grad_norm": 0.5109139084815979,
+      "learning_rate": 0.0001454376019143779,
+      "loss": 2.1494,
+      "step": 391
+    },
+    {
+      "epoch": 0.35680964842416657,
+      "grad_norm": 0.4725574553012848,
+      "learning_rate": 0.00014518042861477986,
+      "loss": 2.1793,
+      "step": 392
+    },
+    {
+      "epoch": 0.3577198771191262,
+      "grad_norm": 0.4739914536476135,
+      "learning_rate": 0.00014492287930982576,
+      "loss": 2.1763,
+      "step": 393
+    },
+    {
+      "epoch": 0.3586301058140858,
+      "grad_norm": 0.5420114994049072,
+      "learning_rate": 0.00014466495614291977,
+      "loss": 2.4521,
+      "step": 394
+    },
+    {
+      "epoch": 0.3595403345090454,
+      "grad_norm": 0.5225427150726318,
+      "learning_rate": 0.00014440666126057744,
+      "loss": 2.372,
+      "step": 395
+    },
+    {
+      "epoch": 0.360450563204005,
+      "grad_norm": 0.5337964296340942,
+      "learning_rate": 0.0001441479968124078,
+      "loss": 2.397,
+      "step": 396
+    },
+    {
+      "epoch": 0.36136079189896464,
+      "grad_norm": 0.5906230807304382,
+      "learning_rate": 0.0001438889649510956,
+      "loss": 2.506,
+      "step": 397
+    },
+    {
+      "epoch": 0.36227102059392424,
+      "grad_norm": 0.6578875780105591,
+      "learning_rate": 0.00014362956783238324,
+      "loss": 2.6408,
+      "step": 398
+    },
+    {
+      "epoch": 0.36318124928888385,
+      "grad_norm": 0.7982918620109558,
+      "learning_rate": 0.00014336980761505297,
+      "loss": 2.6612,
+      "step": 399
+    },
+    {
+      "epoch": 0.36409147798384345,
+      "grad_norm": 1.4390262365341187,
+      "learning_rate": 0.00014310968646090883,
+      "loss": 2.7073,
+      "step": 400
+    },
+    {
+      "epoch": 0.36500170667880305,
+      "grad_norm": 0.5260487198829651,
+      "learning_rate": 0.00014284920653475866,
+      "loss": 2.6269,
+      "step": 401
+    },
+    {
+      "epoch": 0.36591193537376265,
+      "grad_norm": 0.4492892026901245,
+      "learning_rate": 0.00014258837000439618,
+      "loss": 2.3863,
+      "step": 402
+    },
+    {
+      "epoch": 0.36682216406872226,
+      "grad_norm": 0.4619944095611572,
+      "learning_rate": 0.0001423271790405828,
+      "loss": 2.4595,
+      "step": 403
+    },
+    {
+      "epoch": 0.36773239276368186,
+      "grad_norm": 0.4437786638736725,
+      "learning_rate": 0.00014206563581702964,
+      "loss": 2.3674,
+      "step": 404
+    },
+    {
+      "epoch": 0.36864262145864146,
+      "grad_norm": 0.4789164364337921,
+      "learning_rate": 0.0001418037425103795,
+      "loss": 2.5203,
+      "step": 405
+    },
+    {
+      "epoch": 0.36955285015360106,
+      "grad_norm": 0.44783228635787964,
+      "learning_rate": 0.00014154150130018866,
+      "loss": 2.5183,
+      "step": 406
+    },
+    {
+      "epoch": 0.3704630788485607,
+      "grad_norm": 0.40067169070243835,
+      "learning_rate": 0.00014127891436890868,
+      "loss": 2.3846,
+      "step": 407
+    },
+    {
+      "epoch": 0.3713733075435203,
+      "grad_norm": 0.3978015184402466,
+      "learning_rate": 0.0001410159839018684,
+      "loss": 2.3146,
+      "step": 408
+    },
+    {
+      "epoch": 0.37228353623847993,
+      "grad_norm": 0.4096076190471649,
+      "learning_rate": 0.0001407527120872557,
+      "loss": 2.3617,
+      "step": 409
+    },
+    {
+      "epoch": 0.37319376493343953,
+      "grad_norm": 0.4160764217376709,
+      "learning_rate": 0.00014048910111609915,
+      "loss": 2.2909,
+      "step": 410
+    },
+    {
+      "epoch": 0.37410399362839913,
+      "grad_norm": 0.3976461887359619,
+      "learning_rate": 0.0001402251531822499,
+      "loss": 2.3111,
+      "step": 411
+    },
+    {
+      "epoch": 0.37501422232335874,
+      "grad_norm": 0.3890199065208435,
+      "learning_rate": 0.00013996087048236358,
+      "loss": 2.0969,
+      "step": 412
+    },
+    {
+      "epoch": 0.37592445101831834,
+      "grad_norm": 0.4157082140445709,
+      "learning_rate": 0.00013969625521588158,
+      "loss": 2.3205,
+      "step": 413
+    },
+    {
+      "epoch": 0.37683467971327794,
+      "grad_norm": 0.4103608727455139,
+      "learning_rate": 0.00013943130958501317,
+      "loss": 2.2622,
+      "step": 414
+    },
+    {
+      "epoch": 0.37774490840823755,
+      "grad_norm": 0.40916207432746887,
+      "learning_rate": 0.00013916603579471705,
+      "loss": 2.3585,
+      "step": 415
+    },
+    {
+      "epoch": 0.3786551371031972,
+      "grad_norm": 0.39642858505249023,
+      "learning_rate": 0.00013890043605268283,
+      "loss": 2.2196,
+      "step": 416
+    },
+    {
+      "epoch": 0.3795653657981568,
+      "grad_norm": 0.3851282596588135,
+      "learning_rate": 0.00013863451256931287,
+      "loss": 2.0298,
+      "step": 417
+    },
+    {
+      "epoch": 0.3804755944931164,
+      "grad_norm": 0.38890305161476135,
+      "learning_rate": 0.00013836826755770384,
+      "loss": 2.1601,
+      "step": 418
+    },
+    {
+      "epoch": 0.381385823188076,
+      "grad_norm": 0.41382652521133423,
+      "learning_rate": 0.00013810170323362816,
+      "loss": 2.2656,
+      "step": 419
+    },
+    {
+      "epoch": 0.3822960518830356,
+      "grad_norm": 0.3820722699165344,
+      "learning_rate": 0.0001378348218155158,
+      "loss": 2.0094,
+      "step": 420
+    },
+    {
+      "epoch": 0.3832062805779952,
+      "grad_norm": 0.4150048494338989,
+      "learning_rate": 0.00013756762552443553,
+      "loss": 2.2529,
+      "step": 421
+    },
+    {
+      "epoch": 0.3841165092729548,
+      "grad_norm": 0.452776700258255,
+      "learning_rate": 0.00013730011658407676,
+      "loss": 2.1972,
+      "step": 422
+    },
+    {
+      "epoch": 0.3850267379679144,
+      "grad_norm": 0.4173040986061096,
+      "learning_rate": 0.00013703229722073065,
+      "loss": 2.1502,
+      "step": 423
+    },
+    {
+      "epoch": 0.385936966662874,
+      "grad_norm": 0.4115488529205322,
+      "learning_rate": 0.000136764169663272,
+      "loss": 1.9828,
+      "step": 424
+    },
+    {
+      "epoch": 0.38684719535783363,
+      "grad_norm": 0.4060666561126709,
+      "learning_rate": 0.00013649573614314044,
+      "loss": 2.267,
+      "step": 425
+    },
+    {
+      "epoch": 0.3877574240527933,
+      "grad_norm": 0.4049409031867981,
+      "learning_rate": 0.00013622699889432184,
+      "loss": 2.2044,
+      "step": 426
+    },
+    {
+      "epoch": 0.3886676527477529,
+      "grad_norm": 0.40970832109451294,
+      "learning_rate": 0.00013595796015332984,
+      "loss": 2.0984,
+      "step": 427
+    },
+    {
+      "epoch": 0.3895778814427125,
+      "grad_norm": 0.4141111671924591,
+      "learning_rate": 0.00013568862215918717,
+      "loss": 2.109,
+      "step": 428
+    },
+    {
+      "epoch": 0.3904881101376721,
+      "grad_norm": 0.43404263257980347,
+      "learning_rate": 0.00013541898715340716,
+      "loss": 2.1763,
+      "step": 429
+    },
+    {
+      "epoch": 0.3913983388326317,
+      "grad_norm": 0.41949963569641113,
+      "learning_rate": 0.00013514905737997473,
+      "loss": 2.3086,
+      "step": 430
+    },
+    {
+      "epoch": 0.3923085675275913,
+      "grad_norm": 0.41665390133857727,
+      "learning_rate": 0.00013487883508532815,
+      "loss": 2.0726,
+      "step": 431
+    },
+    {
+      "epoch": 0.3932187962225509,
+      "grad_norm": 0.4305708110332489,
+      "learning_rate": 0.00013460832251834011,
+      "loss": 2.1975,
+      "step": 432
+    },
+    {
+      "epoch": 0.3941290249175105,
+      "grad_norm": 0.44775405526161194,
+      "learning_rate": 0.00013433752193029886,
+      "loss": 2.1503,
+      "step": 433
+    },
+    {
+      "epoch": 0.3950392536124701,
+      "grad_norm": 0.44451820850372314,
+      "learning_rate": 0.0001340664355748899,
+      "loss": 2.1004,
+      "step": 434
+    },
+    {
+      "epoch": 0.39594948230742977,
+      "grad_norm": 0.44242945313453674,
+      "learning_rate": 0.0001337950657081768,
+      "loss": 2.1074,
+      "step": 435
+    },
+    {
+      "epoch": 0.39685971100238937,
+      "grad_norm": 0.4649699926376343,
+      "learning_rate": 0.00013352341458858265,
+      "loss": 2.2468,
+      "step": 436
+    },
+    {
+      "epoch": 0.397769939697349,
+      "grad_norm": 0.4718558192253113,
+      "learning_rate": 0.00013325148447687125,
+      "loss": 2.225,
+      "step": 437
+    },
+    {
+      "epoch": 0.3986801683923086,
+      "grad_norm": 0.44748789072036743,
+      "learning_rate": 0.0001329792776361282,
+      "loss": 2.0243,
+      "step": 438
+    },
+    {
+      "epoch": 0.3995903970872682,
+      "grad_norm": 0.4730619192123413,
+      "learning_rate": 0.00013270679633174218,
+      "loss": 2.0262,
+      "step": 439
+    },
+    {
+      "epoch": 0.4005006257822278,
+      "grad_norm": 0.4742071032524109,
+      "learning_rate": 0.00013243404283138597,
+      "loss": 2.1171,
+      "step": 440
+    },
+    {
+      "epoch": 0.4014108544771874,
+      "grad_norm": 0.4963454306125641,
+      "learning_rate": 0.00013216101940499768,
+      "loss": 2.051,
+      "step": 441
+    },
+    {
+      "epoch": 0.402321083172147,
+      "grad_norm": 0.5127780437469482,
+      "learning_rate": 0.00013188772832476188,
+      "loss": 2.1664,
+      "step": 442
+    },
+    {
+      "epoch": 0.4032313118671066,
+      "grad_norm": 0.5129209756851196,
+      "learning_rate": 0.00013161417186509052,
+      "loss": 2.2272,
+      "step": 443
+    },
+    {
+      "epoch": 0.4041415405620662,
+      "grad_norm": 0.5068848133087158,
+      "learning_rate": 0.00013134035230260427,
+      "loss": 2.1007,
+      "step": 444
+    },
+    {
+      "epoch": 0.40505176925702585,
+      "grad_norm": 0.5721228718757629,
+      "learning_rate": 0.00013106627191611332,
+      "loss": 2.255,
+      "step": 445
+    },
+    {
+      "epoch": 0.40596199795198545,
+      "grad_norm": 0.6085918545722961,
+      "learning_rate": 0.0001307919329865985,
+      "loss": 2.456,
+      "step": 446
+    },
+    {
+      "epoch": 0.40687222664694506,
+      "grad_norm": 0.6652196645736694,
+      "learning_rate": 0.00013051733779719234,
+      "loss": 2.5504,
+      "step": 447
+    },
+    {
+      "epoch": 0.40778245534190466,
+      "grad_norm": 0.7234418392181396,
+      "learning_rate": 0.00013024248863316012,
+      "loss": 2.5796,
+      "step": 448
+    },
+    {
+      "epoch": 0.40869268403686426,
+      "grad_norm": 0.8588744401931763,
+      "learning_rate": 0.00012996738778188067,
+      "loss": 2.5756,
+      "step": 449
+    },
+    {
+      "epoch": 0.40960291273182386,
+      "grad_norm": 1.2627683877944946,
+      "learning_rate": 0.0001296920375328275,
+      "loss": 2.203,
+      "step": 450
+    },
+    {
+      "epoch": 0.41051314142678347,
+      "grad_norm": 0.4838164746761322,
+      "learning_rate": 0.00012941644017754964,
+      "loss": 2.434,
+      "step": 451
+    },
+    {
+      "epoch": 0.41142337012174307,
+      "grad_norm": 0.44005534052848816,
+      "learning_rate": 0.00012914059800965268,
+      "loss": 2.55,
+      "step": 452
+    },
+    {
+      "epoch": 0.4123335988167027,
+      "grad_norm": 0.4343414604663849,
+      "learning_rate": 0.0001288645133247795,
+      "loss": 2.432,
+      "step": 453
+    },
+    {
+      "epoch": 0.41324382751166233,
+      "grad_norm": 0.4588654339313507,
+      "learning_rate": 0.00012858818842059145,
+      "loss": 2.4434,
+      "step": 454
+    },
+    {
+      "epoch": 0.41415405620662193,
+      "grad_norm": 0.4294244647026062,
+      "learning_rate": 0.00012831162559674887,
+      "loss": 2.4241,
+      "step": 455
+    },
+    {
+      "epoch": 0.41506428490158154,
+      "grad_norm": 0.40034809708595276,
+      "learning_rate": 0.0001280348271548923,
+      "loss": 2.3191,
+      "step": 456
+    },
+    {
+      "epoch": 0.41597451359654114,
+      "grad_norm": 0.40817153453826904,
+      "learning_rate": 0.00012775779539862304,
+      "loss": 2.589,
+      "step": 457
+    },
+    {
+      "epoch": 0.41688474229150074,
+      "grad_norm": 0.40605810284614563,
+      "learning_rate": 0.0001274805326334842,
+      "loss": 2.3445,
+      "step": 458
+    },
+    {
+      "epoch": 0.41779497098646035,
+      "grad_norm": 0.4386533200740814,
+      "learning_rate": 0.00012720304116694138,
+      "loss": 2.4002,
+      "step": 459
+    },
+    {
+      "epoch": 0.41870519968141995,
+      "grad_norm": 0.40985172986984253,
+      "learning_rate": 0.00012692532330836346,
+      "loss": 2.3964,
+      "step": 460
+    },
+    {
+      "epoch": 0.41961542837637955,
+      "grad_norm": 0.4220562279224396,
+      "learning_rate": 0.00012664738136900348,
+      "loss": 2.3145,
+      "step": 461
+    },
+    {
+      "epoch": 0.42052565707133915,
+      "grad_norm": 0.4068267047405243,
+      "learning_rate": 0.00012636921766197943,
+      "loss": 2.3274,
+      "step": 462
+    },
+    {
+      "epoch": 0.42143588576629876,
+      "grad_norm": 0.3973187208175659,
+      "learning_rate": 0.0001260908345022547,
+      "loss": 2.1801,
+      "step": 463
+    },
+    {
+      "epoch": 0.4223461144612584,
+      "grad_norm": 0.432224303483963,
+      "learning_rate": 0.00012581223420661913,
+      "loss": 2.4079,
+      "step": 464
+    },
+    {
+      "epoch": 0.423256343156218,
+      "grad_norm": 0.3939046859741211,
+      "learning_rate": 0.00012553341909366978,
+      "loss": 2.0749,
+      "step": 465
+    },
+    {
+      "epoch": 0.4241665718511776,
+      "grad_norm": 0.36949658393859863,
+      "learning_rate": 0.00012525439148379128,
+      "loss": 2.1471,
+      "step": 466
+    },
+    {
+      "epoch": 0.4250768005461372,
+      "grad_norm": 0.3828236758708954,
+      "learning_rate": 0.00012497515369913685,
+      "loss": 2.0466,
+      "step": 467
+    },
+    {
+      "epoch": 0.4259870292410968,
+      "grad_norm": 0.3874993920326233,
+      "learning_rate": 0.00012469570806360875,
+      "loss": 2.1605,
+      "step": 468
+    },
+    {
+      "epoch": 0.42689725793605643,
+      "grad_norm": 0.3854924738407135,
+      "learning_rate": 0.00012441605690283915,
+      "loss": 2.0584,
+      "step": 469
+    },
+    {
+      "epoch": 0.42780748663101603,
+      "grad_norm": 0.40301740169525146,
+      "learning_rate": 0.00012413620254417057,
+      "loss": 2.1481,
+      "step": 470
+    },
+    {
+      "epoch": 0.42871771532597563,
+      "grad_norm": 0.3891369104385376,
+      "learning_rate": 0.00012385614731663666,
+      "loss": 2.1968,
+      "step": 471
+    },
+    {
+      "epoch": 0.42962794402093524,
+      "grad_norm": 0.4305795729160309,
+      "learning_rate": 0.00012357589355094275,
+      "loss": 2.0421,
+      "step": 472
+    },
+    {
+      "epoch": 0.4305381727158949,
+      "grad_norm": 0.44661635160446167,
+      "learning_rate": 0.0001232954435794464,
+      "loss": 2.3347,
+      "step": 473
+    },
+    {
+      "epoch": 0.4314484014108545,
+      "grad_norm": 0.3984116315841675,
+      "learning_rate": 0.00012301479973613822,
+      "loss": 2.1093,
+      "step": 474
+    },
+    {
+      "epoch": 0.4323586301058141,
+      "grad_norm": 0.4153747856616974,
+      "learning_rate": 0.00012273396435662212,
+      "loss": 2.0698,
+      "step": 475
+    },
+    {
+      "epoch": 0.4332688588007737,
+      "grad_norm": 0.4589189887046814,
+      "learning_rate": 0.00012245293977809605,
+      "loss": 2.1707,
+      "step": 476
+    },
+    {
+      "epoch": 0.4341790874957333,
+      "grad_norm": 0.43936577439308167,
+      "learning_rate": 0.0001221717283393326,
+      "loss": 2.2608,
+      "step": 477
+    },
+    {
+      "epoch": 0.4350893161906929,
+      "grad_norm": 0.4170132279396057,
+      "learning_rate": 0.0001218903323806595,
+      "loss": 2.0813,
+      "step": 478
+    },
+    {
+      "epoch": 0.4359995448856525,
+      "grad_norm": 0.43124523758888245,
+      "learning_rate": 0.00012160875424393996,
+      "loss": 2.1674,
+      "step": 479
+    },
+    {
+      "epoch": 0.4369097735806121,
+      "grad_norm": 0.4394627511501312,
+      "learning_rate": 0.00012132699627255347,
+      "loss": 2.1904,
+      "step": 480
+    },
+    {
+      "epoch": 0.4378200022755717,
+      "grad_norm": 0.4404590427875519,
+      "learning_rate": 0.00012104506081137608,
+      "loss": 2.1313,
+      "step": 481
+    },
+    {
+      "epoch": 0.4387302309705313,
+      "grad_norm": 0.4580220878124237,
+      "learning_rate": 0.00012076295020676103,
+      "loss": 2.16,
+      "step": 482
+    },
+    {
+      "epoch": 0.439640459665491,
+      "grad_norm": 0.4533630311489105,
+      "learning_rate": 0.00012048066680651908,
+      "loss": 2.1153,
+      "step": 483
+    },
+    {
+      "epoch": 0.4405506883604506,
+      "grad_norm": 0.47520536184310913,
+      "learning_rate": 0.00012019821295989912,
+      "loss": 2.2152,
+      "step": 484
+    },
+    {
+      "epoch": 0.4414609170554102,
+      "grad_norm": 0.44196072220802307,
+      "learning_rate": 0.00011991559101756852,
+      "loss": 2.1375,
+      "step": 485
+    },
+    {
+      "epoch": 0.4423711457503698,
+      "grad_norm": 0.43681493401527405,
+      "learning_rate": 0.00011963280333159358,
+      "loss": 2.0552,
+      "step": 486
+    },
+    {
+      "epoch": 0.4432813744453294,
+      "grad_norm": 0.4537602961063385,
+      "learning_rate": 0.00011934985225541998,
+      "loss": 2.1473,
+      "step": 487
+    },
+    {
+      "epoch": 0.444191603140289,
+      "grad_norm": 0.4935773015022278,
+      "learning_rate": 0.00011906674014385318,
+      "loss": 2.0623,
+      "step": 488
+    },
+    {
+      "epoch": 0.4451018318352486,
+      "grad_norm": 0.4802737236022949,
+      "learning_rate": 0.00011878346935303883,
+      "loss": 2.2908,
+      "step": 489
+    },
+    {
+      "epoch": 0.4460120605302082,
+      "grad_norm": 0.5020537376403809,
+      "learning_rate": 0.00011850004224044315,
+      "loss": 2.3101,
+      "step": 490
+    },
+    {
+      "epoch": 0.4469222892251678,
+      "grad_norm": 0.5106056332588196,
+      "learning_rate": 0.00011821646116483335,
+      "loss": 2.2838,
+      "step": 491
+    },
+    {
+      "epoch": 0.44783251792012746,
+      "grad_norm": 0.473910391330719,
+      "learning_rate": 0.00011793272848625797,
+      "loss": 2.0599,
+      "step": 492
+    },
+    {
+      "epoch": 0.44874274661508706,
+      "grad_norm": 0.5086584091186523,
+      "learning_rate": 0.0001176488465660271,
+      "loss": 2.1578,
+      "step": 493
+    },
+    {
+      "epoch": 0.44965297531004667,
+      "grad_norm": 0.5282394886016846,
+      "learning_rate": 0.00011736481776669306,
+      "loss": 2.2965,
+      "step": 494
+    },
+    {
+      "epoch": 0.45056320400500627,
+      "grad_norm": 0.5987780094146729,
+      "learning_rate": 0.00011708064445203042,
+      "loss": 2.3542,
+      "step": 495
+    },
+    {
+      "epoch": 0.45147343269996587,
+      "grad_norm": 0.5943189859390259,
+      "learning_rate": 0.00011679632898701649,
+      "loss": 2.4294,
+      "step": 496
+    },
+    {
+      "epoch": 0.4523836613949255,
+      "grad_norm": 0.6443737149238586,
+      "learning_rate": 0.0001165118737378116,
+      "loss": 2.605,
+      "step": 497
+    },
+    {
+      "epoch": 0.4532938900898851,
+      "grad_norm": 0.7082577347755432,
+      "learning_rate": 0.00011622728107173946,
+      "loss": 2.4254,
+      "step": 498
+    },
+    {
+      "epoch": 0.4542041187848447,
+      "grad_norm": 0.8503845930099487,
+      "learning_rate": 0.00011594255335726724,
+      "loss": 2.5187,
+      "step": 499
+    },
+    {
+      "epoch": 0.4551143474798043,
+      "grad_norm": 1.6775977611541748,
+      "learning_rate": 0.00011565769296398618,
+      "loss": 2.6669,
+      "step": 500
+    },
+    {
+      "epoch": 0.4560245761747639,
+      "grad_norm": 0.45572495460510254,
+      "learning_rate": 0.00011537270226259169,
+      "loss": 2.5806,
+      "step": 501
+    },
+    {
+      "epoch": 0.45693480486972354,
+      "grad_norm": 0.45138293504714966,
+      "learning_rate": 0.00011508758362486358,
+      "loss": 2.3935,
+      "step": 502
+    },
+    {
+      "epoch": 0.45784503356468315,
+      "grad_norm": 0.4548013210296631,
+      "learning_rate": 0.00011480233942364645,
+      "loss": 2.321,
+      "step": 503
+    },
+    {
+      "epoch": 0.45875526225964275,
+      "grad_norm": 0.434442400932312,
+      "learning_rate": 0.00011451697203282982,
+      "loss": 2.375,
+      "step": 504
+    },
+    {
+      "epoch": 0.45966549095460235,
+      "grad_norm": 0.4139295816421509,
+      "learning_rate": 0.00011423148382732853,
+      "loss": 2.3997,
+      "step": 505
+    },
+    {
+      "epoch": 0.46057571964956195,
+      "grad_norm": 0.46020230650901794,
+      "learning_rate": 0.00011394587718306275,
+      "loss": 2.5745,
+      "step": 506
+    },
+    {
+      "epoch": 0.46148594834452156,
+      "grad_norm": 0.4194343090057373,
+      "learning_rate": 0.00011366015447693837,
+      "loss": 2.2597,
+      "step": 507
+    },
+    {
+      "epoch": 0.46239617703948116,
+      "grad_norm": 0.43983832001686096,
+      "learning_rate": 0.0001133743180868273,
+      "loss": 2.3511,
+      "step": 508
+    },
+    {
+      "epoch": 0.46330640573444076,
+      "grad_norm": 0.41047292947769165,
+      "learning_rate": 0.00011308837039154739,
+      "loss": 2.2614,
+      "step": 509
+    },
+    {
+      "epoch": 0.46421663442940037,
+      "grad_norm": 0.4110110104084015,
+      "learning_rate": 0.0001128023137708429,
+      "loss": 2.2719,
+      "step": 510
+    },
+    {
+      "epoch": 0.46512686312435997,
+      "grad_norm": 0.41848358511924744,
+      "learning_rate": 0.0001125161506053646,
+      "loss": 2.3872,
+      "step": 511
+    },
+    {
+      "epoch": 0.4660370918193196,
+      "grad_norm": 0.39852631092071533,
+      "learning_rate": 0.00011222988327664997,
+      "loss": 2.2001,
+      "step": 512
+    },
+    {
+      "epoch": 0.46694732051427923,
+      "grad_norm": 0.4060978293418884,
+      "learning_rate": 0.00011194351416710324,
+      "loss": 2.2474,
+      "step": 513
+    },
+    {
+      "epoch": 0.46785754920923883,
+      "grad_norm": 0.4010358452796936,
+      "learning_rate": 0.00011165704565997593,
+      "loss": 2.1262,
+      "step": 514
+    },
+    {
+      "epoch": 0.46876777790419843,
+      "grad_norm": 0.4063378572463989,
+      "learning_rate": 0.00011137048013934656,
+      "loss": 2.1583,
+      "step": 515
+    },
+    {
+      "epoch": 0.46967800659915804,
+      "grad_norm": 0.40287846326828003,
+      "learning_rate": 0.00011108381999010111,
+      "loss": 2.2351,
+      "step": 516
+    },
+    {
+      "epoch": 0.47058823529411764,
+      "grad_norm": 0.3861018717288971,
+      "learning_rate": 0.00011079706759791311,
+      "loss": 2.195,
+      "step": 517
+    },
+    {
+      "epoch": 0.47149846398907724,
+      "grad_norm": 0.38855546712875366,
+      "learning_rate": 0.00011051022534922371,
+      "loss": 2.1575,
+      "step": 518
+    },
+    {
+      "epoch": 0.47240869268403685,
+      "grad_norm": 0.3941628038883209,
+      "learning_rate": 0.00011022329563122191,
+      "loss": 2.2324,
+      "step": 519
+    },
+    {
+      "epoch": 0.47331892137899645,
+      "grad_norm": 0.40604814887046814,
+      "learning_rate": 0.00010993628083182467,
+      "loss": 2.1641,
+      "step": 520
+    },
+    {
+      "epoch": 0.4742291500739561,
+      "grad_norm": 0.407815158367157,
+      "learning_rate": 0.000109649183339657,
+      "loss": 2.1648,
+      "step": 521
+    },
+    {
+      "epoch": 0.4751393787689157,
+      "grad_norm": 0.400680810213089,
+      "learning_rate": 0.00010936200554403209,
+      "loss": 2.1939,
+      "step": 522
+    },
+    {
+      "epoch": 0.4760496074638753,
+      "grad_norm": 0.416537344455719,
+      "learning_rate": 0.00010907474983493144,
+      "loss": 2.1694,
+      "step": 523
+    },
+    {
+      "epoch": 0.4769598361588349,
+      "grad_norm": 0.4097869396209717,
+      "learning_rate": 0.00010878741860298503,
+      "loss": 2.1785,
+      "step": 524
+    },
+    {
+      "epoch": 0.4778700648537945,
+      "grad_norm": 0.4243004024028778,
+      "learning_rate": 0.00010850001423945126,
+      "loss": 1.9963,
+      "step": 525
+    },
+    {
+      "epoch": 0.4787802935487541,
+      "grad_norm": 0.41958731412887573,
+      "learning_rate": 0.00010821253913619726,
+      "loss": 2.1629,
+      "step": 526
+    },
+    {
+      "epoch": 0.4796905222437137,
+      "grad_norm": 0.4177284240722656,
+      "learning_rate": 0.00010792499568567884,
+      "loss": 2.1276,
+      "step": 527
+    },
+    {
+      "epoch": 0.4806007509386733,
+      "grad_norm": 0.41077664494514465,
+      "learning_rate": 0.00010763738628092062,
+      "loss": 2.0852,
+      "step": 528
+    },
+    {
+      "epoch": 0.48151097963363293,
+      "grad_norm": 0.4098223149776459,
+      "learning_rate": 0.00010734971331549603,
+      "loss": 1.9977,
+      "step": 529
+    },
+    {
+      "epoch": 0.48242120832859253,
+      "grad_norm": 0.42255935072898865,
+      "learning_rate": 0.00010706197918350758,
+      "loss": 1.9822,
+      "step": 530
+    },
+    {
+      "epoch": 0.4833314370235522,
+      "grad_norm": 0.45597127079963684,
+      "learning_rate": 0.0001067741862795668,
+      "loss": 2.1072,
+      "step": 531
+    },
+    {
+      "epoch": 0.4842416657185118,
+      "grad_norm": 0.4538208544254303,
+      "learning_rate": 0.0001064863369987743,
+      "loss": 2.41,
+      "step": 532
+    },
+    {
+      "epoch": 0.4851518944134714,
+      "grad_norm": 0.4586673676967621,
+      "learning_rate": 0.00010619843373669993,
+      "loss": 2.1736,
+      "step": 533
+    },
+    {
+      "epoch": 0.486062123108431,
+      "grad_norm": 0.4433608055114746,
+      "learning_rate": 0.00010591047888936274,
+      "loss": 2.1324,
+      "step": 534
+    },
+    {
+      "epoch": 0.4869723518033906,
+      "grad_norm": 0.4421234428882599,
+      "learning_rate": 0.00010562247485321115,
+      "loss": 2.0689,
+      "step": 535
+    },
+    {
+      "epoch": 0.4878825804983502,
+      "grad_norm": 0.46843069791793823,
+      "learning_rate": 0.00010533442402510284,
+      "loss": 2.2252,
+      "step": 536
+    },
+    {
+      "epoch": 0.4887928091933098,
+      "grad_norm": 0.4747142493724823,
+      "learning_rate": 0.00010504632880228498,
+      "loss": 2.2503,
+      "step": 537
+    },
+    {
+      "epoch": 0.4897030378882694,
+      "grad_norm": 0.46643224358558655,
+      "learning_rate": 0.00010475819158237425,
+      "loss": 2.2628,
+      "step": 538
+    },
+    {
+      "epoch": 0.490613266583229,
+      "grad_norm": 0.47085490822792053,
+      "learning_rate": 0.00010447001476333673,
+      "loss": 2.0888,
+      "step": 539
+    },
+    {
+      "epoch": 0.49152349527818867,
+      "grad_norm": 0.5102598071098328,
+      "learning_rate": 0.00010418180074346815,
+      "loss": 2.2736,
+      "step": 540
+    },
+    {
+      "epoch": 0.4924337239731483,
+      "grad_norm": 0.49878573417663574,
+      "learning_rate": 0.00010389355192137377,
+      "loss": 2.1107,
+      "step": 541
+    },
+    {
+      "epoch": 0.4933439526681079,
+      "grad_norm": 0.5236616134643555,
+      "learning_rate": 0.00010360527069594859,
+      "loss": 2.4099,
+      "step": 542
+    },
+    {
+      "epoch": 0.4942541813630675,
+      "grad_norm": 0.49875032901763916,
+      "learning_rate": 0.00010331695946635708,
+      "loss": 2.1381,
+      "step": 543
+    },
+    {
+      "epoch": 0.4951644100580271,
+      "grad_norm": 0.5333012938499451,
+      "learning_rate": 0.00010302862063201367,
+      "loss": 2.2274,
+      "step": 544
+    },
+    {
+      "epoch": 0.4960746387529867,
+      "grad_norm": 0.5504993200302124,
+      "learning_rate": 0.00010274025659256232,
+      "loss": 2.2348,
+      "step": 545
+    },
+    {
+      "epoch": 0.4969848674479463,
+      "grad_norm": 0.5924202799797058,
+      "learning_rate": 0.00010245186974785685,
+      "loss": 2.3686,
+      "step": 546
+    },
+    {
+      "epoch": 0.4978950961429059,
+      "grad_norm": 0.6003567576408386,
+      "learning_rate": 0.00010216346249794087,
+      "loss": 2.3336,
+      "step": 547
+    },
+    {
+      "epoch": 0.4988053248378655,
+      "grad_norm": 0.6700019836425781,
+      "learning_rate": 0.00010187503724302776,
+      "loss": 2.4446,
+      "step": 548
+    },
+    {
+      "epoch": 0.4997155535328251,
+      "grad_norm": 0.8171781897544861,
+      "learning_rate": 0.00010158659638348081,
+      "loss": 2.4278,
+      "step": 549
+    },
+    {
+      "epoch": 0.5006257822277848,
+      "grad_norm": 1.4212020635604858,
+      "learning_rate": 0.0001012981423197931,
+      "loss": 2.6229,
+      "step": 550
+    },
+    {
+      "epoch": 0.5006257822277848,
+      "eval_loss": 2.2479705810546875,
+      "eval_runtime": 205.3622,
+      "eval_samples_per_second": 9.013,
+      "eval_steps_per_second": 4.509,
+      "step": 550
     }
   ],
   "logging_steps": 1,
       "attributes": {}
     }
   },
+  "total_flos": 1.0074178982447677e+18,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null