atsuki-yamaguchi's picture
Upload folder using huggingface_hub
2e400e8 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 3022,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0026472534745201853,
"grad_norm": 24.50491714477539,
"learning_rate": 2.631578947368421e-06,
"loss": 6.5473,
"step": 4
},
{
"epoch": 0.005294506949040371,
"grad_norm": 17.426511764526367,
"learning_rate": 5.263157894736842e-06,
"loss": 6.2116,
"step": 8
},
{
"epoch": 0.007941760423560556,
"grad_norm": 6.35976505279541,
"learning_rate": 7.894736842105263e-06,
"loss": 5.7967,
"step": 12
},
{
"epoch": 0.010589013898080741,
"grad_norm": 5.454939842224121,
"learning_rate": 1.0526315789473684e-05,
"loss": 5.3365,
"step": 16
},
{
"epoch": 0.013236267372600927,
"grad_norm": 4.607099533081055,
"learning_rate": 1.3157894736842106e-05,
"loss": 4.7105,
"step": 20
},
{
"epoch": 0.01588352084712111,
"grad_norm": 3.6498019695281982,
"learning_rate": 1.5789473684210526e-05,
"loss": 4.286,
"step": 24
},
{
"epoch": 0.018530774321641297,
"grad_norm": 4.196900844573975,
"learning_rate": 1.8421052631578947e-05,
"loss": 4.3134,
"step": 28
},
{
"epoch": 0.021178027796161483,
"grad_norm": 3.617469072341919,
"learning_rate": 2.105263157894737e-05,
"loss": 3.7494,
"step": 32
},
{
"epoch": 0.02382528127068167,
"grad_norm": 3.05267333984375,
"learning_rate": 2.368421052631579e-05,
"loss": 3.8046,
"step": 36
},
{
"epoch": 0.026472534745201854,
"grad_norm": 2.607614517211914,
"learning_rate": 2.6315789473684212e-05,
"loss": 3.385,
"step": 40
},
{
"epoch": 0.02911978821972204,
"grad_norm": 2.536888837814331,
"learning_rate": 2.8947368421052634e-05,
"loss": 3.3516,
"step": 44
},
{
"epoch": 0.03176704169424222,
"grad_norm": 2.315871000289917,
"learning_rate": 3.157894736842105e-05,
"loss": 3.0795,
"step": 48
},
{
"epoch": 0.03441429516876241,
"grad_norm": 2.3058571815490723,
"learning_rate": 3.421052631578947e-05,
"loss": 3.0708,
"step": 52
},
{
"epoch": 0.037061548643282594,
"grad_norm": 2.067796230316162,
"learning_rate": 3.6842105263157895e-05,
"loss": 2.8311,
"step": 56
},
{
"epoch": 0.03970880211780278,
"grad_norm": 1.9578440189361572,
"learning_rate": 3.9473684210526316e-05,
"loss": 2.696,
"step": 60
},
{
"epoch": 0.042356055592322965,
"grad_norm": 2.043933629989624,
"learning_rate": 4.210526315789474e-05,
"loss": 2.7501,
"step": 64
},
{
"epoch": 0.04500330906684315,
"grad_norm": 1.82830810546875,
"learning_rate": 4.473684210526316e-05,
"loss": 2.5058,
"step": 68
},
{
"epoch": 0.04765056254136334,
"grad_norm": 1.8841806650161743,
"learning_rate": 4.736842105263158e-05,
"loss": 2.5708,
"step": 72
},
{
"epoch": 0.05029781601588352,
"grad_norm": 1.9775539636611938,
"learning_rate": 5e-05,
"loss": 2.6332,
"step": 76
},
{
"epoch": 0.05294506949040371,
"grad_norm": 1.7908610105514526,
"learning_rate": 5.2631578947368424e-05,
"loss": 2.5441,
"step": 80
},
{
"epoch": 0.05559232296492389,
"grad_norm": 1.977647066116333,
"learning_rate": 5.526315789473685e-05,
"loss": 2.3617,
"step": 84
},
{
"epoch": 0.05823957643944408,
"grad_norm": 2.008470296859741,
"learning_rate": 5.789473684210527e-05,
"loss": 2.3994,
"step": 88
},
{
"epoch": 0.06088682991396426,
"grad_norm": 2.070720911026001,
"learning_rate": 6.052631578947369e-05,
"loss": 2.3509,
"step": 92
},
{
"epoch": 0.06353408338848444,
"grad_norm": 2.0442869663238525,
"learning_rate": 6.31578947368421e-05,
"loss": 2.35,
"step": 96
},
{
"epoch": 0.06618133686300463,
"grad_norm": 1.8274725675582886,
"learning_rate": 6.578947368421054e-05,
"loss": 2.2802,
"step": 100
},
{
"epoch": 0.06882859033752482,
"grad_norm": 1.9744892120361328,
"learning_rate": 6.842105263157895e-05,
"loss": 2.4711,
"step": 104
},
{
"epoch": 0.071475843812045,
"grad_norm": 1.881946086883545,
"learning_rate": 7.105263157894737e-05,
"loss": 2.3495,
"step": 108
},
{
"epoch": 0.07412309728656519,
"grad_norm": 1.7632906436920166,
"learning_rate": 7.368421052631579e-05,
"loss": 2.1906,
"step": 112
},
{
"epoch": 0.07677035076108538,
"grad_norm": 1.8465447425842285,
"learning_rate": 7.631578947368422e-05,
"loss": 2.4193,
"step": 116
},
{
"epoch": 0.07941760423560557,
"grad_norm": 1.978273868560791,
"learning_rate": 7.894736842105263e-05,
"loss": 2.3229,
"step": 120
},
{
"epoch": 0.08206485771012574,
"grad_norm": 1.9878270626068115,
"learning_rate": 8.157894736842105e-05,
"loss": 2.3028,
"step": 124
},
{
"epoch": 0.08471211118464593,
"grad_norm": 1.7065322399139404,
"learning_rate": 8.421052631578948e-05,
"loss": 2.244,
"step": 128
},
{
"epoch": 0.08735936465916612,
"grad_norm": 1.8170701265335083,
"learning_rate": 8.68421052631579e-05,
"loss": 2.1112,
"step": 132
},
{
"epoch": 0.0900066181336863,
"grad_norm": 1.9288476705551147,
"learning_rate": 8.947368421052632e-05,
"loss": 2.3551,
"step": 136
},
{
"epoch": 0.09265387160820648,
"grad_norm": 1.8695253133773804,
"learning_rate": 9.210526315789474e-05,
"loss": 2.2814,
"step": 140
},
{
"epoch": 0.09530112508272667,
"grad_norm": 1.7066093683242798,
"learning_rate": 9.473684210526316e-05,
"loss": 1.9036,
"step": 144
},
{
"epoch": 0.09794837855724686,
"grad_norm": 1.8588757514953613,
"learning_rate": 9.736842105263158e-05,
"loss": 2.0139,
"step": 148
},
{
"epoch": 0.10059563203176704,
"grad_norm": 1.789518117904663,
"learning_rate": 0.0001,
"loss": 2.1809,
"step": 152
},
{
"epoch": 0.10324288550628723,
"grad_norm": 1.9242740869522095,
"learning_rate": 9.999952071344157e-05,
"loss": 2.301,
"step": 156
},
{
"epoch": 0.10589013898080742,
"grad_norm": 1.7974549531936646,
"learning_rate": 9.999808286295485e-05,
"loss": 2.2312,
"step": 160
},
{
"epoch": 0.10853739245532759,
"grad_norm": 1.7276233434677124,
"learning_rate": 9.999568647610555e-05,
"loss": 2.1109,
"step": 164
},
{
"epoch": 0.11118464592984778,
"grad_norm": 1.8286519050598145,
"learning_rate": 9.999233159883593e-05,
"loss": 2.0782,
"step": 168
},
{
"epoch": 0.11383189940436797,
"grad_norm": 1.919313907623291,
"learning_rate": 9.998801829546386e-05,
"loss": 2.0693,
"step": 172
},
{
"epoch": 0.11647915287888816,
"grad_norm": 1.6544960737228394,
"learning_rate": 9.998274664868173e-05,
"loss": 2.0982,
"step": 176
},
{
"epoch": 0.11912640635340833,
"grad_norm": 1.8223872184753418,
"learning_rate": 9.997651675955466e-05,
"loss": 2.1379,
"step": 180
},
{
"epoch": 0.12177365982792852,
"grad_norm": 1.7743052244186401,
"learning_rate": 9.996932874751877e-05,
"loss": 2.0637,
"step": 184
},
{
"epoch": 0.12442091330244871,
"grad_norm": 1.7228261232376099,
"learning_rate": 9.996118275037873e-05,
"loss": 2.1696,
"step": 188
},
{
"epoch": 0.1270681667769689,
"grad_norm": 1.6266913414001465,
"learning_rate": 9.995207892430524e-05,
"loss": 2.1247,
"step": 192
},
{
"epoch": 0.12971542025148908,
"grad_norm": 1.8206615447998047,
"learning_rate": 9.994201744383196e-05,
"loss": 2.1831,
"step": 196
},
{
"epoch": 0.13236267372600927,
"grad_norm": 1.943579912185669,
"learning_rate": 9.993099850185216e-05,
"loss": 1.9262,
"step": 200
},
{
"epoch": 0.13500992720052946,
"grad_norm": 1.89098060131073,
"learning_rate": 9.991902230961511e-05,
"loss": 2.2636,
"step": 204
},
{
"epoch": 0.13765718067504965,
"grad_norm": 1.8418017625808716,
"learning_rate": 9.99060890967219e-05,
"loss": 2.2454,
"step": 208
},
{
"epoch": 0.14030443414956983,
"grad_norm": 1.7433375120162964,
"learning_rate": 9.989219911112113e-05,
"loss": 2.2591,
"step": 212
},
{
"epoch": 0.14295168762409,
"grad_norm": 1.885964035987854,
"learning_rate": 9.987735261910417e-05,
"loss": 1.9402,
"step": 216
},
{
"epoch": 0.14559894109861019,
"grad_norm": 1.626397728919983,
"learning_rate": 9.986154990529995e-05,
"loss": 2.119,
"step": 220
},
{
"epoch": 0.14824619457313037,
"grad_norm": 1.5490047931671143,
"learning_rate": 9.984479127266961e-05,
"loss": 1.8635,
"step": 224
},
{
"epoch": 0.15089344804765056,
"grad_norm": 1.5588316917419434,
"learning_rate": 9.982707704250065e-05,
"loss": 1.8135,
"step": 228
},
{
"epoch": 0.15354070152217075,
"grad_norm": 1.9416462182998657,
"learning_rate": 9.980840755440075e-05,
"loss": 2.2288,
"step": 232
},
{
"epoch": 0.15618795499669094,
"grad_norm": 1.5774728059768677,
"learning_rate": 9.978878316629133e-05,
"loss": 1.9254,
"step": 236
},
{
"epoch": 0.15883520847121113,
"grad_norm": 1.6661707162857056,
"learning_rate": 9.976820425440058e-05,
"loss": 2.0111,
"step": 240
},
{
"epoch": 0.1614824619457313,
"grad_norm": 1.5805509090423584,
"learning_rate": 9.974667121325634e-05,
"loss": 2.0657,
"step": 244
},
{
"epoch": 0.16412971542025148,
"grad_norm": 1.7854478359222412,
"learning_rate": 9.972418445567851e-05,
"loss": 1.8586,
"step": 248
},
{
"epoch": 0.16677696889477167,
"grad_norm": 1.61441171169281,
"learning_rate": 9.97007444127711e-05,
"loss": 1.9234,
"step": 252
},
{
"epoch": 0.16942422236929186,
"grad_norm": 2.154454469680786,
"learning_rate": 9.967635153391401e-05,
"loss": 1.949,
"step": 256
},
{
"epoch": 0.17207147584381205,
"grad_norm": 1.5182636976242065,
"learning_rate": 9.965100628675441e-05,
"loss": 2.013,
"step": 260
},
{
"epoch": 0.17471872931833224,
"grad_norm": 1.751714825630188,
"learning_rate": 9.962470915719775e-05,
"loss": 1.9629,
"step": 264
},
{
"epoch": 0.17736598279285243,
"grad_norm": 1.5807703733444214,
"learning_rate": 9.959746064939846e-05,
"loss": 1.8705,
"step": 268
},
{
"epoch": 0.1800132362673726,
"grad_norm": 1.7142225503921509,
"learning_rate": 9.956926128575026e-05,
"loss": 2.0033,
"step": 272
},
{
"epoch": 0.18266048974189278,
"grad_norm": 1.555530309677124,
"learning_rate": 9.954011160687622e-05,
"loss": 1.8995,
"step": 276
},
{
"epoch": 0.18530774321641297,
"grad_norm": 1.5679693222045898,
"learning_rate": 9.951001217161829e-05,
"loss": 2.042,
"step": 280
},
{
"epoch": 0.18795499669093316,
"grad_norm": 1.6399418115615845,
"learning_rate": 9.947896355702666e-05,
"loss": 2.0388,
"step": 284
},
{
"epoch": 0.19060225016545335,
"grad_norm": 1.7505602836608887,
"learning_rate": 9.944696635834867e-05,
"loss": 1.9648,
"step": 288
},
{
"epoch": 0.19324950363997354,
"grad_norm": 1.4888848066329956,
"learning_rate": 9.941402118901744e-05,
"loss": 1.8595,
"step": 292
},
{
"epoch": 0.19589675711449372,
"grad_norm": 1.4739277362823486,
"learning_rate": 9.938012868064e-05,
"loss": 1.7959,
"step": 296
},
{
"epoch": 0.1985440105890139,
"grad_norm": 1.5393471717834473,
"learning_rate": 9.934528948298533e-05,
"loss": 1.9469,
"step": 300
},
{
"epoch": 0.20119126406353408,
"grad_norm": 1.5673627853393555,
"learning_rate": 9.930950426397179e-05,
"loss": 1.9332,
"step": 304
},
{
"epoch": 0.20383851753805426,
"grad_norm": 1.6461111307144165,
"learning_rate": 9.927277370965435e-05,
"loss": 1.8055,
"step": 308
},
{
"epoch": 0.20648577101257445,
"grad_norm": 1.5950462818145752,
"learning_rate": 9.923509852421145e-05,
"loss": 1.8414,
"step": 312
},
{
"epoch": 0.20913302448709464,
"grad_norm": 1.433727741241455,
"learning_rate": 9.919647942993148e-05,
"loss": 1.9514,
"step": 316
},
{
"epoch": 0.21178027796161483,
"grad_norm": 1.445776343345642,
"learning_rate": 9.915691716719898e-05,
"loss": 1.7297,
"step": 320
},
{
"epoch": 0.21442753143613502,
"grad_norm": 1.9325745105743408,
"learning_rate": 9.911641249448036e-05,
"loss": 1.9855,
"step": 324
},
{
"epoch": 0.21707478491065518,
"grad_norm": 1.494813323020935,
"learning_rate": 9.907496618830942e-05,
"loss": 1.7916,
"step": 328
},
{
"epoch": 0.21972203838517537,
"grad_norm": 1.4863932132720947,
"learning_rate": 9.903257904327249e-05,
"loss": 1.8029,
"step": 332
},
{
"epoch": 0.22236929185969556,
"grad_norm": 1.594827651977539,
"learning_rate": 9.898925187199308e-05,
"loss": 1.9516,
"step": 336
},
{
"epoch": 0.22501654533421575,
"grad_norm": 1.5738781690597534,
"learning_rate": 9.894498550511646e-05,
"loss": 1.8997,
"step": 340
},
{
"epoch": 0.22766379880873594,
"grad_norm": 1.5598024129867554,
"learning_rate": 9.88997807912936e-05,
"loss": 1.9249,
"step": 344
},
{
"epoch": 0.23031105228325613,
"grad_norm": 1.4761321544647217,
"learning_rate": 9.885363859716497e-05,
"loss": 1.7571,
"step": 348
},
{
"epoch": 0.23295830575777632,
"grad_norm": 1.4266904592514038,
"learning_rate": 9.88065598073439e-05,
"loss": 1.9811,
"step": 352
},
{
"epoch": 0.23560555923229648,
"grad_norm": 1.5371057987213135,
"learning_rate": 9.875854532439964e-05,
"loss": 1.8021,
"step": 356
},
{
"epoch": 0.23825281270681667,
"grad_norm": 1.380096673965454,
"learning_rate": 9.870959606884004e-05,
"loss": 1.8223,
"step": 360
},
{
"epoch": 0.24090006618133686,
"grad_norm": 1.632664680480957,
"learning_rate": 9.865971297909393e-05,
"loss": 2.006,
"step": 364
},
{
"epoch": 0.24354731965585705,
"grad_norm": 1.3765276670455933,
"learning_rate": 9.860889701149307e-05,
"loss": 1.7893,
"step": 368
},
{
"epoch": 0.24619457313037724,
"grad_norm": 1.5789958238601685,
"learning_rate": 9.855714914025384e-05,
"loss": 1.9381,
"step": 372
},
{
"epoch": 0.24884182660489743,
"grad_norm": 1.8294042348861694,
"learning_rate": 9.850447035745866e-05,
"loss": 1.8584,
"step": 376
},
{
"epoch": 0.2514890800794176,
"grad_norm": 1.5388972759246826,
"learning_rate": 9.845086167303679e-05,
"loss": 1.8763,
"step": 380
},
{
"epoch": 0.2541363335539378,
"grad_norm": 1.5301390886306763,
"learning_rate": 9.839632411474513e-05,
"loss": 2.0612,
"step": 384
},
{
"epoch": 0.256783587028458,
"grad_norm": 1.546277642250061,
"learning_rate": 9.83408587281484e-05,
"loss": 1.9085,
"step": 388
},
{
"epoch": 0.25943084050297816,
"grad_norm": 1.5818853378295898,
"learning_rate": 9.828446657659918e-05,
"loss": 2.0181,
"step": 392
},
{
"epoch": 0.26207809397749837,
"grad_norm": 1.2648255825042725,
"learning_rate": 9.82271487412175e-05,
"loss": 1.6947,
"step": 396
},
{
"epoch": 0.26472534745201853,
"grad_norm": 1.541934847831726,
"learning_rate": 9.816890632087006e-05,
"loss": 1.8053,
"step": 400
},
{
"epoch": 0.2673726009265387,
"grad_norm": 1.5966472625732422,
"learning_rate": 9.810974043214922e-05,
"loss": 1.8733,
"step": 404
},
{
"epoch": 0.2700198544010589,
"grad_norm": 1.5871154069900513,
"learning_rate": 9.804965220935161e-05,
"loss": 1.896,
"step": 408
},
{
"epoch": 0.2726671078755791,
"grad_norm": 1.4850573539733887,
"learning_rate": 9.798864280445632e-05,
"loss": 1.8494,
"step": 412
},
{
"epoch": 0.2753143613500993,
"grad_norm": 1.4737725257873535,
"learning_rate": 9.792671338710285e-05,
"loss": 1.8145,
"step": 416
},
{
"epoch": 0.27796161482461945,
"grad_norm": 1.5895408391952515,
"learning_rate": 9.786386514456872e-05,
"loss": 1.9279,
"step": 420
},
{
"epoch": 0.28060886829913967,
"grad_norm": 1.522838830947876,
"learning_rate": 9.780009928174661e-05,
"loss": 1.9103,
"step": 424
},
{
"epoch": 0.28325612177365983,
"grad_norm": 1.4890238046646118,
"learning_rate": 9.773541702112137e-05,
"loss": 1.9306,
"step": 428
},
{
"epoch": 0.28590337524818,
"grad_norm": 1.5047945976257324,
"learning_rate": 9.766981960274653e-05,
"loss": 1.8459,
"step": 432
},
{
"epoch": 0.2885506287227002,
"grad_norm": 1.4997539520263672,
"learning_rate": 9.760330828422053e-05,
"loss": 1.7442,
"step": 436
},
{
"epoch": 0.29119788219722037,
"grad_norm": 1.389294981956482,
"learning_rate": 9.753588434066258e-05,
"loss": 1.9077,
"step": 440
},
{
"epoch": 0.2938451356717406,
"grad_norm": 1.3641945123672485,
"learning_rate": 9.746754906468832e-05,
"loss": 1.8979,
"step": 444
},
{
"epoch": 0.29649238914626075,
"grad_norm": 1.5315138101577759,
"learning_rate": 9.73983037663849e-05,
"loss": 1.8207,
"step": 448
},
{
"epoch": 0.29913964262078097,
"grad_norm": 1.5057647228240967,
"learning_rate": 9.732814977328592e-05,
"loss": 1.911,
"step": 452
},
{
"epoch": 0.3017868960953011,
"grad_norm": 1.368912696838379,
"learning_rate": 9.725708843034605e-05,
"loss": 1.8377,
"step": 456
},
{
"epoch": 0.3044341495698213,
"grad_norm": 1.389817714691162,
"learning_rate": 9.718512109991514e-05,
"loss": 1.7907,
"step": 460
},
{
"epoch": 0.3070814030443415,
"grad_norm": 1.7318735122680664,
"learning_rate": 9.711224916171215e-05,
"loss": 1.9412,
"step": 464
},
{
"epoch": 0.30972865651886167,
"grad_norm": 1.4791710376739502,
"learning_rate": 9.703847401279871e-05,
"loss": 1.7754,
"step": 468
},
{
"epoch": 0.3123759099933819,
"grad_norm": 1.3618526458740234,
"learning_rate": 9.69637970675523e-05,
"loss": 1.73,
"step": 472
},
{
"epoch": 0.31502316346790205,
"grad_norm": 1.5649083852767944,
"learning_rate": 9.688821975763918e-05,
"loss": 1.9635,
"step": 476
},
{
"epoch": 0.31767041694242226,
"grad_norm": 1.3701534271240234,
"learning_rate": 9.681174353198687e-05,
"loss": 1.6581,
"step": 480
},
{
"epoch": 0.3203176704169424,
"grad_norm": 1.4764872789382935,
"learning_rate": 9.673436985675645e-05,
"loss": 1.794,
"step": 484
},
{
"epoch": 0.3229649238914626,
"grad_norm": 1.4432624578475952,
"learning_rate": 9.665610021531447e-05,
"loss": 1.9016,
"step": 488
},
{
"epoch": 0.3256121773659828,
"grad_norm": 1.572975993156433,
"learning_rate": 9.657693610820437e-05,
"loss": 2.035,
"step": 492
},
{
"epoch": 0.32825943084050296,
"grad_norm": 1.5382163524627686,
"learning_rate": 9.649687905311785e-05,
"loss": 2.0041,
"step": 496
},
{
"epoch": 0.3309066843150232,
"grad_norm": 1.3413423299789429,
"learning_rate": 9.641593058486574e-05,
"loss": 1.7448,
"step": 500
},
{
"epoch": 0.33355393778954334,
"grad_norm": 1.4374409914016724,
"learning_rate": 9.633409225534855e-05,
"loss": 1.7816,
"step": 504
},
{
"epoch": 0.33620119126406356,
"grad_norm": 1.4096835851669312,
"learning_rate": 9.625136563352671e-05,
"loss": 1.772,
"step": 508
},
{
"epoch": 0.3388484447385837,
"grad_norm": 2.1890769004821777,
"learning_rate": 9.616775230539057e-05,
"loss": 1.8641,
"step": 512
},
{
"epoch": 0.3414956982131039,
"grad_norm": 1.4621169567108154,
"learning_rate": 9.608325387392986e-05,
"loss": 1.7406,
"step": 516
},
{
"epoch": 0.3441429516876241,
"grad_norm": 1.4140963554382324,
"learning_rate": 9.599787195910313e-05,
"loss": 1.6127,
"step": 520
},
{
"epoch": 0.34679020516214426,
"grad_norm": 1.459409236907959,
"learning_rate": 9.591160819780649e-05,
"loss": 1.7579,
"step": 524
},
{
"epoch": 0.3494374586366645,
"grad_norm": 1.7444220781326294,
"learning_rate": 9.582446424384242e-05,
"loss": 1.8177,
"step": 528
},
{
"epoch": 0.35208471211118464,
"grad_norm": 1.4114232063293457,
"learning_rate": 9.573644176788794e-05,
"loss": 1.7955,
"step": 532
},
{
"epoch": 0.35473196558570486,
"grad_norm": 1.4076716899871826,
"learning_rate": 9.564754245746264e-05,
"loss": 1.9122,
"step": 536
},
{
"epoch": 0.357379219060225,
"grad_norm": 1.4209445714950562,
"learning_rate": 9.555776801689632e-05,
"loss": 1.8108,
"step": 540
},
{
"epoch": 0.3600264725347452,
"grad_norm": 1.5626829862594604,
"learning_rate": 9.546712016729624e-05,
"loss": 1.9285,
"step": 544
},
{
"epoch": 0.3626737260092654,
"grad_norm": 1.4253438711166382,
"learning_rate": 9.537560064651427e-05,
"loss": 1.6505,
"step": 548
},
{
"epoch": 0.36532097948378556,
"grad_norm": 1.447141170501709,
"learning_rate": 9.528321120911346e-05,
"loss": 1.8303,
"step": 552
},
{
"epoch": 0.3679682329583058,
"grad_norm": 1.4913408756256104,
"learning_rate": 9.51899536263344e-05,
"loss": 1.8382,
"step": 556
},
{
"epoch": 0.37061548643282594,
"grad_norm": 1.5191394090652466,
"learning_rate": 9.509582968606136e-05,
"loss": 1.7477,
"step": 560
},
{
"epoch": 0.37326273990734615,
"grad_norm": 1.3612414598464966,
"learning_rate": 9.500084119278788e-05,
"loss": 1.7101,
"step": 564
},
{
"epoch": 0.3759099933818663,
"grad_norm": 1.3365185260772705,
"learning_rate": 9.49049899675823e-05,
"loss": 1.8855,
"step": 568
},
{
"epoch": 0.3785572468563865,
"grad_norm": 1.4907687902450562,
"learning_rate": 9.480827784805278e-05,
"loss": 1.8158,
"step": 572
},
{
"epoch": 0.3812045003309067,
"grad_norm": 1.2549834251403809,
"learning_rate": 9.471070668831208e-05,
"loss": 1.6304,
"step": 576
},
{
"epoch": 0.38385175380542685,
"grad_norm": 1.6914743185043335,
"learning_rate": 9.4612278358942e-05,
"loss": 1.6976,
"step": 580
},
{
"epoch": 0.38649900727994707,
"grad_norm": 1.5349342823028564,
"learning_rate": 9.451299474695754e-05,
"loss": 1.7323,
"step": 584
},
{
"epoch": 0.38914626075446723,
"grad_norm": 1.4379171133041382,
"learning_rate": 9.441285775577075e-05,
"loss": 1.7762,
"step": 588
},
{
"epoch": 0.39179351422898745,
"grad_norm": 1.360475778579712,
"learning_rate": 9.431186930515419e-05,
"loss": 1.7328,
"step": 592
},
{
"epoch": 0.3944407677035076,
"grad_norm": 1.4364429712295532,
"learning_rate": 9.421003133120412e-05,
"loss": 1.7363,
"step": 596
},
{
"epoch": 0.3970880211780278,
"grad_norm": 1.4598385095596313,
"learning_rate": 9.410734578630343e-05,
"loss": 1.6917,
"step": 600
},
{
"epoch": 0.399735274652548,
"grad_norm": 1.3313078880310059,
"learning_rate": 9.400381463908416e-05,
"loss": 1.8008,
"step": 604
},
{
"epoch": 0.40238252812706815,
"grad_norm": 1.5070075988769531,
"learning_rate": 9.389943987438983e-05,
"loss": 1.669,
"step": 608
},
{
"epoch": 0.40502978160158837,
"grad_norm": 1.3858133554458618,
"learning_rate": 9.379422349323728e-05,
"loss": 1.6599,
"step": 612
},
{
"epoch": 0.40767703507610853,
"grad_norm": 1.3775012493133545,
"learning_rate": 9.368816751277843e-05,
"loss": 1.628,
"step": 616
},
{
"epoch": 0.41032428855062875,
"grad_norm": 1.3733761310577393,
"learning_rate": 9.358127396626147e-05,
"loss": 1.6797,
"step": 620
},
{
"epoch": 0.4129715420251489,
"grad_norm": 1.760237455368042,
"learning_rate": 9.347354490299205e-05,
"loss": 1.7479,
"step": 624
},
{
"epoch": 0.41561879549966907,
"grad_norm": 1.2483643293380737,
"learning_rate": 9.336498238829384e-05,
"loss": 1.6595,
"step": 628
},
{
"epoch": 0.4182660489741893,
"grad_norm": 2.099116563796997,
"learning_rate": 9.325558850346897e-05,
"loss": 1.6933,
"step": 632
},
{
"epoch": 0.42091330244870945,
"grad_norm": 1.3913215398788452,
"learning_rate": 9.31453653457582e-05,
"loss": 1.6433,
"step": 636
},
{
"epoch": 0.42356055592322966,
"grad_norm": 1.3813973665237427,
"learning_rate": 9.303431502830065e-05,
"loss": 1.6652,
"step": 640
},
{
"epoch": 0.4262078093977498,
"grad_norm": 1.496819019317627,
"learning_rate": 9.292243968009331e-05,
"loss": 1.747,
"step": 644
},
{
"epoch": 0.42885506287227004,
"grad_norm": 1.37201988697052,
"learning_rate": 9.280974144595018e-05,
"loss": 1.6331,
"step": 648
},
{
"epoch": 0.4315023163467902,
"grad_norm": 1.505353331565857,
"learning_rate": 9.269622248646124e-05,
"loss": 1.7717,
"step": 652
},
{
"epoch": 0.43414956982131037,
"grad_norm": 1.8498897552490234,
"learning_rate": 9.258188497795093e-05,
"loss": 1.6643,
"step": 656
},
{
"epoch": 0.4367968232958306,
"grad_norm": 1.2886799573898315,
"learning_rate": 9.24667311124365e-05,
"loss": 1.777,
"step": 660
},
{
"epoch": 0.43944407677035074,
"grad_norm": 1.283218502998352,
"learning_rate": 9.23507630975859e-05,
"loss": 1.6958,
"step": 664
},
{
"epoch": 0.44209133024487096,
"grad_norm": 1.3919546604156494,
"learning_rate": 9.223398315667561e-05,
"loss": 1.6515,
"step": 668
},
{
"epoch": 0.4447385837193911,
"grad_norm": 1.4083247184753418,
"learning_rate": 9.211639352854787e-05,
"loss": 1.7531,
"step": 672
},
{
"epoch": 0.44738583719391134,
"grad_norm": 1.2739989757537842,
"learning_rate": 9.199799646756777e-05,
"loss": 1.7694,
"step": 676
},
{
"epoch": 0.4500330906684315,
"grad_norm": 1.4435306787490845,
"learning_rate": 9.187879424358014e-05,
"loss": 1.8044,
"step": 680
},
{
"epoch": 0.45268034414295166,
"grad_norm": 1.4848833084106445,
"learning_rate": 9.17587891418659e-05,
"loss": 1.6531,
"step": 684
},
{
"epoch": 0.4553275976174719,
"grad_norm": 1.527485966682434,
"learning_rate": 9.163798346309837e-05,
"loss": 1.8783,
"step": 688
},
{
"epoch": 0.45797485109199204,
"grad_norm": 1.2369976043701172,
"learning_rate": 9.151637952329903e-05,
"loss": 1.5479,
"step": 692
},
{
"epoch": 0.46062210456651226,
"grad_norm": 1.4693775177001953,
"learning_rate": 9.139397965379327e-05,
"loss": 1.7891,
"step": 696
},
{
"epoch": 0.4632693580410324,
"grad_norm": 1.6788188219070435,
"learning_rate": 9.127078620116556e-05,
"loss": 1.7637,
"step": 700
},
{
"epoch": 0.46591661151555264,
"grad_norm": 1.3309741020202637,
"learning_rate": 9.114680152721453e-05,
"loss": 1.6053,
"step": 704
},
{
"epoch": 0.4685638649900728,
"grad_norm": 1.509023904800415,
"learning_rate": 9.102202800890772e-05,
"loss": 1.8784,
"step": 708
},
{
"epoch": 0.47121111846459296,
"grad_norm": 1.3232872486114502,
"learning_rate": 9.089646803833589e-05,
"loss": 1.6745,
"step": 712
},
{
"epoch": 0.4738583719391132,
"grad_norm": 1.3540325164794922,
"learning_rate": 9.077012402266731e-05,
"loss": 1.6668,
"step": 716
},
{
"epoch": 0.47650562541363334,
"grad_norm": 1.3100489377975464,
"learning_rate": 9.064299838410152e-05,
"loss": 1.6188,
"step": 720
},
{
"epoch": 0.47915287888815355,
"grad_norm": 1.3783172369003296,
"learning_rate": 9.051509355982293e-05,
"loss": 1.6491,
"step": 724
},
{
"epoch": 0.4818001323626737,
"grad_norm": 1.27851402759552,
"learning_rate": 9.038641200195404e-05,
"loss": 1.8925,
"step": 728
},
{
"epoch": 0.48444738583719393,
"grad_norm": 1.4370380640029907,
"learning_rate": 9.025695617750848e-05,
"loss": 1.7996,
"step": 732
},
{
"epoch": 0.4870946393117141,
"grad_norm": 1.4078205823898315,
"learning_rate": 9.012672856834373e-05,
"loss": 1.8554,
"step": 736
},
{
"epoch": 0.48974189278623426,
"grad_norm": 1.3553669452667236,
"learning_rate": 8.999573167111348e-05,
"loss": 1.5417,
"step": 740
},
{
"epoch": 0.4923891462607545,
"grad_norm": 1.4759166240692139,
"learning_rate": 8.986396799721983e-05,
"loss": 1.6143,
"step": 744
},
{
"epoch": 0.49503639973527463,
"grad_norm": 1.3601372241973877,
"learning_rate": 8.973144007276508e-05,
"loss": 1.7011,
"step": 748
},
{
"epoch": 0.49768365320979485,
"grad_norm": 1.425181269645691,
"learning_rate": 8.959815043850336e-05,
"loss": 1.672,
"step": 752
},
{
"epoch": 0.500330906684315,
"grad_norm": 1.440303921699524,
"learning_rate": 8.946410164979184e-05,
"loss": 1.8008,
"step": 756
},
{
"epoch": 0.5029781601588352,
"grad_norm": 1.4576961994171143,
"learning_rate": 8.932929627654185e-05,
"loss": 1.5234,
"step": 760
},
{
"epoch": 0.5056254136333554,
"grad_norm": 1.3088816404342651,
"learning_rate": 8.919373690316952e-05,
"loss": 1.701,
"step": 764
},
{
"epoch": 0.5082726671078756,
"grad_norm": 3.7521555423736572,
"learning_rate": 8.905742612854628e-05,
"loss": 1.6714,
"step": 768
},
{
"epoch": 0.5109199205823958,
"grad_norm": 1.4540220499038696,
"learning_rate": 8.892036656594898e-05,
"loss": 1.6276,
"step": 772
},
{
"epoch": 0.513567174056916,
"grad_norm": 1.3043605089187622,
"learning_rate": 8.87825608430099e-05,
"loss": 1.635,
"step": 776
},
{
"epoch": 0.5162144275314361,
"grad_norm": 1.3931020498275757,
"learning_rate": 8.864401160166624e-05,
"loss": 1.5822,
"step": 780
},
{
"epoch": 0.5188616810059563,
"grad_norm": 1.3738582134246826,
"learning_rate": 8.85047214981096e-05,
"loss": 1.694,
"step": 784
},
{
"epoch": 0.5215089344804765,
"grad_norm": 1.3968422412872314,
"learning_rate": 8.83646932027349e-05,
"loss": 1.6673,
"step": 788
},
{
"epoch": 0.5241561879549967,
"grad_norm": 1.4195423126220703,
"learning_rate": 8.822392940008937e-05,
"loss": 1.5422,
"step": 792
},
{
"epoch": 0.5268034414295168,
"grad_norm": 1.2660058736801147,
"learning_rate": 8.808243278882094e-05,
"loss": 1.4875,
"step": 796
},
{
"epoch": 0.5294506949040371,
"grad_norm": 1.3500608205795288,
"learning_rate": 8.794020608162656e-05,
"loss": 1.6946,
"step": 800
},
{
"epoch": 0.5320979483785573,
"grad_norm": 1.6274265050888062,
"learning_rate": 8.779725200520021e-05,
"loss": 1.6943,
"step": 804
},
{
"epoch": 0.5347452018530774,
"grad_norm": 1.2186963558197021,
"learning_rate": 8.765357330018056e-05,
"loss": 1.4563,
"step": 808
},
{
"epoch": 0.5373924553275976,
"grad_norm": 1.501142978668213,
"learning_rate": 8.750917272109848e-05,
"loss": 1.6729,
"step": 812
},
{
"epoch": 0.5400397088021178,
"grad_norm": 1.372517466545105,
"learning_rate": 8.736405303632427e-05,
"loss": 1.636,
"step": 816
},
{
"epoch": 0.542686962276638,
"grad_norm": 1.4448741674423218,
"learning_rate": 8.721821702801449e-05,
"loss": 1.6977,
"step": 820
},
{
"epoch": 0.5453342157511581,
"grad_norm": 1.4774208068847656,
"learning_rate": 8.707166749205866e-05,
"loss": 1.7892,
"step": 824
},
{
"epoch": 0.5479814692256784,
"grad_norm": 1.3137487173080444,
"learning_rate": 8.692440723802571e-05,
"loss": 1.5086,
"step": 828
},
{
"epoch": 0.5506287227001986,
"grad_norm": 1.4480420351028442,
"learning_rate": 8.677643908911007e-05,
"loss": 1.6694,
"step": 832
},
{
"epoch": 0.5532759761747187,
"grad_norm": 1.4660981893539429,
"learning_rate": 8.662776588207747e-05,
"loss": 1.632,
"step": 836
},
{
"epoch": 0.5559232296492389,
"grad_norm": 1.2639222145080566,
"learning_rate": 8.647839046721076e-05,
"loss": 1.5101,
"step": 840
},
{
"epoch": 0.5585704831237591,
"grad_norm": 1.3556458950042725,
"learning_rate": 8.632831570825508e-05,
"loss": 1.7912,
"step": 844
},
{
"epoch": 0.5612177365982793,
"grad_norm": 1.2261251211166382,
"learning_rate": 8.617754448236298e-05,
"loss": 1.6547,
"step": 848
},
{
"epoch": 0.5638649900727994,
"grad_norm": 1.2850754261016846,
"learning_rate": 8.602607968003935e-05,
"loss": 1.5365,
"step": 852
},
{
"epoch": 0.5665122435473197,
"grad_norm": 1.3346043825149536,
"learning_rate": 8.587392420508598e-05,
"loss": 1.6175,
"step": 856
},
{
"epoch": 0.5691594970218399,
"grad_norm": 1.5381152629852295,
"learning_rate": 8.572108097454578e-05,
"loss": 1.7967,
"step": 860
},
{
"epoch": 0.57180675049636,
"grad_norm": 1.2237263917922974,
"learning_rate": 8.556755291864701e-05,
"loss": 1.6057,
"step": 864
},
{
"epoch": 0.5744540039708802,
"grad_norm": 1.233619213104248,
"learning_rate": 8.541334298074701e-05,
"loss": 1.7107,
"step": 868
},
{
"epoch": 0.5771012574454004,
"grad_norm": 1.2423778772354126,
"learning_rate": 8.525845411727581e-05,
"loss": 1.4729,
"step": 872
},
{
"epoch": 0.5797485109199206,
"grad_norm": 7.3384480476379395,
"learning_rate": 8.51028892976794e-05,
"loss": 1.6363,
"step": 876
},
{
"epoch": 0.5823957643944407,
"grad_norm": 1.3198407888412476,
"learning_rate": 8.494665150436288e-05,
"loss": 1.646,
"step": 880
},
{
"epoch": 0.585043017868961,
"grad_norm": 1.172568678855896,
"learning_rate": 8.478974373263318e-05,
"loss": 1.4356,
"step": 884
},
{
"epoch": 0.5876902713434812,
"grad_norm": 1.4879450798034668,
"learning_rate": 8.463216899064179e-05,
"loss": 1.7847,
"step": 888
},
{
"epoch": 0.5903375248180013,
"grad_norm": 1.3998438119888306,
"learning_rate": 8.447393029932692e-05,
"loss": 1.7818,
"step": 892
},
{
"epoch": 0.5929847782925215,
"grad_norm": 1.3567726612091064,
"learning_rate": 8.431503069235565e-05,
"loss": 1.5539,
"step": 896
},
{
"epoch": 0.5956320317670417,
"grad_norm": 1.4983903169631958,
"learning_rate": 8.415547321606584e-05,
"loss": 1.6477,
"step": 900
},
{
"epoch": 0.5982792852415619,
"grad_norm": 1.2646454572677612,
"learning_rate": 8.399526092940768e-05,
"loss": 1.6087,
"step": 904
},
{
"epoch": 0.600926538716082,
"grad_norm": 1.4137752056121826,
"learning_rate": 8.38343969038849e-05,
"loss": 1.7626,
"step": 908
},
{
"epoch": 0.6035737921906023,
"grad_norm": 1.4016697406768799,
"learning_rate": 8.367288422349617e-05,
"loss": 1.6947,
"step": 912
},
{
"epoch": 0.6062210456651225,
"grad_norm": 1.331425666809082,
"learning_rate": 8.351072598467576e-05,
"loss": 1.6358,
"step": 916
},
{
"epoch": 0.6088682991396426,
"grad_norm": 1.2292309999465942,
"learning_rate": 8.334792529623419e-05,
"loss": 1.4613,
"step": 920
},
{
"epoch": 0.6115155526141628,
"grad_norm": 1.3756728172302246,
"learning_rate": 8.318448527929877e-05,
"loss": 1.5771,
"step": 924
},
{
"epoch": 0.614162806088683,
"grad_norm": 1.4124281406402588,
"learning_rate": 8.302040906725361e-05,
"loss": 1.7364,
"step": 928
},
{
"epoch": 0.6168100595632032,
"grad_norm": 1.298540472984314,
"learning_rate": 8.285569980567964e-05,
"loss": 1.6394,
"step": 932
},
{
"epoch": 0.6194573130377233,
"grad_norm": 1.3905584812164307,
"learning_rate": 8.269036065229427e-05,
"loss": 1.7034,
"step": 936
},
{
"epoch": 0.6221045665122436,
"grad_norm": 1.4072821140289307,
"learning_rate": 8.252439477689082e-05,
"loss": 1.6315,
"step": 940
},
{
"epoch": 0.6247518199867638,
"grad_norm": 1.239159345626831,
"learning_rate": 8.235780536127787e-05,
"loss": 1.5178,
"step": 944
},
{
"epoch": 0.6273990734612839,
"grad_norm": 1.3636091947555542,
"learning_rate": 8.21905955992181e-05,
"loss": 1.6564,
"step": 948
},
{
"epoch": 0.6300463269358041,
"grad_norm": 1.3506637811660767,
"learning_rate": 8.202276869636713e-05,
"loss": 1.646,
"step": 952
},
{
"epoch": 0.6326935804103243,
"grad_norm": 1.4368304014205933,
"learning_rate": 8.185432787021216e-05,
"loss": 1.5073,
"step": 956
},
{
"epoch": 0.6353408338848445,
"grad_norm": 1.3278450965881348,
"learning_rate": 8.168527635001015e-05,
"loss": 1.5203,
"step": 960
},
{
"epoch": 0.6379880873593646,
"grad_norm": 1.2450168132781982,
"learning_rate": 8.151561737672591e-05,
"loss": 1.7171,
"step": 964
},
{
"epoch": 0.6406353408338848,
"grad_norm": 1.2755018472671509,
"learning_rate": 8.134535420297008e-05,
"loss": 1.5675,
"step": 968
},
{
"epoch": 0.6432825943084051,
"grad_norm": 1.3066191673278809,
"learning_rate": 8.117449009293668e-05,
"loss": 1.6525,
"step": 972
},
{
"epoch": 0.6459298477829252,
"grad_norm": 1.2875075340270996,
"learning_rate": 8.100302832234056e-05,
"loss": 1.6484,
"step": 976
},
{
"epoch": 0.6485771012574454,
"grad_norm": 1.5069595575332642,
"learning_rate": 8.083097217835461e-05,
"loss": 1.6251,
"step": 980
},
{
"epoch": 0.6512243547319656,
"grad_norm": 1.334075927734375,
"learning_rate": 8.065832495954668e-05,
"loss": 1.743,
"step": 984
},
{
"epoch": 0.6538716082064858,
"grad_norm": 1.3219469785690308,
"learning_rate": 8.048508997581647e-05,
"loss": 1.6345,
"step": 988
},
{
"epoch": 0.6565188616810059,
"grad_norm": 1.4275529384613037,
"learning_rate": 8.03112705483319e-05,
"loss": 1.7515,
"step": 992
},
{
"epoch": 0.6591661151555261,
"grad_norm": 1.349526286125183,
"learning_rate": 8.013687000946561e-05,
"loss": 1.5209,
"step": 996
},
{
"epoch": 0.6618133686300464,
"grad_norm": 1.3620506525039673,
"learning_rate": 7.996189170273096e-05,
"loss": 1.6789,
"step": 1000
},
{
"epoch": 0.6644606221045665,
"grad_norm": 1.2079874277114868,
"learning_rate": 7.978633898271795e-05,
"loss": 1.3453,
"step": 1004
},
{
"epoch": 0.6671078755790867,
"grad_norm": 1.3527398109436035,
"learning_rate": 7.961021521502895e-05,
"loss": 1.5927,
"step": 1008
},
{
"epoch": 0.6697551290536069,
"grad_norm": 1.3048250675201416,
"learning_rate": 7.943352377621414e-05,
"loss": 1.643,
"step": 1012
},
{
"epoch": 0.6724023825281271,
"grad_norm": 1.2111921310424805,
"learning_rate": 7.925626805370678e-05,
"loss": 1.4432,
"step": 1016
},
{
"epoch": 0.6750496360026472,
"grad_norm": 1.3531336784362793,
"learning_rate": 7.907845144575829e-05,
"loss": 1.6235,
"step": 1020
},
{
"epoch": 0.6776968894771674,
"grad_norm": 1.204720139503479,
"learning_rate": 7.890007736137307e-05,
"loss": 1.5377,
"step": 1024
},
{
"epoch": 0.6803441429516877,
"grad_norm": 1.3632683753967285,
"learning_rate": 7.872114922024313e-05,
"loss": 1.5758,
"step": 1028
},
{
"epoch": 0.6829913964262078,
"grad_norm": 1.4058332443237305,
"learning_rate": 7.854167045268264e-05,
"loss": 1.4645,
"step": 1032
},
{
"epoch": 0.685638649900728,
"grad_norm": 1.2490967512130737,
"learning_rate": 7.836164449956199e-05,
"loss": 1.5723,
"step": 1036
},
{
"epoch": 0.6882859033752482,
"grad_norm": 1.3228312730789185,
"learning_rate": 7.818107481224198e-05,
"loss": 1.466,
"step": 1040
},
{
"epoch": 0.6909331568497684,
"grad_norm": 1.3664582967758179,
"learning_rate": 7.799996485250755e-05,
"loss": 1.4823,
"step": 1044
},
{
"epoch": 0.6935804103242885,
"grad_norm": 1.1946579217910767,
"learning_rate": 7.781831809250151e-05,
"loss": 1.6093,
"step": 1048
},
{
"epoch": 0.6962276637988087,
"grad_norm": 1.3534433841705322,
"learning_rate": 7.763613801465786e-05,
"loss": 1.5823,
"step": 1052
},
{
"epoch": 0.698874917273329,
"grad_norm": 1.275877833366394,
"learning_rate": 7.745342811163507e-05,
"loss": 1.508,
"step": 1056
},
{
"epoch": 0.7015221707478491,
"grad_norm": 1.2870965003967285,
"learning_rate": 7.727019188624922e-05,
"loss": 1.6452,
"step": 1060
},
{
"epoch": 0.7041694242223693,
"grad_norm": 1.2805050611495972,
"learning_rate": 7.708643285140667e-05,
"loss": 1.7463,
"step": 1064
},
{
"epoch": 0.7068166776968895,
"grad_norm": 1.331794261932373,
"learning_rate": 7.690215453003684e-05,
"loss": 1.4428,
"step": 1068
},
{
"epoch": 0.7094639311714097,
"grad_norm": 1.3701887130737305,
"learning_rate": 7.671736045502462e-05,
"loss": 1.6868,
"step": 1072
},
{
"epoch": 0.7121111846459298,
"grad_norm": 1.3474302291870117,
"learning_rate": 7.653205416914267e-05,
"loss": 1.4919,
"step": 1076
},
{
"epoch": 0.71475843812045,
"grad_norm": 1.6028352975845337,
"learning_rate": 7.634623922498348e-05,
"loss": 1.5958,
"step": 1080
},
{
"epoch": 0.7174056915949703,
"grad_norm": 1.2263597249984741,
"learning_rate": 7.615991918489125e-05,
"loss": 1.7238,
"step": 1084
},
{
"epoch": 0.7200529450694904,
"grad_norm": 1.4178084135055542,
"learning_rate": 7.597309762089359e-05,
"loss": 1.48,
"step": 1088
},
{
"epoch": 0.7227001985440106,
"grad_norm": 1.3942856788635254,
"learning_rate": 7.57857781146331e-05,
"loss": 1.5336,
"step": 1092
},
{
"epoch": 0.7253474520185308,
"grad_norm": 1.2155961990356445,
"learning_rate": 7.559796425729863e-05,
"loss": 1.4977,
"step": 1096
},
{
"epoch": 0.727994705493051,
"grad_norm": 1.3590655326843262,
"learning_rate": 7.540965964955649e-05,
"loss": 1.6736,
"step": 1100
},
{
"epoch": 0.7306419589675711,
"grad_norm": 1.1585520505905151,
"learning_rate": 7.522086790148133e-05,
"loss": 1.6883,
"step": 1104
},
{
"epoch": 0.7332892124420913,
"grad_norm": 1.2694188356399536,
"learning_rate": 7.503159263248709e-05,
"loss": 1.657,
"step": 1108
},
{
"epoch": 0.7359364659166115,
"grad_norm": 1.2413800954818726,
"learning_rate": 7.484183747125742e-05,
"loss": 1.4757,
"step": 1112
},
{
"epoch": 0.7385837193911317,
"grad_norm": 1.1527191400527954,
"learning_rate": 7.46516060556763e-05,
"loss": 1.5628,
"step": 1116
},
{
"epoch": 0.7412309728656519,
"grad_norm": 1.5187007188796997,
"learning_rate": 7.446090203275809e-05,
"loss": 1.6387,
"step": 1120
},
{
"epoch": 0.7438782263401721,
"grad_norm": 1.3278498649597168,
"learning_rate": 7.426972905857781e-05,
"loss": 1.5212,
"step": 1124
},
{
"epoch": 0.7465254798146923,
"grad_norm": 1.4994242191314697,
"learning_rate": 7.407809079820094e-05,
"loss": 1.7582,
"step": 1128
},
{
"epoch": 0.7491727332892124,
"grad_norm": 1.2623709440231323,
"learning_rate": 7.388599092561315e-05,
"loss": 1.6223,
"step": 1132
},
{
"epoch": 0.7518199867637326,
"grad_norm": 1.3785511255264282,
"learning_rate": 7.369343312364993e-05,
"loss": 1.5051,
"step": 1136
},
{
"epoch": 0.7544672402382528,
"grad_norm": 1.2472020387649536,
"learning_rate": 7.350042108392594e-05,
"loss": 1.419,
"step": 1140
},
{
"epoch": 0.757114493712773,
"grad_norm": 1.6892167329788208,
"learning_rate": 7.330695850676421e-05,
"loss": 1.5718,
"step": 1144
},
{
"epoch": 0.7597617471872932,
"grad_norm": 1.4521297216415405,
"learning_rate": 7.311304910112525e-05,
"loss": 1.6383,
"step": 1148
},
{
"epoch": 0.7624090006618134,
"grad_norm": 1.450149655342102,
"learning_rate": 7.291869658453594e-05,
"loss": 1.771,
"step": 1152
},
{
"epoch": 0.7650562541363336,
"grad_norm": 1.3068790435791016,
"learning_rate": 7.272390468301821e-05,
"loss": 1.6414,
"step": 1156
},
{
"epoch": 0.7677035076108537,
"grad_norm": 1.1887469291687012,
"learning_rate": 7.252867713101771e-05,
"loss": 1.3455,
"step": 1160
},
{
"epoch": 0.7703507610853739,
"grad_norm": 1.2392699718475342,
"learning_rate": 7.233301767133205e-05,
"loss": 1.5139,
"step": 1164
},
{
"epoch": 0.7729980145598941,
"grad_norm": 1.353925347328186,
"learning_rate": 7.213693005503924e-05,
"loss": 1.6324,
"step": 1168
},
{
"epoch": 0.7756452680344142,
"grad_norm": 1.2792888879776,
"learning_rate": 7.194041804142557e-05,
"loss": 1.69,
"step": 1172
},
{
"epoch": 0.7782925215089345,
"grad_norm": 1.1825402975082397,
"learning_rate": 7.174348539791375e-05,
"loss": 1.3613,
"step": 1176
},
{
"epoch": 0.7809397749834547,
"grad_norm": 1.2615066766738892,
"learning_rate": 7.154613589999054e-05,
"loss": 1.6972,
"step": 1180
},
{
"epoch": 0.7835870284579749,
"grad_norm": 1.239867091178894,
"learning_rate": 7.13483733311344e-05,
"loss": 1.403,
"step": 1184
},
{
"epoch": 0.786234281932495,
"grad_norm": 1.3656786680221558,
"learning_rate": 7.115020148274295e-05,
"loss": 1.6528,
"step": 1188
},
{
"epoch": 0.7888815354070152,
"grad_norm": 1.2590436935424805,
"learning_rate": 7.095162415406034e-05,
"loss": 1.5411,
"step": 1192
},
{
"epoch": 0.7915287888815354,
"grad_norm": 1.2784417867660522,
"learning_rate": 7.075264515210435e-05,
"loss": 1.5618,
"step": 1196
},
{
"epoch": 0.7941760423560555,
"grad_norm": 1.3260300159454346,
"learning_rate": 7.055326829159341e-05,
"loss": 1.5295,
"step": 1200
},
{
"epoch": 0.7968232958305758,
"grad_norm": 5.832207202911377,
"learning_rate": 7.03534973948735e-05,
"loss": 1.5864,
"step": 1204
},
{
"epoch": 0.799470549305096,
"grad_norm": 1.2828547954559326,
"learning_rate": 7.015333629184484e-05,
"loss": 1.5081,
"step": 1208
},
{
"epoch": 0.8021178027796162,
"grad_norm": 1.2997095584869385,
"learning_rate": 6.995278881988847e-05,
"loss": 1.5827,
"step": 1212
},
{
"epoch": 0.8047650562541363,
"grad_norm": 1.2829680442810059,
"learning_rate": 6.975185882379271e-05,
"loss": 1.4565,
"step": 1216
},
{
"epoch": 0.8074123097286565,
"grad_norm": 1.3034470081329346,
"learning_rate": 6.955055015567942e-05,
"loss": 1.4973,
"step": 1220
},
{
"epoch": 0.8100595632031767,
"grad_norm": 1.170404314994812,
"learning_rate": 6.934886667493012e-05,
"loss": 1.4518,
"step": 1224
},
{
"epoch": 0.8127068166776968,
"grad_norm": 1.2815779447555542,
"learning_rate": 6.914681224811208e-05,
"loss": 1.546,
"step": 1228
},
{
"epoch": 0.8153540701522171,
"grad_norm": 1.227200984954834,
"learning_rate": 6.894439074890414e-05,
"loss": 1.5478,
"step": 1232
},
{
"epoch": 0.8180013236267373,
"grad_norm": 1.2927132844924927,
"learning_rate": 6.874160605802244e-05,
"loss": 1.6184,
"step": 1236
},
{
"epoch": 0.8206485771012575,
"grad_norm": 1.2327131032943726,
"learning_rate": 6.853846206314605e-05,
"loss": 1.5553,
"step": 1240
},
{
"epoch": 0.8232958305757776,
"grad_norm": 1.1886876821517944,
"learning_rate": 6.833496265884241e-05,
"loss": 1.4956,
"step": 1244
},
{
"epoch": 0.8259430840502978,
"grad_norm": 1.4828628301620483,
"learning_rate": 6.813111174649269e-05,
"loss": 1.7339,
"step": 1248
},
{
"epoch": 0.828590337524818,
"grad_norm": 1.2269375324249268,
"learning_rate": 6.792691323421698e-05,
"loss": 1.5712,
"step": 1252
},
{
"epoch": 0.8312375909993381,
"grad_norm": 1.4898347854614258,
"learning_rate": 6.772237103679937e-05,
"loss": 1.6172,
"step": 1256
},
{
"epoch": 0.8338848444738584,
"grad_norm": 1.1373467445373535,
"learning_rate": 6.751748907561288e-05,
"loss": 1.3869,
"step": 1260
},
{
"epoch": 0.8365320979483786,
"grad_norm": 1.2607003450393677,
"learning_rate": 6.731227127854434e-05,
"loss": 1.5501,
"step": 1264
},
{
"epoch": 0.8391793514228988,
"grad_norm": 1.357080340385437,
"learning_rate": 6.710672157991899e-05,
"loss": 1.5804,
"step": 1268
},
{
"epoch": 0.8418266048974189,
"grad_norm": 1.300445318222046,
"learning_rate": 6.690084392042513e-05,
"loss": 1.4547,
"step": 1272
},
{
"epoch": 0.8444738583719391,
"grad_norm": 1.281031608581543,
"learning_rate": 6.669464224703861e-05,
"loss": 1.5843,
"step": 1276
},
{
"epoch": 0.8471211118464593,
"grad_norm": 1.2201812267303467,
"learning_rate": 6.648812051294697e-05,
"loss": 1.4422,
"step": 1280
},
{
"epoch": 0.8497683653209794,
"grad_norm": 1.2445136308670044,
"learning_rate": 6.628128267747391e-05,
"loss": 1.5826,
"step": 1284
},
{
"epoch": 0.8524156187954997,
"grad_norm": 1.383170247077942,
"learning_rate": 6.607413270600319e-05,
"loss": 1.6194,
"step": 1288
},
{
"epoch": 0.8550628722700199,
"grad_norm": 1.370076060295105,
"learning_rate": 6.586667456990267e-05,
"loss": 1.6408,
"step": 1292
},
{
"epoch": 0.8577101257445401,
"grad_norm": 1.293721318244934,
"learning_rate": 6.565891224644822e-05,
"loss": 1.5066,
"step": 1296
},
{
"epoch": 0.8603573792190602,
"grad_norm": 1.4381659030914307,
"learning_rate": 6.545084971874738e-05,
"loss": 1.5161,
"step": 1300
},
{
"epoch": 0.8630046326935804,
"grad_norm": 1.3525183200836182,
"learning_rate": 6.524249097566306e-05,
"loss": 1.6022,
"step": 1304
},
{
"epoch": 0.8656518861681006,
"grad_norm": 1.1742914915084839,
"learning_rate": 6.503384001173707e-05,
"loss": 1.3307,
"step": 1308
},
{
"epoch": 0.8682991396426207,
"grad_norm": 1.275770664215088,
"learning_rate": 6.48249008271135e-05,
"loss": 1.5092,
"step": 1312
},
{
"epoch": 0.870946393117141,
"grad_norm": 1.3267558813095093,
"learning_rate": 6.461567742746206e-05,
"loss": 1.6288,
"step": 1316
},
{
"epoch": 0.8735936465916612,
"grad_norm": 1.1977699995040894,
"learning_rate": 6.440617382390128e-05,
"loss": 1.5567,
"step": 1320
},
{
"epoch": 0.8762409000661814,
"grad_norm": 1.1399099826812744,
"learning_rate": 6.419639403292161e-05,
"loss": 1.5925,
"step": 1324
},
{
"epoch": 0.8788881535407015,
"grad_norm": 1.3445255756378174,
"learning_rate": 6.398634207630841e-05,
"loss": 1.5288,
"step": 1328
},
{
"epoch": 0.8815354070152217,
"grad_norm": 1.2953174114227295,
"learning_rate": 6.377602198106483e-05,
"loss": 1.5119,
"step": 1332
},
{
"epoch": 0.8841826604897419,
"grad_norm": 1.2466961145401,
"learning_rate": 6.356543777933468e-05,
"loss": 1.4559,
"step": 1336
},
{
"epoch": 0.886829913964262,
"grad_norm": 1.410008430480957,
"learning_rate": 6.335459350832504e-05,
"loss": 1.6239,
"step": 1340
},
{
"epoch": 0.8894771674387822,
"grad_norm": 1.2374393939971924,
"learning_rate": 6.314349321022893e-05,
"loss": 1.4162,
"step": 1344
},
{
"epoch": 0.8921244209133025,
"grad_norm": 1.3700758218765259,
"learning_rate": 6.293214093214775e-05,
"loss": 1.4784,
"step": 1348
},
{
"epoch": 0.8947716743878227,
"grad_norm": 1.345596432685852,
"learning_rate": 6.272054072601374e-05,
"loss": 1.5489,
"step": 1352
},
{
"epoch": 0.8974189278623428,
"grad_norm": 1.1666315793991089,
"learning_rate": 6.250869664851227e-05,
"loss": 1.3515,
"step": 1356
},
{
"epoch": 0.900066181336863,
"grad_norm": 1.2450063228607178,
"learning_rate": 6.229661276100412e-05,
"loss": 1.4763,
"step": 1360
},
{
"epoch": 0.9027134348113832,
"grad_norm": 1.1888995170593262,
"learning_rate": 6.208429312944754e-05,
"loss": 1.4322,
"step": 1364
},
{
"epoch": 0.9053606882859033,
"grad_norm": 1.3319921493530273,
"learning_rate": 6.187174182432033e-05,
"loss": 1.5044,
"step": 1368
},
{
"epoch": 0.9080079417604235,
"grad_norm": 1.2023800611495972,
"learning_rate": 6.165896292054187e-05,
"loss": 1.5033,
"step": 1372
},
{
"epoch": 0.9106551952349438,
"grad_norm": 1.3017017841339111,
"learning_rate": 6.14459604973949e-05,
"loss": 1.4683,
"step": 1376
},
{
"epoch": 0.913302448709464,
"grad_norm": 1.2657389640808105,
"learning_rate": 6.12327386384473e-05,
"loss": 1.5533,
"step": 1380
},
{
"epoch": 0.9159497021839841,
"grad_norm": 1.3227919340133667,
"learning_rate": 6.101930143147395e-05,
"loss": 1.5239,
"step": 1384
},
{
"epoch": 0.9185969556585043,
"grad_norm": 1.3174325227737427,
"learning_rate": 6.080565296837821e-05,
"loss": 1.5259,
"step": 1388
},
{
"epoch": 0.9212442091330245,
"grad_norm": 1.2424542903900146,
"learning_rate": 6.059179734511356e-05,
"loss": 1.3573,
"step": 1392
},
{
"epoch": 0.9238914626075446,
"grad_norm": 1.2109280824661255,
"learning_rate": 6.037773866160502e-05,
"loss": 1.3831,
"step": 1396
},
{
"epoch": 0.9265387160820648,
"grad_norm": 1.2729474306106567,
"learning_rate": 6.0163481021670575e-05,
"loss": 1.674,
"step": 1400
},
{
"epoch": 0.9291859695565851,
"grad_norm": 1.1736104488372803,
"learning_rate": 5.994902853294251e-05,
"loss": 1.4935,
"step": 1404
},
{
"epoch": 0.9318332230311053,
"grad_norm": 1.3021750450134277,
"learning_rate": 5.973438530678861e-05,
"loss": 1.6066,
"step": 1408
},
{
"epoch": 0.9344804765056254,
"grad_norm": 1.3625566959381104,
"learning_rate": 5.951955545823342e-05,
"loss": 1.629,
"step": 1412
},
{
"epoch": 0.9371277299801456,
"grad_norm": 1.1946239471435547,
"learning_rate": 5.930454310587929e-05,
"loss": 1.4444,
"step": 1416
},
{
"epoch": 0.9397749834546658,
"grad_norm": 1.4337393045425415,
"learning_rate": 5.9089352371827446e-05,
"loss": 1.6888,
"step": 1420
},
{
"epoch": 0.9424222369291859,
"grad_norm": 1.3422842025756836,
"learning_rate": 5.8873987381598924e-05,
"loss": 1.6227,
"step": 1424
},
{
"epoch": 0.9450694904037061,
"grad_norm": 1.2459781169891357,
"learning_rate": 5.865845226405553e-05,
"loss": 1.4704,
"step": 1428
},
{
"epoch": 0.9477167438782264,
"grad_norm": 1.5130873918533325,
"learning_rate": 5.844275115132064e-05,
"loss": 1.5029,
"step": 1432
},
{
"epoch": 0.9503639973527466,
"grad_norm": 1.127805471420288,
"learning_rate": 5.822688817870004e-05,
"loss": 1.5289,
"step": 1436
},
{
"epoch": 0.9530112508272667,
"grad_norm": 1.283653736114502,
"learning_rate": 5.801086748460255e-05,
"loss": 1.545,
"step": 1440
},
{
"epoch": 0.9556585043017869,
"grad_norm": 1.290038824081421,
"learning_rate": 5.7794693210460804e-05,
"loss": 1.5588,
"step": 1444
},
{
"epoch": 0.9583057577763071,
"grad_norm": 1.2246005535125732,
"learning_rate": 5.757836950065172e-05,
"loss": 1.4577,
"step": 1448
},
{
"epoch": 0.9609530112508272,
"grad_norm": 1.3036789894104004,
"learning_rate": 5.736190050241719e-05,
"loss": 1.6891,
"step": 1452
},
{
"epoch": 0.9636002647253474,
"grad_norm": 1.2149336338043213,
"learning_rate": 5.714529036578443e-05,
"loss": 1.4114,
"step": 1456
},
{
"epoch": 0.9662475181998676,
"grad_norm": 1.1539721488952637,
"learning_rate": 5.692854324348653e-05,
"loss": 1.5497,
"step": 1460
},
{
"epoch": 0.9688947716743879,
"grad_norm": 1.3573237657546997,
"learning_rate": 5.6711663290882776e-05,
"loss": 1.4812,
"step": 1464
},
{
"epoch": 0.971542025148908,
"grad_norm": 1.338049292564392,
"learning_rate": 5.649465466587902e-05,
"loss": 1.6043,
"step": 1468
},
{
"epoch": 0.9741892786234282,
"grad_norm": 1.3066737651824951,
"learning_rate": 5.627752152884794e-05,
"loss": 1.582,
"step": 1472
},
{
"epoch": 0.9768365320979484,
"grad_norm": 1.2373597621917725,
"learning_rate": 5.606026804254931e-05,
"loss": 1.4099,
"step": 1476
},
{
"epoch": 0.9794837855724685,
"grad_norm": 1.1805121898651123,
"learning_rate": 5.584289837205012e-05,
"loss": 1.3914,
"step": 1480
},
{
"epoch": 0.9821310390469887,
"grad_norm": 1.3286755084991455,
"learning_rate": 5.5625416684644874e-05,
"loss": 1.4803,
"step": 1484
},
{
"epoch": 0.984778292521509,
"grad_norm": 1.2538626194000244,
"learning_rate": 5.540782714977549e-05,
"loss": 1.5063,
"step": 1488
},
{
"epoch": 0.9874255459960292,
"grad_norm": 1.2164523601531982,
"learning_rate": 5.51901339389516e-05,
"loss": 1.3555,
"step": 1492
},
{
"epoch": 0.9900727994705493,
"grad_norm": 1.217489242553711,
"learning_rate": 5.4972341225670354e-05,
"loss": 1.4818,
"step": 1496
},
{
"epoch": 0.9927200529450695,
"grad_norm": 1.170462727546692,
"learning_rate": 5.4754453185336586e-05,
"loss": 1.5693,
"step": 1500
},
{
"epoch": 0.9953673064195897,
"grad_norm": 1.2590230703353882,
"learning_rate": 5.453647399518262e-05,
"loss": 1.3735,
"step": 1504
},
{
"epoch": 0.9980145598941098,
"grad_norm": 1.1807870864868164,
"learning_rate": 5.431840783418832e-05,
"loss": 1.3643,
"step": 1508
},
{
"epoch": 1.00066181336863,
"grad_norm": 1.1421568393707275,
"learning_rate": 5.410025888300087e-05,
"loss": 1.4336,
"step": 1512
},
{
"epoch": 1.0033090668431501,
"grad_norm": 1.161148190498352,
"learning_rate": 5.388203132385467e-05,
"loss": 1.2284,
"step": 1516
},
{
"epoch": 1.0059563203176705,
"grad_norm": 1.129975438117981,
"learning_rate": 5.366372934049114e-05,
"loss": 1.2385,
"step": 1520
},
{
"epoch": 1.0086035737921906,
"grad_norm": 1.0889602899551392,
"learning_rate": 5.3445357118078545e-05,
"loss": 1.0735,
"step": 1524
},
{
"epoch": 1.011250827266711,
"grad_norm": 1.2157572507858276,
"learning_rate": 5.322691884313172e-05,
"loss": 1.1803,
"step": 1528
},
{
"epoch": 1.013898080741231,
"grad_norm": 1.1153740882873535,
"learning_rate": 5.300841870343183e-05,
"loss": 1.0574,
"step": 1532
},
{
"epoch": 1.016545334215751,
"grad_norm": 1.1907968521118164,
"learning_rate": 5.2789860887946066e-05,
"loss": 1.0691,
"step": 1536
},
{
"epoch": 1.0191925876902714,
"grad_norm": 1.1797744035720825,
"learning_rate": 5.257124958674736e-05,
"loss": 1.1063,
"step": 1540
},
{
"epoch": 1.0218398411647915,
"grad_norm": 1.0647462606430054,
"learning_rate": 5.235258899093406e-05,
"loss": 1.0512,
"step": 1544
},
{
"epoch": 1.0244870946393116,
"grad_norm": 1.1768978834152222,
"learning_rate": 5.213388329254949e-05,
"loss": 1.197,
"step": 1548
},
{
"epoch": 1.027134348113832,
"grad_norm": 1.282067060470581,
"learning_rate": 5.191513668450178e-05,
"loss": 1.231,
"step": 1552
},
{
"epoch": 1.029781601588352,
"grad_norm": 1.3294609785079956,
"learning_rate": 5.1696353360483216e-05,
"loss": 1.2719,
"step": 1556
},
{
"epoch": 1.0324288550628722,
"grad_norm": 1.187889814376831,
"learning_rate": 5.1477537514890116e-05,
"loss": 1.2815,
"step": 1560
},
{
"epoch": 1.0350761085373925,
"grad_norm": 1.152590036392212,
"learning_rate": 5.125869334274219e-05,
"loss": 1.126,
"step": 1564
},
{
"epoch": 1.0377233620119126,
"grad_norm": 1.1706854104995728,
"learning_rate": 5.103982503960224e-05,
"loss": 1.22,
"step": 1568
},
{
"epoch": 1.0403706154864327,
"grad_norm": 1.1738533973693848,
"learning_rate": 5.082093680149571e-05,
"loss": 1.2386,
"step": 1572
},
{
"epoch": 1.043017868960953,
"grad_norm": 1.299540400505066,
"learning_rate": 5.060203282483022e-05,
"loss": 1.2308,
"step": 1576
},
{
"epoch": 1.0456651224354732,
"grad_norm": 1.1205031871795654,
"learning_rate": 5.038311730631509e-05,
"loss": 1.1254,
"step": 1580
},
{
"epoch": 1.0483123759099935,
"grad_norm": 1.1589237451553345,
"learning_rate": 5.016419444288096e-05,
"loss": 1.046,
"step": 1584
},
{
"epoch": 1.0509596293845136,
"grad_norm": 1.1844594478607178,
"learning_rate": 4.9945268431599245e-05,
"loss": 1.1835,
"step": 1588
},
{
"epoch": 1.0536068828590337,
"grad_norm": 1.319905400276184,
"learning_rate": 4.972634346960173e-05,
"loss": 1.2235,
"step": 1592
},
{
"epoch": 1.056254136333554,
"grad_norm": 1.1240413188934326,
"learning_rate": 4.950742375400007e-05,
"loss": 1.0733,
"step": 1596
},
{
"epoch": 1.0589013898080741,
"grad_norm": 1.27524995803833,
"learning_rate": 4.9288513481805374e-05,
"loss": 1.1595,
"step": 1600
},
{
"epoch": 1.0615486432825942,
"grad_norm": 1.2067784070968628,
"learning_rate": 4.906961684984767e-05,
"loss": 1.1771,
"step": 1604
},
{
"epoch": 1.0641958967571146,
"grad_norm": 1.154008150100708,
"learning_rate": 4.8850738054695486e-05,
"loss": 1.1934,
"step": 1608
},
{
"epoch": 1.0668431502316347,
"grad_norm": 1.1568691730499268,
"learning_rate": 4.863188129257539e-05,
"loss": 1.1032,
"step": 1612
},
{
"epoch": 1.0694904037061548,
"grad_norm": 1.1935631036758423,
"learning_rate": 4.8413050759291585e-05,
"loss": 1.1457,
"step": 1616
},
{
"epoch": 1.072137657180675,
"grad_norm": 1.1685223579406738,
"learning_rate": 4.8194250650145374e-05,
"loss": 1.0371,
"step": 1620
},
{
"epoch": 1.0747849106551952,
"grad_norm": 1.2918758392333984,
"learning_rate": 4.797548515985481e-05,
"loss": 1.1128,
"step": 1624
},
{
"epoch": 1.0774321641297153,
"grad_norm": 1.232910394668579,
"learning_rate": 4.775675848247427e-05,
"loss": 1.0407,
"step": 1628
},
{
"epoch": 1.0800794176042356,
"grad_norm": 1.27483069896698,
"learning_rate": 4.7538074811313975e-05,
"loss": 1.1523,
"step": 1632
},
{
"epoch": 1.0827266710787558,
"grad_norm": 1.2089005708694458,
"learning_rate": 4.731943833885973e-05,
"loss": 1.0901,
"step": 1636
},
{
"epoch": 1.0853739245532759,
"grad_norm": 1.272049069404602,
"learning_rate": 4.7100853256692406e-05,
"loss": 1.1968,
"step": 1640
},
{
"epoch": 1.0880211780277962,
"grad_norm": 1.1610321998596191,
"learning_rate": 4.6882323755407706e-05,
"loss": 1.0379,
"step": 1644
},
{
"epoch": 1.0906684315023163,
"grad_norm": 1.0861836671829224,
"learning_rate": 4.666385402453568e-05,
"loss": 1.1274,
"step": 1648
},
{
"epoch": 1.0933156849768366,
"grad_norm": 1.2042131423950195,
"learning_rate": 4.644544825246059e-05,
"loss": 1.1502,
"step": 1652
},
{
"epoch": 1.0959629384513567,
"grad_norm": 1.5976825952529907,
"learning_rate": 4.622711062634046e-05,
"loss": 1.1527,
"step": 1656
},
{
"epoch": 1.0986101919258768,
"grad_norm": 1.2653815746307373,
"learning_rate": 4.600884533202686e-05,
"loss": 1.0946,
"step": 1660
},
{
"epoch": 1.1012574454003972,
"grad_norm": 1.129782795906067,
"learning_rate": 4.579065655398465e-05,
"loss": 1.1376,
"step": 1664
},
{
"epoch": 1.1039046988749173,
"grad_norm": 1.0471429824829102,
"learning_rate": 4.5572548475211805e-05,
"loss": 1.1488,
"step": 1668
},
{
"epoch": 1.1065519523494374,
"grad_norm": 1.281714916229248,
"learning_rate": 4.535452527715911e-05,
"loss": 1.2245,
"step": 1672
},
{
"epoch": 1.1091992058239577,
"grad_norm": 1.1683017015457153,
"learning_rate": 4.5136591139650105e-05,
"loss": 1.1307,
"step": 1676
},
{
"epoch": 1.1118464592984778,
"grad_norm": 1.1847896575927734,
"learning_rate": 4.491875024080088e-05,
"loss": 1.0821,
"step": 1680
},
{
"epoch": 1.114493712772998,
"grad_norm": 1.196803331375122,
"learning_rate": 4.470100675694007e-05,
"loss": 1.0633,
"step": 1684
},
{
"epoch": 1.1171409662475182,
"grad_norm": 1.1869444847106934,
"learning_rate": 4.4483364862528646e-05,
"loss": 1.1864,
"step": 1688
},
{
"epoch": 1.1197882197220383,
"grad_norm": 1.221575140953064,
"learning_rate": 4.4265828730079987e-05,
"loss": 1.0547,
"step": 1692
},
{
"epoch": 1.1224354731965587,
"grad_norm": 1.164784550666809,
"learning_rate": 4.404840253007987e-05,
"loss": 1.1614,
"step": 1696
},
{
"epoch": 1.1250827266710788,
"grad_norm": 1.0524084568023682,
"learning_rate": 4.3831090430906484e-05,
"loss": 1.1285,
"step": 1700
},
{
"epoch": 1.1277299801455989,
"grad_norm": 1.6504064798355103,
"learning_rate": 4.361389659875058e-05,
"loss": 1.1689,
"step": 1704
},
{
"epoch": 1.1303772336201192,
"grad_norm": 1.1136175394058228,
"learning_rate": 4.339682519753551e-05,
"loss": 1.0815,
"step": 1708
},
{
"epoch": 1.1330244870946393,
"grad_norm": 1.1745281219482422,
"learning_rate": 4.3179880388837496e-05,
"loss": 1.1722,
"step": 1712
},
{
"epoch": 1.1356717405691594,
"grad_norm": 1.0880483388900757,
"learning_rate": 4.2963066331805725e-05,
"loss": 1.0361,
"step": 1716
},
{
"epoch": 1.1383189940436798,
"grad_norm": 1.137968897819519,
"learning_rate": 4.2746387183082755e-05,
"loss": 1.1,
"step": 1720
},
{
"epoch": 1.1409662475181999,
"grad_norm": 1.2682772874832153,
"learning_rate": 4.252984709672473e-05,
"loss": 1.134,
"step": 1724
},
{
"epoch": 1.14361350099272,
"grad_norm": 1.128180742263794,
"learning_rate": 4.231345022412174e-05,
"loss": 1.0812,
"step": 1728
},
{
"epoch": 1.1462607544672403,
"grad_norm": 1.0430972576141357,
"learning_rate": 4.2097200713918264e-05,
"loss": 1.034,
"step": 1732
},
{
"epoch": 1.1489080079417604,
"grad_norm": 1.1832259893417358,
"learning_rate": 4.188110271193371e-05,
"loss": 1.1422,
"step": 1736
},
{
"epoch": 1.1515552614162807,
"grad_norm": 1.1320624351501465,
"learning_rate": 4.1665160361082704e-05,
"loss": 1.0688,
"step": 1740
},
{
"epoch": 1.1542025148908008,
"grad_norm": 1.2752870321273804,
"learning_rate": 4.144937780129594e-05,
"loss": 1.1926,
"step": 1744
},
{
"epoch": 1.156849768365321,
"grad_norm": 1.2092264890670776,
"learning_rate": 4.123375916944061e-05,
"loss": 1.0973,
"step": 1748
},
{
"epoch": 1.159497021839841,
"grad_norm": 1.1710125207901,
"learning_rate": 4.101830859924124e-05,
"loss": 1.2602,
"step": 1752
},
{
"epoch": 1.1621442753143614,
"grad_norm": 1.4670571088790894,
"learning_rate": 4.080303022120025e-05,
"loss": 1.2005,
"step": 1756
},
{
"epoch": 1.1647915287888815,
"grad_norm": 1.1942548751831055,
"learning_rate": 4.058792816251902e-05,
"loss": 1.2164,
"step": 1760
},
{
"epoch": 1.1674387822634018,
"grad_norm": 1.2230584621429443,
"learning_rate": 4.037300654701856e-05,
"loss": 1.0395,
"step": 1764
},
{
"epoch": 1.170086035737922,
"grad_norm": 1.3117454051971436,
"learning_rate": 4.015826949506049e-05,
"loss": 1.1848,
"step": 1768
},
{
"epoch": 1.172733289212442,
"grad_norm": 1.2102235555648804,
"learning_rate": 3.994372112346812e-05,
"loss": 1.1349,
"step": 1772
},
{
"epoch": 1.1753805426869623,
"grad_norm": 1.3425853252410889,
"learning_rate": 3.9729365545447514e-05,
"loss": 1.1756,
"step": 1776
},
{
"epoch": 1.1780277961614825,
"grad_norm": 1.1865317821502686,
"learning_rate": 3.9515206870508534e-05,
"loss": 1.1298,
"step": 1780
},
{
"epoch": 1.1806750496360026,
"grad_norm": 1.0945122241973877,
"learning_rate": 3.930124920438616e-05,
"loss": 1.1275,
"step": 1784
},
{
"epoch": 1.1833223031105229,
"grad_norm": 1.2114017009735107,
"learning_rate": 3.908749664896171e-05,
"loss": 1.1958,
"step": 1788
},
{
"epoch": 1.185969556585043,
"grad_norm": 1.1771973371505737,
"learning_rate": 3.887395330218429e-05,
"loss": 1.0868,
"step": 1792
},
{
"epoch": 1.188616810059563,
"grad_norm": 1.2639689445495605,
"learning_rate": 3.866062325799209e-05,
"loss": 1.213,
"step": 1796
},
{
"epoch": 1.1912640635340834,
"grad_norm": 1.1774057149887085,
"learning_rate": 3.844751060623404e-05,
"loss": 1.0974,
"step": 1800
},
{
"epoch": 1.1939113170086035,
"grad_norm": 1.1269370317459106,
"learning_rate": 3.823461943259132e-05,
"loss": 1.1296,
"step": 1804
},
{
"epoch": 1.1965585704831239,
"grad_norm": 1.2880319356918335,
"learning_rate": 3.802195381849901e-05,
"loss": 1.1121,
"step": 1808
},
{
"epoch": 1.199205823957644,
"grad_norm": 1.1425657272338867,
"learning_rate": 3.7809517841067976e-05,
"loss": 1.0818,
"step": 1812
},
{
"epoch": 1.201853077432164,
"grad_norm": 1.1727538108825684,
"learning_rate": 3.759731557300652e-05,
"loss": 1.025,
"step": 1816
},
{
"epoch": 1.2045003309066844,
"grad_norm": 1.2917152643203735,
"learning_rate": 3.738535108254246e-05,
"loss": 1.21,
"step": 1820
},
{
"epoch": 1.2071475843812045,
"grad_norm": 1.1989338397979736,
"learning_rate": 3.7173628433345006e-05,
"loss": 1.1712,
"step": 1824
},
{
"epoch": 1.2097948378557246,
"grad_norm": 1.2029826641082764,
"learning_rate": 3.696215168444699e-05,
"loss": 1.1146,
"step": 1828
},
{
"epoch": 1.212442091330245,
"grad_norm": 1.173412561416626,
"learning_rate": 3.675092489016693e-05,
"loss": 1.1237,
"step": 1832
},
{
"epoch": 1.215089344804765,
"grad_norm": 1.250653862953186,
"learning_rate": 3.6539952100031326e-05,
"loss": 1.1326,
"step": 1836
},
{
"epoch": 1.2177365982792852,
"grad_norm": 1.1222728490829468,
"learning_rate": 3.632923735869711e-05,
"loss": 1.1575,
"step": 1840
},
{
"epoch": 1.2203838517538055,
"grad_norm": 1.098129153251648,
"learning_rate": 3.611878470587402e-05,
"loss": 1.1357,
"step": 1844
},
{
"epoch": 1.2230311052283256,
"grad_norm": 1.2261312007904053,
"learning_rate": 3.5908598176247124e-05,
"loss": 1.075,
"step": 1848
},
{
"epoch": 1.225678358702846,
"grad_norm": 1.145168423652649,
"learning_rate": 3.569868179939958e-05,
"loss": 1.1333,
"step": 1852
},
{
"epoch": 1.228325612177366,
"grad_norm": 1.1339921951293945,
"learning_rate": 3.5489039599735294e-05,
"loss": 1.0158,
"step": 1856
},
{
"epoch": 1.2309728656518861,
"grad_norm": 1.2139281034469604,
"learning_rate": 3.5279675596401846e-05,
"loss": 1.1726,
"step": 1860
},
{
"epoch": 1.2336201191264062,
"grad_norm": 1.2778246402740479,
"learning_rate": 3.5070593803213267e-05,
"loss": 1.182,
"step": 1864
},
{
"epoch": 1.2362673726009266,
"grad_norm": 1.2227150201797485,
"learning_rate": 3.4861798228573325e-05,
"loss": 1.0037,
"step": 1868
},
{
"epoch": 1.2389146260754467,
"grad_norm": 1.2715504169464111,
"learning_rate": 3.465329287539852e-05,
"loss": 1.21,
"step": 1872
},
{
"epoch": 1.241561879549967,
"grad_norm": 1.300766944885254,
"learning_rate": 3.444508174104136e-05,
"loss": 1.1,
"step": 1876
},
{
"epoch": 1.244209133024487,
"grad_norm": 1.1540982723236084,
"learning_rate": 3.423716881721375e-05,
"loss": 1.1127,
"step": 1880
},
{
"epoch": 1.2468563864990072,
"grad_norm": 1.4233511686325073,
"learning_rate": 3.402955808991052e-05,
"loss": 1.1692,
"step": 1884
},
{
"epoch": 1.2495036399735275,
"grad_norm": 1.2163995504379272,
"learning_rate": 3.382225353933288e-05,
"loss": 1.0856,
"step": 1888
},
{
"epoch": 1.2521508934480476,
"grad_norm": 1.2361574172973633,
"learning_rate": 3.3615259139812225e-05,
"loss": 1.2024,
"step": 1892
},
{
"epoch": 1.254798146922568,
"grad_norm": 1.0741496086120605,
"learning_rate": 3.340857885973388e-05,
"loss": 1.0447,
"step": 1896
},
{
"epoch": 1.257445400397088,
"grad_norm": 1.1579320430755615,
"learning_rate": 3.320221666146107e-05,
"loss": 1.0772,
"step": 1900
},
{
"epoch": 1.2600926538716082,
"grad_norm": 1.2062878608703613,
"learning_rate": 3.299617650125889e-05,
"loss": 1.1011,
"step": 1904
},
{
"epoch": 1.2627399073461283,
"grad_norm": 1.2862952947616577,
"learning_rate": 3.279046232921852e-05,
"loss": 1.2596,
"step": 1908
},
{
"epoch": 1.2653871608206486,
"grad_norm": 1.2335329055786133,
"learning_rate": 3.2585078089181464e-05,
"loss": 1.2462,
"step": 1912
},
{
"epoch": 1.2680344142951687,
"grad_norm": 1.0968290567398071,
"learning_rate": 3.238002771866391e-05,
"loss": 1.0543,
"step": 1916
},
{
"epoch": 1.270681667769689,
"grad_norm": 1.06516695022583,
"learning_rate": 3.217531514878136e-05,
"loss": 1.1669,
"step": 1920
},
{
"epoch": 1.2733289212442092,
"grad_norm": 1.1616246700286865,
"learning_rate": 3.1970944304173126e-05,
"loss": 1.2252,
"step": 1924
},
{
"epoch": 1.2759761747187293,
"grad_norm": 1.1696902513504028,
"learning_rate": 3.176691910292715e-05,
"loss": 1.2329,
"step": 1928
},
{
"epoch": 1.2786234281932494,
"grad_norm": 1.210041880607605,
"learning_rate": 3.156324345650488e-05,
"loss": 1.3271,
"step": 1932
},
{
"epoch": 1.2812706816677697,
"grad_norm": 1.0774304866790771,
"learning_rate": 3.1359921269666324e-05,
"loss": 1.0306,
"step": 1936
},
{
"epoch": 1.2839179351422898,
"grad_norm": 1.166651725769043,
"learning_rate": 3.1156956440395136e-05,
"loss": 1.021,
"step": 1940
},
{
"epoch": 1.2865651886168101,
"grad_norm": 1.2745511531829834,
"learning_rate": 3.095435285982387e-05,
"loss": 1.1301,
"step": 1944
},
{
"epoch": 1.2892124420913302,
"grad_norm": 1.0762966871261597,
"learning_rate": 3.075211441215944e-05,
"loss": 1.0831,
"step": 1948
},
{
"epoch": 1.2918596955658503,
"grad_norm": 1.298743486404419,
"learning_rate": 3.055024497460867e-05,
"loss": 1.1705,
"step": 1952
},
{
"epoch": 1.2945069490403707,
"grad_norm": 1.243034839630127,
"learning_rate": 3.0348748417303823e-05,
"loss": 1.1282,
"step": 1956
},
{
"epoch": 1.2971542025148908,
"grad_norm": 1.2496618032455444,
"learning_rate": 3.0147628603228594e-05,
"loss": 1.0639,
"step": 1960
},
{
"epoch": 1.299801455989411,
"grad_norm": 1.141508936882019,
"learning_rate": 2.9946889388143913e-05,
"loss": 1.1297,
"step": 1964
},
{
"epoch": 1.3024487094639312,
"grad_norm": 1.188610553741455,
"learning_rate": 2.974653462051411e-05,
"loss": 1.1628,
"step": 1968
},
{
"epoch": 1.3050959629384513,
"grad_norm": 1.1807959079742432,
"learning_rate": 2.9546568141433006e-05,
"loss": 1.0527,
"step": 1972
},
{
"epoch": 1.3077432164129714,
"grad_norm": 1.1804313659667969,
"learning_rate": 2.9346993784550474e-05,
"loss": 1.196,
"step": 1976
},
{
"epoch": 1.3103904698874917,
"grad_norm": 1.1646931171417236,
"learning_rate": 2.9147815375998766e-05,
"loss": 1.0773,
"step": 1980
},
{
"epoch": 1.3130377233620119,
"grad_norm": 1.4130630493164062,
"learning_rate": 2.8949036734319247e-05,
"loss": 1.2183,
"step": 1984
},
{
"epoch": 1.3156849768365322,
"grad_norm": 1.1829743385314941,
"learning_rate": 2.8750661670389135e-05,
"loss": 1.1457,
"step": 1988
},
{
"epoch": 1.3183322303110523,
"grad_norm": 1.1480798721313477,
"learning_rate": 2.8552693987348532e-05,
"loss": 1.0502,
"step": 1992
},
{
"epoch": 1.3209794837855724,
"grad_norm": 1.1411528587341309,
"learning_rate": 2.835513748052738e-05,
"loss": 1.1938,
"step": 1996
},
{
"epoch": 1.3236267372600927,
"grad_norm": 1.1550084352493286,
"learning_rate": 2.815799593737285e-05,
"loss": 1.1577,
"step": 2000
},
{
"epoch": 1.3262739907346128,
"grad_norm": 1.1829745769500732,
"learning_rate": 2.7961273137376566e-05,
"loss": 1.097,
"step": 2004
},
{
"epoch": 1.3289212442091332,
"grad_norm": 1.229865312576294,
"learning_rate": 2.7764972852002323e-05,
"loss": 1.0721,
"step": 2008
},
{
"epoch": 1.3315684976836533,
"grad_norm": 1.1786168813705444,
"learning_rate": 2.7569098844613616e-05,
"loss": 1.094,
"step": 2012
},
{
"epoch": 1.3342157511581734,
"grad_norm": 1.4941198825836182,
"learning_rate": 2.7373654870401634e-05,
"loss": 1.2017,
"step": 2016
},
{
"epoch": 1.3368630046326935,
"grad_norm": 1.1714154481887817,
"learning_rate": 2.7178644676313143e-05,
"loss": 0.9992,
"step": 2020
},
{
"epoch": 1.3395102581072138,
"grad_norm": 1.2153651714324951,
"learning_rate": 2.698407200097872e-05,
"loss": 1.1801,
"step": 2024
},
{
"epoch": 1.342157511581734,
"grad_norm": 1.2198010683059692,
"learning_rate": 2.6789940574641102e-05,
"loss": 1.0585,
"step": 2028
},
{
"epoch": 1.3448047650562542,
"grad_norm": 1.2211023569107056,
"learning_rate": 2.6596254119083656e-05,
"loss": 1.111,
"step": 2032
},
{
"epoch": 1.3474520185307743,
"grad_norm": 1.2999107837677002,
"learning_rate": 2.6403016347558894e-05,
"loss": 1.1344,
"step": 2036
},
{
"epoch": 1.3500992720052944,
"grad_norm": 1.181583046913147,
"learning_rate": 2.6210230964717513e-05,
"loss": 1.0638,
"step": 2040
},
{
"epoch": 1.3527465254798146,
"grad_norm": 1.1883265972137451,
"learning_rate": 2.6017901666537216e-05,
"loss": 1.0218,
"step": 2044
},
{
"epoch": 1.3553937789543349,
"grad_norm": 1.2537999153137207,
"learning_rate": 2.5826032140251943e-05,
"loss": 1.0679,
"step": 2048
},
{
"epoch": 1.358041032428855,
"grad_norm": 1.1566420793533325,
"learning_rate": 2.563462606428101e-05,
"loss": 1.116,
"step": 2052
},
{
"epoch": 1.3606882859033753,
"grad_norm": 1.1046433448791504,
"learning_rate": 2.5443687108158836e-05,
"loss": 1.0058,
"step": 2056
},
{
"epoch": 1.3633355393778954,
"grad_norm": 1.307966709136963,
"learning_rate": 2.525321893246444e-05,
"loss": 1.2426,
"step": 2060
},
{
"epoch": 1.3659827928524155,
"grad_norm": 1.0436811447143555,
"learning_rate": 2.5063225188751273e-05,
"loss": 1.0737,
"step": 2064
},
{
"epoch": 1.3686300463269359,
"grad_norm": 1.0671106576919556,
"learning_rate": 2.4873709519477202e-05,
"loss": 1.083,
"step": 2068
},
{
"epoch": 1.371277299801456,
"grad_norm": 1.3584109544754028,
"learning_rate": 2.4684675557934767e-05,
"loss": 1.0333,
"step": 2072
},
{
"epoch": 1.3739245532759763,
"grad_norm": 1.180293321609497,
"learning_rate": 2.4496126928181467e-05,
"loss": 1.0714,
"step": 2076
},
{
"epoch": 1.3765718067504964,
"grad_norm": 1.102691888809204,
"learning_rate": 2.4308067244970228e-05,
"loss": 1.0386,
"step": 2080
},
{
"epoch": 1.3792190602250165,
"grad_norm": 1.156723976135254,
"learning_rate": 2.4120500113680177e-05,
"loss": 1.0593,
"step": 2084
},
{
"epoch": 1.3818663136995366,
"grad_norm": 1.2727686166763306,
"learning_rate": 2.3933429130247538e-05,
"loss": 1.2251,
"step": 2088
},
{
"epoch": 1.384513567174057,
"grad_norm": 1.213897466659546,
"learning_rate": 2.3746857881096584e-05,
"loss": 1.0509,
"step": 2092
},
{
"epoch": 1.387160820648577,
"grad_norm": 1.1525429487228394,
"learning_rate": 2.3560789943071033e-05,
"loss": 1.0187,
"step": 2096
},
{
"epoch": 1.3898080741230974,
"grad_norm": 1.1950461864471436,
"learning_rate": 2.3375228883365334e-05,
"loss": 1.0912,
"step": 2100
},
{
"epoch": 1.3924553275976175,
"grad_norm": 1.1531497240066528,
"learning_rate": 2.319017825945633e-05,
"loss": 1.128,
"step": 2104
},
{
"epoch": 1.3951025810721376,
"grad_norm": 1.2713518142700195,
"learning_rate": 2.300564161903511e-05,
"loss": 1.0656,
"step": 2108
},
{
"epoch": 1.397749834546658,
"grad_norm": 1.1415860652923584,
"learning_rate": 2.282162249993895e-05,
"loss": 1.1084,
"step": 2112
},
{
"epoch": 1.400397088021178,
"grad_norm": 1.114864468574524,
"learning_rate": 2.263812443008343e-05,
"loss": 1.0531,
"step": 2116
},
{
"epoch": 1.4030443414956983,
"grad_norm": 1.3787562847137451,
"learning_rate": 2.245515092739488e-05,
"loss": 1.072,
"step": 2120
},
{
"epoch": 1.4056915949702184,
"grad_norm": 1.014003872871399,
"learning_rate": 2.2272705499742925e-05,
"loss": 1.0156,
"step": 2124
},
{
"epoch": 1.4083388484447386,
"grad_norm": 1.1538441181182861,
"learning_rate": 2.209079164487323e-05,
"loss": 1.0101,
"step": 2128
},
{
"epoch": 1.4109861019192587,
"grad_norm": 1.2096091508865356,
"learning_rate": 2.1909412850340394e-05,
"loss": 1.0201,
"step": 2132
},
{
"epoch": 1.413633355393779,
"grad_norm": 1.1149653196334839,
"learning_rate": 2.1728572593441133e-05,
"loss": 1.1124,
"step": 2136
},
{
"epoch": 1.416280608868299,
"grad_norm": 1.3355867862701416,
"learning_rate": 2.154827434114765e-05,
"loss": 1.1943,
"step": 2140
},
{
"epoch": 1.4189278623428194,
"grad_norm": 1.2160899639129639,
"learning_rate": 2.1368521550041066e-05,
"loss": 1.1481,
"step": 2144
},
{
"epoch": 1.4215751158173395,
"grad_norm": 1.163010597229004,
"learning_rate": 2.1189317666245285e-05,
"loss": 1.0703,
"step": 2148
},
{
"epoch": 1.4242223692918596,
"grad_norm": 1.1877809762954712,
"learning_rate": 2.1010666125360767e-05,
"loss": 1.1211,
"step": 2152
},
{
"epoch": 1.42686962276638,
"grad_norm": 1.4443504810333252,
"learning_rate": 2.083257035239885e-05,
"loss": 1.2918,
"step": 2156
},
{
"epoch": 1.4295168762409,
"grad_norm": 1.2549368143081665,
"learning_rate": 2.0655033761715897e-05,
"loss": 1.1117,
"step": 2160
},
{
"epoch": 1.4321641297154202,
"grad_norm": 1.2271883487701416,
"learning_rate": 2.0478059756948002e-05,
"loss": 1.1452,
"step": 2164
},
{
"epoch": 1.4348113831899405,
"grad_norm": 1.2357865571975708,
"learning_rate": 2.0301651730945627e-05,
"loss": 1.0594,
"step": 2168
},
{
"epoch": 1.4374586366644606,
"grad_norm": 1.08621346950531,
"learning_rate": 2.0125813065708566e-05,
"loss": 1.0332,
"step": 2172
},
{
"epoch": 1.4401058901389807,
"grad_norm": 1.1553773880004883,
"learning_rate": 1.9950547132321183e-05,
"loss": 1.0823,
"step": 2176
},
{
"epoch": 1.442753143613501,
"grad_norm": 1.2597051858901978,
"learning_rate": 1.9775857290887757e-05,
"loss": 1.0197,
"step": 2180
},
{
"epoch": 1.4454003970880211,
"grad_norm": 1.2433415651321411,
"learning_rate": 1.9601746890467965e-05,
"loss": 1.0602,
"step": 2184
},
{
"epoch": 1.4480476505625415,
"grad_norm": 1.3405801057815552,
"learning_rate": 1.942821926901279e-05,
"loss": 1.1459,
"step": 2188
},
{
"epoch": 1.4506949040370616,
"grad_norm": 1.1183578968048096,
"learning_rate": 1.9255277753300487e-05,
"loss": 1.08,
"step": 2192
},
{
"epoch": 1.4533421575115817,
"grad_norm": 1.011930227279663,
"learning_rate": 1.9082925658872853e-05,
"loss": 1.0511,
"step": 2196
},
{
"epoch": 1.4559894109861018,
"grad_norm": 1.1752732992172241,
"learning_rate": 1.8911166289971545e-05,
"loss": 1.0437,
"step": 2200
},
{
"epoch": 1.4586366644606221,
"grad_norm": 1.1920056343078613,
"learning_rate": 1.8740002939474822e-05,
"loss": 1.0756,
"step": 2204
},
{
"epoch": 1.4612839179351422,
"grad_norm": 1.1798444986343384,
"learning_rate": 1.856943888883444e-05,
"loss": 1.0473,
"step": 2208
},
{
"epoch": 1.4639311714096626,
"grad_norm": 1.4702142477035522,
"learning_rate": 1.8399477408012643e-05,
"loss": 1.0968,
"step": 2212
},
{
"epoch": 1.4665784248841827,
"grad_norm": 1.2086206674575806,
"learning_rate": 1.82301217554196e-05,
"loss": 1.0752,
"step": 2216
},
{
"epoch": 1.4692256783587028,
"grad_norm": 1.2675915956497192,
"learning_rate": 1.8061375177850774e-05,
"loss": 1.1505,
"step": 2220
},
{
"epoch": 1.471872931833223,
"grad_norm": 1.1746258735656738,
"learning_rate": 1.7893240910424876e-05,
"loss": 1.0708,
"step": 2224
},
{
"epoch": 1.4745201853077432,
"grad_norm": 1.2071187496185303,
"learning_rate": 1.772572217652163e-05,
"loss": 1.085,
"step": 2228
},
{
"epoch": 1.4771674387822635,
"grad_norm": 1.321071743965149,
"learning_rate": 1.755882218772018e-05,
"loss": 1.1952,
"step": 2232
},
{
"epoch": 1.4798146922567836,
"grad_norm": 1.1357455253601074,
"learning_rate": 1.7392544143737355e-05,
"loss": 0.9572,
"step": 2236
},
{
"epoch": 1.4824619457313037,
"grad_norm": 1.1780970096588135,
"learning_rate": 1.7226891232366394e-05,
"loss": 0.9885,
"step": 2240
},
{
"epoch": 1.4851091992058238,
"grad_norm": 1.017472505569458,
"learning_rate": 1.7061866629415862e-05,
"loss": 1.0184,
"step": 2244
},
{
"epoch": 1.4877564526803442,
"grad_norm": 1.0961604118347168,
"learning_rate": 1.6897473498648765e-05,
"loss": 1.0232,
"step": 2248
},
{
"epoch": 1.4904037061548643,
"grad_norm": 1.187002182006836,
"learning_rate": 1.673371499172174e-05,
"loss": 0.9823,
"step": 2252
},
{
"epoch": 1.4930509596293846,
"grad_norm": 1.1367725133895874,
"learning_rate": 1.6570594248124875e-05,
"loss": 1.0288,
"step": 2256
},
{
"epoch": 1.4956982131039047,
"grad_norm": 1.129102110862732,
"learning_rate": 1.640811439512136e-05,
"loss": 1.0688,
"step": 2260
},
{
"epoch": 1.4983454665784248,
"grad_norm": 1.1886552572250366,
"learning_rate": 1.6246278547687604e-05,
"loss": 1.0209,
"step": 2264
},
{
"epoch": 1.500992720052945,
"grad_norm": 1.2786222696304321,
"learning_rate": 1.6085089808453408e-05,
"loss": 1.1101,
"step": 2268
},
{
"epoch": 1.5036399735274653,
"grad_norm": 1.2403247356414795,
"learning_rate": 1.592455126764264e-05,
"loss": 1.0919,
"step": 2272
},
{
"epoch": 1.5062872270019856,
"grad_norm": 1.1364173889160156,
"learning_rate": 1.5764666003013905e-05,
"loss": 1.0854,
"step": 2276
},
{
"epoch": 1.5089344804765057,
"grad_norm": 1.0539426803588867,
"learning_rate": 1.560543707980152e-05,
"loss": 1.014,
"step": 2280
},
{
"epoch": 1.5115817339510258,
"grad_norm": 1.2470543384552002,
"learning_rate": 1.544686755065677e-05,
"loss": 1.0845,
"step": 2284
},
{
"epoch": 1.514228987425546,
"grad_norm": 1.3111423254013062,
"learning_rate": 1.5288960455589447e-05,
"loss": 1.1363,
"step": 2288
},
{
"epoch": 1.5168762409000662,
"grad_norm": 1.076616883277893,
"learning_rate": 1.5131718821909435e-05,
"loss": 1.0104,
"step": 2292
},
{
"epoch": 1.5195234943745863,
"grad_norm": 1.082895040512085,
"learning_rate": 1.4975145664168839e-05,
"loss": 1.0468,
"step": 2296
},
{
"epoch": 1.5221707478491067,
"grad_norm": 1.2314468622207642,
"learning_rate": 1.4819243984104015e-05,
"loss": 1.0802,
"step": 2300
},
{
"epoch": 1.5248180013236268,
"grad_norm": 1.7986695766448975,
"learning_rate": 1.4664016770578182e-05,
"loss": 1.0324,
"step": 2304
},
{
"epoch": 1.5274652547981469,
"grad_norm": 1.2059293985366821,
"learning_rate": 1.4509466999523985e-05,
"loss": 1.0119,
"step": 2308
},
{
"epoch": 1.530112508272667,
"grad_norm": 1.1547520160675049,
"learning_rate": 1.4355597633886575e-05,
"loss": 1.0348,
"step": 2312
},
{
"epoch": 1.5327597617471873,
"grad_norm": 1.1303229331970215,
"learning_rate": 1.4202411623566685e-05,
"loss": 0.9453,
"step": 2316
},
{
"epoch": 1.5354070152217076,
"grad_norm": 1.3329232931137085,
"learning_rate": 1.4049911905364128e-05,
"loss": 1.1958,
"step": 2320
},
{
"epoch": 1.5380542686962277,
"grad_norm": 1.2855108976364136,
"learning_rate": 1.3898101402921516e-05,
"loss": 1.1197,
"step": 2324
},
{
"epoch": 1.5407015221707479,
"grad_norm": 1.1098500490188599,
"learning_rate": 1.3746983026668198e-05,
"loss": 1.0392,
"step": 2328
},
{
"epoch": 1.543348775645268,
"grad_norm": 1.232391119003296,
"learning_rate": 1.359655967376442e-05,
"loss": 1.0877,
"step": 2332
},
{
"epoch": 1.545996029119788,
"grad_norm": 1.2778176069259644,
"learning_rate": 1.3446834228045812e-05,
"loss": 1.0646,
"step": 2336
},
{
"epoch": 1.5486432825943084,
"grad_norm": 1.0760436058044434,
"learning_rate": 1.3297809559968133e-05,
"loss": 1.0476,
"step": 2340
},
{
"epoch": 1.5512905360688287,
"grad_norm": 1.0470277070999146,
"learning_rate": 1.3149488526552201e-05,
"loss": 0.9706,
"step": 2344
},
{
"epoch": 1.5539377895433488,
"grad_norm": 1.3804305791854858,
"learning_rate": 1.3001873971329121e-05,
"loss": 1.0437,
"step": 2348
},
{
"epoch": 1.556585043017869,
"grad_norm": 1.1428264379501343,
"learning_rate": 1.2854968724285754e-05,
"loss": 1.0923,
"step": 2352
},
{
"epoch": 1.559232296492389,
"grad_norm": 1.1798884868621826,
"learning_rate": 1.270877560181054e-05,
"loss": 1.1306,
"step": 2356
},
{
"epoch": 1.5618795499669094,
"grad_norm": 1.1382559537887573,
"learning_rate": 1.2563297406639395e-05,
"loss": 1.1029,
"step": 2360
},
{
"epoch": 1.5645268034414295,
"grad_norm": 1.0915166139602661,
"learning_rate": 1.2418536927802094e-05,
"loss": 0.9779,
"step": 2364
},
{
"epoch": 1.5671740569159498,
"grad_norm": 1.1595373153686523,
"learning_rate": 1.2274496940568664e-05,
"loss": 1.1744,
"step": 2368
},
{
"epoch": 1.56982131039047,
"grad_norm": 1.1752400398254395,
"learning_rate": 1.213118020639633e-05,
"loss": 1.0246,
"step": 2372
},
{
"epoch": 1.57246856386499,
"grad_norm": 1.064510464668274,
"learning_rate": 1.1988589472876438e-05,
"loss": 1.1571,
"step": 2376
},
{
"epoch": 1.5751158173395101,
"grad_norm": 1.2771798372268677,
"learning_rate": 1.184672747368189e-05,
"loss": 1.0656,
"step": 2380
},
{
"epoch": 1.5777630708140304,
"grad_norm": 1.2218413352966309,
"learning_rate": 1.1705596928514645e-05,
"loss": 1.0626,
"step": 2384
},
{
"epoch": 1.5804103242885508,
"grad_norm": 1.0653800964355469,
"learning_rate": 1.1565200543053623e-05,
"loss": 1.0626,
"step": 2388
},
{
"epoch": 1.5830575777630709,
"grad_norm": 1.2271225452423096,
"learning_rate": 1.1425541008902851e-05,
"loss": 1.1017,
"step": 2392
},
{
"epoch": 1.585704831237591,
"grad_norm": 1.1287221908569336,
"learning_rate": 1.128662100353985e-05,
"loss": 0.9612,
"step": 2396
},
{
"epoch": 1.588352084712111,
"grad_norm": 1.1722044944763184,
"learning_rate": 1.1148443190264246e-05,
"loss": 0.9906,
"step": 2400
},
{
"epoch": 1.5909993381866314,
"grad_norm": 1.3099933862686157,
"learning_rate": 1.1011010218146777e-05,
"loss": 1.0637,
"step": 2404
},
{
"epoch": 1.5936465916611515,
"grad_norm": 1.1737853288650513,
"learning_rate": 1.0874324721978501e-05,
"loss": 1.082,
"step": 2408
},
{
"epoch": 1.5962938451356719,
"grad_norm": 1.258298635482788,
"learning_rate": 1.0738389322220276e-05,
"loss": 1.0151,
"step": 2412
},
{
"epoch": 1.598941098610192,
"grad_norm": 1.198495864868164,
"learning_rate": 1.0603206624952482e-05,
"loss": 1.0566,
"step": 2416
},
{
"epoch": 1.601588352084712,
"grad_norm": 1.1976563930511475,
"learning_rate": 1.0468779221825103e-05,
"loss": 1.1149,
"step": 2420
},
{
"epoch": 1.6042356055592322,
"grad_norm": 1.0899832248687744,
"learning_rate": 1.0335109690008055e-05,
"loss": 1.0187,
"step": 2424
},
{
"epoch": 1.6068828590337525,
"grad_norm": 1.3058562278747559,
"learning_rate": 1.0202200592141703e-05,
"loss": 1.1494,
"step": 2428
},
{
"epoch": 1.6095301125082728,
"grad_norm": 1.304995059967041,
"learning_rate": 1.0070054476287849e-05,
"loss": 1.1067,
"step": 2432
},
{
"epoch": 1.612177365982793,
"grad_norm": 1.2065619230270386,
"learning_rate": 9.938673875880755e-06,
"loss": 1.03,
"step": 2436
},
{
"epoch": 1.614824619457313,
"grad_norm": 1.3018181324005127,
"learning_rate": 9.808061309678634e-06,
"loss": 1.1286,
"step": 2440
},
{
"epoch": 1.6174718729318331,
"grad_norm": 1.257094144821167,
"learning_rate": 9.678219281715412e-06,
"loss": 1.2452,
"step": 2444
},
{
"epoch": 1.6201191264063532,
"grad_norm": 1.1389868259429932,
"learning_rate": 9.549150281252633e-06,
"loss": 1.1589,
"step": 2448
},
{
"epoch": 1.6227663798808736,
"grad_norm": 1.2208179235458374,
"learning_rate": 9.420856782731774e-06,
"loss": 1.0969,
"step": 2452
},
{
"epoch": 1.625413633355394,
"grad_norm": 1.2272435426712036,
"learning_rate": 9.293341245726794e-06,
"loss": 0.9552,
"step": 2456
},
{
"epoch": 1.628060886829914,
"grad_norm": 1.1400785446166992,
"learning_rate": 9.16660611489702e-06,
"loss": 0.9583,
"step": 2460
},
{
"epoch": 1.6307081403044341,
"grad_norm": 1.1277272701263428,
"learning_rate": 9.040653819940259e-06,
"loss": 1.0511,
"step": 2464
},
{
"epoch": 1.6333553937789542,
"grad_norm": 1.1486189365386963,
"learning_rate": 8.915486775546173e-06,
"loss": 0.9686,
"step": 2468
},
{
"epoch": 1.6360026472534746,
"grad_norm": 1.1076239347457886,
"learning_rate": 8.791107381350027e-06,
"loss": 0.9773,
"step": 2472
},
{
"epoch": 1.6386499007279947,
"grad_norm": 1.0638751983642578,
"learning_rate": 8.6675180218867e-06,
"loss": 1.0176,
"step": 2476
},
{
"epoch": 1.641297154202515,
"grad_norm": 1.201035499572754,
"learning_rate": 8.544721066544964e-06,
"loss": 1.0009,
"step": 2480
},
{
"epoch": 1.643944407677035,
"grad_norm": 1.2673206329345703,
"learning_rate": 8.422718869522006e-06,
"loss": 1.1548,
"step": 2484
},
{
"epoch": 1.6465916611515552,
"grad_norm": 1.1903181076049805,
"learning_rate": 8.30151376977834e-06,
"loss": 1.0678,
"step": 2488
},
{
"epoch": 1.6492389146260753,
"grad_norm": 1.1597754955291748,
"learning_rate": 8.181108090993001e-06,
"loss": 1.0756,
"step": 2492
},
{
"epoch": 1.6518861681005956,
"grad_norm": 1.142747163772583,
"learning_rate": 8.061504141518888e-06,
"loss": 1.1026,
"step": 2496
},
{
"epoch": 1.654533421575116,
"grad_norm": 1.187888741493225,
"learning_rate": 7.942704214338648e-06,
"loss": 1.0138,
"step": 2500
},
{
"epoch": 1.657180675049636,
"grad_norm": 1.1005282402038574,
"learning_rate": 7.824710587020596e-06,
"loss": 1.015,
"step": 2504
},
{
"epoch": 1.6598279285241562,
"grad_norm": 1.2265509366989136,
"learning_rate": 7.707525521675097e-06,
"loss": 1.3109,
"step": 2508
},
{
"epoch": 1.6624751819986763,
"grad_norm": 1.1046435832977295,
"learning_rate": 7.591151264911239e-06,
"loss": 1.0726,
"step": 2512
},
{
"epoch": 1.6651224354731966,
"grad_norm": 1.1124870777130127,
"learning_rate": 7.475590047793712e-06,
"loss": 1.0319,
"step": 2516
},
{
"epoch": 1.6677696889477167,
"grad_norm": 1.0768115520477295,
"learning_rate": 7.360844085800023e-06,
"loss": 0.9718,
"step": 2520
},
{
"epoch": 1.670416942422237,
"grad_norm": 1.1033765077590942,
"learning_rate": 7.246915578778046e-06,
"loss": 0.9838,
"step": 2524
},
{
"epoch": 1.6730641958967571,
"grad_norm": 1.1125131845474243,
"learning_rate": 7.133806710903884e-06,
"loss": 0.9366,
"step": 2528
},
{
"epoch": 1.6757114493712773,
"grad_norm": 1.0644124746322632,
"learning_rate": 7.0215196506399515e-06,
"loss": 0.9442,
"step": 2532
},
{
"epoch": 1.6783587028457974,
"grad_norm": 1.4144614934921265,
"learning_rate": 6.910056550693356e-06,
"loss": 1.0511,
"step": 2536
},
{
"epoch": 1.6810059563203177,
"grad_norm": 1.1880645751953125,
"learning_rate": 6.799419547974739e-06,
"loss": 1.069,
"step": 2540
},
{
"epoch": 1.683653209794838,
"grad_norm": 1.2131253480911255,
"learning_rate": 6.6896107635572414e-06,
"loss": 1.11,
"step": 2544
},
{
"epoch": 1.6863004632693581,
"grad_norm": 1.1012145280838013,
"learning_rate": 6.580632302635831e-06,
"loss": 1.0216,
"step": 2548
},
{
"epoch": 1.6889477167438782,
"grad_norm": 1.4158655405044556,
"learning_rate": 6.472486254486954e-06,
"loss": 0.989,
"step": 2552
},
{
"epoch": 1.6915949702183983,
"grad_norm": 1.168895959854126,
"learning_rate": 6.36517469242851e-06,
"loss": 1.1558,
"step": 2556
},
{
"epoch": 1.6942422236929184,
"grad_norm": 1.180389642715454,
"learning_rate": 6.258699673780083e-06,
"loss": 1.0815,
"step": 2560
},
{
"epoch": 1.6968894771674388,
"grad_norm": 1.186112642288208,
"learning_rate": 6.15306323982347e-06,
"loss": 1.0766,
"step": 2564
},
{
"epoch": 1.699536730641959,
"grad_norm": 1.3972220420837402,
"learning_rate": 6.04826741576357e-06,
"loss": 0.933,
"step": 2568
},
{
"epoch": 1.7021839841164792,
"grad_norm": 1.0709800720214844,
"learning_rate": 5.944314210689611e-06,
"loss": 0.9295,
"step": 2572
},
{
"epoch": 1.7048312375909993,
"grad_norm": 1.131684422492981,
"learning_rate": 5.841205617536516e-06,
"loss": 1.0127,
"step": 2576
},
{
"epoch": 1.7074784910655194,
"grad_norm": 1.1289499998092651,
"learning_rate": 5.738943613046821e-06,
"loss": 1.0566,
"step": 2580
},
{
"epoch": 1.7101257445400397,
"grad_norm": 1.0850427150726318,
"learning_rate": 5.637530157732673e-06,
"loss": 0.929,
"step": 2584
},
{
"epoch": 1.7127729980145598,
"grad_norm": 1.3074991703033447,
"learning_rate": 5.536967195838333e-06,
"loss": 1.1549,
"step": 2588
},
{
"epoch": 1.7154202514890802,
"grad_norm": 1.286634922027588,
"learning_rate": 5.437256655302814e-06,
"loss": 1.0361,
"step": 2592
},
{
"epoch": 1.7180675049636003,
"grad_norm": 1.098363995552063,
"learning_rate": 5.338400447723008e-06,
"loss": 1.0157,
"step": 2596
},
{
"epoch": 1.7207147584381204,
"grad_norm": 1.2663805484771729,
"learning_rate": 5.240400468316975e-06,
"loss": 1.0805,
"step": 2600
},
{
"epoch": 1.7233620119126405,
"grad_norm": 1.2380725145339966,
"learning_rate": 5.143258595887607e-06,
"loss": 1.0504,
"step": 2604
},
{
"epoch": 1.7260092653871608,
"grad_norm": 1.534725546836853,
"learning_rate": 5.046976692786665e-06,
"loss": 1.0683,
"step": 2608
},
{
"epoch": 1.7286565188616811,
"grad_norm": 1.2903854846954346,
"learning_rate": 4.951556604879048e-06,
"loss": 1.1924,
"step": 2612
},
{
"epoch": 1.7313037723362013,
"grad_norm": 1.378965139389038,
"learning_rate": 4.857000161507353e-06,
"loss": 1.1261,
"step": 2616
},
{
"epoch": 1.7339510258107214,
"grad_norm": 1.3099424839019775,
"learning_rate": 4.763309175456876e-06,
"loss": 1.1385,
"step": 2620
},
{
"epoch": 1.7365982792852415,
"grad_norm": 1.1315497159957886,
"learning_rate": 4.67048544292083e-06,
"loss": 1.0022,
"step": 2624
},
{
"epoch": 1.7392455327597618,
"grad_norm": 1.0618172883987427,
"learning_rate": 4.5785307434659195e-06,
"loss": 0.933,
"step": 2628
},
{
"epoch": 1.741892786234282,
"grad_norm": 1.1535784006118774,
"learning_rate": 4.487446839998194e-06,
"loss": 1.0693,
"step": 2632
},
{
"epoch": 1.7445400397088022,
"grad_norm": 1.208883285522461,
"learning_rate": 4.397235478729262e-06,
"loss": 1.0487,
"step": 2636
},
{
"epoch": 1.7471872931833223,
"grad_norm": 1.079362392425537,
"learning_rate": 4.307898389142867e-06,
"loss": 1.0225,
"step": 2640
},
{
"epoch": 1.7498345466578424,
"grad_norm": 1.1642612218856812,
"learning_rate": 4.21943728396163e-06,
"loss": 1.0915,
"step": 2644
},
{
"epoch": 1.7524818001323625,
"grad_norm": 1.202144742012024,
"learning_rate": 4.1318538591143204e-06,
"loss": 0.9903,
"step": 2648
},
{
"epoch": 1.7551290536068829,
"grad_norm": 1.182325839996338,
"learning_rate": 4.045149793703257e-06,
"loss": 1.0321,
"step": 2652
},
{
"epoch": 1.7577763070814032,
"grad_norm": 1.1768420934677124,
"learning_rate": 3.959326749972159e-06,
"loss": 1.0065,
"step": 2656
},
{
"epoch": 1.7604235605559233,
"grad_norm": 1.1037213802337646,
"learning_rate": 3.8743863732742855e-06,
"loss": 1.0145,
"step": 2660
},
{
"epoch": 1.7630708140304434,
"grad_norm": 1.0442618131637573,
"learning_rate": 3.790330292040878e-06,
"loss": 0.9401,
"step": 2664
},
{
"epoch": 1.7657180675049635,
"grad_norm": 1.2205618619918823,
"learning_rate": 3.7071601177499193e-06,
"loss": 1.0445,
"step": 2668
},
{
"epoch": 1.7683653209794836,
"grad_norm": 0.982466995716095,
"learning_rate": 3.6248774448952695e-06,
"loss": 0.9302,
"step": 2672
},
{
"epoch": 1.771012574454004,
"grad_norm": 1.2503985166549683,
"learning_rate": 3.5434838509560974e-06,
"loss": 0.9465,
"step": 2676
},
{
"epoch": 1.7736598279285243,
"grad_norm": 1.2538197040557861,
"learning_rate": 3.4629808963666355e-06,
"loss": 1.1634,
"step": 2680
},
{
"epoch": 1.7763070814030444,
"grad_norm": 1.1053706407546997,
"learning_rate": 3.3833701244862347e-06,
"loss": 0.9964,
"step": 2684
},
{
"epoch": 1.7789543348775645,
"grad_norm": 1.2324868440628052,
"learning_rate": 3.304653061569807e-06,
"loss": 1.009,
"step": 2688
},
{
"epoch": 1.7816015883520846,
"grad_norm": 1.1064050197601318,
"learning_rate": 3.226831216738568e-06,
"loss": 0.9975,
"step": 2692
},
{
"epoch": 1.784248841826605,
"grad_norm": 1.1996777057647705,
"learning_rate": 3.149906081951076e-06,
"loss": 1.1181,
"step": 2696
},
{
"epoch": 1.786896095301125,
"grad_norm": 1.0701042413711548,
"learning_rate": 3.0738791319746606e-06,
"loss": 0.9735,
"step": 2700
},
{
"epoch": 1.7895433487756454,
"grad_norm": 1.426613211631775,
"learning_rate": 2.9987518243571266e-06,
"loss": 1.0882,
"step": 2704
},
{
"epoch": 1.7921906022501655,
"grad_norm": 1.1900283098220825,
"learning_rate": 2.924525599398831e-06,
"loss": 1.0896,
"step": 2708
},
{
"epoch": 1.7948378557246856,
"grad_norm": 1.203924536705017,
"learning_rate": 2.8512018801250428e-06,
"loss": 1.0041,
"step": 2712
},
{
"epoch": 1.7974851091992057,
"grad_norm": 1.1849395036697388,
"learning_rate": 2.7787820722586844e-06,
"loss": 1.018,
"step": 2716
},
{
"epoch": 1.800132362673726,
"grad_norm": 1.3121761083602905,
"learning_rate": 2.707267564193383e-06,
"loss": 1.0887,
"step": 2720
},
{
"epoch": 1.8027796161482463,
"grad_norm": 1.0863194465637207,
"learning_rate": 2.636659726966817e-06,
"loss": 0.9601,
"step": 2724
},
{
"epoch": 1.8054268696227664,
"grad_norm": 1.2052465677261353,
"learning_rate": 2.5669599142344958e-06,
"loss": 1.1252,
"step": 2728
},
{
"epoch": 1.8080741230972865,
"grad_norm": 1.2324072122573853,
"learning_rate": 2.4981694622437545e-06,
"loss": 1.0962,
"step": 2732
},
{
"epoch": 1.8107213765718067,
"grad_norm": 1.1981109380722046,
"learning_rate": 2.4302896898081516e-06,
"loss": 1.1382,
"step": 2736
},
{
"epoch": 1.813368630046327,
"grad_norm": 1.0790292024612427,
"learning_rate": 2.3633218982821724e-06,
"loss": 1.0246,
"step": 2740
},
{
"epoch": 1.816015883520847,
"grad_norm": 1.188328504562378,
"learning_rate": 2.2972673715363268e-06,
"loss": 1.1037,
"step": 2744
},
{
"epoch": 1.8186631369953674,
"grad_norm": 2.650550365447998,
"learning_rate": 2.232127375932491e-06,
"loss": 0.9985,
"step": 2748
},
{
"epoch": 1.8213103904698875,
"grad_norm": 1.209547758102417,
"learning_rate": 2.1679031602996168e-06,
"loss": 1.0379,
"step": 2752
},
{
"epoch": 1.8239576439444076,
"grad_norm": 1.2373130321502686,
"learning_rate": 2.104595955909844e-06,
"loss": 1.1138,
"step": 2756
},
{
"epoch": 1.8266048974189277,
"grad_norm": 1.1303315162658691,
"learning_rate": 2.042206976454869e-06,
"loss": 1.0872,
"step": 2760
},
{
"epoch": 1.829252150893448,
"grad_norm": 1.1631232500076294,
"learning_rate": 1.980737418022649e-06,
"loss": 0.9993,
"step": 2764
},
{
"epoch": 1.8318994043679684,
"grad_norm": 0.9920935034751892,
"learning_rate": 1.9201884590745122e-06,
"loss": 0.9902,
"step": 2768
},
{
"epoch": 1.8345466578424885,
"grad_norm": 1.1404036283493042,
"learning_rate": 1.8605612604225387e-06,
"loss": 0.9403,
"step": 2772
},
{
"epoch": 1.8371939113170086,
"grad_norm": 1.3009891510009766,
"learning_rate": 1.8018569652073381e-06,
"loss": 1.065,
"step": 2776
},
{
"epoch": 1.8398411647915287,
"grad_norm": 1.0856890678405762,
"learning_rate": 1.7440766988760793e-06,
"loss": 1.0082,
"step": 2780
},
{
"epoch": 1.8424884182660488,
"grad_norm": 1.2409597635269165,
"learning_rate": 1.6872215691609684e-06,
"loss": 1.2227,
"step": 2784
},
{
"epoch": 1.8451356717405691,
"grad_norm": 1.229095458984375,
"learning_rate": 1.631292666057982e-06,
"loss": 1.1196,
"step": 2788
},
{
"epoch": 1.8477829252150895,
"grad_norm": 1.1981017589569092,
"learning_rate": 1.5762910618059789e-06,
"loss": 1.1182,
"step": 2792
},
{
"epoch": 1.8504301786896096,
"grad_norm": 1.2496317625045776,
"learning_rate": 1.5222178108661444e-06,
"loss": 1.011,
"step": 2796
},
{
"epoch": 1.8530774321641297,
"grad_norm": 1.3405871391296387,
"learning_rate": 1.469073949901778e-06,
"loss": 0.9571,
"step": 2800
},
{
"epoch": 1.8557246856386498,
"grad_norm": 1.1392794847488403,
"learning_rate": 1.4168604977583989e-06,
"loss": 0.9235,
"step": 2804
},
{
"epoch": 1.8583719391131701,
"grad_norm": 1.3417925834655762,
"learning_rate": 1.3655784554442385e-06,
"loss": 0.9861,
"step": 2808
},
{
"epoch": 1.8610191925876902,
"grad_norm": 1.2177116870880127,
"learning_rate": 1.3152288061110518e-06,
"loss": 1.0414,
"step": 2812
},
{
"epoch": 1.8636664460622105,
"grad_norm": 1.18758225440979,
"learning_rate": 1.2658125150352361e-06,
"loss": 1.0958,
"step": 2816
},
{
"epoch": 1.8663136995367307,
"grad_norm": 1.068544864654541,
"learning_rate": 1.2173305295993477e-06,
"loss": 0.8817,
"step": 2820
},
{
"epoch": 1.8689609530112508,
"grad_norm": 1.0975282192230225,
"learning_rate": 1.169783779273953e-06,
"loss": 0.9843,
"step": 2824
},
{
"epoch": 1.8716082064857709,
"grad_norm": 1.1519986391067505,
"learning_rate": 1.1231731755997954e-06,
"loss": 1.1748,
"step": 2828
},
{
"epoch": 1.8742554599602912,
"grad_norm": 1.3243839740753174,
"learning_rate": 1.0774996121702908e-06,
"loss": 1.0024,
"step": 2832
},
{
"epoch": 1.8769027134348115,
"grad_norm": 1.1130131483078003,
"learning_rate": 1.0327639646144415e-06,
"loss": 0.9669,
"step": 2836
},
{
"epoch": 1.8795499669093316,
"grad_norm": 1.2060186862945557,
"learning_rate": 9.889670905800397e-07,
"loss": 0.9385,
"step": 2840
},
{
"epoch": 1.8821972203838517,
"grad_norm": 1.1549471616744995,
"learning_rate": 9.461098297172011e-07,
"loss": 0.9559,
"step": 2844
},
{
"epoch": 1.8848444738583718,
"grad_norm": 1.1581448316574097,
"learning_rate": 9.041930036622903e-07,
"loss": 1.069,
"step": 2848
},
{
"epoch": 1.8874917273328922,
"grad_norm": 1.1043188571929932,
"learning_rate": 8.632174160221496e-07,
"loss": 1.0042,
"step": 2852
},
{
"epoch": 1.8901389808074123,
"grad_norm": 1.1459840536117554,
"learning_rate": 8.231838523587277e-07,
"loss": 0.9267,
"step": 2856
},
{
"epoch": 1.8927862342819326,
"grad_norm": 1.2066096067428589,
"learning_rate": 7.840930801739754e-07,
"loss": 1.0465,
"step": 2860
},
{
"epoch": 1.8954334877564527,
"grad_norm": 1.2505649328231812,
"learning_rate": 7.459458488951632e-07,
"loss": 1.0685,
"step": 2864
},
{
"epoch": 1.8980807412309728,
"grad_norm": 1.138899564743042,
"learning_rate": 7.087428898604975e-07,
"loss": 1.0052,
"step": 2868
},
{
"epoch": 1.900727994705493,
"grad_norm": 1.1179523468017578,
"learning_rate": 6.724849163050995e-07,
"loss": 0.9854,
"step": 2872
},
{
"epoch": 1.9033752481800132,
"grad_norm": 1.3499395847320557,
"learning_rate": 6.37172623347354e-07,
"loss": 1.0413,
"step": 2876
},
{
"epoch": 1.9060225016545336,
"grad_norm": 1.0739634037017822,
"learning_rate": 6.02806687975549e-07,
"loss": 1.1554,
"step": 2880
},
{
"epoch": 1.9086697551290537,
"grad_norm": 1.0829598903656006,
"learning_rate": 5.693877690349292e-07,
"loss": 1.0416,
"step": 2884
},
{
"epoch": 1.9113170086035738,
"grad_norm": 1.0071786642074585,
"learning_rate": 5.369165072150239e-07,
"loss": 0.929,
"step": 2888
},
{
"epoch": 1.913964262078094,
"grad_norm": 1.1580030918121338,
"learning_rate": 5.053935250374176e-07,
"loss": 1.0629,
"step": 2892
},
{
"epoch": 1.916611515552614,
"grad_norm": 1.2572953701019287,
"learning_rate": 4.7481942684378113e-07,
"loss": 1.1105,
"step": 2896
},
{
"epoch": 1.9192587690271343,
"grad_norm": 1.1861546039581299,
"learning_rate": 4.451947987842764e-07,
"loss": 1.0511,
"step": 2900
},
{
"epoch": 1.9219060225016547,
"grad_norm": 1.1360516548156738,
"learning_rate": 4.165202088063425e-07,
"loss": 1.0623,
"step": 2904
},
{
"epoch": 1.9245532759761748,
"grad_norm": 1.1186720132827759,
"learning_rate": 3.8879620664381e-07,
"loss": 0.9999,
"step": 2908
},
{
"epoch": 1.9272005294506949,
"grad_norm": 1.2490679025650024,
"learning_rate": 3.620233238063375e-07,
"loss": 1.0442,
"step": 2912
},
{
"epoch": 1.929847782925215,
"grad_norm": 1.309167504310608,
"learning_rate": 3.362020735692417e-07,
"loss": 1.1706,
"step": 2916
},
{
"epoch": 1.9324950363997353,
"grad_norm": 1.1864930391311646,
"learning_rate": 3.1133295096364977e-07,
"loss": 1.0731,
"step": 2920
},
{
"epoch": 1.9351422898742554,
"grad_norm": 1.1746701002120972,
"learning_rate": 2.87416432767007e-07,
"loss": 1.0544,
"step": 2924
},
{
"epoch": 1.9377895433487757,
"grad_norm": 1.272407054901123,
"learning_rate": 2.644529774939397e-07,
"loss": 1.0909,
"step": 2928
},
{
"epoch": 1.9404367968232958,
"grad_norm": 1.1303869485855103,
"learning_rate": 2.4244302538746766e-07,
"loss": 0.9551,
"step": 2932
},
{
"epoch": 1.943084050297816,
"grad_norm": 1.0882586240768433,
"learning_rate": 2.2138699841056655e-07,
"loss": 0.9893,
"step": 2936
},
{
"epoch": 1.945731303772336,
"grad_norm": 1.2608906030654907,
"learning_rate": 2.012853002380466e-07,
"loss": 1.0569,
"step": 2940
},
{
"epoch": 1.9483785572468564,
"grad_norm": 1.2106075286865234,
"learning_rate": 1.8213831624887545e-07,
"loss": 0.9922,
"step": 2944
},
{
"epoch": 1.9510258107213767,
"grad_norm": 1.1815046072006226,
"learning_rate": 1.6394641351872297e-07,
"loss": 1.0113,
"step": 2948
},
{
"epoch": 1.9536730641958968,
"grad_norm": 1.1953189373016357,
"learning_rate": 1.4670994081297795e-07,
"loss": 1.0361,
"step": 2952
},
{
"epoch": 1.956320317670417,
"grad_norm": 1.0204826593399048,
"learning_rate": 1.3042922858002015e-07,
"loss": 0.9583,
"step": 2956
},
{
"epoch": 1.958967571144937,
"grad_norm": 1.1778640747070312,
"learning_rate": 1.1510458894490871e-07,
"loss": 1.0795,
"step": 2960
},
{
"epoch": 1.9616148246194574,
"grad_norm": 1.1050951480865479,
"learning_rate": 1.0073631570340358e-07,
"loss": 0.947,
"step": 2964
},
{
"epoch": 1.9642620780939775,
"grad_norm": 1.4342139959335327,
"learning_rate": 8.732468431630892e-08,
"loss": 0.9858,
"step": 2968
},
{
"epoch": 1.9669093315684978,
"grad_norm": 1.3275805711746216,
"learning_rate": 7.486995190420509e-08,
"loss": 1.0232,
"step": 2972
},
{
"epoch": 1.969556585043018,
"grad_norm": 1.862630844116211,
"learning_rate": 6.337235724254154e-08,
"loss": 1.1036,
"step": 2976
},
{
"epoch": 1.972203838517538,
"grad_norm": 1.1249923706054688,
"learning_rate": 5.2832120757007054e-08,
"loss": 1.1517,
"step": 2980
},
{
"epoch": 1.974851091992058,
"grad_norm": 1.4025081396102905,
"learning_rate": 4.324944451934987e-08,
"loss": 1.1827,
"step": 2984
},
{
"epoch": 1.9774983454665784,
"grad_norm": 1.2881486415863037,
"learning_rate": 3.4624512243497386e-08,
"loss": 0.9921,
"step": 2988
},
{
"epoch": 1.9801455989410988,
"grad_norm": 1.256659746170044,
"learning_rate": 2.6957489281997926e-08,
"loss": 1.0058,
"step": 2992
},
{
"epoch": 1.9827928524156189,
"grad_norm": 1.2083126306533813,
"learning_rate": 2.0248522622906552e-08,
"loss": 1.0364,
"step": 2996
},
{
"epoch": 1.985440105890139,
"grad_norm": 1.2300423383712769,
"learning_rate": 1.4497740886920685e-08,
"loss": 1.056,
"step": 3000
},
{
"epoch": 1.988087359364659,
"grad_norm": 1.1946439743041992,
"learning_rate": 9.70525432493763e-09,
"loss": 1.1,
"step": 3004
},
{
"epoch": 1.9907346128391792,
"grad_norm": 1.1879000663757324,
"learning_rate": 5.8711548159229305e-09,
"loss": 0.9764,
"step": 3008
},
{
"epoch": 1.9933818663136995,
"grad_norm": 1.7793687582015991,
"learning_rate": 2.9955158651839845e-09,
"loss": 1.0218,
"step": 3012
},
{
"epoch": 1.9960291197882198,
"grad_norm": 1.2599058151245117,
"learning_rate": 1.0783926029211966e-09,
"loss": 1.0414,
"step": 3016
},
{
"epoch": 1.99867637326274,
"grad_norm": 1.2598057985305786,
"learning_rate": 1.1982178318437066e-10,
"loss": 1.1198,
"step": 3020
}
],
"logging_steps": 4,
"max_steps": 3022,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 1511,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.115408240900833e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}