atsuki-yamaguchi's picture
Upload folder using huggingface_hub
78a22e2 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 3462,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002021952628538417,
"grad_norm": 9.595681190490723,
"learning_rate": 2.017291066282421e-06,
"loss": 7.4786,
"step": 7
},
{
"epoch": 0.004043905257076834,
"grad_norm": 7.749298095703125,
"learning_rate": 4.034582132564842e-06,
"loss": 7.261,
"step": 14
},
{
"epoch": 0.006065857885615251,
"grad_norm": 9.839412689208984,
"learning_rate": 6.0518731988472625e-06,
"loss": 6.189,
"step": 21
},
{
"epoch": 0.008087810514153668,
"grad_norm": 4.33282470703125,
"learning_rate": 8.069164265129683e-06,
"loss": 6.0148,
"step": 28
},
{
"epoch": 0.010109763142692086,
"grad_norm": 4.110238552093506,
"learning_rate": 1.0086455331412104e-05,
"loss": 5.4958,
"step": 35
},
{
"epoch": 0.012131715771230503,
"grad_norm": 3.388286828994751,
"learning_rate": 1.2103746397694525e-05,
"loss": 5.3635,
"step": 42
},
{
"epoch": 0.01415366839976892,
"grad_norm": 2.6382975578308105,
"learning_rate": 1.4121037463976946e-05,
"loss": 5.1959,
"step": 49
},
{
"epoch": 0.016175621028307337,
"grad_norm": 3.417970895767212,
"learning_rate": 1.6138328530259367e-05,
"loss": 5.1294,
"step": 56
},
{
"epoch": 0.018197573656845753,
"grad_norm": 2.9546399116516113,
"learning_rate": 1.8155619596541786e-05,
"loss": 4.8528,
"step": 63
},
{
"epoch": 0.020219526285384173,
"grad_norm": 2.8038337230682373,
"learning_rate": 2.017291066282421e-05,
"loss": 4.6832,
"step": 70
},
{
"epoch": 0.02224147891392259,
"grad_norm": 2.5608632564544678,
"learning_rate": 2.219020172910663e-05,
"loss": 4.6793,
"step": 77
},
{
"epoch": 0.024263431542461005,
"grad_norm": 2.646967649459839,
"learning_rate": 2.420749279538905e-05,
"loss": 4.4008,
"step": 84
},
{
"epoch": 0.02628538417099942,
"grad_norm": 2.6493546962738037,
"learning_rate": 2.622478386167147e-05,
"loss": 4.4752,
"step": 91
},
{
"epoch": 0.02830733679953784,
"grad_norm": 2.450045585632324,
"learning_rate": 2.824207492795389e-05,
"loss": 4.441,
"step": 98
},
{
"epoch": 0.030329289428076257,
"grad_norm": 2.8270249366760254,
"learning_rate": 3.025936599423631e-05,
"loss": 4.2923,
"step": 105
},
{
"epoch": 0.032351242056614674,
"grad_norm": 2.512385368347168,
"learning_rate": 3.227665706051873e-05,
"loss": 4.2352,
"step": 112
},
{
"epoch": 0.03437319468515309,
"grad_norm": 2.7446236610412598,
"learning_rate": 3.4293948126801156e-05,
"loss": 4.3132,
"step": 119
},
{
"epoch": 0.036395147313691506,
"grad_norm": 2.433756113052368,
"learning_rate": 3.631123919308357e-05,
"loss": 4.1372,
"step": 126
},
{
"epoch": 0.03841709994222992,
"grad_norm": 2.985642433166504,
"learning_rate": 3.8328530259365994e-05,
"loss": 4.1249,
"step": 133
},
{
"epoch": 0.040439052570768345,
"grad_norm": 2.4674811363220215,
"learning_rate": 4.034582132564842e-05,
"loss": 4.0113,
"step": 140
},
{
"epoch": 0.04246100519930676,
"grad_norm": 2.5470809936523438,
"learning_rate": 4.236311239193084e-05,
"loss": 4.0147,
"step": 147
},
{
"epoch": 0.04448295782784518,
"grad_norm": 2.6432108879089355,
"learning_rate": 4.438040345821326e-05,
"loss": 3.9437,
"step": 154
},
{
"epoch": 0.046504910456383594,
"grad_norm": 2.5995123386383057,
"learning_rate": 4.639769452449568e-05,
"loss": 3.8956,
"step": 161
},
{
"epoch": 0.04852686308492201,
"grad_norm": 2.4285008907318115,
"learning_rate": 4.84149855907781e-05,
"loss": 4.002,
"step": 168
},
{
"epoch": 0.05054881571346043,
"grad_norm": 4.111863613128662,
"learning_rate": 5.0432276657060516e-05,
"loss": 3.9789,
"step": 175
},
{
"epoch": 0.05257076834199884,
"grad_norm": 2.493316173553467,
"learning_rate": 5.244956772334294e-05,
"loss": 3.7673,
"step": 182
},
{
"epoch": 0.05459272097053726,
"grad_norm": 2.6891896724700928,
"learning_rate": 5.446685878962536e-05,
"loss": 3.8416,
"step": 189
},
{
"epoch": 0.05661467359907568,
"grad_norm": 2.5575196743011475,
"learning_rate": 5.648414985590778e-05,
"loss": 3.7602,
"step": 196
},
{
"epoch": 0.0586366262276141,
"grad_norm": 2.6597752571105957,
"learning_rate": 5.850144092219021e-05,
"loss": 3.8453,
"step": 203
},
{
"epoch": 0.060658578856152515,
"grad_norm": 2.5099666118621826,
"learning_rate": 6.051873198847262e-05,
"loss": 3.7611,
"step": 210
},
{
"epoch": 0.06268053148469092,
"grad_norm": 2.335249185562134,
"learning_rate": 6.253602305475504e-05,
"loss": 3.7128,
"step": 217
},
{
"epoch": 0.06470248411322935,
"grad_norm": 2.349083423614502,
"learning_rate": 6.455331412103747e-05,
"loss": 3.7442,
"step": 224
},
{
"epoch": 0.06672443674176777,
"grad_norm": 2.6178603172302246,
"learning_rate": 6.657060518731989e-05,
"loss": 3.7445,
"step": 231
},
{
"epoch": 0.06874638937030618,
"grad_norm": 2.6607229709625244,
"learning_rate": 6.858789625360231e-05,
"loss": 3.7259,
"step": 238
},
{
"epoch": 0.0707683419988446,
"grad_norm": 2.389965295791626,
"learning_rate": 7.060518731988472e-05,
"loss": 3.7101,
"step": 245
},
{
"epoch": 0.07279029462738301,
"grad_norm": 2.3201496601104736,
"learning_rate": 7.262247838616714e-05,
"loss": 3.5793,
"step": 252
},
{
"epoch": 0.07481224725592144,
"grad_norm": 2.3502607345581055,
"learning_rate": 7.463976945244957e-05,
"loss": 3.6208,
"step": 259
},
{
"epoch": 0.07683419988445984,
"grad_norm": 2.284522294998169,
"learning_rate": 7.665706051873199e-05,
"loss": 3.6786,
"step": 266
},
{
"epoch": 0.07885615251299827,
"grad_norm": 2.3930606842041016,
"learning_rate": 7.867435158501441e-05,
"loss": 3.6442,
"step": 273
},
{
"epoch": 0.08087810514153669,
"grad_norm": 2.294808864593506,
"learning_rate": 8.069164265129683e-05,
"loss": 3.6099,
"step": 280
},
{
"epoch": 0.0829000577700751,
"grad_norm": 2.481281280517578,
"learning_rate": 8.270893371757926e-05,
"loss": 3.534,
"step": 287
},
{
"epoch": 0.08492201039861352,
"grad_norm": 2.197254180908203,
"learning_rate": 8.472622478386168e-05,
"loss": 3.6141,
"step": 294
},
{
"epoch": 0.08694396302715193,
"grad_norm": 2.4509365558624268,
"learning_rate": 8.67435158501441e-05,
"loss": 3.4953,
"step": 301
},
{
"epoch": 0.08896591565569036,
"grad_norm": 2.4953505992889404,
"learning_rate": 8.876080691642652e-05,
"loss": 3.457,
"step": 308
},
{
"epoch": 0.09098786828422877,
"grad_norm": 2.2001383304595947,
"learning_rate": 9.077809798270895e-05,
"loss": 3.4942,
"step": 315
},
{
"epoch": 0.09300982091276719,
"grad_norm": 2.1801021099090576,
"learning_rate": 9.279538904899135e-05,
"loss": 3.3884,
"step": 322
},
{
"epoch": 0.0950317735413056,
"grad_norm": 2.2749996185302734,
"learning_rate": 9.481268011527378e-05,
"loss": 3.495,
"step": 329
},
{
"epoch": 0.09705372616984402,
"grad_norm": 2.1388254165649414,
"learning_rate": 9.68299711815562e-05,
"loss": 3.5859,
"step": 336
},
{
"epoch": 0.09907567879838244,
"grad_norm": 2.1429038047790527,
"learning_rate": 9.884726224783862e-05,
"loss": 3.4566,
"step": 343
},
{
"epoch": 0.10109763142692085,
"grad_norm": 2.1946280002593994,
"learning_rate": 9.999994866347054e-05,
"loss": 3.4641,
"step": 350
},
{
"epoch": 0.10311958405545928,
"grad_norm": 3.0027740001678467,
"learning_rate": 9.999942959510397e-05,
"loss": 3.3817,
"step": 357
},
{
"epoch": 0.10514153668399769,
"grad_norm": 2.3069963455200195,
"learning_rate": 9.999835153577435e-05,
"loss": 3.3968,
"step": 364
},
{
"epoch": 0.10716348931253611,
"grad_norm": 2.2940797805786133,
"learning_rate": 9.999671449753431e-05,
"loss": 3.5512,
"step": 371
},
{
"epoch": 0.10918544194107452,
"grad_norm": 2.4967830181121826,
"learning_rate": 9.999451849868585e-05,
"loss": 3.4169,
"step": 378
},
{
"epoch": 0.11120739456961294,
"grad_norm": 2.4844703674316406,
"learning_rate": 9.99917635637801e-05,
"loss": 3.5323,
"step": 385
},
{
"epoch": 0.11322934719815136,
"grad_norm": 2.1567542552948,
"learning_rate": 9.998844972361712e-05,
"loss": 3.4126,
"step": 392
},
{
"epoch": 0.11525129982668977,
"grad_norm": 2.143780469894409,
"learning_rate": 9.998457701524546e-05,
"loss": 3.3793,
"step": 399
},
{
"epoch": 0.1172732524552282,
"grad_norm": 2.182628631591797,
"learning_rate": 9.998014548196178e-05,
"loss": 3.2996,
"step": 406
},
{
"epoch": 0.1192952050837666,
"grad_norm": 2.220407247543335,
"learning_rate": 9.99751551733104e-05,
"loss": 3.4636,
"step": 413
},
{
"epoch": 0.12131715771230503,
"grad_norm": 2.0776526927948,
"learning_rate": 9.996960614508271e-05,
"loss": 3.348,
"step": 420
},
{
"epoch": 0.12333911034084344,
"grad_norm": 2.2219507694244385,
"learning_rate": 9.996349845931651e-05,
"loss": 3.3982,
"step": 427
},
{
"epoch": 0.12536106296938185,
"grad_norm": 2.48421311378479,
"learning_rate": 9.995683218429546e-05,
"loss": 3.3587,
"step": 434
},
{
"epoch": 0.12738301559792028,
"grad_norm": 2.1980128288269043,
"learning_rate": 9.994960739454811e-05,
"loss": 3.3461,
"step": 441
},
{
"epoch": 0.1294049682264587,
"grad_norm": 2.3740694522857666,
"learning_rate": 9.994182417084725e-05,
"loss": 3.3966,
"step": 448
},
{
"epoch": 0.1314269208549971,
"grad_norm": 2.3175392150878906,
"learning_rate": 9.993348260020892e-05,
"loss": 3.2726,
"step": 455
},
{
"epoch": 0.13344887348353554,
"grad_norm": 2.2129814624786377,
"learning_rate": 9.99245827758914e-05,
"loss": 3.3214,
"step": 462
},
{
"epoch": 0.13547082611207395,
"grad_norm": 2.125340461730957,
"learning_rate": 9.991512479739428e-05,
"loss": 3.3272,
"step": 469
},
{
"epoch": 0.13749277874061236,
"grad_norm": 2.3547654151916504,
"learning_rate": 9.990510877045724e-05,
"loss": 3.2926,
"step": 476
},
{
"epoch": 0.13951473136915077,
"grad_norm": 3.001073122024536,
"learning_rate": 9.989453480705895e-05,
"loss": 3.3037,
"step": 483
},
{
"epoch": 0.1415366839976892,
"grad_norm": 2.411472797393799,
"learning_rate": 9.988340302541574e-05,
"loss": 3.1022,
"step": 490
},
{
"epoch": 0.14355863662622761,
"grad_norm": 2.169168472290039,
"learning_rate": 9.987171354998033e-05,
"loss": 3.2779,
"step": 497
},
{
"epoch": 0.14558058925476602,
"grad_norm": 2.482876777648926,
"learning_rate": 9.985946651144046e-05,
"loss": 3.3117,
"step": 504
},
{
"epoch": 0.14760254188330446,
"grad_norm": 2.099360704421997,
"learning_rate": 9.984666204671735e-05,
"loss": 3.293,
"step": 511
},
{
"epoch": 0.14962449451184287,
"grad_norm": 1.954262137413025,
"learning_rate": 9.983330029896423e-05,
"loss": 3.2429,
"step": 518
},
{
"epoch": 0.15164644714038128,
"grad_norm": 1.9902952909469604,
"learning_rate": 9.981938141756476e-05,
"loss": 3.227,
"step": 525
},
{
"epoch": 0.1536683997689197,
"grad_norm": 2.3210673332214355,
"learning_rate": 9.980490555813124e-05,
"loss": 3.1948,
"step": 532
},
{
"epoch": 0.15569035239745813,
"grad_norm": 2.481900453567505,
"learning_rate": 9.978987288250307e-05,
"loss": 3.2088,
"step": 539
},
{
"epoch": 0.15771230502599654,
"grad_norm": 2.0122852325439453,
"learning_rate": 9.977428355874472e-05,
"loss": 3.1562,
"step": 546
},
{
"epoch": 0.15973425765453494,
"grad_norm": 2.022824287414551,
"learning_rate": 9.975813776114401e-05,
"loss": 3.1278,
"step": 553
},
{
"epoch": 0.16175621028307338,
"grad_norm": 2.0773727893829346,
"learning_rate": 9.97414356702101e-05,
"loss": 3.1692,
"step": 560
},
{
"epoch": 0.1637781629116118,
"grad_norm": 2.01796817779541,
"learning_rate": 9.97241774726715e-05,
"loss": 3.1741,
"step": 567
},
{
"epoch": 0.1658001155401502,
"grad_norm": 1.8899658918380737,
"learning_rate": 9.970636336147391e-05,
"loss": 3.1241,
"step": 574
},
{
"epoch": 0.1678220681686886,
"grad_norm": 2.1450061798095703,
"learning_rate": 9.968799353577815e-05,
"loss": 3.131,
"step": 581
},
{
"epoch": 0.16984402079722705,
"grad_norm": 2.2159628868103027,
"learning_rate": 9.96690682009579e-05,
"loss": 3.262,
"step": 588
},
{
"epoch": 0.17186597342576546,
"grad_norm": 2.3034591674804688,
"learning_rate": 9.964958756859741e-05,
"loss": 3.1114,
"step": 595
},
{
"epoch": 0.17388792605430387,
"grad_norm": 1.9303778409957886,
"learning_rate": 9.962955185648909e-05,
"loss": 3.1515,
"step": 602
},
{
"epoch": 0.17590987868284227,
"grad_norm": 1.83698308467865,
"learning_rate": 9.960896128863115e-05,
"loss": 3.2162,
"step": 609
},
{
"epoch": 0.1779318313113807,
"grad_norm": 1.970585823059082,
"learning_rate": 9.958781609522504e-05,
"loss": 3.1861,
"step": 616
},
{
"epoch": 0.17995378393991912,
"grad_norm": 2.279804229736328,
"learning_rate": 9.95661165126729e-05,
"loss": 3.0684,
"step": 623
},
{
"epoch": 0.18197573656845753,
"grad_norm": 1.8226346969604492,
"learning_rate": 9.95438627835749e-05,
"loss": 3.1671,
"step": 630
},
{
"epoch": 0.18399768919699597,
"grad_norm": 1.8989958763122559,
"learning_rate": 9.952105515672654e-05,
"loss": 3.2091,
"step": 637
},
{
"epoch": 0.18601964182553438,
"grad_norm": 1.846562147140503,
"learning_rate": 9.949769388711591e-05,
"loss": 3.1451,
"step": 644
},
{
"epoch": 0.18804159445407279,
"grad_norm": 1.8738723993301392,
"learning_rate": 9.947377923592073e-05,
"loss": 3.0587,
"step": 651
},
{
"epoch": 0.1900635470826112,
"grad_norm": 1.9004042148590088,
"learning_rate": 9.944931147050553e-05,
"loss": 3.0329,
"step": 658
},
{
"epoch": 0.19208549971114963,
"grad_norm": 1.9021046161651611,
"learning_rate": 9.942429086441864e-05,
"loss": 3.0379,
"step": 665
},
{
"epoch": 0.19410745233968804,
"grad_norm": 2.065567970275879,
"learning_rate": 9.93987176973891e-05,
"loss": 3.2037,
"step": 672
},
{
"epoch": 0.19612940496822645,
"grad_norm": 1.881011724472046,
"learning_rate": 9.937259225532356e-05,
"loss": 3.0945,
"step": 679
},
{
"epoch": 0.1981513575967649,
"grad_norm": 1.9205586910247803,
"learning_rate": 9.934591483030306e-05,
"loss": 3.0818,
"step": 686
},
{
"epoch": 0.2001733102253033,
"grad_norm": 1.8970929384231567,
"learning_rate": 9.931868572057979e-05,
"loss": 3.0714,
"step": 693
},
{
"epoch": 0.2021952628538417,
"grad_norm": 2.0119457244873047,
"learning_rate": 9.929090523057377e-05,
"loss": 3.0832,
"step": 700
},
{
"epoch": 0.20421721548238012,
"grad_norm": 1.9726409912109375,
"learning_rate": 9.926257367086939e-05,
"loss": 3.0904,
"step": 707
},
{
"epoch": 0.20623916811091855,
"grad_norm": 1.8767212629318237,
"learning_rate": 9.923369135821198e-05,
"loss": 3.0606,
"step": 714
},
{
"epoch": 0.20826112073945696,
"grad_norm": 1.899286150932312,
"learning_rate": 9.920425861550425e-05,
"loss": 3.0126,
"step": 721
},
{
"epoch": 0.21028307336799537,
"grad_norm": 1.8871937990188599,
"learning_rate": 9.917427577180271e-05,
"loss": 3.1418,
"step": 728
},
{
"epoch": 0.2123050259965338,
"grad_norm": 2.15138578414917,
"learning_rate": 9.914374316231396e-05,
"loss": 3.12,
"step": 735
},
{
"epoch": 0.21432697862507222,
"grad_norm": 1.8373454809188843,
"learning_rate": 9.911266112839093e-05,
"loss": 3.0967,
"step": 742
},
{
"epoch": 0.21634893125361063,
"grad_norm": 1.924065113067627,
"learning_rate": 9.908103001752913e-05,
"loss": 2.9944,
"step": 749
},
{
"epoch": 0.21837088388214904,
"grad_norm": 1.8721510171890259,
"learning_rate": 9.90488501833627e-05,
"loss": 3.1021,
"step": 756
},
{
"epoch": 0.22039283651068747,
"grad_norm": 1.8645668029785156,
"learning_rate": 9.901612198566044e-05,
"loss": 3.0134,
"step": 763
},
{
"epoch": 0.22241478913922588,
"grad_norm": 1.8228344917297363,
"learning_rate": 9.898284579032188e-05,
"loss": 3.0468,
"step": 770
},
{
"epoch": 0.2244367417677643,
"grad_norm": 1.7747055292129517,
"learning_rate": 9.894902196937312e-05,
"loss": 3.0553,
"step": 777
},
{
"epoch": 0.22645869439630273,
"grad_norm": 1.8610613346099854,
"learning_rate": 9.891465090096265e-05,
"loss": 3.0393,
"step": 784
},
{
"epoch": 0.22848064702484114,
"grad_norm": 1.8580009937286377,
"learning_rate": 9.887973296935725e-05,
"loss": 3.0419,
"step": 791
},
{
"epoch": 0.23050259965337955,
"grad_norm": 1.7875131368637085,
"learning_rate": 9.884426856493746e-05,
"loss": 2.9642,
"step": 798
},
{
"epoch": 0.23252455228191796,
"grad_norm": 1.8746123313903809,
"learning_rate": 9.880825808419348e-05,
"loss": 3.0594,
"step": 805
},
{
"epoch": 0.2345465049104564,
"grad_norm": 1.8220926523208618,
"learning_rate": 9.877170192972056e-05,
"loss": 3.0327,
"step": 812
},
{
"epoch": 0.2365684575389948,
"grad_norm": 1.9343609809875488,
"learning_rate": 9.873460051021457e-05,
"loss": 3.0395,
"step": 819
},
{
"epoch": 0.2385904101675332,
"grad_norm": 1.9146010875701904,
"learning_rate": 9.86969542404674e-05,
"loss": 3.0361,
"step": 826
},
{
"epoch": 0.24061236279607162,
"grad_norm": 1.8916493654251099,
"learning_rate": 9.865876354136234e-05,
"loss": 3.0112,
"step": 833
},
{
"epoch": 0.24263431542461006,
"grad_norm": 1.8126282691955566,
"learning_rate": 9.862002883986938e-05,
"loss": 3.0325,
"step": 840
},
{
"epoch": 0.24465626805314847,
"grad_norm": 1.9282585382461548,
"learning_rate": 9.85807505690404e-05,
"loss": 3.0337,
"step": 847
},
{
"epoch": 0.24667822068168688,
"grad_norm": 1.7758145332336426,
"learning_rate": 9.854092916800442e-05,
"loss": 3.0062,
"step": 854
},
{
"epoch": 0.24870017331022531,
"grad_norm": 1.8073774576187134,
"learning_rate": 9.850056508196255e-05,
"loss": 2.9754,
"step": 861
},
{
"epoch": 0.2507221259387637,
"grad_norm": 1.9235025644302368,
"learning_rate": 9.845965876218312e-05,
"loss": 2.9062,
"step": 868
},
{
"epoch": 0.25274407856730213,
"grad_norm": 1.7717721462249756,
"learning_rate": 9.841821066599666e-05,
"loss": 3.0064,
"step": 875
},
{
"epoch": 0.25476603119584057,
"grad_norm": 1.7075670957565308,
"learning_rate": 9.837622125679062e-05,
"loss": 3.0164,
"step": 882
},
{
"epoch": 0.25678798382437895,
"grad_norm": 1.824540138244629,
"learning_rate": 9.83336910040044e-05,
"loss": 3.0539,
"step": 889
},
{
"epoch": 0.2588099364529174,
"grad_norm": 1.7004854679107666,
"learning_rate": 9.829062038312394e-05,
"loss": 3.0165,
"step": 896
},
{
"epoch": 0.2608318890814558,
"grad_norm": 1.7976192235946655,
"learning_rate": 9.824700987567653e-05,
"loss": 2.9922,
"step": 903
},
{
"epoch": 0.2628538417099942,
"grad_norm": 1.8717296123504639,
"learning_rate": 9.820285996922526e-05,
"loss": 3.0127,
"step": 910
},
{
"epoch": 0.26487579433853264,
"grad_norm": 2.026646137237549,
"learning_rate": 9.815817115736379e-05,
"loss": 3.0056,
"step": 917
},
{
"epoch": 0.2668977469670711,
"grad_norm": 1.7542977333068848,
"learning_rate": 9.811294393971063e-05,
"loss": 2.9186,
"step": 924
},
{
"epoch": 0.26891969959560946,
"grad_norm": 1.832772135734558,
"learning_rate": 9.806717882190368e-05,
"loss": 2.9559,
"step": 931
},
{
"epoch": 0.2709416522241479,
"grad_norm": 1.7612087726593018,
"learning_rate": 9.802087631559451e-05,
"loss": 2.9334,
"step": 938
},
{
"epoch": 0.2729636048526863,
"grad_norm": 1.7945784330368042,
"learning_rate": 9.797403693844271e-05,
"loss": 3.0131,
"step": 945
},
{
"epoch": 0.2749855574812247,
"grad_norm": 1.840769648551941,
"learning_rate": 9.792666121410998e-05,
"loss": 2.9896,
"step": 952
},
{
"epoch": 0.27700751010976316,
"grad_norm": 1.9374017715454102,
"learning_rate": 9.787874967225444e-05,
"loss": 2.9585,
"step": 959
},
{
"epoch": 0.27902946273830154,
"grad_norm": 1.8398560285568237,
"learning_rate": 9.783030284852454e-05,
"loss": 3.0073,
"step": 966
},
{
"epoch": 0.28105141536684,
"grad_norm": 1.7769570350646973,
"learning_rate": 9.778132128455322e-05,
"loss": 2.9579,
"step": 973
},
{
"epoch": 0.2830733679953784,
"grad_norm": 1.9076484441757202,
"learning_rate": 9.773180552795173e-05,
"loss": 3.0034,
"step": 980
},
{
"epoch": 0.2850953206239168,
"grad_norm": 1.818621039390564,
"learning_rate": 9.768175613230365e-05,
"loss": 2.9642,
"step": 987
},
{
"epoch": 0.28711727325245523,
"grad_norm": 1.9143321514129639,
"learning_rate": 9.76311736571585e-05,
"loss": 2.9598,
"step": 994
},
{
"epoch": 0.28913922588099367,
"grad_norm": 1.8136980533599854,
"learning_rate": 9.75800586680257e-05,
"loss": 3.0053,
"step": 1001
},
{
"epoch": 0.29116117850953205,
"grad_norm": 1.9737019538879395,
"learning_rate": 9.752841173636808e-05,
"loss": 2.9401,
"step": 1008
},
{
"epoch": 0.2931831311380705,
"grad_norm": 1.8300338983535767,
"learning_rate": 9.747623343959563e-05,
"loss": 2.9416,
"step": 1015
},
{
"epoch": 0.2952050837666089,
"grad_norm": 1.9094542264938354,
"learning_rate": 9.74235243610589e-05,
"loss": 2.8963,
"step": 1022
},
{
"epoch": 0.2972270363951473,
"grad_norm": 2.2808616161346436,
"learning_rate": 9.737028509004258e-05,
"loss": 2.9543,
"step": 1029
},
{
"epoch": 0.29924898902368574,
"grad_norm": 1.786083698272705,
"learning_rate": 9.73165162217589e-05,
"loss": 2.9339,
"step": 1036
},
{
"epoch": 0.3012709416522241,
"grad_norm": 1.8315051794052124,
"learning_rate": 9.726221835734096e-05,
"loss": 2.9229,
"step": 1043
},
{
"epoch": 0.30329289428076256,
"grad_norm": 1.701285719871521,
"learning_rate": 9.720739210383598e-05,
"loss": 2.8461,
"step": 1050
},
{
"epoch": 0.305314846909301,
"grad_norm": 1.8492621183395386,
"learning_rate": 9.715203807419855e-05,
"loss": 3.0137,
"step": 1057
},
{
"epoch": 0.3073367995378394,
"grad_norm": 1.8336840867996216,
"learning_rate": 9.70961568872838e-05,
"loss": 2.8932,
"step": 1064
},
{
"epoch": 0.3093587521663778,
"grad_norm": 1.8716745376586914,
"learning_rate": 9.70397491678404e-05,
"loss": 2.955,
"step": 1071
},
{
"epoch": 0.31138070479491625,
"grad_norm": 1.7288552522659302,
"learning_rate": 9.698281554650366e-05,
"loss": 2.8435,
"step": 1078
},
{
"epoch": 0.31340265742345463,
"grad_norm": 1.6746890544891357,
"learning_rate": 9.692535665978845e-05,
"loss": 2.9312,
"step": 1085
},
{
"epoch": 0.31542461005199307,
"grad_norm": 1.7941049337387085,
"learning_rate": 9.686737315008207e-05,
"loss": 2.8434,
"step": 1092
},
{
"epoch": 0.3174465626805315,
"grad_norm": 1.8552913665771484,
"learning_rate": 9.680886566563705e-05,
"loss": 3.04,
"step": 1099
},
{
"epoch": 0.3194685153090699,
"grad_norm": 1.8141459226608276,
"learning_rate": 9.674983486056399e-05,
"loss": 2.9887,
"step": 1106
},
{
"epoch": 0.3214904679376083,
"grad_norm": 1.7158820629119873,
"learning_rate": 9.66902813948241e-05,
"loss": 2.9345,
"step": 1113
},
{
"epoch": 0.32351242056614676,
"grad_norm": 2.014641761779785,
"learning_rate": 9.6630205934222e-05,
"loss": 2.9396,
"step": 1120
},
{
"epoch": 0.32553437319468514,
"grad_norm": 1.8080192804336548,
"learning_rate": 9.656960915039815e-05,
"loss": 2.9015,
"step": 1127
},
{
"epoch": 0.3275563258232236,
"grad_norm": 1.941286563873291,
"learning_rate": 9.650849172082132e-05,
"loss": 2.8438,
"step": 1134
},
{
"epoch": 0.32957827845176196,
"grad_norm": 1.7322505712509155,
"learning_rate": 9.644685432878117e-05,
"loss": 2.9456,
"step": 1141
},
{
"epoch": 0.3316002310803004,
"grad_norm": 1.6413743495941162,
"learning_rate": 9.638469766338045e-05,
"loss": 2.8915,
"step": 1148
},
{
"epoch": 0.33362218370883884,
"grad_norm": 1.8302034139633179,
"learning_rate": 9.632202241952737e-05,
"loss": 2.9105,
"step": 1155
},
{
"epoch": 0.3356441363373772,
"grad_norm": 1.7431873083114624,
"learning_rate": 9.625882929792781e-05,
"loss": 2.9286,
"step": 1162
},
{
"epoch": 0.33766608896591566,
"grad_norm": 1.8077412843704224,
"learning_rate": 9.619511900507753e-05,
"loss": 2.9155,
"step": 1169
},
{
"epoch": 0.3396880415944541,
"grad_norm": 1.8113936185836792,
"learning_rate": 9.613089225325421e-05,
"loss": 2.9043,
"step": 1176
},
{
"epoch": 0.3417099942229925,
"grad_norm": 1.6953517198562622,
"learning_rate": 9.606614976050949e-05,
"loss": 2.936,
"step": 1183
},
{
"epoch": 0.3437319468515309,
"grad_norm": 1.7508031129837036,
"learning_rate": 9.600089225066103e-05,
"loss": 2.7969,
"step": 1190
},
{
"epoch": 0.34575389948006935,
"grad_norm": 1.6380009651184082,
"learning_rate": 9.59351204532843e-05,
"loss": 2.8198,
"step": 1197
},
{
"epoch": 0.34777585210860773,
"grad_norm": 1.7703418731689453,
"learning_rate": 9.58688351037045e-05,
"loss": 2.836,
"step": 1204
},
{
"epoch": 0.34979780473714617,
"grad_norm": 1.90382719039917,
"learning_rate": 9.580203694298833e-05,
"loss": 2.9212,
"step": 1211
},
{
"epoch": 0.35181975736568455,
"grad_norm": 1.857691764831543,
"learning_rate": 9.573472671793564e-05,
"loss": 2.908,
"step": 1218
},
{
"epoch": 0.353841709994223,
"grad_norm": 1.7361656427383423,
"learning_rate": 9.56669051810712e-05,
"loss": 2.8179,
"step": 1225
},
{
"epoch": 0.3558636626227614,
"grad_norm": 1.681665062904358,
"learning_rate": 9.559857309063616e-05,
"loss": 2.9504,
"step": 1232
},
{
"epoch": 0.3578856152512998,
"grad_norm": 1.6940059661865234,
"learning_rate": 9.552973121057966e-05,
"loss": 2.8335,
"step": 1239
},
{
"epoch": 0.35990756787983824,
"grad_norm": 1.7692625522613525,
"learning_rate": 9.546038031055026e-05,
"loss": 2.8118,
"step": 1246
},
{
"epoch": 0.3619295205083767,
"grad_norm": 1.7423547506332397,
"learning_rate": 9.539052116588734e-05,
"loss": 2.8345,
"step": 1253
},
{
"epoch": 0.36395147313691506,
"grad_norm": 1.7952203750610352,
"learning_rate": 9.532015455761241e-05,
"loss": 2.8732,
"step": 1260
},
{
"epoch": 0.3659734257654535,
"grad_norm": 1.6462956666946411,
"learning_rate": 9.52492812724204e-05,
"loss": 2.8128,
"step": 1267
},
{
"epoch": 0.36799537839399193,
"grad_norm": 1.7551960945129395,
"learning_rate": 9.51779021026709e-05,
"loss": 2.8367,
"step": 1274
},
{
"epoch": 0.3700173310225303,
"grad_norm": 1.922524094581604,
"learning_rate": 9.510601784637921e-05,
"loss": 2.8275,
"step": 1281
},
{
"epoch": 0.37203928365106875,
"grad_norm": 1.74599027633667,
"learning_rate": 9.503362930720747e-05,
"loss": 2.9025,
"step": 1288
},
{
"epoch": 0.3740612362796072,
"grad_norm": 1.7603254318237305,
"learning_rate": 9.496073729445573e-05,
"loss": 2.9233,
"step": 1295
},
{
"epoch": 0.37608318890814557,
"grad_norm": 1.7627036571502686,
"learning_rate": 9.488734262305281e-05,
"loss": 2.8665,
"step": 1302
},
{
"epoch": 0.378105141536684,
"grad_norm": 1.642699956893921,
"learning_rate": 9.481344611354721e-05,
"loss": 2.8467,
"step": 1309
},
{
"epoch": 0.3801270941652224,
"grad_norm": 1.629868507385254,
"learning_rate": 9.473904859209801e-05,
"loss": 2.8086,
"step": 1316
},
{
"epoch": 0.3821490467937608,
"grad_norm": 1.7129733562469482,
"learning_rate": 9.466415089046551e-05,
"loss": 2.7833,
"step": 1323
},
{
"epoch": 0.38417099942229926,
"grad_norm": 1.6511763334274292,
"learning_rate": 9.458875384600206e-05,
"loss": 2.9041,
"step": 1330
},
{
"epoch": 0.38619295205083765,
"grad_norm": 1.7601238489151,
"learning_rate": 9.451285830164256e-05,
"loss": 2.8706,
"step": 1337
},
{
"epoch": 0.3882149046793761,
"grad_norm": 1.7456876039505005,
"learning_rate": 9.44364651058952e-05,
"loss": 2.8847,
"step": 1344
},
{
"epoch": 0.3902368573079145,
"grad_norm": 1.7066991329193115,
"learning_rate": 9.435957511283184e-05,
"loss": 2.8346,
"step": 1351
},
{
"epoch": 0.3922588099364529,
"grad_norm": 1.6667215824127197,
"learning_rate": 9.42821891820785e-05,
"loss": 2.7794,
"step": 1358
},
{
"epoch": 0.39428076256499134,
"grad_norm": 1.7139601707458496,
"learning_rate": 9.420430817880578e-05,
"loss": 2.8378,
"step": 1365
},
{
"epoch": 0.3963027151935298,
"grad_norm": 1.7201801538467407,
"learning_rate": 9.412593297371914e-05,
"loss": 2.8381,
"step": 1372
},
{
"epoch": 0.39832466782206816,
"grad_norm": 1.7901239395141602,
"learning_rate": 9.404706444304921e-05,
"loss": 2.7901,
"step": 1379
},
{
"epoch": 0.4003466204506066,
"grad_norm": 1.7698922157287598,
"learning_rate": 9.396770346854197e-05,
"loss": 2.9107,
"step": 1386
},
{
"epoch": 0.402368573079145,
"grad_norm": 1.6929407119750977,
"learning_rate": 9.38878509374489e-05,
"loss": 2.8586,
"step": 1393
},
{
"epoch": 0.4043905257076834,
"grad_norm": 1.8319777250289917,
"learning_rate": 9.380750774251702e-05,
"loss": 2.8823,
"step": 1400
},
{
"epoch": 0.40641247833622185,
"grad_norm": 1.7365424633026123,
"learning_rate": 9.372667478197901e-05,
"loss": 2.8193,
"step": 1407
},
{
"epoch": 0.40843443096476023,
"grad_norm": 1.6431219577789307,
"learning_rate": 9.364535295954304e-05,
"loss": 2.7777,
"step": 1414
},
{
"epoch": 0.41045638359329867,
"grad_norm": 1.824762225151062,
"learning_rate": 9.356354318438279e-05,
"loss": 2.8279,
"step": 1421
},
{
"epoch": 0.4124783362218371,
"grad_norm": 1.7178157567977905,
"learning_rate": 9.348124637112719e-05,
"loss": 2.9241,
"step": 1428
},
{
"epoch": 0.4145002888503755,
"grad_norm": 1.7042038440704346,
"learning_rate": 9.339846343985019e-05,
"loss": 2.826,
"step": 1435
},
{
"epoch": 0.4165222414789139,
"grad_norm": 1.691094160079956,
"learning_rate": 9.331519531606062e-05,
"loss": 2.7895,
"step": 1442
},
{
"epoch": 0.41854419410745236,
"grad_norm": 1.7455573081970215,
"learning_rate": 9.323144293069164e-05,
"loss": 2.8141,
"step": 1449
},
{
"epoch": 0.42056614673599074,
"grad_norm": 1.6580560207366943,
"learning_rate": 9.314720722009045e-05,
"loss": 2.7663,
"step": 1456
},
{
"epoch": 0.4225880993645292,
"grad_norm": 1.6528170108795166,
"learning_rate": 9.306248912600783e-05,
"loss": 2.7459,
"step": 1463
},
{
"epoch": 0.4246100519930676,
"grad_norm": 1.768887996673584,
"learning_rate": 9.29772895955876e-05,
"loss": 2.8386,
"step": 1470
},
{
"epoch": 0.426632004621606,
"grad_norm": 1.9209166765213013,
"learning_rate": 9.289160958135591e-05,
"loss": 2.7621,
"step": 1477
},
{
"epoch": 0.42865395725014444,
"grad_norm": 1.6556992530822754,
"learning_rate": 9.28054500412108e-05,
"loss": 2.8435,
"step": 1484
},
{
"epoch": 0.4306759098786828,
"grad_norm": 1.7342525720596313,
"learning_rate": 9.271881193841135e-05,
"loss": 2.7762,
"step": 1491
},
{
"epoch": 0.43269786250722125,
"grad_norm": 1.7639968395233154,
"learning_rate": 9.263169624156694e-05,
"loss": 2.8624,
"step": 1498
},
{
"epoch": 0.4347198151357597,
"grad_norm": 1.6903929710388184,
"learning_rate": 9.25441039246264e-05,
"loss": 2.8375,
"step": 1505
},
{
"epoch": 0.43674176776429807,
"grad_norm": 2.2824156284332275,
"learning_rate": 9.245603596686719e-05,
"loss": 2.8951,
"step": 1512
},
{
"epoch": 0.4387637203928365,
"grad_norm": 1.6158207654953003,
"learning_rate": 9.236749335288442e-05,
"loss": 2.7868,
"step": 1519
},
{
"epoch": 0.44078567302137495,
"grad_norm": 1.6459358930587769,
"learning_rate": 9.227847707257975e-05,
"loss": 2.7288,
"step": 1526
},
{
"epoch": 0.44280762564991333,
"grad_norm": 1.6403940916061401,
"learning_rate": 9.218898812115049e-05,
"loss": 2.7801,
"step": 1533
},
{
"epoch": 0.44482957827845176,
"grad_norm": 1.6620136499404907,
"learning_rate": 9.209902749907836e-05,
"loss": 2.7598,
"step": 1540
},
{
"epoch": 0.4468515309069902,
"grad_norm": 1.7667573690414429,
"learning_rate": 9.200859621211832e-05,
"loss": 2.7368,
"step": 1547
},
{
"epoch": 0.4488734835355286,
"grad_norm": 1.568248987197876,
"learning_rate": 9.191769527128736e-05,
"loss": 2.7518,
"step": 1554
},
{
"epoch": 0.450895436164067,
"grad_norm": 1.6497979164123535,
"learning_rate": 9.182632569285314e-05,
"loss": 2.8241,
"step": 1561
},
{
"epoch": 0.45291738879260546,
"grad_norm": 1.7366968393325806,
"learning_rate": 9.17344884983227e-05,
"loss": 2.8069,
"step": 1568
},
{
"epoch": 0.45493934142114384,
"grad_norm": 1.7804850339889526,
"learning_rate": 9.1642184714431e-05,
"loss": 2.7597,
"step": 1575
},
{
"epoch": 0.4569612940496823,
"grad_norm": 10.3452730178833,
"learning_rate": 9.15494153731294e-05,
"loss": 2.7507,
"step": 1582
},
{
"epoch": 0.45898324667822066,
"grad_norm": 1.733091950416565,
"learning_rate": 9.145618151157424e-05,
"loss": 2.7015,
"step": 1589
},
{
"epoch": 0.4610051993067591,
"grad_norm": 1.769957184791565,
"learning_rate": 9.136248417211512e-05,
"loss": 2.7582,
"step": 1596
},
{
"epoch": 0.46302715193529753,
"grad_norm": 1.633809208869934,
"learning_rate": 9.12683244022833e-05,
"loss": 2.7988,
"step": 1603
},
{
"epoch": 0.4650491045638359,
"grad_norm": 1.6593530178070068,
"learning_rate": 9.117370325478e-05,
"loss": 2.8712,
"step": 1610
},
{
"epoch": 0.46707105719237435,
"grad_norm": 1.607200026512146,
"learning_rate": 9.107862178746463e-05,
"loss": 2.7589,
"step": 1617
},
{
"epoch": 0.4690930098209128,
"grad_norm": 1.6576822996139526,
"learning_rate": 9.098308106334291e-05,
"loss": 2.7401,
"step": 1624
},
{
"epoch": 0.47111496244945117,
"grad_norm": 1.7137107849121094,
"learning_rate": 9.088708215055508e-05,
"loss": 2.7639,
"step": 1631
},
{
"epoch": 0.4731369150779896,
"grad_norm": 1.6344847679138184,
"learning_rate": 9.079062612236387e-05,
"loss": 2.8319,
"step": 1638
},
{
"epoch": 0.47515886770652804,
"grad_norm": 1.5891085863113403,
"learning_rate": 9.069371405714252e-05,
"loss": 2.6934,
"step": 1645
},
{
"epoch": 0.4771808203350664,
"grad_norm": 4.501755237579346,
"learning_rate": 9.05963470383628e-05,
"loss": 2.6837,
"step": 1652
},
{
"epoch": 0.47920277296360486,
"grad_norm": 1.567033290863037,
"learning_rate": 9.049852615458278e-05,
"loss": 2.7968,
"step": 1659
},
{
"epoch": 0.48122472559214324,
"grad_norm": 1.6929982900619507,
"learning_rate": 9.040025249943476e-05,
"loss": 2.6938,
"step": 1666
},
{
"epoch": 0.4832466782206817,
"grad_norm": 1.7686516046524048,
"learning_rate": 9.030152717161294e-05,
"loss": 2.8016,
"step": 1673
},
{
"epoch": 0.4852686308492201,
"grad_norm": 1.6347525119781494,
"learning_rate": 9.020235127486125e-05,
"loss": 2.7131,
"step": 1680
},
{
"epoch": 0.4872905834777585,
"grad_norm": 1.5744967460632324,
"learning_rate": 9.010272591796097e-05,
"loss": 2.7364,
"step": 1687
},
{
"epoch": 0.48931253610629694,
"grad_norm": 1.6553685665130615,
"learning_rate": 9.000265221471822e-05,
"loss": 2.7467,
"step": 1694
},
{
"epoch": 0.4913344887348354,
"grad_norm": 1.6058526039123535,
"learning_rate": 8.990213128395175e-05,
"loss": 2.7837,
"step": 1701
},
{
"epoch": 0.49335644136337375,
"grad_norm": 1.6541842222213745,
"learning_rate": 8.980116424948019e-05,
"loss": 2.7639,
"step": 1708
},
{
"epoch": 0.4953783939919122,
"grad_norm": 1.800726056098938,
"learning_rate": 8.969975224010961e-05,
"loss": 2.7972,
"step": 1715
},
{
"epoch": 0.49740034662045063,
"grad_norm": 1.7600321769714355,
"learning_rate": 8.959789638962089e-05,
"loss": 2.714,
"step": 1722
},
{
"epoch": 0.499422299248989,
"grad_norm": 1.6169345378875732,
"learning_rate": 8.9495597836757e-05,
"loss": 2.78,
"step": 1729
},
{
"epoch": 0.5014442518775274,
"grad_norm": 1.6731981039047241,
"learning_rate": 8.939285772521033e-05,
"loss": 2.7272,
"step": 1736
},
{
"epoch": 0.5034662045060658,
"grad_norm": 1.7356120347976685,
"learning_rate": 8.928967720360987e-05,
"loss": 2.7888,
"step": 1743
},
{
"epoch": 0.5054881571346043,
"grad_norm": 1.6834124326705933,
"learning_rate": 8.918605742550837e-05,
"loss": 2.7427,
"step": 1750
},
{
"epoch": 0.5075101097631427,
"grad_norm": 1.5424860715866089,
"learning_rate": 8.908199954936942e-05,
"loss": 2.7217,
"step": 1757
},
{
"epoch": 0.5095320623916811,
"grad_norm": 1.7886710166931152,
"learning_rate": 8.897750473855453e-05,
"loss": 2.7078,
"step": 1764
},
{
"epoch": 0.5115540150202196,
"grad_norm": 1.64043390750885,
"learning_rate": 8.887257416131016e-05,
"loss": 2.7356,
"step": 1771
},
{
"epoch": 0.5135759676487579,
"grad_norm": 1.6376231908798218,
"learning_rate": 8.876720899075455e-05,
"loss": 2.7536,
"step": 1778
},
{
"epoch": 0.5155979202772963,
"grad_norm": 1.877820611000061,
"learning_rate": 8.866141040486471e-05,
"loss": 2.7294,
"step": 1785
},
{
"epoch": 0.5176198729058348,
"grad_norm": 1.784562587738037,
"learning_rate": 8.85551795864632e-05,
"loss": 2.7104,
"step": 1792
},
{
"epoch": 0.5196418255343732,
"grad_norm": 1.5521408319473267,
"learning_rate": 8.844851772320494e-05,
"loss": 2.5452,
"step": 1799
},
{
"epoch": 0.5216637781629117,
"grad_norm": 1.6310935020446777,
"learning_rate": 8.834142600756386e-05,
"loss": 2.7134,
"step": 1806
},
{
"epoch": 0.52368573079145,
"grad_norm": 1.6490379571914673,
"learning_rate": 8.823390563681965e-05,
"loss": 2.7386,
"step": 1813
},
{
"epoch": 0.5257076834199884,
"grad_norm": 1.6033873558044434,
"learning_rate": 8.812595781304436e-05,
"loss": 2.7031,
"step": 1820
},
{
"epoch": 0.5277296360485269,
"grad_norm": 1.5489314794540405,
"learning_rate": 8.801758374308888e-05,
"loss": 2.6165,
"step": 1827
},
{
"epoch": 0.5297515886770653,
"grad_norm": 1.7704176902770996,
"learning_rate": 8.790878463856958e-05,
"loss": 2.6288,
"step": 1834
},
{
"epoch": 0.5317735413056037,
"grad_norm": 3.7646679878234863,
"learning_rate": 8.779956171585463e-05,
"loss": 2.718,
"step": 1841
},
{
"epoch": 0.5337954939341422,
"grad_norm": 1.51352858543396,
"learning_rate": 8.768991619605054e-05,
"loss": 2.7983,
"step": 1848
},
{
"epoch": 0.5358174465626805,
"grad_norm": 1.5823729038238525,
"learning_rate": 8.757984930498833e-05,
"loss": 2.6646,
"step": 1855
},
{
"epoch": 0.5378393991912189,
"grad_norm": 1.6353611946105957,
"learning_rate": 8.746936227321003e-05,
"loss": 2.7042,
"step": 1862
},
{
"epoch": 0.5398613518197574,
"grad_norm": 1.7938235998153687,
"learning_rate": 8.735845633595477e-05,
"loss": 2.7986,
"step": 1869
},
{
"epoch": 0.5418833044482958,
"grad_norm": 1.6479319334030151,
"learning_rate": 8.7247132733145e-05,
"loss": 2.6975,
"step": 1876
},
{
"epoch": 0.5439052570768342,
"grad_norm": 1.5863536596298218,
"learning_rate": 8.713539270937271e-05,
"loss": 2.788,
"step": 1883
},
{
"epoch": 0.5459272097053726,
"grad_norm": 1.684091567993164,
"learning_rate": 8.702323751388541e-05,
"loss": 2.7282,
"step": 1890
},
{
"epoch": 0.547949162333911,
"grad_norm": 1.654599666595459,
"learning_rate": 8.691066840057223e-05,
"loss": 2.6806,
"step": 1897
},
{
"epoch": 0.5499711149624494,
"grad_norm": 1.6643625497817993,
"learning_rate": 8.679768662794985e-05,
"loss": 2.7077,
"step": 1904
},
{
"epoch": 0.5519930675909879,
"grad_norm": 1.6734774112701416,
"learning_rate": 8.66842934591485e-05,
"loss": 2.7808,
"step": 1911
},
{
"epoch": 0.5540150202195263,
"grad_norm": 1.5683385133743286,
"learning_rate": 8.657049016189776e-05,
"loss": 2.75,
"step": 1918
},
{
"epoch": 0.5560369728480647,
"grad_norm": 1.7231022119522095,
"learning_rate": 8.645627800851244e-05,
"loss": 2.7396,
"step": 1925
},
{
"epoch": 0.5580589254766031,
"grad_norm": 1.6285254955291748,
"learning_rate": 8.634165827587834e-05,
"loss": 2.7408,
"step": 1932
},
{
"epoch": 0.5600808781051415,
"grad_norm": 1.6170200109481812,
"learning_rate": 8.622663224543797e-05,
"loss": 2.6624,
"step": 1939
},
{
"epoch": 0.56210283073368,
"grad_norm": 1.5798912048339844,
"learning_rate": 8.611120120317623e-05,
"loss": 2.6589,
"step": 1946
},
{
"epoch": 0.5641247833622184,
"grad_norm": 1.4929171800613403,
"learning_rate": 8.599536643960605e-05,
"loss": 2.6562,
"step": 1953
},
{
"epoch": 0.5661467359907568,
"grad_norm": 1.6575974225997925,
"learning_rate": 8.587912924975391e-05,
"loss": 2.6959,
"step": 1960
},
{
"epoch": 0.5681686886192953,
"grad_norm": 1.6966285705566406,
"learning_rate": 8.576249093314541e-05,
"loss": 2.6809,
"step": 1967
},
{
"epoch": 0.5701906412478336,
"grad_norm": 1.5209206342697144,
"learning_rate": 8.564545279379073e-05,
"loss": 2.6698,
"step": 1974
},
{
"epoch": 0.572212593876372,
"grad_norm": 1.6631150245666504,
"learning_rate": 8.552801614017004e-05,
"loss": 2.7044,
"step": 1981
},
{
"epoch": 0.5742345465049105,
"grad_norm": 1.6683377027511597,
"learning_rate": 8.541018228521886e-05,
"loss": 2.7116,
"step": 1988
},
{
"epoch": 0.5762564991334489,
"grad_norm": 1.5585025548934937,
"learning_rate": 8.529195254631345e-05,
"loss": 2.7277,
"step": 1995
},
{
"epoch": 0.5782784517619873,
"grad_norm": 1.5222512483596802,
"learning_rate": 8.517332824525599e-05,
"loss": 2.6625,
"step": 2002
},
{
"epoch": 0.5803004043905257,
"grad_norm": 1.634189486503601,
"learning_rate": 8.505431070825985e-05,
"loss": 2.7999,
"step": 2009
},
{
"epoch": 0.5823223570190641,
"grad_norm": 1.697341799736023,
"learning_rate": 8.493490126593479e-05,
"loss": 2.6942,
"step": 2016
},
{
"epoch": 0.5843443096476025,
"grad_norm": 1.6360241174697876,
"learning_rate": 8.481510125327198e-05,
"loss": 2.7039,
"step": 2023
},
{
"epoch": 0.586366262276141,
"grad_norm": 1.7048941850662231,
"learning_rate": 8.46949120096292e-05,
"loss": 2.5961,
"step": 2030
},
{
"epoch": 0.5883882149046794,
"grad_norm": 1.4944428205490112,
"learning_rate": 8.457433487871582e-05,
"loss": 2.692,
"step": 2037
},
{
"epoch": 0.5904101675332178,
"grad_norm": 1.6939690113067627,
"learning_rate": 8.445337120857775e-05,
"loss": 2.6814,
"step": 2044
},
{
"epoch": 0.5924321201617562,
"grad_norm": 1.5280512571334839,
"learning_rate": 8.433202235158237e-05,
"loss": 2.6694,
"step": 2051
},
{
"epoch": 0.5944540727902946,
"grad_norm": 1.6327159404754639,
"learning_rate": 8.421028966440345e-05,
"loss": 2.6265,
"step": 2058
},
{
"epoch": 0.596476025418833,
"grad_norm": 1.5076837539672852,
"learning_rate": 8.408817450800594e-05,
"loss": 2.6193,
"step": 2065
},
{
"epoch": 0.5984979780473715,
"grad_norm": 1.753414511680603,
"learning_rate": 8.396567824763084e-05,
"loss": 2.7144,
"step": 2072
},
{
"epoch": 0.6005199306759099,
"grad_norm": 1.5676695108413696,
"learning_rate": 8.384280225277978e-05,
"loss": 2.6933,
"step": 2079
},
{
"epoch": 0.6025418833044482,
"grad_norm": 1.5314265489578247,
"learning_rate": 8.371954789719986e-05,
"loss": 2.6253,
"step": 2086
},
{
"epoch": 0.6045638359329867,
"grad_norm": 1.5249189138412476,
"learning_rate": 8.359591655886822e-05,
"loss": 2.5989,
"step": 2093
},
{
"epoch": 0.6065857885615251,
"grad_norm": 1.6141725778579712,
"learning_rate": 8.347190961997666e-05,
"loss": 2.6513,
"step": 2100
},
{
"epoch": 0.6086077411900636,
"grad_norm": 2.041733980178833,
"learning_rate": 8.334752846691614e-05,
"loss": 2.7081,
"step": 2107
},
{
"epoch": 0.610629693818602,
"grad_norm": 1.6285977363586426,
"learning_rate": 8.322277449026135e-05,
"loss": 2.6404,
"step": 2114
},
{
"epoch": 0.6126516464471404,
"grad_norm": 1.5082623958587646,
"learning_rate": 8.309764908475508e-05,
"loss": 2.5684,
"step": 2121
},
{
"epoch": 0.6146735990756788,
"grad_norm": 1.5108692646026611,
"learning_rate": 8.297215364929274e-05,
"loss": 2.6821,
"step": 2128
},
{
"epoch": 0.6166955517042172,
"grad_norm": 1.6972891092300415,
"learning_rate": 8.28462895869066e-05,
"loss": 2.6732,
"step": 2135
},
{
"epoch": 0.6187175043327556,
"grad_norm": 69.6792984008789,
"learning_rate": 8.272005830475022e-05,
"loss": 2.6544,
"step": 2142
},
{
"epoch": 0.6207394569612941,
"grad_norm": 1.5854318141937256,
"learning_rate": 8.259346121408258e-05,
"loss": 2.7488,
"step": 2149
},
{
"epoch": 0.6227614095898325,
"grad_norm": 1.8268544673919678,
"learning_rate": 8.246649973025244e-05,
"loss": 2.5605,
"step": 2156
},
{
"epoch": 0.6247833622183708,
"grad_norm": 1.5942052602767944,
"learning_rate": 8.233917527268242e-05,
"loss": 2.661,
"step": 2163
},
{
"epoch": 0.6268053148469093,
"grad_norm": 1.622071385383606,
"learning_rate": 8.22114892648532e-05,
"loss": 2.7426,
"step": 2170
},
{
"epoch": 0.6288272674754477,
"grad_norm": 1.594794511795044,
"learning_rate": 8.208344313428753e-05,
"loss": 2.704,
"step": 2177
},
{
"epoch": 0.6308492201039861,
"grad_norm": 1.5333237648010254,
"learning_rate": 8.195503831253438e-05,
"loss": 2.6622,
"step": 2184
},
{
"epoch": 0.6328711727325246,
"grad_norm": 1.5856921672821045,
"learning_rate": 8.182627623515278e-05,
"loss": 2.6087,
"step": 2191
},
{
"epoch": 0.634893125361063,
"grad_norm": 1.5291564464569092,
"learning_rate": 8.169715834169593e-05,
"loss": 2.6231,
"step": 2198
},
{
"epoch": 0.6369150779896013,
"grad_norm": 1.6826353073120117,
"learning_rate": 8.156768607569501e-05,
"loss": 2.693,
"step": 2205
},
{
"epoch": 0.6389370306181398,
"grad_norm": 1.5820770263671875,
"learning_rate": 8.143786088464306e-05,
"loss": 2.5229,
"step": 2212
},
{
"epoch": 0.6409589832466782,
"grad_norm": 1.591989517211914,
"learning_rate": 8.130768421997881e-05,
"loss": 2.6624,
"step": 2219
},
{
"epoch": 0.6429809358752167,
"grad_norm": 1.6424845457077026,
"learning_rate": 8.117715753707045e-05,
"loss": 2.6297,
"step": 2226
},
{
"epoch": 0.6450028885037551,
"grad_norm": 1.6022992134094238,
"learning_rate": 8.104628229519935e-05,
"loss": 2.6345,
"step": 2233
},
{
"epoch": 0.6470248411322935,
"grad_norm": 1.543290138244629,
"learning_rate": 8.091505995754375e-05,
"loss": 2.5814,
"step": 2240
},
{
"epoch": 0.6490467937608319,
"grad_norm": 1.5112725496292114,
"learning_rate": 8.078349199116241e-05,
"loss": 2.637,
"step": 2247
},
{
"epoch": 0.6510687463893703,
"grad_norm": 1.6379178762435913,
"learning_rate": 8.065157986697819e-05,
"loss": 2.6515,
"step": 2254
},
{
"epoch": 0.6530906990179087,
"grad_norm": 1.5617547035217285,
"learning_rate": 8.051932505976161e-05,
"loss": 2.6124,
"step": 2261
},
{
"epoch": 0.6551126516464472,
"grad_norm": 1.7973439693450928,
"learning_rate": 8.03867290481144e-05,
"loss": 2.7264,
"step": 2268
},
{
"epoch": 0.6571346042749856,
"grad_norm": 1.5679056644439697,
"learning_rate": 8.025379331445291e-05,
"loss": 2.6324,
"step": 2275
},
{
"epoch": 0.6591565569035239,
"grad_norm": 1.546196460723877,
"learning_rate": 8.012051934499156e-05,
"loss": 2.6175,
"step": 2282
},
{
"epoch": 0.6611785095320624,
"grad_norm": 1.556950330734253,
"learning_rate": 7.998690862972626e-05,
"loss": 2.64,
"step": 2289
},
{
"epoch": 0.6632004621606008,
"grad_norm": 1.6919306516647339,
"learning_rate": 7.985296266241768e-05,
"loss": 2.5713,
"step": 2296
},
{
"epoch": 0.6652224147891392,
"grad_norm": 1.5550673007965088,
"learning_rate": 7.971868294057461e-05,
"loss": 2.6087,
"step": 2303
},
{
"epoch": 0.6672443674176777,
"grad_norm": 1.4847160577774048,
"learning_rate": 7.958407096543721e-05,
"loss": 2.6296,
"step": 2310
},
{
"epoch": 0.6692663200462161,
"grad_norm": 1.5030337572097778,
"learning_rate": 7.944912824196019e-05,
"loss": 2.5972,
"step": 2317
},
{
"epoch": 0.6712882726747544,
"grad_norm": 1.541245698928833,
"learning_rate": 7.931385627879603e-05,
"loss": 2.6291,
"step": 2324
},
{
"epoch": 0.6733102253032929,
"grad_norm": 1.5703462362289429,
"learning_rate": 7.917825658827807e-05,
"loss": 2.6247,
"step": 2331
},
{
"epoch": 0.6753321779318313,
"grad_norm": 1.6782206296920776,
"learning_rate": 7.904233068640364e-05,
"loss": 2.6725,
"step": 2338
},
{
"epoch": 0.6773541305603697,
"grad_norm": 1.490038275718689,
"learning_rate": 7.89060800928171e-05,
"loss": 2.6327,
"step": 2345
},
{
"epoch": 0.6793760831889082,
"grad_norm": 1.5980485677719116,
"learning_rate": 7.876950633079281e-05,
"loss": 2.5769,
"step": 2352
},
{
"epoch": 0.6813980358174465,
"grad_norm": 1.5050002336502075,
"learning_rate": 7.863261092721821e-05,
"loss": 2.4925,
"step": 2359
},
{
"epoch": 0.683419988445985,
"grad_norm": 1.6050291061401367,
"learning_rate": 7.84953954125766e-05,
"loss": 2.5922,
"step": 2366
},
{
"epoch": 0.6854419410745234,
"grad_norm": 1.6114401817321777,
"learning_rate": 7.835786132093014e-05,
"loss": 2.696,
"step": 2373
},
{
"epoch": 0.6874638937030618,
"grad_norm": 1.5678999423980713,
"learning_rate": 7.822001018990265e-05,
"loss": 2.6002,
"step": 2380
},
{
"epoch": 0.6894858463316003,
"grad_norm": 1.6375820636749268,
"learning_rate": 7.808184356066247e-05,
"loss": 2.6161,
"step": 2387
},
{
"epoch": 0.6915077989601387,
"grad_norm": 1.5844779014587402,
"learning_rate": 7.794336297790513e-05,
"loss": 2.6841,
"step": 2394
},
{
"epoch": 0.693529751588677,
"grad_norm": 1.6627087593078613,
"learning_rate": 7.780456998983619e-05,
"loss": 2.5834,
"step": 2401
},
{
"epoch": 0.6955517042172155,
"grad_norm": 1.545854091644287,
"learning_rate": 7.766546614815389e-05,
"loss": 2.5658,
"step": 2408
},
{
"epoch": 0.6975736568457539,
"grad_norm": 1.5244427919387817,
"learning_rate": 7.752605300803176e-05,
"loss": 2.6742,
"step": 2415
},
{
"epoch": 0.6995956094742923,
"grad_norm": 1.6053026914596558,
"learning_rate": 7.738633212810129e-05,
"loss": 2.5592,
"step": 2422
},
{
"epoch": 0.7016175621028308,
"grad_norm": 1.5977808237075806,
"learning_rate": 7.724630507043452e-05,
"loss": 2.6546,
"step": 2429
},
{
"epoch": 0.7036395147313691,
"grad_norm": 1.5113341808319092,
"learning_rate": 7.710597340052646e-05,
"loss": 2.5986,
"step": 2436
},
{
"epoch": 0.7056614673599075,
"grad_norm": 1.5165156126022339,
"learning_rate": 7.696533868727772e-05,
"loss": 2.6337,
"step": 2443
},
{
"epoch": 0.707683419988446,
"grad_norm": 1.6537933349609375,
"learning_rate": 7.682440250297693e-05,
"loss": 2.6414,
"step": 2450
},
{
"epoch": 0.7097053726169844,
"grad_norm": 1.6465896368026733,
"learning_rate": 7.668316642328312e-05,
"loss": 2.5988,
"step": 2457
},
{
"epoch": 0.7117273252455228,
"grad_norm": 1.5026781558990479,
"learning_rate": 7.654163202720818e-05,
"loss": 2.5555,
"step": 2464
},
{
"epoch": 0.7137492778740613,
"grad_norm": 1.5290915966033936,
"learning_rate": 7.63998008970991e-05,
"loss": 2.5399,
"step": 2471
},
{
"epoch": 0.7157712305025996,
"grad_norm": 1.5298501253128052,
"learning_rate": 7.625767461862036e-05,
"loss": 2.5872,
"step": 2478
},
{
"epoch": 0.717793183131138,
"grad_norm": 1.4506494998931885,
"learning_rate": 7.611525478073622e-05,
"loss": 2.581,
"step": 2485
},
{
"epoch": 0.7198151357596765,
"grad_norm": 1.572906255722046,
"learning_rate": 7.597254297569287e-05,
"loss": 2.6715,
"step": 2492
},
{
"epoch": 0.7218370883882149,
"grad_norm": 1.5634372234344482,
"learning_rate": 7.582954079900071e-05,
"loss": 2.4712,
"step": 2499
},
{
"epoch": 0.7238590410167534,
"grad_norm": 1.4667270183563232,
"learning_rate": 7.568624984941647e-05,
"loss": 2.6222,
"step": 2506
},
{
"epoch": 0.7258809936452918,
"grad_norm": 1.5092616081237793,
"learning_rate": 7.554267172892533e-05,
"loss": 2.6669,
"step": 2513
},
{
"epoch": 0.7279029462738301,
"grad_norm": 1.562249779701233,
"learning_rate": 7.539880804272306e-05,
"loss": 2.6305,
"step": 2520
},
{
"epoch": 0.7299248989023686,
"grad_norm": 1.6032488346099854,
"learning_rate": 7.5254660399198e-05,
"loss": 2.7047,
"step": 2527
},
{
"epoch": 0.731946851530907,
"grad_norm": 1.5360654592514038,
"learning_rate": 7.511023040991314e-05,
"loss": 2.6315,
"step": 2534
},
{
"epoch": 0.7339688041594454,
"grad_norm": 1.5899386405944824,
"learning_rate": 7.496551968958807e-05,
"loss": 2.6078,
"step": 2541
},
{
"epoch": 0.7359907567879839,
"grad_norm": 1.521978497505188,
"learning_rate": 7.482052985608097e-05,
"loss": 2.5834,
"step": 2548
},
{
"epoch": 0.7380127094165222,
"grad_norm": 1.43201744556427,
"learning_rate": 7.467526253037045e-05,
"loss": 2.6157,
"step": 2555
},
{
"epoch": 0.7400346620450606,
"grad_norm": 1.5489884614944458,
"learning_rate": 7.452971933653748e-05,
"loss": 2.6691,
"step": 2562
},
{
"epoch": 0.7420566146735991,
"grad_norm": 1.496980905532837,
"learning_rate": 7.438390190174724e-05,
"loss": 2.612,
"step": 2569
},
{
"epoch": 0.7440785673021375,
"grad_norm": 1.6220296621322632,
"learning_rate": 7.423781185623087e-05,
"loss": 2.5776,
"step": 2576
},
{
"epoch": 0.7461005199306759,
"grad_norm": 1.5279968976974487,
"learning_rate": 7.409145083326733e-05,
"loss": 2.5984,
"step": 2583
},
{
"epoch": 0.7481224725592144,
"grad_norm": 1.5506973266601562,
"learning_rate": 7.394482046916504e-05,
"loss": 2.5105,
"step": 2590
},
{
"epoch": 0.7501444251877527,
"grad_norm": 1.597609281539917,
"learning_rate": 7.379792240324372e-05,
"loss": 2.552,
"step": 2597
},
{
"epoch": 0.7521663778162911,
"grad_norm": 1.946540355682373,
"learning_rate": 7.365075827781589e-05,
"loss": 2.5808,
"step": 2604
},
{
"epoch": 0.7541883304448296,
"grad_norm": 1.557524561882019,
"learning_rate": 7.350332973816867e-05,
"loss": 2.5806,
"step": 2611
},
{
"epoch": 0.756210283073368,
"grad_norm": 1.6246957778930664,
"learning_rate": 7.335563843254527e-05,
"loss": 2.5267,
"step": 2618
},
{
"epoch": 0.7582322357019065,
"grad_norm": 1.5411909818649292,
"learning_rate": 7.320768601212663e-05,
"loss": 2.6094,
"step": 2625
},
{
"epoch": 0.7602541883304448,
"grad_norm": 1.5607600212097168,
"learning_rate": 7.305947413101295e-05,
"loss": 2.579,
"step": 2632
},
{
"epoch": 0.7622761409589832,
"grad_norm": 1.5495412349700928,
"learning_rate": 7.291100444620518e-05,
"loss": 2.6576,
"step": 2639
},
{
"epoch": 0.7642980935875217,
"grad_norm": 1.6648463010787964,
"learning_rate": 7.27622786175865e-05,
"loss": 2.5696,
"step": 2646
},
{
"epoch": 0.7663200462160601,
"grad_norm": 1.562121868133545,
"learning_rate": 7.261329830790376e-05,
"loss": 2.6125,
"step": 2653
},
{
"epoch": 0.7683419988445985,
"grad_norm": 1.5453815460205078,
"learning_rate": 7.246406518274886e-05,
"loss": 2.4759,
"step": 2660
},
{
"epoch": 0.770363951473137,
"grad_norm": 1.4709010124206543,
"learning_rate": 7.231458091054026e-05,
"loss": 2.5199,
"step": 2667
},
{
"epoch": 0.7723859041016753,
"grad_norm": 1.6227165460586548,
"learning_rate": 7.216484716250414e-05,
"loss": 2.6122,
"step": 2674
},
{
"epoch": 0.7744078567302137,
"grad_norm": 1.4720866680145264,
"learning_rate": 7.201486561265582e-05,
"loss": 2.5468,
"step": 2681
},
{
"epoch": 0.7764298093587522,
"grad_norm": 1.4698817729949951,
"learning_rate": 7.18646379377811e-05,
"loss": 2.6578,
"step": 2688
},
{
"epoch": 0.7784517619872906,
"grad_norm": 1.6236904859542847,
"learning_rate": 7.171416581741734e-05,
"loss": 2.5009,
"step": 2695
},
{
"epoch": 0.780473714615829,
"grad_norm": 1.4604315757751465,
"learning_rate": 7.156345093383489e-05,
"loss": 2.4697,
"step": 2702
},
{
"epoch": 0.7824956672443674,
"grad_norm": 1.632841944694519,
"learning_rate": 7.14124949720181e-05,
"loss": 2.5719,
"step": 2709
},
{
"epoch": 0.7845176198729058,
"grad_norm": 1.5395116806030273,
"learning_rate": 7.126129961964658e-05,
"loss": 2.5451,
"step": 2716
},
{
"epoch": 0.7865395725014442,
"grad_norm": 1.4505419731140137,
"learning_rate": 7.110986656707634e-05,
"loss": 2.5052,
"step": 2723
},
{
"epoch": 0.7885615251299827,
"grad_norm": 1.4465105533599854,
"learning_rate": 7.095819750732089e-05,
"loss": 2.6008,
"step": 2730
},
{
"epoch": 0.7905834777585211,
"grad_norm": 1.467899203300476,
"learning_rate": 7.08062941360322e-05,
"loss": 2.5364,
"step": 2737
},
{
"epoch": 0.7926054303870596,
"grad_norm": 1.5132942199707031,
"learning_rate": 7.065415815148189e-05,
"loss": 2.5405,
"step": 2744
},
{
"epoch": 0.7946273830155979,
"grad_norm": 1.4897785186767578,
"learning_rate": 7.050179125454217e-05,
"loss": 2.5331,
"step": 2751
},
{
"epoch": 0.7966493356441363,
"grad_norm": 1.5240031480789185,
"learning_rate": 7.034919514866688e-05,
"loss": 2.4342,
"step": 2758
},
{
"epoch": 0.7986712882726748,
"grad_norm": 1.5082119703292847,
"learning_rate": 7.019637153987232e-05,
"loss": 2.589,
"step": 2765
},
{
"epoch": 0.8006932409012132,
"grad_norm": 1.413856863975525,
"learning_rate": 7.004332213671832e-05,
"loss": 2.5479,
"step": 2772
},
{
"epoch": 0.8027151935297516,
"grad_norm": 1.492169737815857,
"learning_rate": 6.98900486502891e-05,
"loss": 2.5689,
"step": 2779
},
{
"epoch": 0.80473714615829,
"grad_norm": 1.478893756866455,
"learning_rate": 6.973655279417404e-05,
"loss": 2.5185,
"step": 2786
},
{
"epoch": 0.8067590987868284,
"grad_norm": 1.4768402576446533,
"learning_rate": 6.958283628444866e-05,
"loss": 2.5312,
"step": 2793
},
{
"epoch": 0.8087810514153668,
"grad_norm": 1.569712519645691,
"learning_rate": 6.942890083965538e-05,
"loss": 2.6235,
"step": 2800
},
{
"epoch": 0.8108030040439053,
"grad_norm": 1.5381864309310913,
"learning_rate": 6.927474818078424e-05,
"loss": 2.5677,
"step": 2807
},
{
"epoch": 0.8128249566724437,
"grad_norm": 2.156064987182617,
"learning_rate": 6.912038003125381e-05,
"loss": 2.5038,
"step": 2814
},
{
"epoch": 0.8148469093009821,
"grad_norm": 1.4678055047988892,
"learning_rate": 6.896579811689176e-05,
"loss": 2.5175,
"step": 2821
},
{
"epoch": 0.8168688619295205,
"grad_norm": 1.4652937650680542,
"learning_rate": 6.881100416591569e-05,
"loss": 2.5186,
"step": 2828
},
{
"epoch": 0.8188908145580589,
"grad_norm": 1.5202971696853638,
"learning_rate": 6.865599990891374e-05,
"loss": 2.5939,
"step": 2835
},
{
"epoch": 0.8209127671865973,
"grad_norm": 1.546286702156067,
"learning_rate": 6.850078707882524e-05,
"loss": 2.4651,
"step": 2842
},
{
"epoch": 0.8229347198151358,
"grad_norm": 1.5782599449157715,
"learning_rate": 6.834536741092137e-05,
"loss": 2.5271,
"step": 2849
},
{
"epoch": 0.8249566724436742,
"grad_norm": 1.4335618019104004,
"learning_rate": 6.818974264278578e-05,
"loss": 2.5743,
"step": 2856
},
{
"epoch": 0.8269786250722126,
"grad_norm": 1.5671149492263794,
"learning_rate": 6.803391451429505e-05,
"loss": 2.548,
"step": 2863
},
{
"epoch": 0.829000577700751,
"grad_norm": 1.423937439918518,
"learning_rate": 6.787788476759942e-05,
"loss": 2.5809,
"step": 2870
},
{
"epoch": 0.8310225303292894,
"grad_norm": 1.490062952041626,
"learning_rate": 6.772165514710314e-05,
"loss": 2.5932,
"step": 2877
},
{
"epoch": 0.8330444829578278,
"grad_norm": 1.6760882139205933,
"learning_rate": 6.756522739944503e-05,
"loss": 2.4622,
"step": 2884
},
{
"epoch": 0.8350664355863663,
"grad_norm": 1.506843090057373,
"learning_rate": 6.740860327347903e-05,
"loss": 2.4669,
"step": 2891
},
{
"epoch": 0.8370883882149047,
"grad_norm": 1.5637836456298828,
"learning_rate": 6.725178452025448e-05,
"loss": 2.5673,
"step": 2898
},
{
"epoch": 0.839110340843443,
"grad_norm": 1.503013253211975,
"learning_rate": 6.709477289299676e-05,
"loss": 2.5664,
"step": 2905
},
{
"epoch": 0.8411322934719815,
"grad_norm": 1.470767855644226,
"learning_rate": 6.693757014708747e-05,
"loss": 2.4871,
"step": 2912
},
{
"epoch": 0.8431542461005199,
"grad_norm": 1.5470950603485107,
"learning_rate": 6.678017804004495e-05,
"loss": 2.5527,
"step": 2919
},
{
"epoch": 0.8451761987290584,
"grad_norm": 1.5984236001968384,
"learning_rate": 6.662259833150462e-05,
"loss": 2.5518,
"step": 2926
},
{
"epoch": 0.8471981513575968,
"grad_norm": 1.5305312871932983,
"learning_rate": 6.646483278319919e-05,
"loss": 2.522,
"step": 2933
},
{
"epoch": 0.8492201039861352,
"grad_norm": 1.5044381618499756,
"learning_rate": 6.630688315893914e-05,
"loss": 2.5722,
"step": 2940
},
{
"epoch": 0.8512420566146736,
"grad_norm": 1.4263911247253418,
"learning_rate": 6.614875122459284e-05,
"loss": 2.5258,
"step": 2947
},
{
"epoch": 0.853264009243212,
"grad_norm": 1.5409590005874634,
"learning_rate": 6.59904387480669e-05,
"loss": 2.5226,
"step": 2954
},
{
"epoch": 0.8552859618717504,
"grad_norm": 1.4655554294586182,
"learning_rate": 6.58319474992864e-05,
"loss": 2.5071,
"step": 2961
},
{
"epoch": 0.8573079145002889,
"grad_norm": 1.4593043327331543,
"learning_rate": 6.567327925017507e-05,
"loss": 2.5224,
"step": 2968
},
{
"epoch": 0.8593298671288273,
"grad_norm": 1.6059690713882446,
"learning_rate": 6.551443577463549e-05,
"loss": 2.4767,
"step": 2975
},
{
"epoch": 0.8613518197573656,
"grad_norm": 3.554884433746338,
"learning_rate": 6.535541884852927e-05,
"loss": 2.5063,
"step": 2982
},
{
"epoch": 0.8633737723859041,
"grad_norm": 2.0433740615844727,
"learning_rate": 6.519623024965718e-05,
"loss": 2.5728,
"step": 2989
},
{
"epoch": 0.8653957250144425,
"grad_norm": 3.690753698348999,
"learning_rate": 6.503687175773928e-05,
"loss": 2.5452,
"step": 2996
},
{
"epoch": 0.8674176776429809,
"grad_norm": 1.5708411931991577,
"learning_rate": 6.487734515439505e-05,
"loss": 2.4689,
"step": 3003
},
{
"epoch": 0.8694396302715194,
"grad_norm": 1.6525423526763916,
"learning_rate": 6.471765222312342e-05,
"loss": 2.5138,
"step": 3010
},
{
"epoch": 0.8714615829000578,
"grad_norm": 1.5194200277328491,
"learning_rate": 6.455779474928286e-05,
"loss": 2.6165,
"step": 3017
},
{
"epoch": 0.8734835355285961,
"grad_norm": 1.4519085884094238,
"learning_rate": 6.439777452007144e-05,
"loss": 2.564,
"step": 3024
},
{
"epoch": 0.8755054881571346,
"grad_norm": 1.615209698677063,
"learning_rate": 6.423759332450681e-05,
"loss": 2.5355,
"step": 3031
},
{
"epoch": 0.877527440785673,
"grad_norm": 4.623706340789795,
"learning_rate": 6.407725295340619e-05,
"loss": 2.5307,
"step": 3038
},
{
"epoch": 0.8795493934142115,
"grad_norm": 1.492876648902893,
"learning_rate": 6.391675519936642e-05,
"loss": 2.5213,
"step": 3045
},
{
"epoch": 0.8815713460427499,
"grad_norm": 1.6647744178771973,
"learning_rate": 6.375610185674383e-05,
"loss": 2.5198,
"step": 3052
},
{
"epoch": 0.8835932986712882,
"grad_norm": 1.4380768537521362,
"learning_rate": 6.35952947216343e-05,
"loss": 2.531,
"step": 3059
},
{
"epoch": 0.8856152512998267,
"grad_norm": 1.5382163524627686,
"learning_rate": 6.343433559185296e-05,
"loss": 2.5012,
"step": 3066
},
{
"epoch": 0.8876372039283651,
"grad_norm": 1.5382838249206543,
"learning_rate": 6.327322626691441e-05,
"loss": 2.4709,
"step": 3073
},
{
"epoch": 0.8896591565569035,
"grad_norm": 1.5894644260406494,
"learning_rate": 6.311196854801227e-05,
"loss": 2.496,
"step": 3080
},
{
"epoch": 0.891681109185442,
"grad_norm": 1.4344818592071533,
"learning_rate": 6.295056423799927e-05,
"loss": 2.5521,
"step": 3087
},
{
"epoch": 0.8937030618139804,
"grad_norm": 1.4660284519195557,
"learning_rate": 6.278901514136704e-05,
"loss": 2.5297,
"step": 3094
},
{
"epoch": 0.8957250144425187,
"grad_norm": 1.585867166519165,
"learning_rate": 6.262732306422582e-05,
"loss": 2.5552,
"step": 3101
},
{
"epoch": 0.8977469670710572,
"grad_norm": 1.4793505668640137,
"learning_rate": 6.246548981428453e-05,
"loss": 2.5055,
"step": 3108
},
{
"epoch": 0.8997689196995956,
"grad_norm": 1.5395692586898804,
"learning_rate": 6.230351720083021e-05,
"loss": 2.5287,
"step": 3115
},
{
"epoch": 0.901790872328134,
"grad_norm": 1.640716314315796,
"learning_rate": 6.21414070347081e-05,
"loss": 2.3934,
"step": 3122
},
{
"epoch": 0.9038128249566725,
"grad_norm": 1.448921799659729,
"learning_rate": 6.197916112830122e-05,
"loss": 2.5192,
"step": 3129
},
{
"epoch": 0.9058347775852109,
"grad_norm": 1.545032024383545,
"learning_rate": 6.181678129551017e-05,
"loss": 2.4952,
"step": 3136
},
{
"epoch": 0.9078567302137492,
"grad_norm": 1.4347082376480103,
"learning_rate": 6.165426935173287e-05,
"loss": 2.4568,
"step": 3143
},
{
"epoch": 0.9098786828422877,
"grad_norm": 1.5237294435501099,
"learning_rate": 6.149162711384417e-05,
"loss": 2.477,
"step": 3150
},
{
"epoch": 0.9119006354708261,
"grad_norm": 1.49074125289917,
"learning_rate": 6.132885640017566e-05,
"loss": 2.4879,
"step": 3157
},
{
"epoch": 0.9139225880993646,
"grad_norm": 1.4706346988677979,
"learning_rate": 6.116595903049526e-05,
"loss": 2.5777,
"step": 3164
},
{
"epoch": 0.915944540727903,
"grad_norm": 1.555942177772522,
"learning_rate": 6.100293682598689e-05,
"loss": 2.5135,
"step": 3171
},
{
"epoch": 0.9179664933564413,
"grad_norm": 1.5904544591903687,
"learning_rate": 6.083979160923012e-05,
"loss": 2.6248,
"step": 3178
},
{
"epoch": 0.9199884459849798,
"grad_norm": 1.4976989030838013,
"learning_rate": 6.0676525204179815e-05,
"loss": 2.4641,
"step": 3185
},
{
"epoch": 0.9220103986135182,
"grad_norm": 1.5183124542236328,
"learning_rate": 6.051313943614566e-05,
"loss": 2.501,
"step": 3192
},
{
"epoch": 0.9240323512420566,
"grad_norm": 1.5499184131622314,
"learning_rate": 6.034963613177189e-05,
"loss": 2.4758,
"step": 3199
},
{
"epoch": 0.9260543038705951,
"grad_norm": 1.4297561645507812,
"learning_rate": 6.0186017119016744e-05,
"loss": 2.5047,
"step": 3206
},
{
"epoch": 0.9280762564991335,
"grad_norm": 1.6014001369476318,
"learning_rate": 6.002228422713205e-05,
"loss": 2.5153,
"step": 3213
},
{
"epoch": 0.9300982091276718,
"grad_norm": 2.9846770763397217,
"learning_rate": 5.9858439286642864e-05,
"loss": 2.4438,
"step": 3220
},
{
"epoch": 0.9321201617562103,
"grad_norm": 1.633302927017212,
"learning_rate": 5.969448412932688e-05,
"loss": 2.5063,
"step": 3227
},
{
"epoch": 0.9341421143847487,
"grad_norm": 1.5165661573410034,
"learning_rate": 5.953042058819405e-05,
"loss": 2.5131,
"step": 3234
},
{
"epoch": 0.9361640670132871,
"grad_norm": 1.5822478532791138,
"learning_rate": 5.9366250497466025e-05,
"loss": 2.4719,
"step": 3241
},
{
"epoch": 0.9381860196418256,
"grad_norm": 1.5682454109191895,
"learning_rate": 5.92019756925557e-05,
"loss": 2.4422,
"step": 3248
},
{
"epoch": 0.9402079722703639,
"grad_norm": 1.4735851287841797,
"learning_rate": 5.9037598010046644e-05,
"loss": 2.4137,
"step": 3255
},
{
"epoch": 0.9422299248989023,
"grad_norm": 1.435520887374878,
"learning_rate": 5.887311928767263e-05,
"loss": 2.5265,
"step": 3262
},
{
"epoch": 0.9442518775274408,
"grad_norm": 1.5351203680038452,
"learning_rate": 5.8708541364296966e-05,
"loss": 2.4811,
"step": 3269
},
{
"epoch": 0.9462738301559792,
"grad_norm": 1.3827540874481201,
"learning_rate": 5.854386607989214e-05,
"loss": 2.4361,
"step": 3276
},
{
"epoch": 0.9482957827845176,
"grad_norm": 1.46942937374115,
"learning_rate": 5.837909527551901e-05,
"loss": 2.5334,
"step": 3283
},
{
"epoch": 0.9503177354130561,
"grad_norm": 1.498850703239441,
"learning_rate": 5.821423079330648e-05,
"loss": 2.5236,
"step": 3290
},
{
"epoch": 0.9523396880415944,
"grad_norm": 1.5063371658325195,
"learning_rate": 5.804927447643065e-05,
"loss": 2.4448,
"step": 3297
},
{
"epoch": 0.9543616406701328,
"grad_norm": 1.611903429031372,
"learning_rate": 5.7884228169094346e-05,
"loss": 2.4297,
"step": 3304
},
{
"epoch": 0.9563835932986713,
"grad_norm": 1.4947395324707031,
"learning_rate": 5.771909371650655e-05,
"loss": 2.5439,
"step": 3311
},
{
"epoch": 0.9584055459272097,
"grad_norm": 1.4461942911148071,
"learning_rate": 5.755387296486161e-05,
"loss": 2.5017,
"step": 3318
},
{
"epoch": 0.9604274985557482,
"grad_norm": 1.4871538877487183,
"learning_rate": 5.738856776131878e-05,
"loss": 2.4582,
"step": 3325
},
{
"epoch": 0.9624494511842865,
"grad_norm": 1.6216068267822266,
"learning_rate": 5.722317995398142e-05,
"loss": 2.5564,
"step": 3332
},
{
"epoch": 0.9644714038128249,
"grad_norm": 1.5217235088348389,
"learning_rate": 5.705771139187642e-05,
"loss": 2.4783,
"step": 3339
},
{
"epoch": 0.9664933564413634,
"grad_norm": 1.5554358959197998,
"learning_rate": 5.689216392493352e-05,
"loss": 2.4965,
"step": 3346
},
{
"epoch": 0.9685153090699018,
"grad_norm": 1.455360770225525,
"learning_rate": 5.672653940396459e-05,
"loss": 2.4727,
"step": 3353
},
{
"epoch": 0.9705372616984402,
"grad_norm": 1.4881621599197388,
"learning_rate": 5.6560839680642916e-05,
"loss": 2.522,
"step": 3360
},
{
"epoch": 0.9725592143269787,
"grad_norm": 1.4497352838516235,
"learning_rate": 5.6395066607482663e-05,
"loss": 2.4652,
"step": 3367
},
{
"epoch": 0.974581166955517,
"grad_norm": 1.4684548377990723,
"learning_rate": 5.622922203781792e-05,
"loss": 2.3909,
"step": 3374
},
{
"epoch": 0.9766031195840554,
"grad_norm": 1.5175479650497437,
"learning_rate": 5.6063307825782166e-05,
"loss": 2.5645,
"step": 3381
},
{
"epoch": 0.9786250722125939,
"grad_norm": 1.4038801193237305,
"learning_rate": 5.589732582628747e-05,
"loss": 2.3975,
"step": 3388
},
{
"epoch": 0.9806470248411323,
"grad_norm": 1.4544683694839478,
"learning_rate": 5.5731277895003754e-05,
"loss": 2.4395,
"step": 3395
},
{
"epoch": 0.9826689774696707,
"grad_norm": 1.468361496925354,
"learning_rate": 5.556516588833807e-05,
"loss": 2.4758,
"step": 3402
},
{
"epoch": 0.9846909300982092,
"grad_norm": 1.4997249841690063,
"learning_rate": 5.539899166341378e-05,
"loss": 2.4686,
"step": 3409
},
{
"epoch": 0.9867128827267475,
"grad_norm": 1.4386577606201172,
"learning_rate": 5.5232757078049925e-05,
"loss": 2.567,
"step": 3416
},
{
"epoch": 0.988734835355286,
"grad_norm": 1.4930377006530762,
"learning_rate": 5.506646399074029e-05,
"loss": 2.381,
"step": 3423
},
{
"epoch": 0.9907567879838244,
"grad_norm": 1.5108468532562256,
"learning_rate": 5.4900114260632754e-05,
"loss": 2.59,
"step": 3430
},
{
"epoch": 0.9927787406123628,
"grad_norm": 1.8218153715133667,
"learning_rate": 5.4733709747508465e-05,
"loss": 2.4011,
"step": 3437
},
{
"epoch": 0.9948006932409013,
"grad_norm": 1.5193471908569336,
"learning_rate": 5.456725231176102e-05,
"loss": 2.4279,
"step": 3444
},
{
"epoch": 0.9968226458694396,
"grad_norm": 1.4844223260879517,
"learning_rate": 5.440074381437569e-05,
"loss": 2.4751,
"step": 3451
},
{
"epoch": 0.998844598497978,
"grad_norm": 1.5008902549743652,
"learning_rate": 5.423418611690862e-05,
"loss": 2.4627,
"step": 3458
}
],
"logging_steps": 7,
"max_steps": 6924,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 3462,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.85999467371561e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}