distil-bert / trainer_state.json
Gideonah's picture
Upload folder using huggingface_hub
9537917 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 1000,
"global_step": 10608,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002828054298642534,
"grad_norm": 2.258908748626709,
"learning_rate": 1.9981146304675718e-05,
"loss": 1.5648,
"step": 10
},
{
"epoch": 0.005656108597285068,
"grad_norm": 2.3489885330200195,
"learning_rate": 1.9962292609351435e-05,
"loss": 1.2587,
"step": 20
},
{
"epoch": 0.008484162895927601,
"grad_norm": 3.1505980491638184,
"learning_rate": 1.994343891402715e-05,
"loss": 0.996,
"step": 30
},
{
"epoch": 0.011312217194570135,
"grad_norm": 2.7015912532806396,
"learning_rate": 1.9924585218702868e-05,
"loss": 0.991,
"step": 40
},
{
"epoch": 0.01414027149321267,
"grad_norm": 2.6314280033111572,
"learning_rate": 1.9905731523378585e-05,
"loss": 0.934,
"step": 50
},
{
"epoch": 0.016968325791855202,
"grad_norm": 2.055577516555786,
"learning_rate": 1.98868778280543e-05,
"loss": 0.8245,
"step": 60
},
{
"epoch": 0.019796380090497737,
"grad_norm": 2.4679293632507324,
"learning_rate": 1.9868024132730018e-05,
"loss": 0.7501,
"step": 70
},
{
"epoch": 0.02262443438914027,
"grad_norm": 4.737666130065918,
"learning_rate": 1.9849170437405735e-05,
"loss": 0.7572,
"step": 80
},
{
"epoch": 0.025452488687782805,
"grad_norm": 3.135712146759033,
"learning_rate": 1.983031674208145e-05,
"loss": 0.6785,
"step": 90
},
{
"epoch": 0.02828054298642534,
"grad_norm": 3.1648664474487305,
"learning_rate": 1.9811463046757168e-05,
"loss": 0.6764,
"step": 100
},
{
"epoch": 0.031108597285067874,
"grad_norm": 2.4548099040985107,
"learning_rate": 1.9792609351432884e-05,
"loss": 0.6322,
"step": 110
},
{
"epoch": 0.033936651583710405,
"grad_norm": 2.9620838165283203,
"learning_rate": 1.97737556561086e-05,
"loss": 0.6756,
"step": 120
},
{
"epoch": 0.03676470588235294,
"grad_norm": 6.274707794189453,
"learning_rate": 1.9754901960784318e-05,
"loss": 0.6566,
"step": 130
},
{
"epoch": 0.03959276018099547,
"grad_norm": 4.661096096038818,
"learning_rate": 1.9736048265460034e-05,
"loss": 0.6179,
"step": 140
},
{
"epoch": 0.04242081447963801,
"grad_norm": 3.97485613822937,
"learning_rate": 1.971719457013575e-05,
"loss": 0.5623,
"step": 150
},
{
"epoch": 0.04524886877828054,
"grad_norm": 5.157923698425293,
"learning_rate": 1.9698340874811464e-05,
"loss": 0.5132,
"step": 160
},
{
"epoch": 0.04807692307692308,
"grad_norm": 4.4768385887146,
"learning_rate": 1.967948717948718e-05,
"loss": 0.5081,
"step": 170
},
{
"epoch": 0.05090497737556561,
"grad_norm": 4.282762050628662,
"learning_rate": 1.9660633484162897e-05,
"loss": 0.5104,
"step": 180
},
{
"epoch": 0.05373303167420815,
"grad_norm": 5.214530944824219,
"learning_rate": 1.9641779788838614e-05,
"loss": 0.6005,
"step": 190
},
{
"epoch": 0.05656108597285068,
"grad_norm": 2.7484629154205322,
"learning_rate": 1.962292609351433e-05,
"loss": 0.5388,
"step": 200
},
{
"epoch": 0.05938914027149321,
"grad_norm": 3.6074981689453125,
"learning_rate": 1.9604072398190047e-05,
"loss": 0.512,
"step": 210
},
{
"epoch": 0.06221719457013575,
"grad_norm": 5.487575054168701,
"learning_rate": 1.9585218702865764e-05,
"loss": 0.5233,
"step": 220
},
{
"epoch": 0.06504524886877829,
"grad_norm": 3.193223714828491,
"learning_rate": 1.956636500754148e-05,
"loss": 0.5091,
"step": 230
},
{
"epoch": 0.06787330316742081,
"grad_norm": 8.64730167388916,
"learning_rate": 1.9547511312217197e-05,
"loss": 0.472,
"step": 240
},
{
"epoch": 0.07070135746606335,
"grad_norm": 3.8944942951202393,
"learning_rate": 1.9528657616892914e-05,
"loss": 0.5551,
"step": 250
},
{
"epoch": 0.07352941176470588,
"grad_norm": 3.6671645641326904,
"learning_rate": 1.950980392156863e-05,
"loss": 0.5575,
"step": 260
},
{
"epoch": 0.07635746606334842,
"grad_norm": 5.698751449584961,
"learning_rate": 1.9490950226244343e-05,
"loss": 0.4495,
"step": 270
},
{
"epoch": 0.07918552036199095,
"grad_norm": 6.565758228302002,
"learning_rate": 1.947209653092006e-05,
"loss": 0.5017,
"step": 280
},
{
"epoch": 0.08201357466063348,
"grad_norm": 4.61959171295166,
"learning_rate": 1.9453242835595777e-05,
"loss": 0.4639,
"step": 290
},
{
"epoch": 0.08484162895927602,
"grad_norm": 4.656176567077637,
"learning_rate": 1.9434389140271493e-05,
"loss": 0.4566,
"step": 300
},
{
"epoch": 0.08766968325791855,
"grad_norm": 3.0015969276428223,
"learning_rate": 1.941553544494721e-05,
"loss": 0.5226,
"step": 310
},
{
"epoch": 0.09049773755656108,
"grad_norm": 3.7588982582092285,
"learning_rate": 1.9396681749622927e-05,
"loss": 0.4415,
"step": 320
},
{
"epoch": 0.09332579185520362,
"grad_norm": 8.415599822998047,
"learning_rate": 1.9377828054298643e-05,
"loss": 0.4992,
"step": 330
},
{
"epoch": 0.09615384615384616,
"grad_norm": 2.917985677719116,
"learning_rate": 1.935897435897436e-05,
"loss": 0.392,
"step": 340
},
{
"epoch": 0.09898190045248868,
"grad_norm": 3.867098093032837,
"learning_rate": 1.9340120663650076e-05,
"loss": 0.4382,
"step": 350
},
{
"epoch": 0.10180995475113122,
"grad_norm": 3.6737847328186035,
"learning_rate": 1.9321266968325793e-05,
"loss": 0.4458,
"step": 360
},
{
"epoch": 0.10463800904977376,
"grad_norm": 3.9890048503875732,
"learning_rate": 1.930241327300151e-05,
"loss": 0.4038,
"step": 370
},
{
"epoch": 0.1074660633484163,
"grad_norm": 7.732723236083984,
"learning_rate": 1.9283559577677226e-05,
"loss": 0.4541,
"step": 380
},
{
"epoch": 0.11029411764705882,
"grad_norm": 7.815601348876953,
"learning_rate": 1.9264705882352943e-05,
"loss": 0.5302,
"step": 390
},
{
"epoch": 0.11312217194570136,
"grad_norm": 4.053082466125488,
"learning_rate": 1.924585218702866e-05,
"loss": 0.4203,
"step": 400
},
{
"epoch": 0.1159502262443439,
"grad_norm": 5.627740859985352,
"learning_rate": 1.9226998491704376e-05,
"loss": 0.4473,
"step": 410
},
{
"epoch": 0.11877828054298642,
"grad_norm": 5.07185697555542,
"learning_rate": 1.9208144796380093e-05,
"loss": 0.4345,
"step": 420
},
{
"epoch": 0.12160633484162896,
"grad_norm": 3.9544167518615723,
"learning_rate": 1.918929110105581e-05,
"loss": 0.4849,
"step": 430
},
{
"epoch": 0.1244343891402715,
"grad_norm": 6.9721293449401855,
"learning_rate": 1.9170437405731526e-05,
"loss": 0.4029,
"step": 440
},
{
"epoch": 0.12726244343891402,
"grad_norm": 4.0150556564331055,
"learning_rate": 1.9151583710407243e-05,
"loss": 0.5236,
"step": 450
},
{
"epoch": 0.13009049773755657,
"grad_norm": 6.681572914123535,
"learning_rate": 1.913273001508296e-05,
"loss": 0.4059,
"step": 460
},
{
"epoch": 0.1329185520361991,
"grad_norm": 5.1267876625061035,
"learning_rate": 1.9113876319758676e-05,
"loss": 0.467,
"step": 470
},
{
"epoch": 0.13574660633484162,
"grad_norm": 3.66207218170166,
"learning_rate": 1.9095022624434392e-05,
"loss": 0.3689,
"step": 480
},
{
"epoch": 0.13857466063348417,
"grad_norm": 4.016237735748291,
"learning_rate": 1.907616892911011e-05,
"loss": 0.4502,
"step": 490
},
{
"epoch": 0.1414027149321267,
"grad_norm": 5.003229141235352,
"learning_rate": 1.9057315233785822e-05,
"loss": 0.3935,
"step": 500
},
{
"epoch": 0.14423076923076922,
"grad_norm": 7.917128562927246,
"learning_rate": 1.903846153846154e-05,
"loss": 0.4765,
"step": 510
},
{
"epoch": 0.14705882352941177,
"grad_norm": 4.975776672363281,
"learning_rate": 1.9019607843137255e-05,
"loss": 0.4564,
"step": 520
},
{
"epoch": 0.1498868778280543,
"grad_norm": 5.559664726257324,
"learning_rate": 1.9000754147812972e-05,
"loss": 0.3663,
"step": 530
},
{
"epoch": 0.15271493212669685,
"grad_norm": 4.463156700134277,
"learning_rate": 1.898190045248869e-05,
"loss": 0.358,
"step": 540
},
{
"epoch": 0.15554298642533937,
"grad_norm": 4.642306327819824,
"learning_rate": 1.8963046757164405e-05,
"loss": 0.4083,
"step": 550
},
{
"epoch": 0.1583710407239819,
"grad_norm": 10.270988464355469,
"learning_rate": 1.8944193061840122e-05,
"loss": 0.5147,
"step": 560
},
{
"epoch": 0.16119909502262444,
"grad_norm": 4.234387397766113,
"learning_rate": 1.892533936651584e-05,
"loss": 0.5115,
"step": 570
},
{
"epoch": 0.16402714932126697,
"grad_norm": 5.710938930511475,
"learning_rate": 1.8906485671191555e-05,
"loss": 0.4483,
"step": 580
},
{
"epoch": 0.1668552036199095,
"grad_norm": 8.131850242614746,
"learning_rate": 1.8887631975867272e-05,
"loss": 0.4282,
"step": 590
},
{
"epoch": 0.16968325791855204,
"grad_norm": 4.962357997894287,
"learning_rate": 1.886877828054299e-05,
"loss": 0.4691,
"step": 600
},
{
"epoch": 0.17251131221719457,
"grad_norm": 7.434023857116699,
"learning_rate": 1.8849924585218705e-05,
"loss": 0.3864,
"step": 610
},
{
"epoch": 0.1753393665158371,
"grad_norm": 3.2809391021728516,
"learning_rate": 1.883107088989442e-05,
"loss": 0.3506,
"step": 620
},
{
"epoch": 0.17816742081447964,
"grad_norm": 4.324213027954102,
"learning_rate": 1.8812217194570138e-05,
"loss": 0.3316,
"step": 630
},
{
"epoch": 0.18099547511312217,
"grad_norm": 4.655824661254883,
"learning_rate": 1.8793363499245855e-05,
"loss": 0.4192,
"step": 640
},
{
"epoch": 0.18382352941176472,
"grad_norm": 3.21587872505188,
"learning_rate": 1.877450980392157e-05,
"loss": 0.3091,
"step": 650
},
{
"epoch": 0.18665158371040724,
"grad_norm": 5.112785339355469,
"learning_rate": 1.8755656108597288e-05,
"loss": 0.4218,
"step": 660
},
{
"epoch": 0.18947963800904977,
"grad_norm": 2.902008533477783,
"learning_rate": 1.8736802413273005e-05,
"loss": 0.3595,
"step": 670
},
{
"epoch": 0.19230769230769232,
"grad_norm": 4.45237398147583,
"learning_rate": 1.8717948717948718e-05,
"loss": 0.4378,
"step": 680
},
{
"epoch": 0.19513574660633484,
"grad_norm": 3.8630735874176025,
"learning_rate": 1.8699095022624435e-05,
"loss": 0.4072,
"step": 690
},
{
"epoch": 0.19796380090497737,
"grad_norm": 3.8529324531555176,
"learning_rate": 1.868024132730015e-05,
"loss": 0.4335,
"step": 700
},
{
"epoch": 0.20079185520361992,
"grad_norm": 2.7820119857788086,
"learning_rate": 1.8661387631975868e-05,
"loss": 0.4457,
"step": 710
},
{
"epoch": 0.20361990950226244,
"grad_norm": 7.398665428161621,
"learning_rate": 1.8642533936651584e-05,
"loss": 0.4552,
"step": 720
},
{
"epoch": 0.20644796380090497,
"grad_norm": 5.17724609375,
"learning_rate": 1.86236802413273e-05,
"loss": 0.4322,
"step": 730
},
{
"epoch": 0.20927601809954752,
"grad_norm": 7.6877851486206055,
"learning_rate": 1.8604826546003018e-05,
"loss": 0.3493,
"step": 740
},
{
"epoch": 0.21210407239819004,
"grad_norm": 3.655296802520752,
"learning_rate": 1.8585972850678734e-05,
"loss": 0.4361,
"step": 750
},
{
"epoch": 0.2149321266968326,
"grad_norm": 5.219052791595459,
"learning_rate": 1.856711915535445e-05,
"loss": 0.391,
"step": 760
},
{
"epoch": 0.21776018099547512,
"grad_norm": 7.211420059204102,
"learning_rate": 1.8548265460030168e-05,
"loss": 0.3808,
"step": 770
},
{
"epoch": 0.22058823529411764,
"grad_norm": 3.348724842071533,
"learning_rate": 1.8529411764705884e-05,
"loss": 0.4041,
"step": 780
},
{
"epoch": 0.2234162895927602,
"grad_norm": 5.180337905883789,
"learning_rate": 1.85105580693816e-05,
"loss": 0.2996,
"step": 790
},
{
"epoch": 0.22624434389140272,
"grad_norm": 5.770839214324951,
"learning_rate": 1.8491704374057317e-05,
"loss": 0.3898,
"step": 800
},
{
"epoch": 0.22907239819004524,
"grad_norm": 2.4614834785461426,
"learning_rate": 1.8472850678733034e-05,
"loss": 0.3307,
"step": 810
},
{
"epoch": 0.2319004524886878,
"grad_norm": 5.4143548011779785,
"learning_rate": 1.845399698340875e-05,
"loss": 0.4487,
"step": 820
},
{
"epoch": 0.23472850678733032,
"grad_norm": 3.846611499786377,
"learning_rate": 1.8435143288084464e-05,
"loss": 0.4334,
"step": 830
},
{
"epoch": 0.23755656108597284,
"grad_norm": 7.2528510093688965,
"learning_rate": 1.841628959276018e-05,
"loss": 0.3471,
"step": 840
},
{
"epoch": 0.2403846153846154,
"grad_norm": 4.265413284301758,
"learning_rate": 1.8397435897435897e-05,
"loss": 0.3879,
"step": 850
},
{
"epoch": 0.24321266968325791,
"grad_norm": 4.564918518066406,
"learning_rate": 1.8378582202111614e-05,
"loss": 0.4142,
"step": 860
},
{
"epoch": 0.24604072398190044,
"grad_norm": 4.268716335296631,
"learning_rate": 1.835972850678733e-05,
"loss": 0.4358,
"step": 870
},
{
"epoch": 0.248868778280543,
"grad_norm": 6.909433841705322,
"learning_rate": 1.8340874811463047e-05,
"loss": 0.405,
"step": 880
},
{
"epoch": 0.2516968325791855,
"grad_norm": 6.801779270172119,
"learning_rate": 1.8322021116138764e-05,
"loss": 0.413,
"step": 890
},
{
"epoch": 0.25452488687782804,
"grad_norm": 4.851901531219482,
"learning_rate": 1.830316742081448e-05,
"loss": 0.4337,
"step": 900
},
{
"epoch": 0.25735294117647056,
"grad_norm": 2.650651693344116,
"learning_rate": 1.8284313725490197e-05,
"loss": 0.372,
"step": 910
},
{
"epoch": 0.26018099547511314,
"grad_norm": 3.660430669784546,
"learning_rate": 1.8265460030165913e-05,
"loss": 0.4142,
"step": 920
},
{
"epoch": 0.26300904977375567,
"grad_norm": 6.7838544845581055,
"learning_rate": 1.824660633484163e-05,
"loss": 0.4053,
"step": 930
},
{
"epoch": 0.2658371040723982,
"grad_norm": 4.861571311950684,
"learning_rate": 1.8227752639517347e-05,
"loss": 0.4416,
"step": 940
},
{
"epoch": 0.2686651583710407,
"grad_norm": 5.85540771484375,
"learning_rate": 1.8208898944193063e-05,
"loss": 0.3848,
"step": 950
},
{
"epoch": 0.27149321266968324,
"grad_norm": 2.6535346508026123,
"learning_rate": 1.819004524886878e-05,
"loss": 0.4228,
"step": 960
},
{
"epoch": 0.2743212669683258,
"grad_norm": 2.8000805377960205,
"learning_rate": 1.8171191553544496e-05,
"loss": 0.3476,
"step": 970
},
{
"epoch": 0.27714932126696834,
"grad_norm": 5.302433013916016,
"learning_rate": 1.8152337858220213e-05,
"loss": 0.3536,
"step": 980
},
{
"epoch": 0.27997737556561086,
"grad_norm": 4.889918327331543,
"learning_rate": 1.813348416289593e-05,
"loss": 0.3817,
"step": 990
},
{
"epoch": 0.2828054298642534,
"grad_norm": 6.261002540588379,
"learning_rate": 1.8114630467571646e-05,
"loss": 0.3976,
"step": 1000
},
{
"epoch": 0.2828054298642534,
"eval_accuracy": 0.8570633153038498,
"eval_loss": 0.3707549571990967,
"eval_runtime": 127.6312,
"eval_samples_per_second": 98.503,
"eval_steps_per_second": 3.079,
"step": 1000
},
{
"epoch": 0.2856334841628959,
"grad_norm": 4.15647554397583,
"learning_rate": 1.8095776772247363e-05,
"loss": 0.3198,
"step": 1010
},
{
"epoch": 0.28846153846153844,
"grad_norm": 3.652892589569092,
"learning_rate": 1.807692307692308e-05,
"loss": 0.376,
"step": 1020
},
{
"epoch": 0.291289592760181,
"grad_norm": 7.219604015350342,
"learning_rate": 1.8058069381598796e-05,
"loss": 0.4519,
"step": 1030
},
{
"epoch": 0.29411764705882354,
"grad_norm": 3.8920180797576904,
"learning_rate": 1.8039215686274513e-05,
"loss": 0.3336,
"step": 1040
},
{
"epoch": 0.29694570135746606,
"grad_norm": 4.875617504119873,
"learning_rate": 1.802036199095023e-05,
"loss": 0.357,
"step": 1050
},
{
"epoch": 0.2997737556561086,
"grad_norm": 3.1264288425445557,
"learning_rate": 1.8001508295625946e-05,
"loss": 0.3482,
"step": 1060
},
{
"epoch": 0.3026018099547511,
"grad_norm": 3.4531030654907227,
"learning_rate": 1.7982654600301663e-05,
"loss": 0.3533,
"step": 1070
},
{
"epoch": 0.3054298642533937,
"grad_norm": 3.0971388816833496,
"learning_rate": 1.796380090497738e-05,
"loss": 0.3737,
"step": 1080
},
{
"epoch": 0.3082579185520362,
"grad_norm": 3.3527133464813232,
"learning_rate": 1.7944947209653092e-05,
"loss": 0.4425,
"step": 1090
},
{
"epoch": 0.31108597285067874,
"grad_norm": 3.197056293487549,
"learning_rate": 1.792609351432881e-05,
"loss": 0.3657,
"step": 1100
},
{
"epoch": 0.31391402714932126,
"grad_norm": 4.942928791046143,
"learning_rate": 1.7907239819004526e-05,
"loss": 0.3988,
"step": 1110
},
{
"epoch": 0.3167420814479638,
"grad_norm": 4.839690208435059,
"learning_rate": 1.7888386123680242e-05,
"loss": 0.3794,
"step": 1120
},
{
"epoch": 0.3195701357466063,
"grad_norm": 5.171438694000244,
"learning_rate": 1.786953242835596e-05,
"loss": 0.3795,
"step": 1130
},
{
"epoch": 0.3223981900452489,
"grad_norm": 2.4731950759887695,
"learning_rate": 1.7850678733031676e-05,
"loss": 0.3396,
"step": 1140
},
{
"epoch": 0.3252262443438914,
"grad_norm": 4.658932209014893,
"learning_rate": 1.7831825037707392e-05,
"loss": 0.2841,
"step": 1150
},
{
"epoch": 0.32805429864253394,
"grad_norm": 3.5409414768218994,
"learning_rate": 1.781297134238311e-05,
"loss": 0.3701,
"step": 1160
},
{
"epoch": 0.33088235294117646,
"grad_norm": 3.814213275909424,
"learning_rate": 1.7794117647058825e-05,
"loss": 0.3445,
"step": 1170
},
{
"epoch": 0.333710407239819,
"grad_norm": 3.226147413253784,
"learning_rate": 1.7775263951734542e-05,
"loss": 0.3513,
"step": 1180
},
{
"epoch": 0.33653846153846156,
"grad_norm": 4.451591491699219,
"learning_rate": 1.775641025641026e-05,
"loss": 0.3648,
"step": 1190
},
{
"epoch": 0.3393665158371041,
"grad_norm": 4.0332818031311035,
"learning_rate": 1.7737556561085972e-05,
"loss": 0.4392,
"step": 1200
},
{
"epoch": 0.3421945701357466,
"grad_norm": 3.1572704315185547,
"learning_rate": 1.771870286576169e-05,
"loss": 0.3602,
"step": 1210
},
{
"epoch": 0.34502262443438914,
"grad_norm": 4.314695835113525,
"learning_rate": 1.7699849170437405e-05,
"loss": 0.29,
"step": 1220
},
{
"epoch": 0.34785067873303166,
"grad_norm": 5.7975239753723145,
"learning_rate": 1.768099547511312e-05,
"loss": 0.3716,
"step": 1230
},
{
"epoch": 0.3506787330316742,
"grad_norm": 5.377049446105957,
"learning_rate": 1.7662141779788838e-05,
"loss": 0.3566,
"step": 1240
},
{
"epoch": 0.35350678733031676,
"grad_norm": 3.84669828414917,
"learning_rate": 1.7643288084464555e-05,
"loss": 0.3961,
"step": 1250
},
{
"epoch": 0.3563348416289593,
"grad_norm": 5.146121501922607,
"learning_rate": 1.762443438914027e-05,
"loss": 0.4366,
"step": 1260
},
{
"epoch": 0.3591628959276018,
"grad_norm": 3.1066689491271973,
"learning_rate": 1.7605580693815988e-05,
"loss": 0.3698,
"step": 1270
},
{
"epoch": 0.36199095022624433,
"grad_norm": 4.1310296058654785,
"learning_rate": 1.7586726998491705e-05,
"loss": 0.3631,
"step": 1280
},
{
"epoch": 0.36481900452488686,
"grad_norm": 3.0287930965423584,
"learning_rate": 1.756787330316742e-05,
"loss": 0.3151,
"step": 1290
},
{
"epoch": 0.36764705882352944,
"grad_norm": 4.4270219802856445,
"learning_rate": 1.7549019607843138e-05,
"loss": 0.357,
"step": 1300
},
{
"epoch": 0.37047511312217196,
"grad_norm": 4.785469055175781,
"learning_rate": 1.7530165912518855e-05,
"loss": 0.3809,
"step": 1310
},
{
"epoch": 0.3733031674208145,
"grad_norm": 5.920436859130859,
"learning_rate": 1.751131221719457e-05,
"loss": 0.4003,
"step": 1320
},
{
"epoch": 0.376131221719457,
"grad_norm": 5.400911331176758,
"learning_rate": 1.7492458521870288e-05,
"loss": 0.4313,
"step": 1330
},
{
"epoch": 0.37895927601809953,
"grad_norm": 6.202630996704102,
"learning_rate": 1.7473604826546004e-05,
"loss": 0.3792,
"step": 1340
},
{
"epoch": 0.38178733031674206,
"grad_norm": 3.413867473602295,
"learning_rate": 1.745475113122172e-05,
"loss": 0.3881,
"step": 1350
},
{
"epoch": 0.38461538461538464,
"grad_norm": 5.005847930908203,
"learning_rate": 1.7435897435897438e-05,
"loss": 0.4344,
"step": 1360
},
{
"epoch": 0.38744343891402716,
"grad_norm": 4.416658878326416,
"learning_rate": 1.7417043740573154e-05,
"loss": 0.332,
"step": 1370
},
{
"epoch": 0.3902714932126697,
"grad_norm": 5.2433247566223145,
"learning_rate": 1.739819004524887e-05,
"loss": 0.3873,
"step": 1380
},
{
"epoch": 0.3930995475113122,
"grad_norm": 3.740522861480713,
"learning_rate": 1.7379336349924588e-05,
"loss": 0.3495,
"step": 1390
},
{
"epoch": 0.39592760180995473,
"grad_norm": 6.047609329223633,
"learning_rate": 1.7360482654600304e-05,
"loss": 0.3922,
"step": 1400
},
{
"epoch": 0.3987556561085973,
"grad_norm": 3.7461910247802734,
"learning_rate": 1.734162895927602e-05,
"loss": 0.3556,
"step": 1410
},
{
"epoch": 0.40158371040723984,
"grad_norm": 7.2883405685424805,
"learning_rate": 1.7322775263951737e-05,
"loss": 0.319,
"step": 1420
},
{
"epoch": 0.40441176470588236,
"grad_norm": 5.338521480560303,
"learning_rate": 1.7303921568627454e-05,
"loss": 0.3272,
"step": 1430
},
{
"epoch": 0.4072398190045249,
"grad_norm": 5.680319309234619,
"learning_rate": 1.728506787330317e-05,
"loss": 0.3336,
"step": 1440
},
{
"epoch": 0.4100678733031674,
"grad_norm": 3.7183480262756348,
"learning_rate": 1.7266214177978887e-05,
"loss": 0.3218,
"step": 1450
},
{
"epoch": 0.41289592760180993,
"grad_norm": 4.478979110717773,
"learning_rate": 1.7247360482654604e-05,
"loss": 0.3719,
"step": 1460
},
{
"epoch": 0.4157239819004525,
"grad_norm": 3.1170661449432373,
"learning_rate": 1.722850678733032e-05,
"loss": 0.3563,
"step": 1470
},
{
"epoch": 0.41855203619909503,
"grad_norm": 5.310198783874512,
"learning_rate": 1.7209653092006037e-05,
"loss": 0.3009,
"step": 1480
},
{
"epoch": 0.42138009049773756,
"grad_norm": 4.134536266326904,
"learning_rate": 1.7190799396681754e-05,
"loss": 0.4116,
"step": 1490
},
{
"epoch": 0.4242081447963801,
"grad_norm": 2.9341182708740234,
"learning_rate": 1.7171945701357467e-05,
"loss": 0.3547,
"step": 1500
},
{
"epoch": 0.4270361990950226,
"grad_norm": 2.2353224754333496,
"learning_rate": 1.7153092006033184e-05,
"loss": 0.3652,
"step": 1510
},
{
"epoch": 0.4298642533936652,
"grad_norm": 4.16320276260376,
"learning_rate": 1.71342383107089e-05,
"loss": 0.3733,
"step": 1520
},
{
"epoch": 0.4326923076923077,
"grad_norm": 4.933135986328125,
"learning_rate": 1.7115384615384617e-05,
"loss": 0.4125,
"step": 1530
},
{
"epoch": 0.43552036199095023,
"grad_norm": 5.511205673217773,
"learning_rate": 1.7096530920060333e-05,
"loss": 0.3102,
"step": 1540
},
{
"epoch": 0.43834841628959276,
"grad_norm": 4.415884494781494,
"learning_rate": 1.707767722473605e-05,
"loss": 0.3348,
"step": 1550
},
{
"epoch": 0.4411764705882353,
"grad_norm": 3.8917481899261475,
"learning_rate": 1.7058823529411767e-05,
"loss": 0.3866,
"step": 1560
},
{
"epoch": 0.4440045248868778,
"grad_norm": 2.751532793045044,
"learning_rate": 1.7039969834087483e-05,
"loss": 0.3644,
"step": 1570
},
{
"epoch": 0.4468325791855204,
"grad_norm": 5.6193413734436035,
"learning_rate": 1.70211161387632e-05,
"loss": 0.3566,
"step": 1580
},
{
"epoch": 0.4496606334841629,
"grad_norm": 3.058835744857788,
"learning_rate": 1.7002262443438916e-05,
"loss": 0.3681,
"step": 1590
},
{
"epoch": 0.45248868778280543,
"grad_norm": 3.9540457725524902,
"learning_rate": 1.6983408748114633e-05,
"loss": 0.3132,
"step": 1600
},
{
"epoch": 0.45531674208144796,
"grad_norm": 3.8163225650787354,
"learning_rate": 1.6964555052790346e-05,
"loss": 0.3654,
"step": 1610
},
{
"epoch": 0.4581447963800905,
"grad_norm": 4.724973201751709,
"learning_rate": 1.6945701357466063e-05,
"loss": 0.3434,
"step": 1620
},
{
"epoch": 0.46097285067873306,
"grad_norm": 3.3608546257019043,
"learning_rate": 1.692684766214178e-05,
"loss": 0.3593,
"step": 1630
},
{
"epoch": 0.4638009049773756,
"grad_norm": 4.132437705993652,
"learning_rate": 1.6907993966817496e-05,
"loss": 0.3552,
"step": 1640
},
{
"epoch": 0.4666289592760181,
"grad_norm": 4.544163227081299,
"learning_rate": 1.6889140271493213e-05,
"loss": 0.3678,
"step": 1650
},
{
"epoch": 0.46945701357466063,
"grad_norm": 4.244106769561768,
"learning_rate": 1.687028657616893e-05,
"loss": 0.3432,
"step": 1660
},
{
"epoch": 0.47228506787330315,
"grad_norm": 3.3168179988861084,
"learning_rate": 1.6851432880844646e-05,
"loss": 0.313,
"step": 1670
},
{
"epoch": 0.4751131221719457,
"grad_norm": 4.040717601776123,
"learning_rate": 1.6832579185520363e-05,
"loss": 0.3334,
"step": 1680
},
{
"epoch": 0.47794117647058826,
"grad_norm": 4.582857608795166,
"learning_rate": 1.681372549019608e-05,
"loss": 0.3004,
"step": 1690
},
{
"epoch": 0.4807692307692308,
"grad_norm": 6.330207347869873,
"learning_rate": 1.6794871794871796e-05,
"loss": 0.3486,
"step": 1700
},
{
"epoch": 0.4835972850678733,
"grad_norm": 3.564183473587036,
"learning_rate": 1.6776018099547512e-05,
"loss": 0.3312,
"step": 1710
},
{
"epoch": 0.48642533936651583,
"grad_norm": 5.753744125366211,
"learning_rate": 1.675716440422323e-05,
"loss": 0.4188,
"step": 1720
},
{
"epoch": 0.48925339366515835,
"grad_norm": 2.692269802093506,
"learning_rate": 1.6738310708898946e-05,
"loss": 0.3149,
"step": 1730
},
{
"epoch": 0.4920814479638009,
"grad_norm": 3.748378038406372,
"learning_rate": 1.6719457013574662e-05,
"loss": 0.374,
"step": 1740
},
{
"epoch": 0.49490950226244346,
"grad_norm": 7.150949478149414,
"learning_rate": 1.670060331825038e-05,
"loss": 0.4427,
"step": 1750
},
{
"epoch": 0.497737556561086,
"grad_norm": 4.332088470458984,
"learning_rate": 1.6681749622926096e-05,
"loss": 0.296,
"step": 1760
},
{
"epoch": 0.5005656108597285,
"grad_norm": 3.9501969814300537,
"learning_rate": 1.6662895927601812e-05,
"loss": 0.3236,
"step": 1770
},
{
"epoch": 0.503393665158371,
"grad_norm": 4.039945602416992,
"learning_rate": 1.664404223227753e-05,
"loss": 0.3654,
"step": 1780
},
{
"epoch": 0.5062217194570136,
"grad_norm": 4.735800743103027,
"learning_rate": 1.6625188536953245e-05,
"loss": 0.3177,
"step": 1790
},
{
"epoch": 0.5090497737556561,
"grad_norm": 3.796029806137085,
"learning_rate": 1.6606334841628962e-05,
"loss": 0.3302,
"step": 1800
},
{
"epoch": 0.5118778280542986,
"grad_norm": 2.808561086654663,
"learning_rate": 1.658748114630468e-05,
"loss": 0.3533,
"step": 1810
},
{
"epoch": 0.5147058823529411,
"grad_norm": 3.9006407260894775,
"learning_rate": 1.6568627450980395e-05,
"loss": 0.3862,
"step": 1820
},
{
"epoch": 0.5175339366515838,
"grad_norm": 6.6023850440979,
"learning_rate": 1.654977375565611e-05,
"loss": 0.4115,
"step": 1830
},
{
"epoch": 0.5203619909502263,
"grad_norm": 3.3932111263275146,
"learning_rate": 1.6530920060331825e-05,
"loss": 0.2936,
"step": 1840
},
{
"epoch": 0.5231900452488688,
"grad_norm": 4.266836166381836,
"learning_rate": 1.651206636500754e-05,
"loss": 0.2848,
"step": 1850
},
{
"epoch": 0.5260180995475113,
"grad_norm": 4.283823490142822,
"learning_rate": 1.6493212669683258e-05,
"loss": 0.4285,
"step": 1860
},
{
"epoch": 0.5288461538461539,
"grad_norm": 3.3755383491516113,
"learning_rate": 1.6474358974358975e-05,
"loss": 0.3579,
"step": 1870
},
{
"epoch": 0.5316742081447964,
"grad_norm": 5.754073143005371,
"learning_rate": 1.645550527903469e-05,
"loss": 0.3325,
"step": 1880
},
{
"epoch": 0.5345022624434389,
"grad_norm": 2.890216588973999,
"learning_rate": 1.6436651583710408e-05,
"loss": 0.3866,
"step": 1890
},
{
"epoch": 0.5373303167420814,
"grad_norm": 4.1960978507995605,
"learning_rate": 1.6417797888386125e-05,
"loss": 0.4097,
"step": 1900
},
{
"epoch": 0.540158371040724,
"grad_norm": 4.490061283111572,
"learning_rate": 1.639894419306184e-05,
"loss": 0.3541,
"step": 1910
},
{
"epoch": 0.5429864253393665,
"grad_norm": 2.911954879760742,
"learning_rate": 1.6380090497737558e-05,
"loss": 0.3435,
"step": 1920
},
{
"epoch": 0.545814479638009,
"grad_norm": 2.816277027130127,
"learning_rate": 1.6361236802413275e-05,
"loss": 0.3168,
"step": 1930
},
{
"epoch": 0.5486425339366516,
"grad_norm": 5.4081807136535645,
"learning_rate": 1.634238310708899e-05,
"loss": 0.404,
"step": 1940
},
{
"epoch": 0.5514705882352942,
"grad_norm": 6.02499532699585,
"learning_rate": 1.6323529411764708e-05,
"loss": 0.4293,
"step": 1950
},
{
"epoch": 0.5542986425339367,
"grad_norm": 5.138996124267578,
"learning_rate": 1.6304675716440424e-05,
"loss": 0.4194,
"step": 1960
},
{
"epoch": 0.5571266968325792,
"grad_norm": 4.069638252258301,
"learning_rate": 1.628582202111614e-05,
"loss": 0.4188,
"step": 1970
},
{
"epoch": 0.5599547511312217,
"grad_norm": 4.273077487945557,
"learning_rate": 1.6266968325791858e-05,
"loss": 0.3714,
"step": 1980
},
{
"epoch": 0.5627828054298643,
"grad_norm": 3.559727430343628,
"learning_rate": 1.6248114630467574e-05,
"loss": 0.2804,
"step": 1990
},
{
"epoch": 0.5656108597285068,
"grad_norm": 3.5052013397216797,
"learning_rate": 1.622926093514329e-05,
"loss": 0.375,
"step": 2000
},
{
"epoch": 0.5656108597285068,
"eval_accuracy": 0.8721762647152402,
"eval_loss": 0.32142505049705505,
"eval_runtime": 126.2811,
"eval_samples_per_second": 99.556,
"eval_steps_per_second": 3.112,
"step": 2000
},
{
"epoch": 0.5684389140271493,
"grad_norm": 3.033839464187622,
"learning_rate": 1.6210407239819008e-05,
"loss": 0.2655,
"step": 2010
},
{
"epoch": 0.5712669683257918,
"grad_norm": 8.15062427520752,
"learning_rate": 1.6191553544494724e-05,
"loss": 0.282,
"step": 2020
},
{
"epoch": 0.5740950226244343,
"grad_norm": 4.665267467498779,
"learning_rate": 1.6172699849170437e-05,
"loss": 0.3432,
"step": 2030
},
{
"epoch": 0.5769230769230769,
"grad_norm": 5.122295379638672,
"learning_rate": 1.6153846153846154e-05,
"loss": 0.4143,
"step": 2040
},
{
"epoch": 0.5797511312217195,
"grad_norm": 5.127368450164795,
"learning_rate": 1.613499245852187e-05,
"loss": 0.3529,
"step": 2050
},
{
"epoch": 0.582579185520362,
"grad_norm": 4.725905418395996,
"learning_rate": 1.6116138763197587e-05,
"loss": 0.3102,
"step": 2060
},
{
"epoch": 0.5854072398190046,
"grad_norm": 2.358879566192627,
"learning_rate": 1.6097285067873304e-05,
"loss": 0.4083,
"step": 2070
},
{
"epoch": 0.5882352941176471,
"grad_norm": 4.624474048614502,
"learning_rate": 1.607843137254902e-05,
"loss": 0.3742,
"step": 2080
},
{
"epoch": 0.5910633484162896,
"grad_norm": 3.6771047115325928,
"learning_rate": 1.6059577677224737e-05,
"loss": 0.3705,
"step": 2090
},
{
"epoch": 0.5938914027149321,
"grad_norm": 3.136711359024048,
"learning_rate": 1.6040723981900454e-05,
"loss": 0.3365,
"step": 2100
},
{
"epoch": 0.5967194570135747,
"grad_norm": 4.1188883781433105,
"learning_rate": 1.602187028657617e-05,
"loss": 0.3138,
"step": 2110
},
{
"epoch": 0.5995475113122172,
"grad_norm": 2.472294569015503,
"learning_rate": 1.6003016591251887e-05,
"loss": 0.2888,
"step": 2120
},
{
"epoch": 0.6023755656108597,
"grad_norm": 3.7209103107452393,
"learning_rate": 1.5984162895927604e-05,
"loss": 0.3057,
"step": 2130
},
{
"epoch": 0.6052036199095022,
"grad_norm": 3.5798637866973877,
"learning_rate": 1.596530920060332e-05,
"loss": 0.3481,
"step": 2140
},
{
"epoch": 0.6080316742081447,
"grad_norm": 3.1317641735076904,
"learning_rate": 1.5946455505279037e-05,
"loss": 0.2694,
"step": 2150
},
{
"epoch": 0.6108597285067874,
"grad_norm": 3.438688278198242,
"learning_rate": 1.592760180995475e-05,
"loss": 0.338,
"step": 2160
},
{
"epoch": 0.6136877828054299,
"grad_norm": 2.2631101608276367,
"learning_rate": 1.5908748114630467e-05,
"loss": 0.4032,
"step": 2170
},
{
"epoch": 0.6165158371040724,
"grad_norm": 3.2705330848693848,
"learning_rate": 1.5889894419306183e-05,
"loss": 0.3878,
"step": 2180
},
{
"epoch": 0.619343891402715,
"grad_norm": 5.617705821990967,
"learning_rate": 1.58710407239819e-05,
"loss": 0.3213,
"step": 2190
},
{
"epoch": 0.6221719457013575,
"grad_norm": 5.0493550300598145,
"learning_rate": 1.5852187028657616e-05,
"loss": 0.3606,
"step": 2200
},
{
"epoch": 0.625,
"grad_norm": 2.885690689086914,
"learning_rate": 1.5833333333333333e-05,
"loss": 0.3232,
"step": 2210
},
{
"epoch": 0.6278280542986425,
"grad_norm": 2.4986419677734375,
"learning_rate": 1.581447963800905e-05,
"loss": 0.3318,
"step": 2220
},
{
"epoch": 0.630656108597285,
"grad_norm": 3.8310494422912598,
"learning_rate": 1.5795625942684766e-05,
"loss": 0.3241,
"step": 2230
},
{
"epoch": 0.6334841628959276,
"grad_norm": 4.589399337768555,
"learning_rate": 1.5776772247360483e-05,
"loss": 0.3537,
"step": 2240
},
{
"epoch": 0.6363122171945701,
"grad_norm": 3.939833164215088,
"learning_rate": 1.57579185520362e-05,
"loss": 0.3665,
"step": 2250
},
{
"epoch": 0.6391402714932126,
"grad_norm": 3.5939204692840576,
"learning_rate": 1.5739064856711916e-05,
"loss": 0.3462,
"step": 2260
},
{
"epoch": 0.6419683257918553,
"grad_norm": 4.346156597137451,
"learning_rate": 1.5720211161387633e-05,
"loss": 0.3887,
"step": 2270
},
{
"epoch": 0.6447963800904978,
"grad_norm": 4.5238165855407715,
"learning_rate": 1.570135746606335e-05,
"loss": 0.269,
"step": 2280
},
{
"epoch": 0.6476244343891403,
"grad_norm": 4.225012302398682,
"learning_rate": 1.5682503770739066e-05,
"loss": 0.3346,
"step": 2290
},
{
"epoch": 0.6504524886877828,
"grad_norm": 5.076806545257568,
"learning_rate": 1.5663650075414783e-05,
"loss": 0.4031,
"step": 2300
},
{
"epoch": 0.6532805429864253,
"grad_norm": 5.921730041503906,
"learning_rate": 1.56447963800905e-05,
"loss": 0.3348,
"step": 2310
},
{
"epoch": 0.6561085972850679,
"grad_norm": 5.128915309906006,
"learning_rate": 1.5625942684766216e-05,
"loss": 0.3626,
"step": 2320
},
{
"epoch": 0.6589366515837104,
"grad_norm": 3.5405006408691406,
"learning_rate": 1.5607088989441932e-05,
"loss": 0.3396,
"step": 2330
},
{
"epoch": 0.6617647058823529,
"grad_norm": 5.166226863861084,
"learning_rate": 1.558823529411765e-05,
"loss": 0.3281,
"step": 2340
},
{
"epoch": 0.6645927601809954,
"grad_norm": 3.0114974975585938,
"learning_rate": 1.5569381598793366e-05,
"loss": 0.3495,
"step": 2350
},
{
"epoch": 0.667420814479638,
"grad_norm": 4.730240345001221,
"learning_rate": 1.5550527903469082e-05,
"loss": 0.3775,
"step": 2360
},
{
"epoch": 0.6702488687782805,
"grad_norm": 6.134402275085449,
"learning_rate": 1.55316742081448e-05,
"loss": 0.408,
"step": 2370
},
{
"epoch": 0.6730769230769231,
"grad_norm": 8.204373359680176,
"learning_rate": 1.5512820512820516e-05,
"loss": 0.3215,
"step": 2380
},
{
"epoch": 0.6759049773755657,
"grad_norm": 5.2875895500183105,
"learning_rate": 1.5493966817496232e-05,
"loss": 0.3004,
"step": 2390
},
{
"epoch": 0.6787330316742082,
"grad_norm": 4.722002029418945,
"learning_rate": 1.547511312217195e-05,
"loss": 0.3495,
"step": 2400
},
{
"epoch": 0.6815610859728507,
"grad_norm": 3.777385711669922,
"learning_rate": 1.5456259426847665e-05,
"loss": 0.314,
"step": 2410
},
{
"epoch": 0.6843891402714932,
"grad_norm": 4.804584503173828,
"learning_rate": 1.5437405731523382e-05,
"loss": 0.302,
"step": 2420
},
{
"epoch": 0.6872171945701357,
"grad_norm": 1.9814542531967163,
"learning_rate": 1.54185520361991e-05,
"loss": 0.32,
"step": 2430
},
{
"epoch": 0.6900452488687783,
"grad_norm": 4.671655178070068,
"learning_rate": 1.5399698340874812e-05,
"loss": 0.3471,
"step": 2440
},
{
"epoch": 0.6928733031674208,
"grad_norm": 4.3465776443481445,
"learning_rate": 1.538084464555053e-05,
"loss": 0.4079,
"step": 2450
},
{
"epoch": 0.6957013574660633,
"grad_norm": 5.087115287780762,
"learning_rate": 1.5361990950226245e-05,
"loss": 0.2957,
"step": 2460
},
{
"epoch": 0.6985294117647058,
"grad_norm": 4.124098777770996,
"learning_rate": 1.5343137254901962e-05,
"loss": 0.3573,
"step": 2470
},
{
"epoch": 0.7013574660633484,
"grad_norm": 4.266404628753662,
"learning_rate": 1.532428355957768e-05,
"loss": 0.3054,
"step": 2480
},
{
"epoch": 0.704185520361991,
"grad_norm": 3.325258731842041,
"learning_rate": 1.5305429864253395e-05,
"loss": 0.3675,
"step": 2490
},
{
"epoch": 0.7070135746606335,
"grad_norm": 2.9218814373016357,
"learning_rate": 1.528657616892911e-05,
"loss": 0.3175,
"step": 2500
},
{
"epoch": 0.709841628959276,
"grad_norm": 4.399160385131836,
"learning_rate": 1.5267722473604828e-05,
"loss": 0.3239,
"step": 2510
},
{
"epoch": 0.7126696832579186,
"grad_norm": 4.460221290588379,
"learning_rate": 1.5248868778280543e-05,
"loss": 0.3827,
"step": 2520
},
{
"epoch": 0.7154977375565611,
"grad_norm": 3.0739834308624268,
"learning_rate": 1.523001508295626e-05,
"loss": 0.3725,
"step": 2530
},
{
"epoch": 0.7183257918552036,
"grad_norm": 2.8812670707702637,
"learning_rate": 1.5211161387631976e-05,
"loss": 0.2495,
"step": 2540
},
{
"epoch": 0.7211538461538461,
"grad_norm": 6.949345588684082,
"learning_rate": 1.5192307692307693e-05,
"loss": 0.3632,
"step": 2550
},
{
"epoch": 0.7239819004524887,
"grad_norm": 3.124908685684204,
"learning_rate": 1.517345399698341e-05,
"loss": 0.3681,
"step": 2560
},
{
"epoch": 0.7268099547511312,
"grad_norm": 4.435882091522217,
"learning_rate": 1.5154600301659126e-05,
"loss": 0.3279,
"step": 2570
},
{
"epoch": 0.7296380090497737,
"grad_norm": 3.6505391597747803,
"learning_rate": 1.5135746606334843e-05,
"loss": 0.3487,
"step": 2580
},
{
"epoch": 0.7324660633484162,
"grad_norm": 3.057103395462036,
"learning_rate": 1.511689291101056e-05,
"loss": 0.3086,
"step": 2590
},
{
"epoch": 0.7352941176470589,
"grad_norm": 2.988297462463379,
"learning_rate": 1.5098039215686276e-05,
"loss": 0.2901,
"step": 2600
},
{
"epoch": 0.7381221719457014,
"grad_norm": 8.121850967407227,
"learning_rate": 1.5079185520361993e-05,
"loss": 0.3656,
"step": 2610
},
{
"epoch": 0.7409502262443439,
"grad_norm": 3.4862985610961914,
"learning_rate": 1.506033182503771e-05,
"loss": 0.3144,
"step": 2620
},
{
"epoch": 0.7437782805429864,
"grad_norm": 2.3046765327453613,
"learning_rate": 1.5041478129713424e-05,
"loss": 0.2498,
"step": 2630
},
{
"epoch": 0.746606334841629,
"grad_norm": 3.606008529663086,
"learning_rate": 1.502262443438914e-05,
"loss": 0.3449,
"step": 2640
},
{
"epoch": 0.7494343891402715,
"grad_norm": 3.494842767715454,
"learning_rate": 1.5003770739064857e-05,
"loss": 0.2784,
"step": 2650
},
{
"epoch": 0.752262443438914,
"grad_norm": 5.306181907653809,
"learning_rate": 1.4984917043740574e-05,
"loss": 0.2783,
"step": 2660
},
{
"epoch": 0.7550904977375565,
"grad_norm": 3.1774604320526123,
"learning_rate": 1.496606334841629e-05,
"loss": 0.2216,
"step": 2670
},
{
"epoch": 0.7579185520361991,
"grad_norm": 5.226140022277832,
"learning_rate": 1.4947209653092007e-05,
"loss": 0.386,
"step": 2680
},
{
"epoch": 0.7607466063348416,
"grad_norm": 3.7945973873138428,
"learning_rate": 1.4928355957767724e-05,
"loss": 0.3213,
"step": 2690
},
{
"epoch": 0.7635746606334841,
"grad_norm": 3.4387052059173584,
"learning_rate": 1.490950226244344e-05,
"loss": 0.3386,
"step": 2700
},
{
"epoch": 0.7664027149321267,
"grad_norm": 3.023867607116699,
"learning_rate": 1.4890648567119157e-05,
"loss": 0.3507,
"step": 2710
},
{
"epoch": 0.7692307692307693,
"grad_norm": 5.2512640953063965,
"learning_rate": 1.4871794871794874e-05,
"loss": 0.3141,
"step": 2720
},
{
"epoch": 0.7720588235294118,
"grad_norm": 3.79915452003479,
"learning_rate": 1.485294117647059e-05,
"loss": 0.3441,
"step": 2730
},
{
"epoch": 0.7748868778280543,
"grad_norm": 1.8601824045181274,
"learning_rate": 1.4834087481146307e-05,
"loss": 0.3662,
"step": 2740
},
{
"epoch": 0.7777149321266968,
"grad_norm": 2.1231563091278076,
"learning_rate": 1.4815233785822024e-05,
"loss": 0.3365,
"step": 2750
},
{
"epoch": 0.7805429864253394,
"grad_norm": 5.087540149688721,
"learning_rate": 1.479638009049774e-05,
"loss": 0.3339,
"step": 2760
},
{
"epoch": 0.7833710407239819,
"grad_norm": 6.3354668617248535,
"learning_rate": 1.4777526395173457e-05,
"loss": 0.317,
"step": 2770
},
{
"epoch": 0.7861990950226244,
"grad_norm": 3.519740581512451,
"learning_rate": 1.4758672699849172e-05,
"loss": 0.3237,
"step": 2780
},
{
"epoch": 0.7890271493212669,
"grad_norm": 2.299184560775757,
"learning_rate": 1.4739819004524888e-05,
"loss": 0.289,
"step": 2790
},
{
"epoch": 0.7918552036199095,
"grad_norm": 6.4508490562438965,
"learning_rate": 1.4720965309200605e-05,
"loss": 0.3214,
"step": 2800
},
{
"epoch": 0.794683257918552,
"grad_norm": 1.989512324333191,
"learning_rate": 1.4702111613876322e-05,
"loss": 0.2949,
"step": 2810
},
{
"epoch": 0.7975113122171946,
"grad_norm": 5.373081684112549,
"learning_rate": 1.4683257918552036e-05,
"loss": 0.3365,
"step": 2820
},
{
"epoch": 0.8003393665158371,
"grad_norm": 2.989363193511963,
"learning_rate": 1.4664404223227753e-05,
"loss": 0.2495,
"step": 2830
},
{
"epoch": 0.8031674208144797,
"grad_norm": 4.9633660316467285,
"learning_rate": 1.464555052790347e-05,
"loss": 0.2815,
"step": 2840
},
{
"epoch": 0.8059954751131222,
"grad_norm": 6.031944274902344,
"learning_rate": 1.4626696832579186e-05,
"loss": 0.2632,
"step": 2850
},
{
"epoch": 0.8088235294117647,
"grad_norm": 3.689105987548828,
"learning_rate": 1.4607843137254903e-05,
"loss": 0.3192,
"step": 2860
},
{
"epoch": 0.8116515837104072,
"grad_norm": 3.926541805267334,
"learning_rate": 1.458898944193062e-05,
"loss": 0.3569,
"step": 2870
},
{
"epoch": 0.8144796380090498,
"grad_norm": 4.753978252410889,
"learning_rate": 1.4570135746606336e-05,
"loss": 0.3957,
"step": 2880
},
{
"epoch": 0.8173076923076923,
"grad_norm": 5.156829833984375,
"learning_rate": 1.4551282051282051e-05,
"loss": 0.3271,
"step": 2890
},
{
"epoch": 0.8201357466063348,
"grad_norm": 3.1717522144317627,
"learning_rate": 1.4532428355957768e-05,
"loss": 0.3337,
"step": 2900
},
{
"epoch": 0.8229638009049773,
"grad_norm": 2.9101099967956543,
"learning_rate": 1.4513574660633484e-05,
"loss": 0.3331,
"step": 2910
},
{
"epoch": 0.8257918552036199,
"grad_norm": 3.014803171157837,
"learning_rate": 1.4494720965309201e-05,
"loss": 0.303,
"step": 2920
},
{
"epoch": 0.8286199095022625,
"grad_norm": 6.68400764465332,
"learning_rate": 1.4475867269984918e-05,
"loss": 0.3414,
"step": 2930
},
{
"epoch": 0.831447963800905,
"grad_norm": 4.039949417114258,
"learning_rate": 1.4457013574660634e-05,
"loss": 0.3107,
"step": 2940
},
{
"epoch": 0.8342760180995475,
"grad_norm": 5.4491286277771,
"learning_rate": 1.443815987933635e-05,
"loss": 0.3094,
"step": 2950
},
{
"epoch": 0.8371040723981901,
"grad_norm": 4.456144332885742,
"learning_rate": 1.4419306184012067e-05,
"loss": 0.3137,
"step": 2960
},
{
"epoch": 0.8399321266968326,
"grad_norm": 3.682917594909668,
"learning_rate": 1.4400452488687784e-05,
"loss": 0.3221,
"step": 2970
},
{
"epoch": 0.8427601809954751,
"grad_norm": 4.826881408691406,
"learning_rate": 1.43815987933635e-05,
"loss": 0.3828,
"step": 2980
},
{
"epoch": 0.8455882352941176,
"grad_norm": 4.945711135864258,
"learning_rate": 1.4362745098039217e-05,
"loss": 0.2887,
"step": 2990
},
{
"epoch": 0.8484162895927602,
"grad_norm": 6.562948226928711,
"learning_rate": 1.4343891402714934e-05,
"loss": 0.3222,
"step": 3000
},
{
"epoch": 0.8484162895927602,
"eval_accuracy": 0.8736875596563792,
"eval_loss": 0.32356539368629456,
"eval_runtime": 126.3914,
"eval_samples_per_second": 99.469,
"eval_steps_per_second": 3.109,
"step": 3000
},
{
"epoch": 0.8512443438914027,
"grad_norm": 5.110568523406982,
"learning_rate": 1.432503770739065e-05,
"loss": 0.3145,
"step": 3010
},
{
"epoch": 0.8540723981900452,
"grad_norm": 4.4614081382751465,
"learning_rate": 1.4306184012066367e-05,
"loss": 0.3098,
"step": 3020
},
{
"epoch": 0.8569004524886877,
"grad_norm": 3.0560202598571777,
"learning_rate": 1.4287330316742084e-05,
"loss": 0.3271,
"step": 3030
},
{
"epoch": 0.8597285067873304,
"grad_norm": 5.284294605255127,
"learning_rate": 1.4268476621417799e-05,
"loss": 0.3461,
"step": 3040
},
{
"epoch": 0.8625565610859729,
"grad_norm": 3.1206512451171875,
"learning_rate": 1.4249622926093515e-05,
"loss": 0.2705,
"step": 3050
},
{
"epoch": 0.8653846153846154,
"grad_norm": 4.308442115783691,
"learning_rate": 1.4230769230769232e-05,
"loss": 0.3691,
"step": 3060
},
{
"epoch": 0.8682126696832579,
"grad_norm": 2.637321710586548,
"learning_rate": 1.4211915535444948e-05,
"loss": 0.3388,
"step": 3070
},
{
"epoch": 0.8710407239819005,
"grad_norm": 4.938368797302246,
"learning_rate": 1.4193061840120665e-05,
"loss": 0.288,
"step": 3080
},
{
"epoch": 0.873868778280543,
"grad_norm": 4.269132137298584,
"learning_rate": 1.4174208144796382e-05,
"loss": 0.3347,
"step": 3090
},
{
"epoch": 0.8766968325791855,
"grad_norm": 4.967940807342529,
"learning_rate": 1.4155354449472098e-05,
"loss": 0.318,
"step": 3100
},
{
"epoch": 0.879524886877828,
"grad_norm": 4.122420787811279,
"learning_rate": 1.4136500754147815e-05,
"loss": 0.3139,
"step": 3110
},
{
"epoch": 0.8823529411764706,
"grad_norm": 3.85040545463562,
"learning_rate": 1.4117647058823532e-05,
"loss": 0.3544,
"step": 3120
},
{
"epoch": 0.8851809954751131,
"grad_norm": 5.834123134613037,
"learning_rate": 1.4098793363499248e-05,
"loss": 0.3717,
"step": 3130
},
{
"epoch": 0.8880090497737556,
"grad_norm": 3.2261178493499756,
"learning_rate": 1.4079939668174965e-05,
"loss": 0.3071,
"step": 3140
},
{
"epoch": 0.8908371040723982,
"grad_norm": 5.519437789916992,
"learning_rate": 1.4061085972850678e-05,
"loss": 0.2519,
"step": 3150
},
{
"epoch": 0.8936651583710408,
"grad_norm": 2.5292046070098877,
"learning_rate": 1.4042232277526395e-05,
"loss": 0.3543,
"step": 3160
},
{
"epoch": 0.8964932126696833,
"grad_norm": 3.737870454788208,
"learning_rate": 1.4023378582202111e-05,
"loss": 0.3036,
"step": 3170
},
{
"epoch": 0.8993212669683258,
"grad_norm": 4.248650550842285,
"learning_rate": 1.4004524886877828e-05,
"loss": 0.321,
"step": 3180
},
{
"epoch": 0.9021493212669683,
"grad_norm": 3.5133938789367676,
"learning_rate": 1.3985671191553544e-05,
"loss": 0.354,
"step": 3190
},
{
"epoch": 0.9049773755656109,
"grad_norm": 2.819633722305298,
"learning_rate": 1.3966817496229261e-05,
"loss": 0.2789,
"step": 3200
},
{
"epoch": 0.9078054298642534,
"grad_norm": 3.6485252380371094,
"learning_rate": 1.3947963800904978e-05,
"loss": 0.3348,
"step": 3210
},
{
"epoch": 0.9106334841628959,
"grad_norm": 4.469762325286865,
"learning_rate": 1.3929110105580694e-05,
"loss": 0.2611,
"step": 3220
},
{
"epoch": 0.9134615384615384,
"grad_norm": 5.00715970993042,
"learning_rate": 1.3910256410256411e-05,
"loss": 0.3701,
"step": 3230
},
{
"epoch": 0.916289592760181,
"grad_norm": 3.802788734436035,
"learning_rate": 1.3891402714932128e-05,
"loss": 0.4052,
"step": 3240
},
{
"epoch": 0.9191176470588235,
"grad_norm": 3.6908090114593506,
"learning_rate": 1.3872549019607844e-05,
"loss": 0.3163,
"step": 3250
},
{
"epoch": 0.9219457013574661,
"grad_norm": 4.198665142059326,
"learning_rate": 1.385369532428356e-05,
"loss": 0.3004,
"step": 3260
},
{
"epoch": 0.9247737556561086,
"grad_norm": 5.080460071563721,
"learning_rate": 1.3834841628959277e-05,
"loss": 0.3444,
"step": 3270
},
{
"epoch": 0.9276018099547512,
"grad_norm": 5.11644983291626,
"learning_rate": 1.3815987933634994e-05,
"loss": 0.3238,
"step": 3280
},
{
"epoch": 0.9304298642533937,
"grad_norm": 7.753527641296387,
"learning_rate": 1.379713423831071e-05,
"loss": 0.3969,
"step": 3290
},
{
"epoch": 0.9332579185520362,
"grad_norm": 3.2283082008361816,
"learning_rate": 1.3778280542986426e-05,
"loss": 0.3186,
"step": 3300
},
{
"epoch": 0.9360859728506787,
"grad_norm": 5.4364094734191895,
"learning_rate": 1.3759426847662142e-05,
"loss": 0.2786,
"step": 3310
},
{
"epoch": 0.9389140271493213,
"grad_norm": 4.061675071716309,
"learning_rate": 1.3740573152337859e-05,
"loss": 0.2909,
"step": 3320
},
{
"epoch": 0.9417420814479638,
"grad_norm": 2.8919031620025635,
"learning_rate": 1.3721719457013575e-05,
"loss": 0.3572,
"step": 3330
},
{
"epoch": 0.9445701357466063,
"grad_norm": 2.643793821334839,
"learning_rate": 1.3702865761689292e-05,
"loss": 0.2582,
"step": 3340
},
{
"epoch": 0.9473981900452488,
"grad_norm": 2.6080071926116943,
"learning_rate": 1.3684012066365009e-05,
"loss": 0.3299,
"step": 3350
},
{
"epoch": 0.9502262443438914,
"grad_norm": 3.8307015895843506,
"learning_rate": 1.3665158371040725e-05,
"loss": 0.3193,
"step": 3360
},
{
"epoch": 0.9530542986425339,
"grad_norm": 5.132751941680908,
"learning_rate": 1.3646304675716442e-05,
"loss": 0.3029,
"step": 3370
},
{
"epoch": 0.9558823529411765,
"grad_norm": 2.5157196521759033,
"learning_rate": 1.3627450980392158e-05,
"loss": 0.2851,
"step": 3380
},
{
"epoch": 0.958710407239819,
"grad_norm": 2.9101061820983887,
"learning_rate": 1.3608597285067875e-05,
"loss": 0.2542,
"step": 3390
},
{
"epoch": 0.9615384615384616,
"grad_norm": 4.939927101135254,
"learning_rate": 1.3589743589743592e-05,
"loss": 0.2904,
"step": 3400
},
{
"epoch": 0.9643665158371041,
"grad_norm": 3.6113576889038086,
"learning_rate": 1.3570889894419308e-05,
"loss": 0.2654,
"step": 3410
},
{
"epoch": 0.9671945701357466,
"grad_norm": 7.237005710601807,
"learning_rate": 1.3552036199095025e-05,
"loss": 0.2636,
"step": 3420
},
{
"epoch": 0.9700226244343891,
"grad_norm": 4.309847354888916,
"learning_rate": 1.3533182503770742e-05,
"loss": 0.3095,
"step": 3430
},
{
"epoch": 0.9728506787330317,
"grad_norm": 3.404597520828247,
"learning_rate": 1.3514328808446458e-05,
"loss": 0.3147,
"step": 3440
},
{
"epoch": 0.9756787330316742,
"grad_norm": 2.2847061157226562,
"learning_rate": 1.3495475113122173e-05,
"loss": 0.2536,
"step": 3450
},
{
"epoch": 0.9785067873303167,
"grad_norm": 3.670473337173462,
"learning_rate": 1.347662141779789e-05,
"loss": 0.4246,
"step": 3460
},
{
"epoch": 0.9813348416289592,
"grad_norm": 3.955064296722412,
"learning_rate": 1.3457767722473606e-05,
"loss": 0.3578,
"step": 3470
},
{
"epoch": 0.9841628959276018,
"grad_norm": 4.502097129821777,
"learning_rate": 1.3438914027149323e-05,
"loss": 0.2467,
"step": 3480
},
{
"epoch": 0.9869909502262444,
"grad_norm": 2.4463083744049072,
"learning_rate": 1.3420060331825038e-05,
"loss": 0.2968,
"step": 3490
},
{
"epoch": 0.9898190045248869,
"grad_norm": 4.400903224945068,
"learning_rate": 1.3401206636500754e-05,
"loss": 0.2767,
"step": 3500
},
{
"epoch": 0.9926470588235294,
"grad_norm": 3.8190836906433105,
"learning_rate": 1.3382352941176471e-05,
"loss": 0.2868,
"step": 3510
},
{
"epoch": 0.995475113122172,
"grad_norm": 6.496269702911377,
"learning_rate": 1.3363499245852188e-05,
"loss": 0.2562,
"step": 3520
},
{
"epoch": 0.9983031674208145,
"grad_norm": 6.07765531539917,
"learning_rate": 1.3344645550527904e-05,
"loss": 0.3454,
"step": 3530
},
{
"epoch": 1.001131221719457,
"grad_norm": 2.3242061138153076,
"learning_rate": 1.3325791855203621e-05,
"loss": 0.2916,
"step": 3540
},
{
"epoch": 1.0039592760180995,
"grad_norm": 4.1568922996521,
"learning_rate": 1.3306938159879338e-05,
"loss": 0.2674,
"step": 3550
},
{
"epoch": 1.006787330316742,
"grad_norm": 4.240556240081787,
"learning_rate": 1.3288084464555052e-05,
"loss": 0.2308,
"step": 3560
},
{
"epoch": 1.0096153846153846,
"grad_norm": 3.1726534366607666,
"learning_rate": 1.3269230769230769e-05,
"loss": 0.2158,
"step": 3570
},
{
"epoch": 1.012443438914027,
"grad_norm": 2.371945381164551,
"learning_rate": 1.3250377073906486e-05,
"loss": 0.2726,
"step": 3580
},
{
"epoch": 1.0152714932126696,
"grad_norm": 4.0892744064331055,
"learning_rate": 1.3231523378582202e-05,
"loss": 0.2854,
"step": 3590
},
{
"epoch": 1.0180995475113122,
"grad_norm": 4.087936878204346,
"learning_rate": 1.3212669683257919e-05,
"loss": 0.2819,
"step": 3600
},
{
"epoch": 1.0209276018099547,
"grad_norm": 2.393385171890259,
"learning_rate": 1.3193815987933636e-05,
"loss": 0.2647,
"step": 3610
},
{
"epoch": 1.0237556561085972,
"grad_norm": 5.088064193725586,
"learning_rate": 1.3174962292609352e-05,
"loss": 0.3036,
"step": 3620
},
{
"epoch": 1.0265837104072397,
"grad_norm": 5.494114875793457,
"learning_rate": 1.3156108597285069e-05,
"loss": 0.2903,
"step": 3630
},
{
"epoch": 1.0294117647058822,
"grad_norm": 1.7145814895629883,
"learning_rate": 1.3137254901960785e-05,
"loss": 0.2035,
"step": 3640
},
{
"epoch": 1.032239819004525,
"grad_norm": 5.571091651916504,
"learning_rate": 1.3118401206636502e-05,
"loss": 0.2725,
"step": 3650
},
{
"epoch": 1.0350678733031675,
"grad_norm": 2.1600940227508545,
"learning_rate": 1.3099547511312219e-05,
"loss": 0.2664,
"step": 3660
},
{
"epoch": 1.03789592760181,
"grad_norm": 5.715987205505371,
"learning_rate": 1.3080693815987935e-05,
"loss": 0.288,
"step": 3670
},
{
"epoch": 1.0407239819004526,
"grad_norm": 3.7473366260528564,
"learning_rate": 1.3061840120663652e-05,
"loss": 0.3043,
"step": 3680
},
{
"epoch": 1.043552036199095,
"grad_norm": 6.1543121337890625,
"learning_rate": 1.3042986425339369e-05,
"loss": 0.3095,
"step": 3690
},
{
"epoch": 1.0463800904977376,
"grad_norm": 2.4978766441345215,
"learning_rate": 1.3024132730015085e-05,
"loss": 0.2721,
"step": 3700
},
{
"epoch": 1.0492081447963801,
"grad_norm": 6.851878643035889,
"learning_rate": 1.3005279034690802e-05,
"loss": 0.2912,
"step": 3710
},
{
"epoch": 1.0520361990950227,
"grad_norm": 3.8570425510406494,
"learning_rate": 1.2986425339366517e-05,
"loss": 0.3127,
"step": 3720
},
{
"epoch": 1.0548642533936652,
"grad_norm": 7.417280197143555,
"learning_rate": 1.2967571644042233e-05,
"loss": 0.2707,
"step": 3730
},
{
"epoch": 1.0576923076923077,
"grad_norm": 4.451798915863037,
"learning_rate": 1.294871794871795e-05,
"loss": 0.2387,
"step": 3740
},
{
"epoch": 1.0605203619909502,
"grad_norm": 3.9390320777893066,
"learning_rate": 1.2929864253393667e-05,
"loss": 0.2626,
"step": 3750
},
{
"epoch": 1.0633484162895928,
"grad_norm": 1.990342617034912,
"learning_rate": 1.2911010558069383e-05,
"loss": 0.2694,
"step": 3760
},
{
"epoch": 1.0661764705882353,
"grad_norm": 3.4424543380737305,
"learning_rate": 1.28921568627451e-05,
"loss": 0.3381,
"step": 3770
},
{
"epoch": 1.0690045248868778,
"grad_norm": 1.7783031463623047,
"learning_rate": 1.2873303167420816e-05,
"loss": 0.1981,
"step": 3780
},
{
"epoch": 1.0718325791855203,
"grad_norm": 3.8346874713897705,
"learning_rate": 1.2854449472096533e-05,
"loss": 0.2624,
"step": 3790
},
{
"epoch": 1.0746606334841629,
"grad_norm": 5.832867622375488,
"learning_rate": 1.283559577677225e-05,
"loss": 0.2581,
"step": 3800
},
{
"epoch": 1.0774886877828054,
"grad_norm": 4.65895414352417,
"learning_rate": 1.2816742081447966e-05,
"loss": 0.3619,
"step": 3810
},
{
"epoch": 1.080316742081448,
"grad_norm": 3.93692946434021,
"learning_rate": 1.279788838612368e-05,
"loss": 0.3296,
"step": 3820
},
{
"epoch": 1.0831447963800904,
"grad_norm": 2.544408082962036,
"learning_rate": 1.2779034690799396e-05,
"loss": 0.2733,
"step": 3830
},
{
"epoch": 1.085972850678733,
"grad_norm": 4.070341110229492,
"learning_rate": 1.2760180995475113e-05,
"loss": 0.2303,
"step": 3840
},
{
"epoch": 1.0888009049773755,
"grad_norm": 3.7344400882720947,
"learning_rate": 1.274132730015083e-05,
"loss": 0.2809,
"step": 3850
},
{
"epoch": 1.091628959276018,
"grad_norm": 5.270275592803955,
"learning_rate": 1.2722473604826546e-05,
"loss": 0.2894,
"step": 3860
},
{
"epoch": 1.0944570135746607,
"grad_norm": 4.697700500488281,
"learning_rate": 1.2703619909502263e-05,
"loss": 0.2382,
"step": 3870
},
{
"epoch": 1.0972850678733033,
"grad_norm": 3.1016902923583984,
"learning_rate": 1.2684766214177979e-05,
"loss": 0.2545,
"step": 3880
},
{
"epoch": 1.1001131221719458,
"grad_norm": 3.6058175563812256,
"learning_rate": 1.2665912518853696e-05,
"loss": 0.2747,
"step": 3890
},
{
"epoch": 1.1029411764705883,
"grad_norm": 6.918750286102295,
"learning_rate": 1.2647058823529412e-05,
"loss": 0.2541,
"step": 3900
},
{
"epoch": 1.1057692307692308,
"grad_norm": 4.158249855041504,
"learning_rate": 1.2628205128205129e-05,
"loss": 0.2514,
"step": 3910
},
{
"epoch": 1.1085972850678734,
"grad_norm": 5.783833980560303,
"learning_rate": 1.2609351432880846e-05,
"loss": 0.2547,
"step": 3920
},
{
"epoch": 1.1114253393665159,
"grad_norm": 3.524967670440674,
"learning_rate": 1.2590497737556562e-05,
"loss": 0.2767,
"step": 3930
},
{
"epoch": 1.1142533936651584,
"grad_norm": 2.39933705329895,
"learning_rate": 1.2571644042232279e-05,
"loss": 0.2359,
"step": 3940
},
{
"epoch": 1.117081447963801,
"grad_norm": 4.107085704803467,
"learning_rate": 1.2552790346907995e-05,
"loss": 0.244,
"step": 3950
},
{
"epoch": 1.1199095022624435,
"grad_norm": 6.81174898147583,
"learning_rate": 1.2533936651583712e-05,
"loss": 0.2579,
"step": 3960
},
{
"epoch": 1.122737556561086,
"grad_norm": 1.8872418403625488,
"learning_rate": 1.2515082956259429e-05,
"loss": 0.2101,
"step": 3970
},
{
"epoch": 1.1255656108597285,
"grad_norm": 3.872263193130493,
"learning_rate": 1.2496229260935144e-05,
"loss": 0.3009,
"step": 3980
},
{
"epoch": 1.128393665158371,
"grad_norm": 2.7092275619506836,
"learning_rate": 1.247737556561086e-05,
"loss": 0.3053,
"step": 3990
},
{
"epoch": 1.1312217194570136,
"grad_norm": 6.832910537719727,
"learning_rate": 1.2458521870286577e-05,
"loss": 0.2339,
"step": 4000
},
{
"epoch": 1.1312217194570136,
"eval_accuracy": 0.8789373210308622,
"eval_loss": 0.31826069951057434,
"eval_runtime": 126.5036,
"eval_samples_per_second": 99.381,
"eval_steps_per_second": 3.107,
"step": 4000
},
{
"epoch": 1.134049773755656,
"grad_norm": 4.47673225402832,
"learning_rate": 1.2439668174962293e-05,
"loss": 0.2778,
"step": 4010
},
{
"epoch": 1.1368778280542986,
"grad_norm": 5.049123287200928,
"learning_rate": 1.242081447963801e-05,
"loss": 0.3089,
"step": 4020
},
{
"epoch": 1.1397058823529411,
"grad_norm": 3.6429476737976074,
"learning_rate": 1.2401960784313727e-05,
"loss": 0.2583,
"step": 4030
},
{
"epoch": 1.1425339366515836,
"grad_norm": 4.532712936401367,
"learning_rate": 1.2383107088989443e-05,
"loss": 0.276,
"step": 4040
},
{
"epoch": 1.1453619909502262,
"grad_norm": 2.8139843940734863,
"learning_rate": 1.236425339366516e-05,
"loss": 0.2866,
"step": 4050
},
{
"epoch": 1.1481900452488687,
"grad_norm": 3.5737717151641846,
"learning_rate": 1.2345399698340877e-05,
"loss": 0.2609,
"step": 4060
},
{
"epoch": 1.1510180995475112,
"grad_norm": 3.4656126499176025,
"learning_rate": 1.2326546003016593e-05,
"loss": 0.2063,
"step": 4070
},
{
"epoch": 1.1538461538461537,
"grad_norm": 7.43180513381958,
"learning_rate": 1.230769230769231e-05,
"loss": 0.3614,
"step": 4080
},
{
"epoch": 1.1566742081447963,
"grad_norm": 2.853827476501465,
"learning_rate": 1.2288838612368026e-05,
"loss": 0.3006,
"step": 4090
},
{
"epoch": 1.1595022624434388,
"grad_norm": 4.522756099700928,
"learning_rate": 1.2269984917043743e-05,
"loss": 0.3122,
"step": 4100
},
{
"epoch": 1.1623303167420815,
"grad_norm": 3.891043186187744,
"learning_rate": 1.225113122171946e-05,
"loss": 0.3056,
"step": 4110
},
{
"epoch": 1.165158371040724,
"grad_norm": 2.7950618267059326,
"learning_rate": 1.2232277526395176e-05,
"loss": 0.3195,
"step": 4120
},
{
"epoch": 1.1679864253393666,
"grad_norm": 3.972943067550659,
"learning_rate": 1.2213423831070891e-05,
"loss": 0.3315,
"step": 4130
},
{
"epoch": 1.170814479638009,
"grad_norm": 10.891520500183105,
"learning_rate": 1.2194570135746608e-05,
"loss": 0.276,
"step": 4140
},
{
"epoch": 1.1736425339366516,
"grad_norm": 4.51856803894043,
"learning_rate": 1.2175716440422323e-05,
"loss": 0.2355,
"step": 4150
},
{
"epoch": 1.1764705882352942,
"grad_norm": 2.575591564178467,
"learning_rate": 1.215686274509804e-05,
"loss": 0.195,
"step": 4160
},
{
"epoch": 1.1792986425339367,
"grad_norm": 3.9654879570007324,
"learning_rate": 1.2138009049773756e-05,
"loss": 0.2505,
"step": 4170
},
{
"epoch": 1.1821266968325792,
"grad_norm": 5.328989028930664,
"learning_rate": 1.2119155354449473e-05,
"loss": 0.2326,
"step": 4180
},
{
"epoch": 1.1849547511312217,
"grad_norm": 3.573969841003418,
"learning_rate": 1.2100301659125189e-05,
"loss": 0.2302,
"step": 4190
},
{
"epoch": 1.1877828054298643,
"grad_norm": 4.914673328399658,
"learning_rate": 1.2081447963800906e-05,
"loss": 0.3016,
"step": 4200
},
{
"epoch": 1.1906108597285068,
"grad_norm": 3.4880571365356445,
"learning_rate": 1.2062594268476622e-05,
"loss": 0.2811,
"step": 4210
},
{
"epoch": 1.1934389140271493,
"grad_norm": 3.827131509780884,
"learning_rate": 1.2043740573152339e-05,
"loss": 0.2431,
"step": 4220
},
{
"epoch": 1.1962669683257918,
"grad_norm": 4.236039161682129,
"learning_rate": 1.2024886877828056e-05,
"loss": 0.2951,
"step": 4230
},
{
"epoch": 1.1990950226244343,
"grad_norm": 4.934144973754883,
"learning_rate": 1.200603318250377e-05,
"loss": 0.2739,
"step": 4240
},
{
"epoch": 1.2019230769230769,
"grad_norm": 6.265100479125977,
"learning_rate": 1.1987179487179487e-05,
"loss": 0.2298,
"step": 4250
},
{
"epoch": 1.2047511312217194,
"grad_norm": 2.0881166458129883,
"learning_rate": 1.1968325791855204e-05,
"loss": 0.245,
"step": 4260
},
{
"epoch": 1.207579185520362,
"grad_norm": 8.104879379272461,
"learning_rate": 1.194947209653092e-05,
"loss": 0.2787,
"step": 4270
},
{
"epoch": 1.2104072398190044,
"grad_norm": 4.176904201507568,
"learning_rate": 1.1930618401206637e-05,
"loss": 0.2805,
"step": 4280
},
{
"epoch": 1.213235294117647,
"grad_norm": 4.049289226531982,
"learning_rate": 1.1911764705882354e-05,
"loss": 0.3075,
"step": 4290
},
{
"epoch": 1.2160633484162897,
"grad_norm": 1.6341451406478882,
"learning_rate": 1.189291101055807e-05,
"loss": 0.2664,
"step": 4300
},
{
"epoch": 1.2188914027149322,
"grad_norm": 7.3927388191223145,
"learning_rate": 1.1874057315233787e-05,
"loss": 0.3117,
"step": 4310
},
{
"epoch": 1.2217194570135748,
"grad_norm": 1.8302087783813477,
"learning_rate": 1.1855203619909503e-05,
"loss": 0.333,
"step": 4320
},
{
"epoch": 1.2245475113122173,
"grad_norm": 2.7119219303131104,
"learning_rate": 1.183634992458522e-05,
"loss": 0.313,
"step": 4330
},
{
"epoch": 1.2273755656108598,
"grad_norm": 6.975501537322998,
"learning_rate": 1.1817496229260937e-05,
"loss": 0.2833,
"step": 4340
},
{
"epoch": 1.2302036199095023,
"grad_norm": 2.1703109741210938,
"learning_rate": 1.1798642533936653e-05,
"loss": 0.2671,
"step": 4350
},
{
"epoch": 1.2330316742081449,
"grad_norm": 3.932482957839966,
"learning_rate": 1.177978883861237e-05,
"loss": 0.2358,
"step": 4360
},
{
"epoch": 1.2358597285067874,
"grad_norm": 2.7635726928710938,
"learning_rate": 1.1760935143288087e-05,
"loss": 0.3196,
"step": 4370
},
{
"epoch": 1.23868778280543,
"grad_norm": 2.945617914199829,
"learning_rate": 1.1742081447963803e-05,
"loss": 0.286,
"step": 4380
},
{
"epoch": 1.2415158371040724,
"grad_norm": 4.623812675476074,
"learning_rate": 1.1723227752639518e-05,
"loss": 0.2605,
"step": 4390
},
{
"epoch": 1.244343891402715,
"grad_norm": 3.3469064235687256,
"learning_rate": 1.1704374057315235e-05,
"loss": 0.2515,
"step": 4400
},
{
"epoch": 1.2471719457013575,
"grad_norm": 6.414296627044678,
"learning_rate": 1.1685520361990951e-05,
"loss": 0.3239,
"step": 4410
},
{
"epoch": 1.25,
"grad_norm": 5.76809549331665,
"learning_rate": 1.1666666666666668e-05,
"loss": 0.2534,
"step": 4420
},
{
"epoch": 1.2528280542986425,
"grad_norm": 5.739138603210449,
"learning_rate": 1.1647812971342385e-05,
"loss": 0.2617,
"step": 4430
},
{
"epoch": 1.255656108597285,
"grad_norm": 3.76336407661438,
"learning_rate": 1.1628959276018101e-05,
"loss": 0.2809,
"step": 4440
},
{
"epoch": 1.2584841628959276,
"grad_norm": 3.3274784088134766,
"learning_rate": 1.1610105580693818e-05,
"loss": 0.2785,
"step": 4450
},
{
"epoch": 1.26131221719457,
"grad_norm": 3.9663026332855225,
"learning_rate": 1.1591251885369534e-05,
"loss": 0.2404,
"step": 4460
},
{
"epoch": 1.2641402714932126,
"grad_norm": 3.5841290950775146,
"learning_rate": 1.1572398190045251e-05,
"loss": 0.2417,
"step": 4470
},
{
"epoch": 1.2669683257918551,
"grad_norm": 3.4801056385040283,
"learning_rate": 1.1553544494720966e-05,
"loss": 0.372,
"step": 4480
},
{
"epoch": 1.2697963800904977,
"grad_norm": 4.876957416534424,
"learning_rate": 1.1534690799396683e-05,
"loss": 0.2947,
"step": 4490
},
{
"epoch": 1.2726244343891402,
"grad_norm": 5.119454860687256,
"learning_rate": 1.1515837104072397e-05,
"loss": 0.3047,
"step": 4500
},
{
"epoch": 1.2754524886877827,
"grad_norm": 4.234288215637207,
"learning_rate": 1.1496983408748114e-05,
"loss": 0.3204,
"step": 4510
},
{
"epoch": 1.2782805429864252,
"grad_norm": 2.9957668781280518,
"learning_rate": 1.147812971342383e-05,
"loss": 0.2602,
"step": 4520
},
{
"epoch": 1.2811085972850678,
"grad_norm": 2.333770990371704,
"learning_rate": 1.1459276018099547e-05,
"loss": 0.1949,
"step": 4530
},
{
"epoch": 1.2839366515837103,
"grad_norm": 4.577385425567627,
"learning_rate": 1.1440422322775264e-05,
"loss": 0.3381,
"step": 4540
},
{
"epoch": 1.2867647058823528,
"grad_norm": 4.607064723968506,
"learning_rate": 1.142156862745098e-05,
"loss": 0.268,
"step": 4550
},
{
"epoch": 1.2895927601809956,
"grad_norm": 4.690824031829834,
"learning_rate": 1.1402714932126697e-05,
"loss": 0.2716,
"step": 4560
},
{
"epoch": 1.292420814479638,
"grad_norm": 4.504805564880371,
"learning_rate": 1.1383861236802414e-05,
"loss": 0.2324,
"step": 4570
},
{
"epoch": 1.2952488687782806,
"grad_norm": 5.126098155975342,
"learning_rate": 1.136500754147813e-05,
"loss": 0.3124,
"step": 4580
},
{
"epoch": 1.2980769230769231,
"grad_norm": 4.265847206115723,
"learning_rate": 1.1346153846153847e-05,
"loss": 0.2522,
"step": 4590
},
{
"epoch": 1.3009049773755657,
"grad_norm": 6.03093957901001,
"learning_rate": 1.1327300150829564e-05,
"loss": 0.1997,
"step": 4600
},
{
"epoch": 1.3037330316742082,
"grad_norm": 5.404048442840576,
"learning_rate": 1.130844645550528e-05,
"loss": 0.2789,
"step": 4610
},
{
"epoch": 1.3065610859728507,
"grad_norm": 3.481680393218994,
"learning_rate": 1.1289592760180997e-05,
"loss": 0.3238,
"step": 4620
},
{
"epoch": 1.3093891402714932,
"grad_norm": 5.523119926452637,
"learning_rate": 1.1270739064856713e-05,
"loss": 0.2349,
"step": 4630
},
{
"epoch": 1.3122171945701357,
"grad_norm": 8.244139671325684,
"learning_rate": 1.125188536953243e-05,
"loss": 0.2466,
"step": 4640
},
{
"epoch": 1.3150452488687783,
"grad_norm": 4.985035419464111,
"learning_rate": 1.1233031674208145e-05,
"loss": 0.2634,
"step": 4650
},
{
"epoch": 1.3178733031674208,
"grad_norm": 5.409512996673584,
"learning_rate": 1.1214177978883862e-05,
"loss": 0.2667,
"step": 4660
},
{
"epoch": 1.3207013574660633,
"grad_norm": 3.6168251037597656,
"learning_rate": 1.1195324283559578e-05,
"loss": 0.2507,
"step": 4670
},
{
"epoch": 1.3235294117647058,
"grad_norm": 4.121711730957031,
"learning_rate": 1.1176470588235295e-05,
"loss": 0.3009,
"step": 4680
},
{
"epoch": 1.3263574660633484,
"grad_norm": 5.204695224761963,
"learning_rate": 1.1157616892911011e-05,
"loss": 0.3055,
"step": 4690
},
{
"epoch": 1.329185520361991,
"grad_norm": 3.2947821617126465,
"learning_rate": 1.1138763197586728e-05,
"loss": 0.2476,
"step": 4700
},
{
"epoch": 1.3320135746606334,
"grad_norm": 4.02095365524292,
"learning_rate": 1.1119909502262445e-05,
"loss": 0.2667,
"step": 4710
},
{
"epoch": 1.334841628959276,
"grad_norm": 4.2972025871276855,
"learning_rate": 1.1101055806938161e-05,
"loss": 0.2281,
"step": 4720
},
{
"epoch": 1.3376696832579187,
"grad_norm": 3.918163537979126,
"learning_rate": 1.1082202111613878e-05,
"loss": 0.2363,
"step": 4730
},
{
"epoch": 1.3404977375565612,
"grad_norm": 4.1806206703186035,
"learning_rate": 1.1063348416289595e-05,
"loss": 0.2639,
"step": 4740
},
{
"epoch": 1.3433257918552037,
"grad_norm": 2.949676990509033,
"learning_rate": 1.1044494720965311e-05,
"loss": 0.2625,
"step": 4750
},
{
"epoch": 1.3461538461538463,
"grad_norm": 5.957220554351807,
"learning_rate": 1.1025641025641028e-05,
"loss": 0.2416,
"step": 4760
},
{
"epoch": 1.3489819004524888,
"grad_norm": 12.7191162109375,
"learning_rate": 1.1006787330316744e-05,
"loss": 0.2752,
"step": 4770
},
{
"epoch": 1.3518099547511313,
"grad_norm": 4.849847793579102,
"learning_rate": 1.0987933634992461e-05,
"loss": 0.295,
"step": 4780
},
{
"epoch": 1.3546380090497738,
"grad_norm": 3.8798282146453857,
"learning_rate": 1.0969079939668178e-05,
"loss": 0.2744,
"step": 4790
},
{
"epoch": 1.3574660633484164,
"grad_norm": 3.093064546585083,
"learning_rate": 1.0950226244343893e-05,
"loss": 0.332,
"step": 4800
},
{
"epoch": 1.3602941176470589,
"grad_norm": 5.489840507507324,
"learning_rate": 1.0931372549019607e-05,
"loss": 0.3402,
"step": 4810
},
{
"epoch": 1.3631221719457014,
"grad_norm": 3.8440115451812744,
"learning_rate": 1.0912518853695324e-05,
"loss": 0.2283,
"step": 4820
},
{
"epoch": 1.365950226244344,
"grad_norm": 6.518070220947266,
"learning_rate": 1.089366515837104e-05,
"loss": 0.2859,
"step": 4830
},
{
"epoch": 1.3687782805429864,
"grad_norm": 1.7918236255645752,
"learning_rate": 1.0874811463046757e-05,
"loss": 0.2879,
"step": 4840
},
{
"epoch": 1.371606334841629,
"grad_norm": 5.9217424392700195,
"learning_rate": 1.0855957767722474e-05,
"loss": 0.3221,
"step": 4850
},
{
"epoch": 1.3744343891402715,
"grad_norm": 4.664400100708008,
"learning_rate": 1.083710407239819e-05,
"loss": 0.3259,
"step": 4860
},
{
"epoch": 1.377262443438914,
"grad_norm": 8.561564445495605,
"learning_rate": 1.0818250377073907e-05,
"loss": 0.2629,
"step": 4870
},
{
"epoch": 1.3800904977375565,
"grad_norm": 5.151259422302246,
"learning_rate": 1.0799396681749624e-05,
"loss": 0.3046,
"step": 4880
},
{
"epoch": 1.382918552036199,
"grad_norm": 5.32489538192749,
"learning_rate": 1.078054298642534e-05,
"loss": 0.2654,
"step": 4890
},
{
"epoch": 1.3857466063348416,
"grad_norm": 4.306127071380615,
"learning_rate": 1.0761689291101057e-05,
"loss": 0.2518,
"step": 4900
},
{
"epoch": 1.3885746606334841,
"grad_norm": 2.7166082859039307,
"learning_rate": 1.0742835595776772e-05,
"loss": 0.2278,
"step": 4910
},
{
"epoch": 1.3914027149321266,
"grad_norm": 5.469938278198242,
"learning_rate": 1.0723981900452489e-05,
"loss": 0.2938,
"step": 4920
},
{
"epoch": 1.3942307692307692,
"grad_norm": 5.4974260330200195,
"learning_rate": 1.0705128205128205e-05,
"loss": 0.2261,
"step": 4930
},
{
"epoch": 1.3970588235294117,
"grad_norm": 2.7733094692230225,
"learning_rate": 1.0686274509803922e-05,
"loss": 0.3328,
"step": 4940
},
{
"epoch": 1.3998868778280542,
"grad_norm": 3.532456398010254,
"learning_rate": 1.0667420814479638e-05,
"loss": 0.323,
"step": 4950
},
{
"epoch": 1.4027149321266967,
"grad_norm": 5.4216227531433105,
"learning_rate": 1.0648567119155355e-05,
"loss": 0.3178,
"step": 4960
},
{
"epoch": 1.4055429864253393,
"grad_norm": 5.761581897735596,
"learning_rate": 1.0629713423831072e-05,
"loss": 0.2391,
"step": 4970
},
{
"epoch": 1.4083710407239818,
"grad_norm": 7.104434013366699,
"learning_rate": 1.0610859728506788e-05,
"loss": 0.2565,
"step": 4980
},
{
"epoch": 1.4111990950226243,
"grad_norm": 5.054209232330322,
"learning_rate": 1.0592006033182505e-05,
"loss": 0.2805,
"step": 4990
},
{
"epoch": 1.4140271493212668,
"grad_norm": 7.0140228271484375,
"learning_rate": 1.0573152337858221e-05,
"loss": 0.3011,
"step": 5000
},
{
"epoch": 1.4140271493212668,
"eval_accuracy": 0.881880369074133,
"eval_loss": 0.3024204969406128,
"eval_runtime": 126.4534,
"eval_samples_per_second": 99.42,
"eval_steps_per_second": 3.108,
"step": 5000
},
{
"epoch": 1.4168552036199096,
"grad_norm": 5.970915794372559,
"learning_rate": 1.0554298642533938e-05,
"loss": 0.2841,
"step": 5010
},
{
"epoch": 1.419683257918552,
"grad_norm": 4.6934943199157715,
"learning_rate": 1.0535444947209655e-05,
"loss": 0.2451,
"step": 5020
},
{
"epoch": 1.4225113122171946,
"grad_norm": 5.1019978523254395,
"learning_rate": 1.0516591251885371e-05,
"loss": 0.2622,
"step": 5030
},
{
"epoch": 1.4253393665158371,
"grad_norm": 3.4515976905822754,
"learning_rate": 1.0497737556561088e-05,
"loss": 0.3172,
"step": 5040
},
{
"epoch": 1.4281674208144797,
"grad_norm": 4.001848220825195,
"learning_rate": 1.0478883861236805e-05,
"loss": 0.2949,
"step": 5050
},
{
"epoch": 1.4309954751131222,
"grad_norm": 3.414452075958252,
"learning_rate": 1.046003016591252e-05,
"loss": 0.2345,
"step": 5060
},
{
"epoch": 1.4338235294117647,
"grad_norm": 6.0561747550964355,
"learning_rate": 1.0441176470588236e-05,
"loss": 0.3239,
"step": 5070
},
{
"epoch": 1.4366515837104072,
"grad_norm": 2.448591470718384,
"learning_rate": 1.0422322775263953e-05,
"loss": 0.2031,
"step": 5080
},
{
"epoch": 1.4394796380090498,
"grad_norm": 5.490105152130127,
"learning_rate": 1.040346907993967e-05,
"loss": 0.2607,
"step": 5090
},
{
"epoch": 1.4423076923076923,
"grad_norm": 2.7472801208496094,
"learning_rate": 1.0384615384615386e-05,
"loss": 0.2412,
"step": 5100
},
{
"epoch": 1.4451357466063348,
"grad_norm": 4.4468770027160645,
"learning_rate": 1.0365761689291103e-05,
"loss": 0.288,
"step": 5110
},
{
"epoch": 1.4479638009049773,
"grad_norm": 1.942518949508667,
"learning_rate": 1.0346907993966819e-05,
"loss": 0.2592,
"step": 5120
},
{
"epoch": 1.4507918552036199,
"grad_norm": 4.880716800689697,
"learning_rate": 1.0328054298642536e-05,
"loss": 0.2454,
"step": 5130
},
{
"epoch": 1.4536199095022624,
"grad_norm": 3.7106387615203857,
"learning_rate": 1.030920060331825e-05,
"loss": 0.2863,
"step": 5140
},
{
"epoch": 1.456447963800905,
"grad_norm": 5.332839488983154,
"learning_rate": 1.0290346907993967e-05,
"loss": 0.3325,
"step": 5150
},
{
"epoch": 1.4592760180995474,
"grad_norm": 4.884565353393555,
"learning_rate": 1.0271493212669684e-05,
"loss": 0.2284,
"step": 5160
},
{
"epoch": 1.4621040723981902,
"grad_norm": 4.775869846343994,
"learning_rate": 1.0252639517345399e-05,
"loss": 0.1897,
"step": 5170
},
{
"epoch": 1.4649321266968327,
"grad_norm": 2.5493810176849365,
"learning_rate": 1.0233785822021115e-05,
"loss": 0.2919,
"step": 5180
},
{
"epoch": 1.4677601809954752,
"grad_norm": 3.7652482986450195,
"learning_rate": 1.0214932126696832e-05,
"loss": 0.2795,
"step": 5190
},
{
"epoch": 1.4705882352941178,
"grad_norm": 4.398680686950684,
"learning_rate": 1.0196078431372549e-05,
"loss": 0.2685,
"step": 5200
},
{
"epoch": 1.4734162895927603,
"grad_norm": 2.400367498397827,
"learning_rate": 1.0177224736048265e-05,
"loss": 0.251,
"step": 5210
},
{
"epoch": 1.4762443438914028,
"grad_norm": 3.4146950244903564,
"learning_rate": 1.0158371040723982e-05,
"loss": 0.2115,
"step": 5220
},
{
"epoch": 1.4790723981900453,
"grad_norm": 4.488588809967041,
"learning_rate": 1.0139517345399699e-05,
"loss": 0.2405,
"step": 5230
},
{
"epoch": 1.4819004524886878,
"grad_norm": 6.304666996002197,
"learning_rate": 1.0120663650075415e-05,
"loss": 0.3394,
"step": 5240
},
{
"epoch": 1.4847285067873304,
"grad_norm": 2.8380801677703857,
"learning_rate": 1.0101809954751132e-05,
"loss": 0.2637,
"step": 5250
},
{
"epoch": 1.487556561085973,
"grad_norm": 4.873356819152832,
"learning_rate": 1.0082956259426848e-05,
"loss": 0.2652,
"step": 5260
},
{
"epoch": 1.4903846153846154,
"grad_norm": 5.6608123779296875,
"learning_rate": 1.0064102564102565e-05,
"loss": 0.2961,
"step": 5270
},
{
"epoch": 1.493212669683258,
"grad_norm": 4.332230567932129,
"learning_rate": 1.0045248868778282e-05,
"loss": 0.2705,
"step": 5280
},
{
"epoch": 1.4960407239819005,
"grad_norm": 5.802159309387207,
"learning_rate": 1.0026395173453998e-05,
"loss": 0.2599,
"step": 5290
},
{
"epoch": 1.498868778280543,
"grad_norm": 3.019793748855591,
"learning_rate": 1.0007541478129715e-05,
"loss": 0.2623,
"step": 5300
},
{
"epoch": 1.5016968325791855,
"grad_norm": 4.762251377105713,
"learning_rate": 9.988687782805431e-06,
"loss": 0.2585,
"step": 5310
},
{
"epoch": 1.504524886877828,
"grad_norm": 6.202815055847168,
"learning_rate": 9.969834087481146e-06,
"loss": 0.2778,
"step": 5320
},
{
"epoch": 1.5073529411764706,
"grad_norm": 3.872309684753418,
"learning_rate": 9.950980392156863e-06,
"loss": 0.3034,
"step": 5330
},
{
"epoch": 1.510180995475113,
"grad_norm": 4.060298919677734,
"learning_rate": 9.93212669683258e-06,
"loss": 0.2884,
"step": 5340
},
{
"epoch": 1.5130090497737556,
"grad_norm": 2.0391085147857666,
"learning_rate": 9.913273001508296e-06,
"loss": 0.2867,
"step": 5350
},
{
"epoch": 1.5158371040723981,
"grad_norm": 4.735014915466309,
"learning_rate": 9.894419306184013e-06,
"loss": 0.2797,
"step": 5360
},
{
"epoch": 1.5186651583710407,
"grad_norm": 4.086658000946045,
"learning_rate": 9.87556561085973e-06,
"loss": 0.2841,
"step": 5370
},
{
"epoch": 1.5214932126696832,
"grad_norm": 4.3362040519714355,
"learning_rate": 9.856711915535446e-06,
"loss": 0.2566,
"step": 5380
},
{
"epoch": 1.5243212669683257,
"grad_norm": 3.9439034461975098,
"learning_rate": 9.837858220211161e-06,
"loss": 0.2506,
"step": 5390
},
{
"epoch": 1.5271493212669682,
"grad_norm": 4.754290580749512,
"learning_rate": 9.819004524886878e-06,
"loss": 0.2271,
"step": 5400
},
{
"epoch": 1.5299773755656108,
"grad_norm": 4.914488792419434,
"learning_rate": 9.800150829562594e-06,
"loss": 0.3032,
"step": 5410
},
{
"epoch": 1.5328054298642533,
"grad_norm": 3.0046920776367188,
"learning_rate": 9.781297134238311e-06,
"loss": 0.2123,
"step": 5420
},
{
"epoch": 1.5356334841628958,
"grad_norm": 4.0427985191345215,
"learning_rate": 9.762443438914027e-06,
"loss": 0.2621,
"step": 5430
},
{
"epoch": 1.5384615384615383,
"grad_norm": 6.442467212677002,
"learning_rate": 9.743589743589744e-06,
"loss": 0.247,
"step": 5440
},
{
"epoch": 1.5412895927601808,
"grad_norm": 3.7217085361480713,
"learning_rate": 9.72473604826546e-06,
"loss": 0.3037,
"step": 5450
},
{
"epoch": 1.5441176470588234,
"grad_norm": 7.558680534362793,
"learning_rate": 9.705882352941177e-06,
"loss": 0.3076,
"step": 5460
},
{
"epoch": 1.5469457013574661,
"grad_norm": 3.152740240097046,
"learning_rate": 9.687028657616894e-06,
"loss": 0.2778,
"step": 5470
},
{
"epoch": 1.5497737556561086,
"grad_norm": 3.996135711669922,
"learning_rate": 9.66817496229261e-06,
"loss": 0.3243,
"step": 5480
},
{
"epoch": 1.5526018099547512,
"grad_norm": 3.837599039077759,
"learning_rate": 9.649321266968327e-06,
"loss": 0.2284,
"step": 5490
},
{
"epoch": 1.5554298642533937,
"grad_norm": 4.957329750061035,
"learning_rate": 9.630467571644044e-06,
"loss": 0.2585,
"step": 5500
},
{
"epoch": 1.5582579185520362,
"grad_norm": 4.0857133865356445,
"learning_rate": 9.61161387631976e-06,
"loss": 0.2947,
"step": 5510
},
{
"epoch": 1.5610859728506787,
"grad_norm": 5.3217902183532715,
"learning_rate": 9.592760180995477e-06,
"loss": 0.3083,
"step": 5520
},
{
"epoch": 1.5639140271493213,
"grad_norm": 6.3014326095581055,
"learning_rate": 9.573906485671192e-06,
"loss": 0.2514,
"step": 5530
},
{
"epoch": 1.5667420814479638,
"grad_norm": 5.7632670402526855,
"learning_rate": 9.555052790346909e-06,
"loss": 0.2889,
"step": 5540
},
{
"epoch": 1.5695701357466063,
"grad_norm": 3.6774861812591553,
"learning_rate": 9.536199095022625e-06,
"loss": 0.2933,
"step": 5550
},
{
"epoch": 1.5723981900452488,
"grad_norm": 2.207911968231201,
"learning_rate": 9.517345399698342e-06,
"loss": 0.2594,
"step": 5560
},
{
"epoch": 1.5752262443438914,
"grad_norm": 4.789866924285889,
"learning_rate": 9.498491704374058e-06,
"loss": 0.3103,
"step": 5570
},
{
"epoch": 1.5780542986425339,
"grad_norm": 5.097392559051514,
"learning_rate": 9.479638009049773e-06,
"loss": 0.2757,
"step": 5580
},
{
"epoch": 1.5808823529411766,
"grad_norm": 4.389581203460693,
"learning_rate": 9.46078431372549e-06,
"loss": 0.3006,
"step": 5590
},
{
"epoch": 1.5837104072398192,
"grad_norm": 6.803945541381836,
"learning_rate": 9.441930618401207e-06,
"loss": 0.2912,
"step": 5600
},
{
"epoch": 1.5865384615384617,
"grad_norm": 2.0034751892089844,
"learning_rate": 9.423076923076923e-06,
"loss": 0.2173,
"step": 5610
},
{
"epoch": 1.5893665158371042,
"grad_norm": 3.0462636947631836,
"learning_rate": 9.40422322775264e-06,
"loss": 0.3155,
"step": 5620
},
{
"epoch": 1.5921945701357467,
"grad_norm": 6.887737274169922,
"learning_rate": 9.385369532428356e-06,
"loss": 0.238,
"step": 5630
},
{
"epoch": 1.5950226244343892,
"grad_norm": 7.331830978393555,
"learning_rate": 9.366515837104073e-06,
"loss": 0.3028,
"step": 5640
},
{
"epoch": 1.5978506787330318,
"grad_norm": 3.274845600128174,
"learning_rate": 9.34766214177979e-06,
"loss": 0.2585,
"step": 5650
},
{
"epoch": 1.6006787330316743,
"grad_norm": 6.801854133605957,
"learning_rate": 9.328808446455506e-06,
"loss": 0.256,
"step": 5660
},
{
"epoch": 1.6035067873303168,
"grad_norm": 7.7837982177734375,
"learning_rate": 9.309954751131223e-06,
"loss": 0.2289,
"step": 5670
},
{
"epoch": 1.6063348416289593,
"grad_norm": 8.501007080078125,
"learning_rate": 9.29110105580694e-06,
"loss": 0.2766,
"step": 5680
},
{
"epoch": 1.6091628959276019,
"grad_norm": 4.016129493713379,
"learning_rate": 9.272247360482656e-06,
"loss": 0.2611,
"step": 5690
},
{
"epoch": 1.6119909502262444,
"grad_norm": 5.062587738037109,
"learning_rate": 9.253393665158373e-06,
"loss": 0.2142,
"step": 5700
},
{
"epoch": 1.614819004524887,
"grad_norm": 2.5895862579345703,
"learning_rate": 9.23453996983409e-06,
"loss": 0.2681,
"step": 5710
},
{
"epoch": 1.6176470588235294,
"grad_norm": 5.066253662109375,
"learning_rate": 9.215686274509804e-06,
"loss": 0.2604,
"step": 5720
},
{
"epoch": 1.620475113122172,
"grad_norm": 5.256166934967041,
"learning_rate": 9.196832579185521e-06,
"loss": 0.3009,
"step": 5730
},
{
"epoch": 1.6233031674208145,
"grad_norm": 4.829041004180908,
"learning_rate": 9.177978883861237e-06,
"loss": 0.2614,
"step": 5740
},
{
"epoch": 1.626131221719457,
"grad_norm": 4.902761459350586,
"learning_rate": 9.159125188536954e-06,
"loss": 0.2348,
"step": 5750
},
{
"epoch": 1.6289592760180995,
"grad_norm": 5.516357421875,
"learning_rate": 9.14027149321267e-06,
"loss": 0.328,
"step": 5760
},
{
"epoch": 1.631787330316742,
"grad_norm": 3.2983596324920654,
"learning_rate": 9.121417797888387e-06,
"loss": 0.1956,
"step": 5770
},
{
"epoch": 1.6346153846153846,
"grad_norm": 7.548886775970459,
"learning_rate": 9.102564102564104e-06,
"loss": 0.2712,
"step": 5780
},
{
"epoch": 1.637443438914027,
"grad_norm": 4.081298828125,
"learning_rate": 9.083710407239819e-06,
"loss": 0.2726,
"step": 5790
},
{
"epoch": 1.6402714932126696,
"grad_norm": 6.161011695861816,
"learning_rate": 9.064856711915535e-06,
"loss": 0.2995,
"step": 5800
},
{
"epoch": 1.6430995475113122,
"grad_norm": 4.223090171813965,
"learning_rate": 9.046003016591252e-06,
"loss": 0.3111,
"step": 5810
},
{
"epoch": 1.6459276018099547,
"grad_norm": 7.8988728523254395,
"learning_rate": 9.027149321266969e-06,
"loss": 0.2875,
"step": 5820
},
{
"epoch": 1.6487556561085972,
"grad_norm": 2.9701428413391113,
"learning_rate": 9.008295625942685e-06,
"loss": 0.2549,
"step": 5830
},
{
"epoch": 1.6515837104072397,
"grad_norm": 6.37022066116333,
"learning_rate": 8.989441930618402e-06,
"loss": 0.3591,
"step": 5840
},
{
"epoch": 1.6544117647058822,
"grad_norm": 4.708193302154541,
"learning_rate": 8.970588235294119e-06,
"loss": 0.2561,
"step": 5850
},
{
"epoch": 1.6572398190045248,
"grad_norm": 5.106235027313232,
"learning_rate": 8.951734539969835e-06,
"loss": 0.2546,
"step": 5860
},
{
"epoch": 1.6600678733031673,
"grad_norm": 4.135291576385498,
"learning_rate": 8.932880844645552e-06,
"loss": 0.2841,
"step": 5870
},
{
"epoch": 1.6628959276018098,
"grad_norm": 5.418251991271973,
"learning_rate": 8.914027149321268e-06,
"loss": 0.2606,
"step": 5880
},
{
"epoch": 1.6657239819004523,
"grad_norm": 7.133711338043213,
"learning_rate": 8.895173453996983e-06,
"loss": 0.3058,
"step": 5890
},
{
"epoch": 1.6685520361990949,
"grad_norm": 3.556772470474243,
"learning_rate": 8.8763197586727e-06,
"loss": 0.271,
"step": 5900
},
{
"epoch": 1.6713800904977374,
"grad_norm": 4.334698677062988,
"learning_rate": 8.857466063348417e-06,
"loss": 0.2695,
"step": 5910
},
{
"epoch": 1.6742081447963801,
"grad_norm": 5.072098731994629,
"learning_rate": 8.838612368024133e-06,
"loss": 0.2845,
"step": 5920
},
{
"epoch": 1.6770361990950227,
"grad_norm": 5.321040630340576,
"learning_rate": 8.81975867269985e-06,
"loss": 0.3065,
"step": 5930
},
{
"epoch": 1.6798642533936652,
"grad_norm": 3.292698860168457,
"learning_rate": 8.800904977375566e-06,
"loss": 0.2547,
"step": 5940
},
{
"epoch": 1.6826923076923077,
"grad_norm": 8.568231582641602,
"learning_rate": 8.782051282051283e-06,
"loss": 0.2547,
"step": 5950
},
{
"epoch": 1.6855203619909502,
"grad_norm": 5.787846088409424,
"learning_rate": 8.763197586727e-06,
"loss": 0.3768,
"step": 5960
},
{
"epoch": 1.6883484162895928,
"grad_norm": 4.789765357971191,
"learning_rate": 8.744343891402716e-06,
"loss": 0.2252,
"step": 5970
},
{
"epoch": 1.6911764705882353,
"grad_norm": 6.947218418121338,
"learning_rate": 8.725490196078433e-06,
"loss": 0.2309,
"step": 5980
},
{
"epoch": 1.6940045248868778,
"grad_norm": 3.733675956726074,
"learning_rate": 8.70663650075415e-06,
"loss": 0.2422,
"step": 5990
},
{
"epoch": 1.6968325791855203,
"grad_norm": 4.800724506378174,
"learning_rate": 8.687782805429864e-06,
"loss": 0.2322,
"step": 6000
},
{
"epoch": 1.6968325791855203,
"eval_accuracy": 0.8820394527521477,
"eval_loss": 0.29341939091682434,
"eval_runtime": 126.3457,
"eval_samples_per_second": 99.505,
"eval_steps_per_second": 3.111,
"step": 6000
},
{
"epoch": 1.6996606334841629,
"grad_norm": 2.899115800857544,
"learning_rate": 8.668929110105581e-06,
"loss": 0.2858,
"step": 6010
},
{
"epoch": 1.7024886877828054,
"grad_norm": 5.119002819061279,
"learning_rate": 8.650075414781298e-06,
"loss": 0.2373,
"step": 6020
},
{
"epoch": 1.7053167420814481,
"grad_norm": 4.328557968139648,
"learning_rate": 8.631221719457014e-06,
"loss": 0.2857,
"step": 6030
},
{
"epoch": 1.7081447963800906,
"grad_norm": 6.154530048370361,
"learning_rate": 8.612368024132731e-06,
"loss": 0.2563,
"step": 6040
},
{
"epoch": 1.7109728506787332,
"grad_norm": 2.4150142669677734,
"learning_rate": 8.593514328808446e-06,
"loss": 0.2766,
"step": 6050
},
{
"epoch": 1.7138009049773757,
"grad_norm": 5.834397315979004,
"learning_rate": 8.574660633484162e-06,
"loss": 0.2804,
"step": 6060
},
{
"epoch": 1.7166289592760182,
"grad_norm": 5.142675876617432,
"learning_rate": 8.555806938159879e-06,
"loss": 0.2573,
"step": 6070
},
{
"epoch": 1.7194570135746607,
"grad_norm": 4.238577842712402,
"learning_rate": 8.536953242835596e-06,
"loss": 0.235,
"step": 6080
},
{
"epoch": 1.7222850678733033,
"grad_norm": 4.491209506988525,
"learning_rate": 8.518099547511312e-06,
"loss": 0.2605,
"step": 6090
},
{
"epoch": 1.7251131221719458,
"grad_norm": 5.393953323364258,
"learning_rate": 8.499245852187029e-06,
"loss": 0.2804,
"step": 6100
},
{
"epoch": 1.7279411764705883,
"grad_norm": 4.455014228820801,
"learning_rate": 8.480392156862745e-06,
"loss": 0.2453,
"step": 6110
},
{
"epoch": 1.7307692307692308,
"grad_norm": 4.781386375427246,
"learning_rate": 8.461538461538462e-06,
"loss": 0.2194,
"step": 6120
},
{
"epoch": 1.7335972850678734,
"grad_norm": 5.215591907501221,
"learning_rate": 8.442684766214179e-06,
"loss": 0.2602,
"step": 6130
},
{
"epoch": 1.7364253393665159,
"grad_norm": 5.542301654815674,
"learning_rate": 8.423831070889895e-06,
"loss": 0.3245,
"step": 6140
},
{
"epoch": 1.7392533936651584,
"grad_norm": 2.144392967224121,
"learning_rate": 8.404977375565612e-06,
"loss": 0.2445,
"step": 6150
},
{
"epoch": 1.742081447963801,
"grad_norm": 3.160285711288452,
"learning_rate": 8.386123680241329e-06,
"loss": 0.2702,
"step": 6160
},
{
"epoch": 1.7449095022624435,
"grad_norm": 4.129340171813965,
"learning_rate": 8.367269984917045e-06,
"loss": 0.2924,
"step": 6170
},
{
"epoch": 1.747737556561086,
"grad_norm": 4.408333778381348,
"learning_rate": 8.348416289592762e-06,
"loss": 0.2364,
"step": 6180
},
{
"epoch": 1.7505656108597285,
"grad_norm": 5.696101188659668,
"learning_rate": 8.329562594268478e-06,
"loss": 0.2445,
"step": 6190
},
{
"epoch": 1.753393665158371,
"grad_norm": 4.723424434661865,
"learning_rate": 8.310708898944195e-06,
"loss": 0.2284,
"step": 6200
},
{
"epoch": 1.7562217194570136,
"grad_norm": 4.272291660308838,
"learning_rate": 8.29185520361991e-06,
"loss": 0.3189,
"step": 6210
},
{
"epoch": 1.759049773755656,
"grad_norm": 4.042122840881348,
"learning_rate": 8.273001508295627e-06,
"loss": 0.2649,
"step": 6220
},
{
"epoch": 1.7618778280542986,
"grad_norm": 1.9126514196395874,
"learning_rate": 8.254147812971343e-06,
"loss": 0.255,
"step": 6230
},
{
"epoch": 1.7647058823529411,
"grad_norm": 11.250100135803223,
"learning_rate": 8.23529411764706e-06,
"loss": 0.302,
"step": 6240
},
{
"epoch": 1.7675339366515836,
"grad_norm": 4.978902816772461,
"learning_rate": 8.216440422322776e-06,
"loss": 0.3068,
"step": 6250
},
{
"epoch": 1.7703619909502262,
"grad_norm": 4.657087802886963,
"learning_rate": 8.197586726998491e-06,
"loss": 0.2641,
"step": 6260
},
{
"epoch": 1.7731900452488687,
"grad_norm": 4.440770626068115,
"learning_rate": 8.178733031674208e-06,
"loss": 0.2831,
"step": 6270
},
{
"epoch": 1.7760180995475112,
"grad_norm": 2.723531484603882,
"learning_rate": 8.159879336349925e-06,
"loss": 0.2247,
"step": 6280
},
{
"epoch": 1.7788461538461537,
"grad_norm": 4.28981351852417,
"learning_rate": 8.141025641025641e-06,
"loss": 0.2403,
"step": 6290
},
{
"epoch": 1.7816742081447963,
"grad_norm": 4.748565673828125,
"learning_rate": 8.122171945701358e-06,
"loss": 0.2126,
"step": 6300
},
{
"epoch": 1.7845022624434388,
"grad_norm": 5.226318359375,
"learning_rate": 8.103318250377074e-06,
"loss": 0.3272,
"step": 6310
},
{
"epoch": 1.7873303167420813,
"grad_norm": 2.937812089920044,
"learning_rate": 8.084464555052791e-06,
"loss": 0.2276,
"step": 6320
},
{
"epoch": 1.7901583710407238,
"grad_norm": 3.215853452682495,
"learning_rate": 8.065610859728508e-06,
"loss": 0.2406,
"step": 6330
},
{
"epoch": 1.7929864253393664,
"grad_norm": 6.499160289764404,
"learning_rate": 8.046757164404224e-06,
"loss": 0.2915,
"step": 6340
},
{
"epoch": 1.7958144796380089,
"grad_norm": 3.940803289413452,
"learning_rate": 8.027903469079941e-06,
"loss": 0.276,
"step": 6350
},
{
"epoch": 1.7986425339366516,
"grad_norm": 2.177950859069824,
"learning_rate": 8.009049773755657e-06,
"loss": 0.2558,
"step": 6360
},
{
"epoch": 1.8014705882352942,
"grad_norm": 7.705915451049805,
"learning_rate": 7.990196078431374e-06,
"loss": 0.2799,
"step": 6370
},
{
"epoch": 1.8042986425339367,
"grad_norm": 5.586729526519775,
"learning_rate": 7.97134238310709e-06,
"loss": 0.224,
"step": 6380
},
{
"epoch": 1.8071266968325792,
"grad_norm": 2.9311821460723877,
"learning_rate": 7.952488687782806e-06,
"loss": 0.2316,
"step": 6390
},
{
"epoch": 1.8099547511312217,
"grad_norm": 3.5633130073547363,
"learning_rate": 7.933634992458522e-06,
"loss": 0.2298,
"step": 6400
},
{
"epoch": 1.8127828054298643,
"grad_norm": 3.4238994121551514,
"learning_rate": 7.914781297134239e-06,
"loss": 0.2647,
"step": 6410
},
{
"epoch": 1.8156108597285068,
"grad_norm": 9.544416427612305,
"learning_rate": 7.895927601809955e-06,
"loss": 0.3275,
"step": 6420
},
{
"epoch": 1.8184389140271493,
"grad_norm": 2.8148701190948486,
"learning_rate": 7.877073906485672e-06,
"loss": 0.2131,
"step": 6430
},
{
"epoch": 1.8212669683257918,
"grad_norm": 5.6752777099609375,
"learning_rate": 7.858220211161389e-06,
"loss": 0.3065,
"step": 6440
},
{
"epoch": 1.8240950226244343,
"grad_norm": 6.207758903503418,
"learning_rate": 7.839366515837105e-06,
"loss": 0.3369,
"step": 6450
},
{
"epoch": 1.8269230769230769,
"grad_norm": 2.1755306720733643,
"learning_rate": 7.820512820512822e-06,
"loss": 0.24,
"step": 6460
},
{
"epoch": 1.8297511312217196,
"grad_norm": 4.380761623382568,
"learning_rate": 7.801659125188537e-06,
"loss": 0.2621,
"step": 6470
},
{
"epoch": 1.8325791855203621,
"grad_norm": 7.944891452789307,
"learning_rate": 7.782805429864253e-06,
"loss": 0.2421,
"step": 6480
},
{
"epoch": 1.8354072398190047,
"grad_norm": 6.696594715118408,
"learning_rate": 7.76395173453997e-06,
"loss": 0.2179,
"step": 6490
},
{
"epoch": 1.8382352941176472,
"grad_norm": 5.534007549285889,
"learning_rate": 7.745098039215687e-06,
"loss": 0.2465,
"step": 6500
},
{
"epoch": 1.8410633484162897,
"grad_norm": 4.6053290367126465,
"learning_rate": 7.726244343891403e-06,
"loss": 0.3311,
"step": 6510
},
{
"epoch": 1.8438914027149322,
"grad_norm": 3.2913260459899902,
"learning_rate": 7.70739064856712e-06,
"loss": 0.2535,
"step": 6520
},
{
"epoch": 1.8467194570135748,
"grad_norm": 5.70173454284668,
"learning_rate": 7.688536953242837e-06,
"loss": 0.2283,
"step": 6530
},
{
"epoch": 1.8495475113122173,
"grad_norm": 6.683012962341309,
"learning_rate": 7.669683257918553e-06,
"loss": 0.2293,
"step": 6540
},
{
"epoch": 1.8523755656108598,
"grad_norm": 4.2895612716674805,
"learning_rate": 7.650829562594268e-06,
"loss": 0.2013,
"step": 6550
},
{
"epoch": 1.8552036199095023,
"grad_norm": 2.8891239166259766,
"learning_rate": 7.631975867269985e-06,
"loss": 0.2482,
"step": 6560
},
{
"epoch": 1.8580316742081449,
"grad_norm": 5.462761402130127,
"learning_rate": 7.613122171945701e-06,
"loss": 0.3063,
"step": 6570
},
{
"epoch": 1.8608597285067874,
"grad_norm": 4.3543806076049805,
"learning_rate": 7.594268476621418e-06,
"loss": 0.2519,
"step": 6580
},
{
"epoch": 1.86368778280543,
"grad_norm": 5.1229681968688965,
"learning_rate": 7.5754147812971346e-06,
"loss": 0.2968,
"step": 6590
},
{
"epoch": 1.8665158371040724,
"grad_norm": 1.8585267066955566,
"learning_rate": 7.556561085972851e-06,
"loss": 0.2208,
"step": 6600
},
{
"epoch": 1.869343891402715,
"grad_norm": 4.255302429199219,
"learning_rate": 7.537707390648568e-06,
"loss": 0.2968,
"step": 6610
},
{
"epoch": 1.8721719457013575,
"grad_norm": 4.815881729125977,
"learning_rate": 7.518853695324284e-06,
"loss": 0.3433,
"step": 6620
},
{
"epoch": 1.875,
"grad_norm": 6.812479496002197,
"learning_rate": 7.500000000000001e-06,
"loss": 0.311,
"step": 6630
},
{
"epoch": 1.8778280542986425,
"grad_norm": 3.9199917316436768,
"learning_rate": 7.481146304675717e-06,
"loss": 0.2767,
"step": 6640
},
{
"epoch": 1.880656108597285,
"grad_norm": 4.117010593414307,
"learning_rate": 7.462292609351433e-06,
"loss": 0.2858,
"step": 6650
},
{
"epoch": 1.8834841628959276,
"grad_norm": 4.636374473571777,
"learning_rate": 7.44343891402715e-06,
"loss": 0.2043,
"step": 6660
},
{
"epoch": 1.88631221719457,
"grad_norm": 5.478713512420654,
"learning_rate": 7.424585218702867e-06,
"loss": 0.288,
"step": 6670
},
{
"epoch": 1.8891402714932126,
"grad_norm": 4.690084457397461,
"learning_rate": 7.405731523378583e-06,
"loss": 0.2651,
"step": 6680
},
{
"epoch": 1.8919683257918551,
"grad_norm": 2.4495575428009033,
"learning_rate": 7.3868778280543e-06,
"loss": 0.2651,
"step": 6690
},
{
"epoch": 1.8947963800904977,
"grad_norm": 5.4684672355651855,
"learning_rate": 7.3680241327300165e-06,
"loss": 0.2834,
"step": 6700
},
{
"epoch": 1.8976244343891402,
"grad_norm": 1.9919039011001587,
"learning_rate": 7.349170437405732e-06,
"loss": 0.2021,
"step": 6710
},
{
"epoch": 1.9004524886877827,
"grad_norm": 4.975834846496582,
"learning_rate": 7.330316742081448e-06,
"loss": 0.3194,
"step": 6720
},
{
"epoch": 1.9032805429864252,
"grad_norm": 4.014176368713379,
"learning_rate": 7.311463046757165e-06,
"loss": 0.2251,
"step": 6730
},
{
"epoch": 1.9061085972850678,
"grad_norm": 7.0189409255981445,
"learning_rate": 7.292609351432881e-06,
"loss": 0.3062,
"step": 6740
},
{
"epoch": 1.9089366515837103,
"grad_norm": 7.0651350021362305,
"learning_rate": 7.273755656108598e-06,
"loss": 0.2488,
"step": 6750
},
{
"epoch": 1.9117647058823528,
"grad_norm": 7.110829830169678,
"learning_rate": 7.2549019607843145e-06,
"loss": 0.2226,
"step": 6760
},
{
"epoch": 1.9145927601809953,
"grad_norm": 8.122304916381836,
"learning_rate": 7.23604826546003e-06,
"loss": 0.2236,
"step": 6770
},
{
"epoch": 1.9174208144796379,
"grad_norm": 4.817609786987305,
"learning_rate": 7.217194570135747e-06,
"loss": 0.2935,
"step": 6780
},
{
"epoch": 1.9202488687782804,
"grad_norm": 3.6452667713165283,
"learning_rate": 7.1983408748114635e-06,
"loss": 0.2711,
"step": 6790
},
{
"epoch": 1.9230769230769231,
"grad_norm": 5.04451847076416,
"learning_rate": 7.17948717948718e-06,
"loss": 0.3383,
"step": 6800
},
{
"epoch": 1.9259049773755657,
"grad_norm": 3.0769617557525635,
"learning_rate": 7.160633484162897e-06,
"loss": 0.2481,
"step": 6810
},
{
"epoch": 1.9287330316742082,
"grad_norm": 2.4666669368743896,
"learning_rate": 7.141779788838613e-06,
"loss": 0.2713,
"step": 6820
},
{
"epoch": 1.9315610859728507,
"grad_norm": 6.22195291519165,
"learning_rate": 7.12292609351433e-06,
"loss": 0.253,
"step": 6830
},
{
"epoch": 1.9343891402714932,
"grad_norm": 5.916505336761475,
"learning_rate": 7.104072398190046e-06,
"loss": 0.3023,
"step": 6840
},
{
"epoch": 1.9372171945701357,
"grad_norm": 3.696983575820923,
"learning_rate": 7.085218702865762e-06,
"loss": 0.3176,
"step": 6850
},
{
"epoch": 1.9400452488687783,
"grad_norm": 4.350560665130615,
"learning_rate": 7.066365007541479e-06,
"loss": 0.2488,
"step": 6860
},
{
"epoch": 1.9428733031674208,
"grad_norm": 4.9616498947143555,
"learning_rate": 7.047511312217196e-06,
"loss": 0.2901,
"step": 6870
},
{
"epoch": 1.9457013574660633,
"grad_norm": 2.2549595832824707,
"learning_rate": 7.028657616892911e-06,
"loss": 0.2526,
"step": 6880
},
{
"epoch": 1.9485294117647058,
"grad_norm": 3.205310821533203,
"learning_rate": 7.009803921568628e-06,
"loss": 0.2819,
"step": 6890
},
{
"epoch": 1.9513574660633484,
"grad_norm": 5.102742671966553,
"learning_rate": 6.990950226244344e-06,
"loss": 0.2573,
"step": 6900
},
{
"epoch": 1.9541855203619911,
"grad_norm": 2.78604793548584,
"learning_rate": 6.97209653092006e-06,
"loss": 0.1702,
"step": 6910
},
{
"epoch": 1.9570135746606336,
"grad_norm": 3.8111801147460938,
"learning_rate": 6.953242835595777e-06,
"loss": 0.2963,
"step": 6920
},
{
"epoch": 1.9598416289592762,
"grad_norm": 4.204692363739014,
"learning_rate": 6.934389140271494e-06,
"loss": 0.2989,
"step": 6930
},
{
"epoch": 1.9626696832579187,
"grad_norm": 3.3682045936584473,
"learning_rate": 6.91553544494721e-06,
"loss": 0.2744,
"step": 6940
},
{
"epoch": 1.9654977375565612,
"grad_norm": 5.661670207977295,
"learning_rate": 6.896681749622927e-06,
"loss": 0.27,
"step": 6950
},
{
"epoch": 1.9683257918552037,
"grad_norm": 3.925750494003296,
"learning_rate": 6.8778280542986434e-06,
"loss": 0.2711,
"step": 6960
},
{
"epoch": 1.9711538461538463,
"grad_norm": 5.467376232147217,
"learning_rate": 6.858974358974359e-06,
"loss": 0.3182,
"step": 6970
},
{
"epoch": 1.9739819004524888,
"grad_norm": 7.46327543258667,
"learning_rate": 6.840120663650076e-06,
"loss": 0.336,
"step": 6980
},
{
"epoch": 1.9768099547511313,
"grad_norm": 4.464349269866943,
"learning_rate": 6.8212669683257924e-06,
"loss": 0.333,
"step": 6990
},
{
"epoch": 1.9796380090497738,
"grad_norm": 5.0763421058654785,
"learning_rate": 6.802413273001509e-06,
"loss": 0.2332,
"step": 7000
},
{
"epoch": 1.9796380090497738,
"eval_accuracy": 0.8868119630925867,
"eval_loss": 0.2794936001300812,
"eval_runtime": 126.4211,
"eval_samples_per_second": 99.445,
"eval_steps_per_second": 3.109,
"step": 7000
},
{
"epoch": 1.9824660633484164,
"grad_norm": 4.514822483062744,
"learning_rate": 6.783559577677226e-06,
"loss": 0.3259,
"step": 7010
},
{
"epoch": 1.9852941176470589,
"grad_norm": 3.9309160709381104,
"learning_rate": 6.764705882352942e-06,
"loss": 0.2671,
"step": 7020
},
{
"epoch": 1.9881221719457014,
"grad_norm": 3.7512924671173096,
"learning_rate": 6.745852187028659e-06,
"loss": 0.3025,
"step": 7030
},
{
"epoch": 1.990950226244344,
"grad_norm": 5.162522792816162,
"learning_rate": 6.7269984917043755e-06,
"loss": 0.2556,
"step": 7040
},
{
"epoch": 1.9937782805429864,
"grad_norm": 5.968090534210205,
"learning_rate": 6.7081447963800904e-06,
"loss": 0.245,
"step": 7050
},
{
"epoch": 1.996606334841629,
"grad_norm": 7.264348983764648,
"learning_rate": 6.689291101055807e-06,
"loss": 0.274,
"step": 7060
},
{
"epoch": 1.9994343891402715,
"grad_norm": 4.840837478637695,
"learning_rate": 6.670437405731524e-06,
"loss": 0.2381,
"step": 7070
},
{
"epoch": 2.002262443438914,
"grad_norm": 3.3212857246398926,
"learning_rate": 6.65158371040724e-06,
"loss": 0.2576,
"step": 7080
},
{
"epoch": 2.0050904977375565,
"grad_norm": 6.3086419105529785,
"learning_rate": 6.632730015082957e-06,
"loss": 0.2471,
"step": 7090
},
{
"epoch": 2.007918552036199,
"grad_norm": 2.5110299587249756,
"learning_rate": 6.613876319758673e-06,
"loss": 0.2414,
"step": 7100
},
{
"epoch": 2.0107466063348416,
"grad_norm": 4.115811824798584,
"learning_rate": 6.595022624434389e-06,
"loss": 0.1715,
"step": 7110
},
{
"epoch": 2.013574660633484,
"grad_norm": 5.045820236206055,
"learning_rate": 6.576168929110106e-06,
"loss": 0.2494,
"step": 7120
},
{
"epoch": 2.0164027149321266,
"grad_norm": 4.6321845054626465,
"learning_rate": 6.5573152337858225e-06,
"loss": 0.222,
"step": 7130
},
{
"epoch": 2.019230769230769,
"grad_norm": 5.135430335998535,
"learning_rate": 6.538461538461539e-06,
"loss": 0.2206,
"step": 7140
},
{
"epoch": 2.0220588235294117,
"grad_norm": 4.786893367767334,
"learning_rate": 6.519607843137256e-06,
"loss": 0.229,
"step": 7150
},
{
"epoch": 2.024886877828054,
"grad_norm": 3.568856716156006,
"learning_rate": 6.500754147812972e-06,
"loss": 0.2235,
"step": 7160
},
{
"epoch": 2.0277149321266967,
"grad_norm": 6.938755989074707,
"learning_rate": 6.481900452488689e-06,
"loss": 0.23,
"step": 7170
},
{
"epoch": 2.0305429864253393,
"grad_norm": 4.014111042022705,
"learning_rate": 6.463046757164405e-06,
"loss": 0.2076,
"step": 7180
},
{
"epoch": 2.033371040723982,
"grad_norm": 5.143094062805176,
"learning_rate": 6.444193061840121e-06,
"loss": 0.3276,
"step": 7190
},
{
"epoch": 2.0361990950226243,
"grad_norm": 4.8052191734313965,
"learning_rate": 6.425339366515838e-06,
"loss": 0.2223,
"step": 7200
},
{
"epoch": 2.039027149321267,
"grad_norm": 6.07175874710083,
"learning_rate": 6.406485671191555e-06,
"loss": 0.2514,
"step": 7210
},
{
"epoch": 2.0418552036199094,
"grad_norm": 3.0855891704559326,
"learning_rate": 6.38763197586727e-06,
"loss": 0.2043,
"step": 7220
},
{
"epoch": 2.044683257918552,
"grad_norm": 5.760570049285889,
"learning_rate": 6.368778280542986e-06,
"loss": 0.2051,
"step": 7230
},
{
"epoch": 2.0475113122171944,
"grad_norm": 5.127667427062988,
"learning_rate": 6.349924585218703e-06,
"loss": 0.2141,
"step": 7240
},
{
"epoch": 2.050339366515837,
"grad_norm": 2.886842727661133,
"learning_rate": 6.331070889894419e-06,
"loss": 0.1705,
"step": 7250
},
{
"epoch": 2.0531674208144794,
"grad_norm": 5.108696937561035,
"learning_rate": 6.312217194570136e-06,
"loss": 0.2737,
"step": 7260
},
{
"epoch": 2.055995475113122,
"grad_norm": 7.453789234161377,
"learning_rate": 6.293363499245853e-06,
"loss": 0.288,
"step": 7270
},
{
"epoch": 2.0588235294117645,
"grad_norm": 3.700695514678955,
"learning_rate": 6.274509803921569e-06,
"loss": 0.2087,
"step": 7280
},
{
"epoch": 2.0616515837104075,
"grad_norm": 3.475170612335205,
"learning_rate": 6.255656108597286e-06,
"loss": 0.182,
"step": 7290
},
{
"epoch": 2.06447963800905,
"grad_norm": 3.636042833328247,
"learning_rate": 6.2368024132730024e-06,
"loss": 0.1856,
"step": 7300
},
{
"epoch": 2.0673076923076925,
"grad_norm": 4.326310157775879,
"learning_rate": 6.217948717948718e-06,
"loss": 0.2071,
"step": 7310
},
{
"epoch": 2.070135746606335,
"grad_norm": 4.5239105224609375,
"learning_rate": 6.199095022624435e-06,
"loss": 0.2045,
"step": 7320
},
{
"epoch": 2.0729638009049776,
"grad_norm": 5.962629318237305,
"learning_rate": 6.1802413273001514e-06,
"loss": 0.2236,
"step": 7330
},
{
"epoch": 2.07579185520362,
"grad_norm": 6.830577373504639,
"learning_rate": 6.161387631975868e-06,
"loss": 0.2435,
"step": 7340
},
{
"epoch": 2.0786199095022626,
"grad_norm": 6.650877952575684,
"learning_rate": 6.142533936651585e-06,
"loss": 0.2273,
"step": 7350
},
{
"epoch": 2.081447963800905,
"grad_norm": 9.387392044067383,
"learning_rate": 6.123680241327301e-06,
"loss": 0.2265,
"step": 7360
},
{
"epoch": 2.0842760180995477,
"grad_norm": 7.404173374176025,
"learning_rate": 6.104826546003018e-06,
"loss": 0.1513,
"step": 7370
},
{
"epoch": 2.08710407239819,
"grad_norm": 3.4944663047790527,
"learning_rate": 6.085972850678733e-06,
"loss": 0.2339,
"step": 7380
},
{
"epoch": 2.0899321266968327,
"grad_norm": 3.5213699340820312,
"learning_rate": 6.0671191553544494e-06,
"loss": 0.2839,
"step": 7390
},
{
"epoch": 2.0927601809954752,
"grad_norm": 4.182003974914551,
"learning_rate": 6.048265460030166e-06,
"loss": 0.2125,
"step": 7400
},
{
"epoch": 2.0955882352941178,
"grad_norm": 6.472683429718018,
"learning_rate": 6.029411764705883e-06,
"loss": 0.1934,
"step": 7410
},
{
"epoch": 2.0984162895927603,
"grad_norm": 3.89056658744812,
"learning_rate": 6.010558069381599e-06,
"loss": 0.1829,
"step": 7420
},
{
"epoch": 2.101244343891403,
"grad_norm": 6.370733261108398,
"learning_rate": 5.991704374057316e-06,
"loss": 0.1888,
"step": 7430
},
{
"epoch": 2.1040723981900453,
"grad_norm": 6.549925327301025,
"learning_rate": 5.972850678733032e-06,
"loss": 0.2399,
"step": 7440
},
{
"epoch": 2.106900452488688,
"grad_norm": 6.536769866943359,
"learning_rate": 5.953996983408748e-06,
"loss": 0.2937,
"step": 7450
},
{
"epoch": 2.1097285067873304,
"grad_norm": 5.718851566314697,
"learning_rate": 5.935143288084465e-06,
"loss": 0.1983,
"step": 7460
},
{
"epoch": 2.112556561085973,
"grad_norm": 6.838066577911377,
"learning_rate": 5.9162895927601815e-06,
"loss": 0.2941,
"step": 7470
},
{
"epoch": 2.1153846153846154,
"grad_norm": 3.4056811332702637,
"learning_rate": 5.897435897435898e-06,
"loss": 0.2191,
"step": 7480
},
{
"epoch": 2.118212669683258,
"grad_norm": 5.439931392669678,
"learning_rate": 5.878582202111615e-06,
"loss": 0.2095,
"step": 7490
},
{
"epoch": 2.1210407239819005,
"grad_norm": 6.081836700439453,
"learning_rate": 5.859728506787331e-06,
"loss": 0.1964,
"step": 7500
},
{
"epoch": 2.123868778280543,
"grad_norm": 2.3146896362304688,
"learning_rate": 5.840874811463048e-06,
"loss": 0.266,
"step": 7510
},
{
"epoch": 2.1266968325791855,
"grad_norm": 2.6987674236297607,
"learning_rate": 5.822021116138764e-06,
"loss": 0.2508,
"step": 7520
},
{
"epoch": 2.129524886877828,
"grad_norm": 4.278384208679199,
"learning_rate": 5.80316742081448e-06,
"loss": 0.1764,
"step": 7530
},
{
"epoch": 2.1323529411764706,
"grad_norm": 6.95686674118042,
"learning_rate": 5.784313725490197e-06,
"loss": 0.274,
"step": 7540
},
{
"epoch": 2.135180995475113,
"grad_norm": 3.3586158752441406,
"learning_rate": 5.765460030165913e-06,
"loss": 0.2624,
"step": 7550
},
{
"epoch": 2.1380090497737556,
"grad_norm": 3.704134702682495,
"learning_rate": 5.746606334841629e-06,
"loss": 0.2229,
"step": 7560
},
{
"epoch": 2.140837104072398,
"grad_norm": 6.012093544006348,
"learning_rate": 5.727752639517345e-06,
"loss": 0.2215,
"step": 7570
},
{
"epoch": 2.1436651583710407,
"grad_norm": 4.300053596496582,
"learning_rate": 5.708898944193062e-06,
"loss": 0.199,
"step": 7580
},
{
"epoch": 2.146493212669683,
"grad_norm": 7.028651714324951,
"learning_rate": 5.690045248868778e-06,
"loss": 0.22,
"step": 7590
},
{
"epoch": 2.1493212669683257,
"grad_norm": 5.363503456115723,
"learning_rate": 5.671191553544495e-06,
"loss": 0.1895,
"step": 7600
},
{
"epoch": 2.1521493212669682,
"grad_norm": 4.580994129180908,
"learning_rate": 5.652337858220212e-06,
"loss": 0.1713,
"step": 7610
},
{
"epoch": 2.1549773755656108,
"grad_norm": 7.074058532714844,
"learning_rate": 5.633484162895928e-06,
"loss": 0.2861,
"step": 7620
},
{
"epoch": 2.1578054298642533,
"grad_norm": 6.180254936218262,
"learning_rate": 5.614630467571645e-06,
"loss": 0.2316,
"step": 7630
},
{
"epoch": 2.160633484162896,
"grad_norm": 9.370762825012207,
"learning_rate": 5.5957767722473614e-06,
"loss": 0.2717,
"step": 7640
},
{
"epoch": 2.1634615384615383,
"grad_norm": 4.996572017669678,
"learning_rate": 5.576923076923077e-06,
"loss": 0.2513,
"step": 7650
},
{
"epoch": 2.166289592760181,
"grad_norm": 6.018435478210449,
"learning_rate": 5.558069381598794e-06,
"loss": 0.2279,
"step": 7660
},
{
"epoch": 2.1691176470588234,
"grad_norm": 4.290647983551025,
"learning_rate": 5.5392156862745104e-06,
"loss": 0.2459,
"step": 7670
},
{
"epoch": 2.171945701357466,
"grad_norm": 3.902825117111206,
"learning_rate": 5.520361990950227e-06,
"loss": 0.2181,
"step": 7680
},
{
"epoch": 2.1747737556561084,
"grad_norm": 2.4550859928131104,
"learning_rate": 5.501508295625944e-06,
"loss": 0.2309,
"step": 7690
},
{
"epoch": 2.177601809954751,
"grad_norm": 3.8267788887023926,
"learning_rate": 5.48265460030166e-06,
"loss": 0.2444,
"step": 7700
},
{
"epoch": 2.1804298642533935,
"grad_norm": 2.1368167400360107,
"learning_rate": 5.463800904977375e-06,
"loss": 0.2044,
"step": 7710
},
{
"epoch": 2.183257918552036,
"grad_norm": 4.121007919311523,
"learning_rate": 5.444947209653092e-06,
"loss": 0.193,
"step": 7720
},
{
"epoch": 2.1860859728506785,
"grad_norm": 1.0247951745986938,
"learning_rate": 5.4260935143288084e-06,
"loss": 0.2452,
"step": 7730
},
{
"epoch": 2.1889140271493215,
"grad_norm": 6.7461323738098145,
"learning_rate": 5.407239819004525e-06,
"loss": 0.2341,
"step": 7740
},
{
"epoch": 2.191742081447964,
"grad_norm": 3.962465286254883,
"learning_rate": 5.388386123680242e-06,
"loss": 0.1699,
"step": 7750
},
{
"epoch": 2.1945701357466065,
"grad_norm": 3.7287843227386475,
"learning_rate": 5.369532428355958e-06,
"loss": 0.1768,
"step": 7760
},
{
"epoch": 2.197398190045249,
"grad_norm": 3.93239426612854,
"learning_rate": 5.350678733031675e-06,
"loss": 0.2383,
"step": 7770
},
{
"epoch": 2.2002262443438916,
"grad_norm": 5.207613468170166,
"learning_rate": 5.331825037707391e-06,
"loss": 0.2282,
"step": 7780
},
{
"epoch": 2.203054298642534,
"grad_norm": 3.9662837982177734,
"learning_rate": 5.312971342383107e-06,
"loss": 0.1616,
"step": 7790
},
{
"epoch": 2.2058823529411766,
"grad_norm": 4.898771286010742,
"learning_rate": 5.294117647058824e-06,
"loss": 0.2013,
"step": 7800
},
{
"epoch": 2.208710407239819,
"grad_norm": 7.645010948181152,
"learning_rate": 5.2752639517345405e-06,
"loss": 0.2478,
"step": 7810
},
{
"epoch": 2.2115384615384617,
"grad_norm": 2.4150936603546143,
"learning_rate": 5.256410256410257e-06,
"loss": 0.1754,
"step": 7820
},
{
"epoch": 2.214366515837104,
"grad_norm": 1.881043791770935,
"learning_rate": 5.237556561085974e-06,
"loss": 0.264,
"step": 7830
},
{
"epoch": 2.2171945701357467,
"grad_norm": 6.877952575683594,
"learning_rate": 5.21870286576169e-06,
"loss": 0.2879,
"step": 7840
},
{
"epoch": 2.2200226244343892,
"grad_norm": 3.3370893001556396,
"learning_rate": 5.199849170437406e-06,
"loss": 0.2312,
"step": 7850
},
{
"epoch": 2.2228506787330318,
"grad_norm": 4.1501545906066895,
"learning_rate": 5.180995475113123e-06,
"loss": 0.2129,
"step": 7860
},
{
"epoch": 2.2256787330316743,
"grad_norm": 4.085570335388184,
"learning_rate": 5.162141779788839e-06,
"loss": 0.1647,
"step": 7870
},
{
"epoch": 2.228506787330317,
"grad_norm": 4.05198335647583,
"learning_rate": 5.143288084464555e-06,
"loss": 0.2338,
"step": 7880
},
{
"epoch": 2.2313348416289593,
"grad_norm": 3.9560508728027344,
"learning_rate": 5.124434389140272e-06,
"loss": 0.3062,
"step": 7890
},
{
"epoch": 2.234162895927602,
"grad_norm": 2.1549770832061768,
"learning_rate": 5.105580693815988e-06,
"loss": 0.2259,
"step": 7900
},
{
"epoch": 2.2369909502262444,
"grad_norm": 2.7982289791107178,
"learning_rate": 5.086726998491704e-06,
"loss": 0.1782,
"step": 7910
},
{
"epoch": 2.239819004524887,
"grad_norm": 4.951447010040283,
"learning_rate": 5.067873303167421e-06,
"loss": 0.2604,
"step": 7920
},
{
"epoch": 2.2426470588235294,
"grad_norm": 5.907583713531494,
"learning_rate": 5.049019607843137e-06,
"loss": 0.2447,
"step": 7930
},
{
"epoch": 2.245475113122172,
"grad_norm": 5.986253261566162,
"learning_rate": 5.030165912518854e-06,
"loss": 0.2829,
"step": 7940
},
{
"epoch": 2.2483031674208145,
"grad_norm": 4.330525875091553,
"learning_rate": 5.011312217194571e-06,
"loss": 0.1908,
"step": 7950
},
{
"epoch": 2.251131221719457,
"grad_norm": 5.337680816650391,
"learning_rate": 4.992458521870287e-06,
"loss": 0.2539,
"step": 7960
},
{
"epoch": 2.2539592760180995,
"grad_norm": 7.187500476837158,
"learning_rate": 4.973604826546004e-06,
"loss": 0.2405,
"step": 7970
},
{
"epoch": 2.256787330316742,
"grad_norm": 5.105306625366211,
"learning_rate": 4.95475113122172e-06,
"loss": 0.2616,
"step": 7980
},
{
"epoch": 2.2596153846153846,
"grad_norm": 4.068017482757568,
"learning_rate": 4.935897435897436e-06,
"loss": 0.2233,
"step": 7990
},
{
"epoch": 2.262443438914027,
"grad_norm": 2.9654664993286133,
"learning_rate": 4.917043740573153e-06,
"loss": 0.2187,
"step": 8000
},
{
"epoch": 2.262443438914027,
"eval_accuracy": 0.8858574610244989,
"eval_loss": 0.29285645484924316,
"eval_runtime": 126.4151,
"eval_samples_per_second": 99.45,
"eval_steps_per_second": 3.109,
"step": 8000
},
{
"epoch": 2.2652714932126696,
"grad_norm": 4.892025470733643,
"learning_rate": 4.898190045248869e-06,
"loss": 0.223,
"step": 8010
},
{
"epoch": 2.268099547511312,
"grad_norm": 6.540407657623291,
"learning_rate": 4.879336349924585e-06,
"loss": 0.2356,
"step": 8020
},
{
"epoch": 2.2709276018099547,
"grad_norm": 4.254669666290283,
"learning_rate": 4.860482654600302e-06,
"loss": 0.2295,
"step": 8030
},
{
"epoch": 2.273755656108597,
"grad_norm": 2.9539434909820557,
"learning_rate": 4.8416289592760185e-06,
"loss": 0.2617,
"step": 8040
},
{
"epoch": 2.2765837104072397,
"grad_norm": 6.981826305389404,
"learning_rate": 4.822775263951735e-06,
"loss": 0.2911,
"step": 8050
},
{
"epoch": 2.2794117647058822,
"grad_norm": 4.400992393493652,
"learning_rate": 4.803921568627452e-06,
"loss": 0.2384,
"step": 8060
},
{
"epoch": 2.2822398190045248,
"grad_norm": 6.687214374542236,
"learning_rate": 4.785067873303168e-06,
"loss": 0.2139,
"step": 8070
},
{
"epoch": 2.2850678733031673,
"grad_norm": 2.111176013946533,
"learning_rate": 4.766214177978885e-06,
"loss": 0.2223,
"step": 8080
},
{
"epoch": 2.28789592760181,
"grad_norm": 7.312646389007568,
"learning_rate": 4.747360482654601e-06,
"loss": 0.2631,
"step": 8090
},
{
"epoch": 2.2907239819004523,
"grad_norm": 5.643038749694824,
"learning_rate": 4.728506787330317e-06,
"loss": 0.179,
"step": 8100
},
{
"epoch": 2.293552036199095,
"grad_norm": 8.725652694702148,
"learning_rate": 4.709653092006033e-06,
"loss": 0.2362,
"step": 8110
},
{
"epoch": 2.2963800904977374,
"grad_norm": 6.781122207641602,
"learning_rate": 4.69079939668175e-06,
"loss": 0.205,
"step": 8120
},
{
"epoch": 2.29920814479638,
"grad_norm": 0.9392467141151428,
"learning_rate": 4.671945701357466e-06,
"loss": 0.2181,
"step": 8130
},
{
"epoch": 2.3020361990950224,
"grad_norm": 1.8741260766983032,
"learning_rate": 4.653092006033183e-06,
"loss": 0.1588,
"step": 8140
},
{
"epoch": 2.3048642533936654,
"grad_norm": 5.825664520263672,
"learning_rate": 4.6342383107088995e-06,
"loss": 0.2214,
"step": 8150
},
{
"epoch": 2.3076923076923075,
"grad_norm": 4.3385701179504395,
"learning_rate": 4.615384615384616e-06,
"loss": 0.2024,
"step": 8160
},
{
"epoch": 2.3105203619909505,
"grad_norm": 5.437368869781494,
"learning_rate": 4.596530920060332e-06,
"loss": 0.2341,
"step": 8170
},
{
"epoch": 2.3133484162895925,
"grad_norm": 5.2032270431518555,
"learning_rate": 4.5776772247360485e-06,
"loss": 0.2639,
"step": 8180
},
{
"epoch": 2.3161764705882355,
"grad_norm": 4.702691555023193,
"learning_rate": 4.558823529411765e-06,
"loss": 0.2153,
"step": 8190
},
{
"epoch": 2.3190045248868776,
"grad_norm": 3.5364975929260254,
"learning_rate": 4.539969834087482e-06,
"loss": 0.1909,
"step": 8200
},
{
"epoch": 2.3218325791855206,
"grad_norm": 2.7947473526000977,
"learning_rate": 4.521116138763198e-06,
"loss": 0.216,
"step": 8210
},
{
"epoch": 2.324660633484163,
"grad_norm": 8.211967468261719,
"learning_rate": 4.502262443438914e-06,
"loss": 0.2122,
"step": 8220
},
{
"epoch": 2.3274886877828056,
"grad_norm": 3.7828614711761475,
"learning_rate": 4.483408748114631e-06,
"loss": 0.2741,
"step": 8230
},
{
"epoch": 2.330316742081448,
"grad_norm": 5.757340908050537,
"learning_rate": 4.464555052790347e-06,
"loss": 0.2854,
"step": 8240
},
{
"epoch": 2.3331447963800906,
"grad_norm": 4.723744869232178,
"learning_rate": 4.445701357466063e-06,
"loss": 0.2508,
"step": 8250
},
{
"epoch": 2.335972850678733,
"grad_norm": 4.520774841308594,
"learning_rate": 4.42684766214178e-06,
"loss": 0.2414,
"step": 8260
},
{
"epoch": 2.3388009049773757,
"grad_norm": 4.983455181121826,
"learning_rate": 4.407993966817496e-06,
"loss": 0.2414,
"step": 8270
},
{
"epoch": 2.341628959276018,
"grad_norm": 6.122417449951172,
"learning_rate": 4.389140271493213e-06,
"loss": 0.2177,
"step": 8280
},
{
"epoch": 2.3444570135746607,
"grad_norm": 2.776017189025879,
"learning_rate": 4.37028657616893e-06,
"loss": 0.2133,
"step": 8290
},
{
"epoch": 2.3472850678733033,
"grad_norm": 7.429429054260254,
"learning_rate": 4.351432880844646e-06,
"loss": 0.1915,
"step": 8300
},
{
"epoch": 2.350113122171946,
"grad_norm": 7.583387851715088,
"learning_rate": 4.332579185520363e-06,
"loss": 0.2396,
"step": 8310
},
{
"epoch": 2.3529411764705883,
"grad_norm": 8.560108184814453,
"learning_rate": 4.313725490196079e-06,
"loss": 0.2364,
"step": 8320
},
{
"epoch": 2.355769230769231,
"grad_norm": 2.898757219314575,
"learning_rate": 4.294871794871795e-06,
"loss": 0.2685,
"step": 8330
},
{
"epoch": 2.3585972850678734,
"grad_norm": 5.2947564125061035,
"learning_rate": 4.276018099547512e-06,
"loss": 0.2222,
"step": 8340
},
{
"epoch": 2.361425339366516,
"grad_norm": 2.573645830154419,
"learning_rate": 4.257164404223228e-06,
"loss": 0.2335,
"step": 8350
},
{
"epoch": 2.3642533936651584,
"grad_norm": 6.62631368637085,
"learning_rate": 4.238310708898944e-06,
"loss": 0.2325,
"step": 8360
},
{
"epoch": 2.367081447963801,
"grad_norm": 5.814454555511475,
"learning_rate": 4.219457013574661e-06,
"loss": 0.2538,
"step": 8370
},
{
"epoch": 2.3699095022624435,
"grad_norm": 6.129361152648926,
"learning_rate": 4.2006033182503775e-06,
"loss": 0.2395,
"step": 8380
},
{
"epoch": 2.372737556561086,
"grad_norm": 5.893956184387207,
"learning_rate": 4.181749622926094e-06,
"loss": 0.2651,
"step": 8390
},
{
"epoch": 2.3755656108597285,
"grad_norm": 6.977567672729492,
"learning_rate": 4.162895927601811e-06,
"loss": 0.2575,
"step": 8400
},
{
"epoch": 2.378393665158371,
"grad_norm": 1.8976235389709473,
"learning_rate": 4.144042232277527e-06,
"loss": 0.199,
"step": 8410
},
{
"epoch": 2.3812217194570136,
"grad_norm": 1.1803913116455078,
"learning_rate": 4.125188536953243e-06,
"loss": 0.2826,
"step": 8420
},
{
"epoch": 2.384049773755656,
"grad_norm": 4.858994483947754,
"learning_rate": 4.10633484162896e-06,
"loss": 0.1937,
"step": 8430
},
{
"epoch": 2.3868778280542986,
"grad_norm": 3.6424715518951416,
"learning_rate": 4.087481146304676e-06,
"loss": 0.2383,
"step": 8440
},
{
"epoch": 2.389705882352941,
"grad_norm": 4.879428863525391,
"learning_rate": 4.068627450980392e-06,
"loss": 0.2187,
"step": 8450
},
{
"epoch": 2.3925339366515836,
"grad_norm": 4.588160991668701,
"learning_rate": 4.049773755656109e-06,
"loss": 0.2134,
"step": 8460
},
{
"epoch": 2.395361990950226,
"grad_norm": 3.9123332500457764,
"learning_rate": 4.030920060331825e-06,
"loss": 0.1968,
"step": 8470
},
{
"epoch": 2.3981900452488687,
"grad_norm": 6.140926361083984,
"learning_rate": 4.012066365007542e-06,
"loss": 0.2356,
"step": 8480
},
{
"epoch": 2.401018099547511,
"grad_norm": 2.6923718452453613,
"learning_rate": 3.9932126696832585e-06,
"loss": 0.2502,
"step": 8490
},
{
"epoch": 2.4038461538461537,
"grad_norm": 3.490473508834839,
"learning_rate": 3.974358974358974e-06,
"loss": 0.2253,
"step": 8500
},
{
"epoch": 2.4066742081447963,
"grad_norm": 3.2556686401367188,
"learning_rate": 3.955505279034691e-06,
"loss": 0.2228,
"step": 8510
},
{
"epoch": 2.409502262443439,
"grad_norm": 5.598496437072754,
"learning_rate": 3.9366515837104075e-06,
"loss": 0.234,
"step": 8520
},
{
"epoch": 2.4123303167420813,
"grad_norm": 4.937731742858887,
"learning_rate": 3.917797888386124e-06,
"loss": 0.2064,
"step": 8530
},
{
"epoch": 2.415158371040724,
"grad_norm": 2.0519907474517822,
"learning_rate": 3.898944193061841e-06,
"loss": 0.2148,
"step": 8540
},
{
"epoch": 2.4179864253393664,
"grad_norm": 4.925931453704834,
"learning_rate": 3.880090497737557e-06,
"loss": 0.2406,
"step": 8550
},
{
"epoch": 2.420814479638009,
"grad_norm": 3.878779172897339,
"learning_rate": 3.861236802413273e-06,
"loss": 0.2159,
"step": 8560
},
{
"epoch": 2.4236425339366514,
"grad_norm": 5.424575328826904,
"learning_rate": 3.84238310708899e-06,
"loss": 0.2202,
"step": 8570
},
{
"epoch": 2.426470588235294,
"grad_norm": 4.764692306518555,
"learning_rate": 3.8235294117647055e-06,
"loss": 0.2238,
"step": 8580
},
{
"epoch": 2.4292986425339365,
"grad_norm": 6.2886881828308105,
"learning_rate": 3.8046757164404226e-06,
"loss": 0.2258,
"step": 8590
},
{
"epoch": 2.4321266968325794,
"grad_norm": 5.105391502380371,
"learning_rate": 3.7858220211161388e-06,
"loss": 0.2291,
"step": 8600
},
{
"epoch": 2.4349547511312215,
"grad_norm": 3.7577686309814453,
"learning_rate": 3.7669683257918554e-06,
"loss": 0.2041,
"step": 8610
},
{
"epoch": 2.4377828054298645,
"grad_norm": 2.689021587371826,
"learning_rate": 3.748114630467572e-06,
"loss": 0.2576,
"step": 8620
},
{
"epoch": 2.4406108597285066,
"grad_norm": 3.162226438522339,
"learning_rate": 3.7292609351432886e-06,
"loss": 0.2342,
"step": 8630
},
{
"epoch": 2.4434389140271495,
"grad_norm": 4.014715671539307,
"learning_rate": 3.710407239819005e-06,
"loss": 0.236,
"step": 8640
},
{
"epoch": 2.446266968325792,
"grad_norm": 5.3587822914123535,
"learning_rate": 3.6915535444947214e-06,
"loss": 0.2328,
"step": 8650
},
{
"epoch": 2.4490950226244346,
"grad_norm": 7.895315647125244,
"learning_rate": 3.672699849170438e-06,
"loss": 0.2333,
"step": 8660
},
{
"epoch": 2.451923076923077,
"grad_norm": 8.392569541931152,
"learning_rate": 3.653846153846154e-06,
"loss": 0.2605,
"step": 8670
},
{
"epoch": 2.4547511312217196,
"grad_norm": 3.8333370685577393,
"learning_rate": 3.6349924585218704e-06,
"loss": 0.2289,
"step": 8680
},
{
"epoch": 2.457579185520362,
"grad_norm": 7.176278114318848,
"learning_rate": 3.616138763197587e-06,
"loss": 0.2119,
"step": 8690
},
{
"epoch": 2.4604072398190047,
"grad_norm": 8.778523445129395,
"learning_rate": 3.5972850678733032e-06,
"loss": 0.2363,
"step": 8700
},
{
"epoch": 2.463235294117647,
"grad_norm": 3.1572511196136475,
"learning_rate": 3.57843137254902e-06,
"loss": 0.2083,
"step": 8710
},
{
"epoch": 2.4660633484162897,
"grad_norm": 6.948089122772217,
"learning_rate": 3.5595776772247365e-06,
"loss": 0.2337,
"step": 8720
},
{
"epoch": 2.4688914027149322,
"grad_norm": 7.237654209136963,
"learning_rate": 3.540723981900453e-06,
"loss": 0.1903,
"step": 8730
},
{
"epoch": 2.4717194570135748,
"grad_norm": 3.5161070823669434,
"learning_rate": 3.5218702865761693e-06,
"loss": 0.2003,
"step": 8740
},
{
"epoch": 2.4745475113122173,
"grad_norm": 5.7288737297058105,
"learning_rate": 3.5030165912518855e-06,
"loss": 0.1979,
"step": 8750
},
{
"epoch": 2.47737556561086,
"grad_norm": 6.921863079071045,
"learning_rate": 3.484162895927602e-06,
"loss": 0.2681,
"step": 8760
},
{
"epoch": 2.4802036199095023,
"grad_norm": 1.5838019847869873,
"learning_rate": 3.4653092006033183e-06,
"loss": 0.186,
"step": 8770
},
{
"epoch": 2.483031674208145,
"grad_norm": 6.464385986328125,
"learning_rate": 3.446455505279035e-06,
"loss": 0.2791,
"step": 8780
},
{
"epoch": 2.4858597285067874,
"grad_norm": 4.105411529541016,
"learning_rate": 3.4276018099547515e-06,
"loss": 0.246,
"step": 8790
},
{
"epoch": 2.48868778280543,
"grad_norm": 5.3756632804870605,
"learning_rate": 3.408748114630468e-06,
"loss": 0.2344,
"step": 8800
},
{
"epoch": 2.4915158371040724,
"grad_norm": 3.4841089248657227,
"learning_rate": 3.3898944193061843e-06,
"loss": 0.1978,
"step": 8810
},
{
"epoch": 2.494343891402715,
"grad_norm": 7.188533782958984,
"learning_rate": 3.371040723981901e-06,
"loss": 0.2737,
"step": 8820
},
{
"epoch": 2.4971719457013575,
"grad_norm": 4.090082168579102,
"learning_rate": 3.3521870286576167e-06,
"loss": 0.2139,
"step": 8830
},
{
"epoch": 2.5,
"grad_norm": 7.417943000793457,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.2275,
"step": 8840
},
{
"epoch": 2.5028280542986425,
"grad_norm": 3.605393648147583,
"learning_rate": 3.31447963800905e-06,
"loss": 0.2446,
"step": 8850
},
{
"epoch": 2.505656108597285,
"grad_norm": 5.961788654327393,
"learning_rate": 3.2956259426847666e-06,
"loss": 0.2923,
"step": 8860
},
{
"epoch": 2.5084841628959276,
"grad_norm": 4.26703405380249,
"learning_rate": 3.2767722473604827e-06,
"loss": 0.1962,
"step": 8870
},
{
"epoch": 2.51131221719457,
"grad_norm": 4.207533359527588,
"learning_rate": 3.2579185520361994e-06,
"loss": 0.1995,
"step": 8880
},
{
"epoch": 2.5141402714932126,
"grad_norm": 4.4618682861328125,
"learning_rate": 3.239064856711916e-06,
"loss": 0.172,
"step": 8890
},
{
"epoch": 2.516968325791855,
"grad_norm": 5.302677631378174,
"learning_rate": 3.2202111613876326e-06,
"loss": 0.1653,
"step": 8900
},
{
"epoch": 2.5197963800904977,
"grad_norm": 3.299323558807373,
"learning_rate": 3.2013574660633484e-06,
"loss": 0.2407,
"step": 8910
},
{
"epoch": 2.52262443438914,
"grad_norm": 6.668271541595459,
"learning_rate": 3.182503770739065e-06,
"loss": 0.2283,
"step": 8920
},
{
"epoch": 2.5254524886877827,
"grad_norm": 7.668635368347168,
"learning_rate": 3.1636500754147816e-06,
"loss": 0.2752,
"step": 8930
},
{
"epoch": 2.5282805429864252,
"grad_norm": 1.711267113685608,
"learning_rate": 3.1447963800904978e-06,
"loss": 0.2136,
"step": 8940
},
{
"epoch": 2.5311085972850678,
"grad_norm": 8.963603019714355,
"learning_rate": 3.1259426847662144e-06,
"loss": 0.205,
"step": 8950
},
{
"epoch": 2.5339366515837103,
"grad_norm": 2.520670175552368,
"learning_rate": 3.107088989441931e-06,
"loss": 0.2131,
"step": 8960
},
{
"epoch": 2.536764705882353,
"grad_norm": 8.796506881713867,
"learning_rate": 3.0882352941176476e-06,
"loss": 0.2969,
"step": 8970
},
{
"epoch": 2.5395927601809953,
"grad_norm": 7.460408687591553,
"learning_rate": 3.069381598793364e-06,
"loss": 0.2432,
"step": 8980
},
{
"epoch": 2.542420814479638,
"grad_norm": 9.012686729431152,
"learning_rate": 3.0505279034690804e-06,
"loss": 0.2707,
"step": 8990
},
{
"epoch": 2.5452488687782804,
"grad_norm": 5.107896327972412,
"learning_rate": 3.0316742081447962e-06,
"loss": 0.2239,
"step": 9000
},
{
"epoch": 2.5452488687782804,
"eval_accuracy": 0.888880050906777,
"eval_loss": 0.288782000541687,
"eval_runtime": 126.5084,
"eval_samples_per_second": 99.377,
"eval_steps_per_second": 3.107,
"step": 9000
},
{
"epoch": 2.5480769230769234,
"grad_norm": 2.8435633182525635,
"learning_rate": 3.012820512820513e-06,
"loss": 0.2544,
"step": 9010
},
{
"epoch": 2.5509049773755654,
"grad_norm": 4.109634876251221,
"learning_rate": 2.9939668174962294e-06,
"loss": 0.2508,
"step": 9020
},
{
"epoch": 2.5537330316742084,
"grad_norm": 3.3078644275665283,
"learning_rate": 2.975113122171946e-06,
"loss": 0.2025,
"step": 9030
},
{
"epoch": 2.5565610859728505,
"grad_norm": 6.037450790405273,
"learning_rate": 2.9562594268476623e-06,
"loss": 0.2347,
"step": 9040
},
{
"epoch": 2.5593891402714934,
"grad_norm": 5.157569408416748,
"learning_rate": 2.937405731523379e-06,
"loss": 0.2684,
"step": 9050
},
{
"epoch": 2.5622171945701355,
"grad_norm": 2.070380210876465,
"learning_rate": 2.9185520361990955e-06,
"loss": 0.2217,
"step": 9060
},
{
"epoch": 2.5650452488687785,
"grad_norm": 2.0333659648895264,
"learning_rate": 2.899698340874812e-06,
"loss": 0.1901,
"step": 9070
},
{
"epoch": 2.5678733031674206,
"grad_norm": 2.8762121200561523,
"learning_rate": 2.880844645550528e-06,
"loss": 0.2175,
"step": 9080
},
{
"epoch": 2.5707013574660635,
"grad_norm": 3.8669402599334717,
"learning_rate": 2.8619909502262445e-06,
"loss": 0.2218,
"step": 9090
},
{
"epoch": 2.5735294117647056,
"grad_norm": 5.87692403793335,
"learning_rate": 2.843137254901961e-06,
"loss": 0.2058,
"step": 9100
},
{
"epoch": 2.5763574660633486,
"grad_norm": 3.9730098247528076,
"learning_rate": 2.8242835595776773e-06,
"loss": 0.2191,
"step": 9110
},
{
"epoch": 2.579185520361991,
"grad_norm": 3.073633909225464,
"learning_rate": 2.805429864253394e-06,
"loss": 0.2499,
"step": 9120
},
{
"epoch": 2.5820135746606336,
"grad_norm": 3.6937789916992188,
"learning_rate": 2.7865761689291105e-06,
"loss": 0.2499,
"step": 9130
},
{
"epoch": 2.584841628959276,
"grad_norm": 4.838074207305908,
"learning_rate": 2.767722473604827e-06,
"loss": 0.1883,
"step": 9140
},
{
"epoch": 2.5876696832579187,
"grad_norm": 6.562351226806641,
"learning_rate": 2.7488687782805433e-06,
"loss": 0.2019,
"step": 9150
},
{
"epoch": 2.590497737556561,
"grad_norm": 3.512963056564331,
"learning_rate": 2.7300150829562595e-06,
"loss": 0.1934,
"step": 9160
},
{
"epoch": 2.5933257918552037,
"grad_norm": 4.1841511726379395,
"learning_rate": 2.7111613876319757e-06,
"loss": 0.1807,
"step": 9170
},
{
"epoch": 2.5961538461538463,
"grad_norm": 4.239630222320557,
"learning_rate": 2.6923076923076923e-06,
"loss": 0.2101,
"step": 9180
},
{
"epoch": 2.598981900452489,
"grad_norm": 3.499694585800171,
"learning_rate": 2.673453996983409e-06,
"loss": 0.2152,
"step": 9190
},
{
"epoch": 2.6018099547511313,
"grad_norm": 3.0219247341156006,
"learning_rate": 2.6546003016591256e-06,
"loss": 0.2311,
"step": 9200
},
{
"epoch": 2.604638009049774,
"grad_norm": 4.168036937713623,
"learning_rate": 2.6357466063348418e-06,
"loss": 0.1943,
"step": 9210
},
{
"epoch": 2.6074660633484164,
"grad_norm": 1.4795814752578735,
"learning_rate": 2.6168929110105584e-06,
"loss": 0.1786,
"step": 9220
},
{
"epoch": 2.610294117647059,
"grad_norm": 1.5753957033157349,
"learning_rate": 2.598039215686275e-06,
"loss": 0.1892,
"step": 9230
},
{
"epoch": 2.6131221719457014,
"grad_norm": 3.37406325340271,
"learning_rate": 2.5791855203619916e-06,
"loss": 0.1632,
"step": 9240
},
{
"epoch": 2.615950226244344,
"grad_norm": 4.640278339385986,
"learning_rate": 2.5603318250377074e-06,
"loss": 0.239,
"step": 9250
},
{
"epoch": 2.6187782805429864,
"grad_norm": 5.864749431610107,
"learning_rate": 2.541478129713424e-06,
"loss": 0.2349,
"step": 9260
},
{
"epoch": 2.621606334841629,
"grad_norm": 4.219099521636963,
"learning_rate": 2.5226244343891406e-06,
"loss": 0.2298,
"step": 9270
},
{
"epoch": 2.6244343891402715,
"grad_norm": 6.88966703414917,
"learning_rate": 2.503770739064857e-06,
"loss": 0.2096,
"step": 9280
},
{
"epoch": 2.627262443438914,
"grad_norm": 3.7265114784240723,
"learning_rate": 2.4849170437405734e-06,
"loss": 0.1961,
"step": 9290
},
{
"epoch": 2.6300904977375565,
"grad_norm": 3.687527656555176,
"learning_rate": 2.46606334841629e-06,
"loss": 0.2054,
"step": 9300
},
{
"epoch": 2.632918552036199,
"grad_norm": 5.014760971069336,
"learning_rate": 2.4472096530920062e-06,
"loss": 0.2425,
"step": 9310
},
{
"epoch": 2.6357466063348416,
"grad_norm": 8.167291641235352,
"learning_rate": 2.428355957767723e-06,
"loss": 0.2079,
"step": 9320
},
{
"epoch": 2.638574660633484,
"grad_norm": 4.277304649353027,
"learning_rate": 2.409502262443439e-06,
"loss": 0.2205,
"step": 9330
},
{
"epoch": 2.6414027149321266,
"grad_norm": 5.0269975662231445,
"learning_rate": 2.3906485671191556e-06,
"loss": 0.2586,
"step": 9340
},
{
"epoch": 2.644230769230769,
"grad_norm": 4.617335796356201,
"learning_rate": 2.371794871794872e-06,
"loss": 0.2167,
"step": 9350
},
{
"epoch": 2.6470588235294117,
"grad_norm": 3.6927714347839355,
"learning_rate": 2.3529411764705885e-06,
"loss": 0.2195,
"step": 9360
},
{
"epoch": 2.649886877828054,
"grad_norm": 3.20468807220459,
"learning_rate": 2.334087481146305e-06,
"loss": 0.2495,
"step": 9370
},
{
"epoch": 2.6527149321266967,
"grad_norm": 4.111125946044922,
"learning_rate": 2.3152337858220213e-06,
"loss": 0.1675,
"step": 9380
},
{
"epoch": 2.6555429864253393,
"grad_norm": 3.872500419616699,
"learning_rate": 2.2963800904977375e-06,
"loss": 0.2614,
"step": 9390
},
{
"epoch": 2.658371040723982,
"grad_norm": 5.960339069366455,
"learning_rate": 2.277526395173454e-06,
"loss": 0.2031,
"step": 9400
},
{
"epoch": 2.6611990950226243,
"grad_norm": 7.735962390899658,
"learning_rate": 2.2586726998491707e-06,
"loss": 0.2164,
"step": 9410
},
{
"epoch": 2.664027149321267,
"grad_norm": 4.943899154663086,
"learning_rate": 2.2398190045248873e-06,
"loss": 0.2322,
"step": 9420
},
{
"epoch": 2.6668552036199094,
"grad_norm": 3.7775423526763916,
"learning_rate": 2.2209653092006035e-06,
"loss": 0.2238,
"step": 9430
},
{
"epoch": 2.669683257918552,
"grad_norm": 6.782299995422363,
"learning_rate": 2.2021116138763197e-06,
"loss": 0.2141,
"step": 9440
},
{
"epoch": 2.6725113122171944,
"grad_norm": 2.3152804374694824,
"learning_rate": 2.1832579185520363e-06,
"loss": 0.1729,
"step": 9450
},
{
"epoch": 2.6753393665158374,
"grad_norm": 5.257414817810059,
"learning_rate": 2.164404223227753e-06,
"loss": 0.1875,
"step": 9460
},
{
"epoch": 2.6781674208144794,
"grad_norm": 5.083720684051514,
"learning_rate": 2.145550527903469e-06,
"loss": 0.2721,
"step": 9470
},
{
"epoch": 2.6809954751131224,
"grad_norm": 3.5238163471221924,
"learning_rate": 2.1266968325791857e-06,
"loss": 0.1752,
"step": 9480
},
{
"epoch": 2.6838235294117645,
"grad_norm": 9.12520694732666,
"learning_rate": 2.1078431372549023e-06,
"loss": 0.2184,
"step": 9490
},
{
"epoch": 2.6866515837104075,
"grad_norm": 3.9677796363830566,
"learning_rate": 2.0889894419306185e-06,
"loss": 0.2685,
"step": 9500
},
{
"epoch": 2.6894796380090495,
"grad_norm": 8.702911376953125,
"learning_rate": 2.0701357466063347e-06,
"loss": 0.2143,
"step": 9510
},
{
"epoch": 2.6923076923076925,
"grad_norm": 5.3467841148376465,
"learning_rate": 2.0512820512820513e-06,
"loss": 0.1466,
"step": 9520
},
{
"epoch": 2.6951357466063346,
"grad_norm": 8.666280746459961,
"learning_rate": 2.032428355957768e-06,
"loss": 0.2221,
"step": 9530
},
{
"epoch": 2.6979638009049776,
"grad_norm": 4.463994979858398,
"learning_rate": 2.0135746606334846e-06,
"loss": 0.2115,
"step": 9540
},
{
"epoch": 2.7007918552036196,
"grad_norm": 8.998452186584473,
"learning_rate": 1.9947209653092008e-06,
"loss": 0.2286,
"step": 9550
},
{
"epoch": 2.7036199095022626,
"grad_norm": 2.3983922004699707,
"learning_rate": 1.975867269984917e-06,
"loss": 0.1753,
"step": 9560
},
{
"epoch": 2.706447963800905,
"grad_norm": 5.0769524574279785,
"learning_rate": 1.9570135746606336e-06,
"loss": 0.2029,
"step": 9570
},
{
"epoch": 2.7092760180995477,
"grad_norm": 3.6228933334350586,
"learning_rate": 1.93815987933635e-06,
"loss": 0.2282,
"step": 9580
},
{
"epoch": 2.71210407239819,
"grad_norm": 7.759435176849365,
"learning_rate": 1.919306184012067e-06,
"loss": 0.2395,
"step": 9590
},
{
"epoch": 2.7149321266968327,
"grad_norm": 7.777573585510254,
"learning_rate": 1.9004524886877828e-06,
"loss": 0.1992,
"step": 9600
},
{
"epoch": 2.7177601809954752,
"grad_norm": 4.795551300048828,
"learning_rate": 1.8815987933634994e-06,
"loss": 0.227,
"step": 9610
},
{
"epoch": 2.7205882352941178,
"grad_norm": 4.623630046844482,
"learning_rate": 1.8627450980392158e-06,
"loss": 0.2199,
"step": 9620
},
{
"epoch": 2.7234162895927603,
"grad_norm": 1.8060227632522583,
"learning_rate": 1.8438914027149324e-06,
"loss": 0.2662,
"step": 9630
},
{
"epoch": 2.726244343891403,
"grad_norm": 4.0437798500061035,
"learning_rate": 1.8250377073906486e-06,
"loss": 0.2106,
"step": 9640
},
{
"epoch": 2.7290723981900453,
"grad_norm": 2.629993200302124,
"learning_rate": 1.806184012066365e-06,
"loss": 0.2275,
"step": 9650
},
{
"epoch": 2.731900452488688,
"grad_norm": 4.662147045135498,
"learning_rate": 1.7873303167420816e-06,
"loss": 0.1597,
"step": 9660
},
{
"epoch": 2.7347285067873304,
"grad_norm": 7.3248066902160645,
"learning_rate": 1.768476621417798e-06,
"loss": 0.2161,
"step": 9670
},
{
"epoch": 2.737556561085973,
"grad_norm": 5.798586845397949,
"learning_rate": 1.7496229260935144e-06,
"loss": 0.2557,
"step": 9680
},
{
"epoch": 2.7403846153846154,
"grad_norm": 2.832303524017334,
"learning_rate": 1.7307692307692308e-06,
"loss": 0.2108,
"step": 9690
},
{
"epoch": 2.743212669683258,
"grad_norm": 1.038588047027588,
"learning_rate": 1.7119155354449475e-06,
"loss": 0.2141,
"step": 9700
},
{
"epoch": 2.7460407239819005,
"grad_norm": 6.463703155517578,
"learning_rate": 1.6930618401206639e-06,
"loss": 0.219,
"step": 9710
},
{
"epoch": 2.748868778280543,
"grad_norm": 6.210083484649658,
"learning_rate": 1.67420814479638e-06,
"loss": 0.1784,
"step": 9720
},
{
"epoch": 2.7516968325791855,
"grad_norm": 5.5614848136901855,
"learning_rate": 1.6553544494720967e-06,
"loss": 0.2848,
"step": 9730
},
{
"epoch": 2.754524886877828,
"grad_norm": 6.321543216705322,
"learning_rate": 1.636500754147813e-06,
"loss": 0.1904,
"step": 9740
},
{
"epoch": 2.7573529411764706,
"grad_norm": 2.9993443489074707,
"learning_rate": 1.6176470588235297e-06,
"loss": 0.2348,
"step": 9750
},
{
"epoch": 2.760180995475113,
"grad_norm": 2.8095312118530273,
"learning_rate": 1.5987933634992459e-06,
"loss": 0.2194,
"step": 9760
},
{
"epoch": 2.7630090497737556,
"grad_norm": 9.010799407958984,
"learning_rate": 1.5799396681749623e-06,
"loss": 0.274,
"step": 9770
},
{
"epoch": 2.765837104072398,
"grad_norm": 4.045629501342773,
"learning_rate": 1.561085972850679e-06,
"loss": 0.2011,
"step": 9780
},
{
"epoch": 2.7686651583710407,
"grad_norm": 7.133453845977783,
"learning_rate": 1.5422322775263953e-06,
"loss": 0.2241,
"step": 9790
},
{
"epoch": 2.771493212669683,
"grad_norm": 4.382336616516113,
"learning_rate": 1.5233785822021115e-06,
"loss": 0.2694,
"step": 9800
},
{
"epoch": 2.7743212669683257,
"grad_norm": 4.200496673583984,
"learning_rate": 1.5045248868778281e-06,
"loss": 0.2299,
"step": 9810
},
{
"epoch": 2.7771493212669682,
"grad_norm": 3.4665303230285645,
"learning_rate": 1.4856711915535445e-06,
"loss": 0.1934,
"step": 9820
},
{
"epoch": 2.7799773755656108,
"grad_norm": 5.625051498413086,
"learning_rate": 1.4668174962292611e-06,
"loss": 0.2565,
"step": 9830
},
{
"epoch": 2.7828054298642533,
"grad_norm": 0.8546460866928101,
"learning_rate": 1.4479638009049775e-06,
"loss": 0.159,
"step": 9840
},
{
"epoch": 2.785633484162896,
"grad_norm": 2.4043455123901367,
"learning_rate": 1.429110105580694e-06,
"loss": 0.1937,
"step": 9850
},
{
"epoch": 2.7884615384615383,
"grad_norm": 5.863745212554932,
"learning_rate": 1.4102564102564104e-06,
"loss": 0.2213,
"step": 9860
},
{
"epoch": 2.791289592760181,
"grad_norm": 4.6385722160339355,
"learning_rate": 1.391402714932127e-06,
"loss": 0.263,
"step": 9870
},
{
"epoch": 2.7941176470588234,
"grad_norm": 6.428844928741455,
"learning_rate": 1.3725490196078434e-06,
"loss": 0.1866,
"step": 9880
},
{
"epoch": 2.7969457013574663,
"grad_norm": 4.29943323135376,
"learning_rate": 1.3536953242835596e-06,
"loss": 0.2021,
"step": 9890
},
{
"epoch": 2.7997737556561084,
"grad_norm": 3.2437448501586914,
"learning_rate": 1.3348416289592762e-06,
"loss": 0.1571,
"step": 9900
},
{
"epoch": 2.8026018099547514,
"grad_norm": 3.756850481033325,
"learning_rate": 1.3159879336349926e-06,
"loss": 0.2301,
"step": 9910
},
{
"epoch": 2.8054298642533935,
"grad_norm": 4.855559825897217,
"learning_rate": 1.2971342383107092e-06,
"loss": 0.2538,
"step": 9920
},
{
"epoch": 2.8082579185520364,
"grad_norm": 4.502439498901367,
"learning_rate": 1.2782805429864254e-06,
"loss": 0.1862,
"step": 9930
},
{
"epoch": 2.8110859728506785,
"grad_norm": 6.242438793182373,
"learning_rate": 1.2594268476621418e-06,
"loss": 0.145,
"step": 9940
},
{
"epoch": 2.8139140271493215,
"grad_norm": 6.00593900680542,
"learning_rate": 1.2405731523378584e-06,
"loss": 0.2097,
"step": 9950
},
{
"epoch": 2.8167420814479636,
"grad_norm": 10.398560523986816,
"learning_rate": 1.2217194570135748e-06,
"loss": 0.2605,
"step": 9960
},
{
"epoch": 2.8195701357466065,
"grad_norm": 4.909145832061768,
"learning_rate": 1.2028657616892912e-06,
"loss": 0.2473,
"step": 9970
},
{
"epoch": 2.8223981900452486,
"grad_norm": 4.879443168640137,
"learning_rate": 1.1840120663650076e-06,
"loss": 0.2071,
"step": 9980
},
{
"epoch": 2.8252262443438916,
"grad_norm": 5.404385566711426,
"learning_rate": 1.165158371040724e-06,
"loss": 0.2138,
"step": 9990
},
{
"epoch": 2.8280542986425337,
"grad_norm": 3.4741604328155518,
"learning_rate": 1.1463046757164404e-06,
"loss": 0.2502,
"step": 10000
},
{
"epoch": 2.8280542986425337,
"eval_accuracy": 0.888880050906777,
"eval_loss": 0.2902699112892151,
"eval_runtime": 126.3428,
"eval_samples_per_second": 99.507,
"eval_steps_per_second": 3.111,
"step": 10000
},
{
"epoch": 2.8308823529411766,
"grad_norm": 4.25987434387207,
"learning_rate": 1.127450980392157e-06,
"loss": 0.2242,
"step": 10010
},
{
"epoch": 2.833710407239819,
"grad_norm": 7.45045280456543,
"learning_rate": 1.1085972850678732e-06,
"loss": 0.2791,
"step": 10020
},
{
"epoch": 2.8365384615384617,
"grad_norm": 4.844043254852295,
"learning_rate": 1.0897435897435899e-06,
"loss": 0.2357,
"step": 10030
},
{
"epoch": 2.839366515837104,
"grad_norm": 5.769428253173828,
"learning_rate": 1.0708898944193063e-06,
"loss": 0.2296,
"step": 10040
},
{
"epoch": 2.8421945701357467,
"grad_norm": 7.023671627044678,
"learning_rate": 1.0520361990950227e-06,
"loss": 0.2318,
"step": 10050
},
{
"epoch": 2.8450226244343892,
"grad_norm": 3.501164436340332,
"learning_rate": 1.033182503770739e-06,
"loss": 0.2456,
"step": 10060
},
{
"epoch": 2.8478506787330318,
"grad_norm": 9.939863204956055,
"learning_rate": 1.0143288084464557e-06,
"loss": 0.2474,
"step": 10070
},
{
"epoch": 2.8506787330316743,
"grad_norm": 5.502429962158203,
"learning_rate": 9.954751131221719e-07,
"loss": 0.2381,
"step": 10080
},
{
"epoch": 2.853506787330317,
"grad_norm": 5.186315536499023,
"learning_rate": 9.766214177978885e-07,
"loss": 0.2141,
"step": 10090
},
{
"epoch": 2.8563348416289593,
"grad_norm": 11.375553131103516,
"learning_rate": 9.57767722473605e-07,
"loss": 0.2459,
"step": 10100
},
{
"epoch": 2.859162895927602,
"grad_norm": 4.658810615539551,
"learning_rate": 9.389140271493213e-07,
"loss": 0.1952,
"step": 10110
},
{
"epoch": 2.8619909502262444,
"grad_norm": 2.7533957958221436,
"learning_rate": 9.200603318250378e-07,
"loss": 0.2113,
"step": 10120
},
{
"epoch": 2.864819004524887,
"grad_norm": 2.1169681549072266,
"learning_rate": 9.012066365007542e-07,
"loss": 0.198,
"step": 10130
},
{
"epoch": 2.8676470588235294,
"grad_norm": 5.239007472991943,
"learning_rate": 8.823529411764707e-07,
"loss": 0.1799,
"step": 10140
},
{
"epoch": 2.870475113122172,
"grad_norm": 9.836233139038086,
"learning_rate": 8.634992458521871e-07,
"loss": 0.2345,
"step": 10150
},
{
"epoch": 2.8733031674208145,
"grad_norm": 2.392709970474243,
"learning_rate": 8.446455505279036e-07,
"loss": 0.2205,
"step": 10160
},
{
"epoch": 2.876131221719457,
"grad_norm": 2.652374267578125,
"learning_rate": 8.257918552036199e-07,
"loss": 0.155,
"step": 10170
},
{
"epoch": 2.8789592760180995,
"grad_norm": 8.32434368133545,
"learning_rate": 8.069381598793364e-07,
"loss": 0.2262,
"step": 10180
},
{
"epoch": 2.881787330316742,
"grad_norm": 5.847408771514893,
"learning_rate": 7.880844645550528e-07,
"loss": 0.2315,
"step": 10190
},
{
"epoch": 2.8846153846153846,
"grad_norm": 2.832589864730835,
"learning_rate": 7.692307692307694e-07,
"loss": 0.2618,
"step": 10200
},
{
"epoch": 2.887443438914027,
"grad_norm": 4.295781135559082,
"learning_rate": 7.503770739064857e-07,
"loss": 0.2084,
"step": 10210
},
{
"epoch": 2.8902714932126696,
"grad_norm": 5.640412330627441,
"learning_rate": 7.315233785822022e-07,
"loss": 0.2891,
"step": 10220
},
{
"epoch": 2.893099547511312,
"grad_norm": 7.115928649902344,
"learning_rate": 7.126696832579186e-07,
"loss": 0.2689,
"step": 10230
},
{
"epoch": 2.8959276018099547,
"grad_norm": 4.832301139831543,
"learning_rate": 6.938159879336351e-07,
"loss": 0.1793,
"step": 10240
},
{
"epoch": 2.898755656108597,
"grad_norm": 5.678529262542725,
"learning_rate": 6.749622926093515e-07,
"loss": 0.1599,
"step": 10250
},
{
"epoch": 2.9015837104072397,
"grad_norm": 6.394534587860107,
"learning_rate": 6.56108597285068e-07,
"loss": 0.2382,
"step": 10260
},
{
"epoch": 2.9044117647058822,
"grad_norm": 5.185941219329834,
"learning_rate": 6.372549019607843e-07,
"loss": 0.1801,
"step": 10270
},
{
"epoch": 2.9072398190045248,
"grad_norm": 3.3339009284973145,
"learning_rate": 6.184012066365008e-07,
"loss": 0.2108,
"step": 10280
},
{
"epoch": 2.9100678733031673,
"grad_norm": 4.131908416748047,
"learning_rate": 5.995475113122173e-07,
"loss": 0.1994,
"step": 10290
},
{
"epoch": 2.91289592760181,
"grad_norm": 5.131499290466309,
"learning_rate": 5.806938159879337e-07,
"loss": 0.2334,
"step": 10300
},
{
"epoch": 2.9157239819004523,
"grad_norm": 6.0886712074279785,
"learning_rate": 5.618401206636501e-07,
"loss": 0.1895,
"step": 10310
},
{
"epoch": 2.918552036199095,
"grad_norm": 6.050991058349609,
"learning_rate": 5.429864253393665e-07,
"loss": 0.2218,
"step": 10320
},
{
"epoch": 2.9213800904977374,
"grad_norm": 5.902265548706055,
"learning_rate": 5.24132730015083e-07,
"loss": 0.2539,
"step": 10330
},
{
"epoch": 2.9242081447963804,
"grad_norm": 2.757305860519409,
"learning_rate": 5.052790346907994e-07,
"loss": 0.1548,
"step": 10340
},
{
"epoch": 2.9270361990950224,
"grad_norm": 2.187263011932373,
"learning_rate": 4.864253393665158e-07,
"loss": 0.164,
"step": 10350
},
{
"epoch": 2.9298642533936654,
"grad_norm": 6.808703899383545,
"learning_rate": 4.675716440422323e-07,
"loss": 0.2073,
"step": 10360
},
{
"epoch": 2.9326923076923075,
"grad_norm": 7.97061014175415,
"learning_rate": 4.4871794871794876e-07,
"loss": 0.2322,
"step": 10370
},
{
"epoch": 2.9355203619909505,
"grad_norm": 6.372758865356445,
"learning_rate": 4.298642533936652e-07,
"loss": 0.2257,
"step": 10380
},
{
"epoch": 2.9383484162895925,
"grad_norm": 6.094609260559082,
"learning_rate": 4.110105580693816e-07,
"loss": 0.2583,
"step": 10390
},
{
"epoch": 2.9411764705882355,
"grad_norm": 2.3653512001037598,
"learning_rate": 3.921568627450981e-07,
"loss": 0.2137,
"step": 10400
},
{
"epoch": 2.9440045248868776,
"grad_norm": 2.020627737045288,
"learning_rate": 3.733031674208145e-07,
"loss": 0.1286,
"step": 10410
},
{
"epoch": 2.9468325791855206,
"grad_norm": 8.650784492492676,
"learning_rate": 3.5444947209653094e-07,
"loss": 0.2233,
"step": 10420
},
{
"epoch": 2.9496606334841626,
"grad_norm": 4.553081512451172,
"learning_rate": 3.355957767722474e-07,
"loss": 0.1869,
"step": 10430
},
{
"epoch": 2.9524886877828056,
"grad_norm": 3.9334750175476074,
"learning_rate": 3.167420814479638e-07,
"loss": 0.2109,
"step": 10440
},
{
"epoch": 2.955316742081448,
"grad_norm": 10.762858390808105,
"learning_rate": 2.978883861236803e-07,
"loss": 0.2151,
"step": 10450
},
{
"epoch": 2.9581447963800906,
"grad_norm": 6.37054967880249,
"learning_rate": 2.790346907993967e-07,
"loss": 0.2099,
"step": 10460
},
{
"epoch": 2.960972850678733,
"grad_norm": 4.642254829406738,
"learning_rate": 2.6018099547511317e-07,
"loss": 0.1753,
"step": 10470
},
{
"epoch": 2.9638009049773757,
"grad_norm": 4.995135307312012,
"learning_rate": 2.4132730015082957e-07,
"loss": 0.1708,
"step": 10480
},
{
"epoch": 2.966628959276018,
"grad_norm": 5.875439643859863,
"learning_rate": 2.2247360482654603e-07,
"loss": 0.2445,
"step": 10490
},
{
"epoch": 2.9694570135746607,
"grad_norm": 0.8066132664680481,
"learning_rate": 2.0361990950226246e-07,
"loss": 0.1628,
"step": 10500
},
{
"epoch": 2.9722850678733033,
"grad_norm": 9.494747161865234,
"learning_rate": 1.847662141779789e-07,
"loss": 0.1322,
"step": 10510
},
{
"epoch": 2.975113122171946,
"grad_norm": 6.8470001220703125,
"learning_rate": 1.6591251885369535e-07,
"loss": 0.2345,
"step": 10520
},
{
"epoch": 2.9779411764705883,
"grad_norm": 5.916505813598633,
"learning_rate": 1.4705882352941178e-07,
"loss": 0.2001,
"step": 10530
},
{
"epoch": 2.980769230769231,
"grad_norm": 6.173225402832031,
"learning_rate": 1.282051282051282e-07,
"loss": 0.2196,
"step": 10540
},
{
"epoch": 2.9835972850678734,
"grad_norm": 4.780458927154541,
"learning_rate": 1.0935143288084465e-07,
"loss": 0.2821,
"step": 10550
},
{
"epoch": 2.986425339366516,
"grad_norm": 2.4266226291656494,
"learning_rate": 9.04977375565611e-08,
"loss": 0.2189,
"step": 10560
},
{
"epoch": 2.9892533936651584,
"grad_norm": 3.3179211616516113,
"learning_rate": 7.164404223227753e-08,
"loss": 0.2647,
"step": 10570
},
{
"epoch": 2.992081447963801,
"grad_norm": 4.998234272003174,
"learning_rate": 5.279034690799398e-08,
"loss": 0.1746,
"step": 10580
},
{
"epoch": 2.9949095022624435,
"grad_norm": 5.929104804992676,
"learning_rate": 3.393665158371041e-08,
"loss": 0.1786,
"step": 10590
},
{
"epoch": 2.997737556561086,
"grad_norm": 4.823156356811523,
"learning_rate": 1.5082956259426848e-08,
"loss": 0.217,
"step": 10600
}
],
"logging_steps": 10,
"max_steps": 10608,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.132054385068024e+16,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}