{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 5.0,
  "eval_steps": 1000,
  "global_step": 2955,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.01692047377326565,
      "grad_norm": 1.2259714603424072,
      "learning_rate": 6.7567567567567575e-06,
      "loss": 1.5997,
      "step": 10
    },
    {
      "epoch": 0.0338409475465313,
      "grad_norm": 1.427184820175171,
      "learning_rate": 1.3513513513513515e-05,
      "loss": 1.5776,
      "step": 20
    },
    {
      "epoch": 0.050761421319796954,
      "grad_norm": 1.1350350379943848,
      "learning_rate": 2.0270270270270273e-05,
      "loss": 1.4915,
      "step": 30
    },
    {
      "epoch": 0.0676818950930626,
      "grad_norm": 0.7067303657531738,
      "learning_rate": 2.702702702702703e-05,
      "loss": 1.3124,
      "step": 40
    },
    {
      "epoch": 0.08460236886632826,
      "grad_norm": 0.6259365677833557,
      "learning_rate": 3.3783783783783784e-05,
      "loss": 1.199,
      "step": 50
    },
    {
      "epoch": 0.10152284263959391,
      "grad_norm": 0.4322277009487152,
      "learning_rate": 4.0540540540540545e-05,
      "loss": 1.0944,
      "step": 60
    },
    {
      "epoch": 0.11844331641285956,
      "grad_norm": 0.5546955466270447,
      "learning_rate": 4.72972972972973e-05,
      "loss": 1.1023,
      "step": 70
    },
    {
      "epoch": 0.1353637901861252,
      "grad_norm": 0.5008642673492432,
      "learning_rate": 5.405405405405406e-05,
      "loss": 1.1058,
      "step": 80
    },
    {
      "epoch": 0.15228426395939088,
      "grad_norm": 0.43932950496673584,
      "learning_rate": 6.0810810810810814e-05,
      "loss": 1.0681,
      "step": 90
    },
    {
      "epoch": 0.1692047377326565,
      "grad_norm": 0.48154979944229126,
      "learning_rate": 6.756756756756757e-05,
      "loss": 1.0741,
      "step": 100
    },
    {
      "epoch": 0.18612521150592218,
      "grad_norm": 0.6456084251403809,
      "learning_rate": 7.432432432432433e-05,
      "loss": 1.088,
      "step": 110
    },
    {
      "epoch": 0.20304568527918782,
      "grad_norm": 0.4756552577018738,
      "learning_rate": 8.108108108108109e-05,
      "loss": 1.0637,
      "step": 120
    },
    {
      "epoch": 0.21996615905245348,
      "grad_norm": 0.6161950826644897,
      "learning_rate": 8.783783783783784e-05,
      "loss": 1.0807,
      "step": 130
    },
    {
      "epoch": 0.23688663282571912,
      "grad_norm": 0.5582019686698914,
      "learning_rate": 9.45945945945946e-05,
      "loss": 1.0755,
      "step": 140
    },
    {
      "epoch": 0.25380710659898476,
      "grad_norm": 0.5398248434066772,
      "learning_rate": 0.00010135135135135136,
      "loss": 1.0935,
      "step": 150
    },
    {
      "epoch": 0.2707275803722504,
      "grad_norm": 0.4599798023700714,
      "learning_rate": 0.00010810810810810812,
      "loss": 1.0533,
      "step": 160
    },
    {
      "epoch": 0.2876480541455161,
      "grad_norm": 0.5725008249282837,
      "learning_rate": 0.00011486486486486487,
      "loss": 1.0489,
      "step": 170
    },
    {
      "epoch": 0.30456852791878175,
      "grad_norm": 0.6546281576156616,
      "learning_rate": 0.00012162162162162163,
      "loss": 1.0251,
      "step": 180
    },
    {
      "epoch": 0.32148900169204736,
      "grad_norm": 0.6009621024131775,
      "learning_rate": 0.0001283783783783784,
      "loss": 1.0235,
      "step": 190
    },
    {
      "epoch": 0.338409475465313,
      "grad_norm": 0.5415365695953369,
      "learning_rate": 0.00013513513513513514,
      "loss": 1.061,
      "step": 200
    },
    {
      "epoch": 0.3553299492385787,
      "grad_norm": 0.5579381585121155,
      "learning_rate": 0.00014189189189189188,
      "loss": 1.0309,
      "step": 210
    },
    {
      "epoch": 0.37225042301184436,
      "grad_norm": 0.4756716191768646,
      "learning_rate": 0.00014864864864864866,
      "loss": 1.0496,
      "step": 220
    },
    {
      "epoch": 0.38917089678510997,
      "grad_norm": 0.5466011166572571,
      "learning_rate": 0.0001554054054054054,
      "loss": 1.0235,
      "step": 230
    },
    {
      "epoch": 0.40609137055837563,
      "grad_norm": 0.49792057275772095,
      "learning_rate": 0.00016216216216216218,
      "loss": 1.044,
      "step": 240
    },
    {
      "epoch": 0.4230118443316413,
      "grad_norm": 0.5117172598838806,
      "learning_rate": 0.00016891891891891893,
      "loss": 1.0268,
      "step": 250
    },
    {
      "epoch": 0.43993231810490696,
      "grad_norm": 0.44799599051475525,
      "learning_rate": 0.00017567567567567568,
      "loss": 1.0259,
      "step": 260
    },
    {
      "epoch": 0.45685279187817257,
      "grad_norm": 0.6032038927078247,
      "learning_rate": 0.00018243243243243245,
      "loss": 1.0602,
      "step": 270
    },
    {
      "epoch": 0.47377326565143824,
      "grad_norm": 0.5627055764198303,
      "learning_rate": 0.0001891891891891892,
      "loss": 1.0632,
      "step": 280
    },
    {
      "epoch": 0.4906937394247039,
      "grad_norm": 0.5613240599632263,
      "learning_rate": 0.00019594594594594594,
      "loss": 1.0256,
      "step": 290
    },
    {
      "epoch": 0.5076142131979695,
      "grad_norm": 1.3261655569076538,
      "learning_rate": 0.00019999888325954442,
      "loss": 1.0541,
      "step": 300
    },
    {
      "epoch": 0.5245346869712352,
      "grad_norm": 0.5617411136627197,
      "learning_rate": 0.0001999863202158626,
      "loss": 1.0307,
      "step": 310
    },
    {
      "epoch": 0.5414551607445008,
      "grad_norm": 0.5320747494697571,
      "learning_rate": 0.00019995979996246553,
      "loss": 1.0644,
      "step": 320
    },
    {
      "epoch": 0.5583756345177665,
      "grad_norm": 0.5442279577255249,
      "learning_rate": 0.00019991932620134706,
      "loss": 1.0447,
      "step": 330
    },
    {
      "epoch": 0.5752961082910322,
      "grad_norm": 0.4834861159324646,
      "learning_rate": 0.00019986490458228775,
      "loss": 1.0272,
      "step": 340
    },
    {
      "epoch": 0.5922165820642978,
      "grad_norm": 0.48604169487953186,
      "learning_rate": 0.00019979654270206636,
      "loss": 1.0696,
      "step": 350
    },
    {
      "epoch": 0.6091370558375635,
      "grad_norm": 0.4888402223587036,
      "learning_rate": 0.00019971425010339923,
      "loss": 1.0821,
      "step": 360
    },
    {
      "epoch": 0.626057529610829,
      "grad_norm": 0.700931191444397,
      "learning_rate": 0.00019961803827360847,
      "loss": 1.0182,
      "step": 370
    },
    {
      "epoch": 0.6429780033840947,
      "grad_norm": 0.5287917256355286,
      "learning_rate": 0.00019950792064301812,
      "loss": 1.0469,
      "step": 380
    },
    {
      "epoch": 0.6598984771573604,
      "grad_norm": 0.4720386266708374,
      "learning_rate": 0.0001993839125830796,
      "loss": 1.0235,
      "step": 390
    },
    {
      "epoch": 0.676818950930626,
      "grad_norm": 0.5388404726982117,
      "learning_rate": 0.00019924603140422596,
      "loss": 1.0548,
      "step": 400
    },
    {
      "epoch": 0.6937394247038917,
      "grad_norm": 0.34490785002708435,
      "learning_rate": 0.0001990942963534554,
      "loss": 1.0723,
      "step": 410
    },
    {
      "epoch": 0.7106598984771574,
      "grad_norm": 0.4453361928462982,
      "learning_rate": 0.00019892872861164467,
      "loss": 1.0667,
      "step": 420
    },
    {
      "epoch": 0.727580372250423,
      "grad_norm": 0.4319419264793396,
      "learning_rate": 0.0001987493512905924,
      "loss": 1.0545,
      "step": 430
    },
    {
      "epoch": 0.7445008460236887,
      "grad_norm": 0.43239086866378784,
      "learning_rate": 0.00019855618942979272,
      "loss": 0.9922,
      "step": 440
    },
    {
      "epoch": 0.7614213197969543,
      "grad_norm": 0.5506799817085266,
      "learning_rate": 0.0001983492699929403,
      "loss": 1.0453,
      "step": 450
    },
    {
      "epoch": 0.7783417935702199,
      "grad_norm": 0.41813233494758606,
      "learning_rate": 0.0001981286218641662,
      "loss": 1.0118,
      "step": 460
    },
    {
      "epoch": 0.7952622673434856,
      "grad_norm": 0.4467175304889679,
      "learning_rate": 0.00019789427584400584,
      "loss": 0.9767,
      "step": 470
    },
    {
      "epoch": 0.8121827411167513,
      "grad_norm": 0.4833202660083771,
      "learning_rate": 0.00019764626464509978,
      "loss": 0.9934,
      "step": 480
    },
    {
      "epoch": 0.8291032148900169,
      "grad_norm": 0.6021937727928162,
      "learning_rate": 0.0001973846228876271,
      "loss": 1.0241,
      "step": 490
    },
    {
      "epoch": 0.8460236886632826,
      "grad_norm": 0.6700023412704468,
      "learning_rate": 0.00019710938709447288,
      "loss": 1.0248,
      "step": 500
    },
    {
      "epoch": 0.8629441624365483,
      "grad_norm": 0.6895177960395813,
      "learning_rate": 0.00019682059568612982,
      "loss": 1.0287,
      "step": 510
    },
    {
      "epoch": 0.8798646362098139,
      "grad_norm": 0.41973355412483215,
      "learning_rate": 0.0001965182889753351,
      "loss": 1.0231,
      "step": 520
    },
    {
      "epoch": 0.8967851099830795,
      "grad_norm": 0.4463100731372833,
      "learning_rate": 0.00019620250916144313,
      "loss": 1.0407,
      "step": 530
    },
    {
      "epoch": 0.9137055837563451,
      "grad_norm": 0.506348192691803,
      "learning_rate": 0.00019587330032453483,
      "loss": 1.0269,
      "step": 540
    },
    {
      "epoch": 0.9306260575296108,
      "grad_norm": 0.38442033529281616,
      "learning_rate": 0.00019553070841926443,
      "loss": 1.0483,
      "step": 550
    },
    {
      "epoch": 0.9475465313028765,
      "grad_norm": 0.4524074196815491,
      "learning_rate": 0.0001951747812684447,
      "loss": 1.0466,
      "step": 560
    },
    {
      "epoch": 0.9644670050761421,
      "grad_norm": 0.38016098737716675,
      "learning_rate": 0.00019480556855637127,
      "loss": 0.9867,
      "step": 570
    },
    {
      "epoch": 0.9813874788494078,
      "grad_norm": 0.47468554973602295,
      "learning_rate": 0.00019442312182188696,
      "loss": 1.0135,
      "step": 580
    },
    {
      "epoch": 0.9983079526226735,
      "grad_norm": 0.453453928232193,
      "learning_rate": 0.00019402749445118772,
      "loss": 1.029,
      "step": 590
    },
    {
      "epoch": 1.015228426395939,
      "grad_norm": 0.5637357831001282,
      "learning_rate": 0.0001936187416703702,
      "loss": 0.9363,
      "step": 600
    },
    {
      "epoch": 1.0321489001692048,
      "grad_norm": 0.45976507663726807,
      "learning_rate": 0.00019319692053772265,
      "loss": 0.9843,
      "step": 610
    },
    {
      "epoch": 1.0490693739424704,
      "grad_norm": 0.4763481914997101,
      "learning_rate": 0.0001927620899357602,
      "loss": 0.9561,
      "step": 620
    },
    {
      "epoch": 1.0659898477157361,
      "grad_norm": 0.5057137608528137,
      "learning_rate": 0.0001923143105630053,
      "loss": 0.9741,
      "step": 630
    },
    {
      "epoch": 1.0829103214890017,
      "grad_norm": 0.48306360840797424,
      "learning_rate": 0.0001918536449255147,
      "loss": 0.9392,
      "step": 640
    },
    {
      "epoch": 1.0998307952622675,
      "grad_norm": 0.4458140432834625,
      "learning_rate": 0.00019138015732815438,
      "loss": 0.9463,
      "step": 650
    },
    {
      "epoch": 1.116751269035533,
      "grad_norm": 0.6166822910308838,
      "learning_rate": 0.00019089391386562283,
      "loss": 0.9376,
      "step": 660
    },
    {
      "epoch": 1.1336717428087986,
      "grad_norm": 0.4895755350589752,
      "learning_rate": 0.00019039498241322505,
      "loss": 0.9752,
      "step": 670
    },
    {
      "epoch": 1.1505922165820643,
      "grad_norm": 0.4752449095249176,
      "learning_rate": 0.00018988343261739767,
      "loss": 0.9088,
      "step": 680
    },
    {
      "epoch": 1.16751269035533,
      "grad_norm": 0.5145062208175659,
      "learning_rate": 0.0001893593358859869,
      "loss": 0.9918,
      "step": 690
    },
    {
      "epoch": 1.1844331641285957,
      "grad_norm": 0.5580092668533325,
      "learning_rate": 0.00018882276537828072,
      "loss": 0.9644,
      "step": 700
    },
    {
      "epoch": 1.2013536379018612,
      "grad_norm": 0.46470576524734497,
      "learning_rate": 0.0001882737959947964,
      "loss": 0.9361,
      "step": 710
    },
    {
      "epoch": 1.218274111675127,
      "grad_norm": 0.4674146771430969,
      "learning_rate": 0.00018771250436682503,
      "loss": 0.9318,
      "step": 720
    },
    {
      "epoch": 1.2351945854483926,
      "grad_norm": 0.5492033958435059,
      "learning_rate": 0.00018713896884573457,
      "loss": 0.9439,
      "step": 730
    },
    {
      "epoch": 1.252115059221658,
      "grad_norm": 0.5646327137947083,
      "learning_rate": 0.00018655326949203259,
      "loss": 0.9292,
      "step": 740
    },
    {
      "epoch": 1.2690355329949239,
      "grad_norm": 0.5472354888916016,
      "learning_rate": 0.0001859554880641905,
      "loss": 0.9083,
      "step": 750
    },
    {
      "epoch": 1.2859560067681894,
      "grad_norm": 0.45085784792900085,
      "learning_rate": 0.000185345708007231,
      "loss": 0.98,
      "step": 760
    },
    {
      "epoch": 1.3028764805414552,
      "grad_norm": 0.6279247999191284,
      "learning_rate": 0.00018472401444107964,
      "loss": 0.9245,
      "step": 770
    },
    {
      "epoch": 1.3197969543147208,
      "grad_norm": 0.5216350555419922,
      "learning_rate": 0.00018409049414868297,
      "loss": 0.9901,
      "step": 780
    },
    {
      "epoch": 1.3367174280879865,
      "grad_norm": 0.5439789295196533,
      "learning_rate": 0.00018344523556389433,
      "loss": 0.9325,
      "step": 790
    },
    {
      "epoch": 1.353637901861252,
      "grad_norm": 0.691190242767334,
      "learning_rate": 0.0001827883287591293,
      "loss": 0.9836,
      "step": 800
    },
    {
      "epoch": 1.3705583756345177,
      "grad_norm": 0.6296900510787964,
      "learning_rate": 0.00018211986543279244,
      "loss": 0.9223,
      "step": 810
    },
    {
      "epoch": 1.3874788494077834,
      "grad_norm": 0.5122337341308594,
      "learning_rate": 0.00018143993889647688,
      "loss": 0.9759,
      "step": 820
    },
    {
      "epoch": 1.404399323181049,
      "grad_norm": 0.5923859477043152,
      "learning_rate": 0.0001807486440619389,
      "loss": 0.9678,
      "step": 830
    },
    {
      "epoch": 1.4213197969543148,
      "grad_norm": 0.5583952069282532,
      "learning_rate": 0.00018004607742784916,
      "loss": 0.9936,
      "step": 840
    },
    {
      "epoch": 1.4382402707275803,
      "grad_norm": 0.5816633105278015,
      "learning_rate": 0.0001793323370663222,
      "loss": 0.9496,
      "step": 850
    },
    {
      "epoch": 1.455160744500846,
      "grad_norm": 0.4728400707244873,
      "learning_rate": 0.00017860752260922652,
      "loss": 0.9335,
      "step": 860
    },
    {
      "epoch": 1.4720812182741116,
      "grad_norm": 0.6433700919151306,
      "learning_rate": 0.00017787173523427688,
      "loss": 0.9713,
      "step": 870
    },
    {
      "epoch": 1.4890016920473772,
      "grad_norm": 0.4833492934703827,
      "learning_rate": 0.0001771250776509106,
      "loss": 0.9575,
      "step": 880
    },
    {
      "epoch": 1.505922165820643,
      "grad_norm": 0.5448775887489319,
      "learning_rate": 0.00017636765408595055,
      "loss": 0.9599,
      "step": 890
    },
    {
      "epoch": 1.5228426395939088,
      "grad_norm": 0.6575478911399841,
      "learning_rate": 0.00017559957026905563,
      "loss": 0.963,
      "step": 900
    },
    {
      "epoch": 1.5397631133671743,
      "grad_norm": 0.49584710597991943,
      "learning_rate": 0.00017482093341796218,
      "loss": 0.936,
      "step": 910
    },
    {
      "epoch": 1.5566835871404399,
      "grad_norm": 0.538006603717804,
      "learning_rate": 0.00017403185222351704,
      "loss": 0.9529,
      "step": 920
    },
    {
      "epoch": 1.5736040609137056,
      "grad_norm": 0.5086967945098877,
      "learning_rate": 0.00017323243683450552,
      "loss": 0.9807,
      "step": 930
    },
    {
      "epoch": 1.5905245346869712,
      "grad_norm": 0.6087446808815002,
      "learning_rate": 0.00017242279884227535,
      "loss": 0.957,
      "step": 940
    },
    {
      "epoch": 1.6074450084602367,
      "grad_norm": 0.6025837063789368,
      "learning_rate": 0.00017160305126515972,
      "loss": 0.9622,
      "step": 950
    },
    {
      "epoch": 1.6243654822335025,
      "grad_norm": 0.49924030900001526,
      "learning_rate": 0.00017077330853270087,
      "loss": 0.9724,
      "step": 960
    },
    {
      "epoch": 1.6412859560067683,
      "grad_norm": 0.5102123022079468,
      "learning_rate": 0.00016993368646967658,
      "loss": 0.966,
      "step": 970
    },
    {
      "epoch": 1.6582064297800339,
      "grad_norm": 0.6529399752616882,
      "learning_rate": 0.00016908430227993227,
      "loss": 0.9161,
      "step": 980
    },
    {
      "epoch": 1.6751269035532994,
      "grad_norm": 0.5354600548744202,
      "learning_rate": 0.00016822527453002023,
      "loss": 0.972,
      "step": 990
    },
    {
      "epoch": 1.6920473773265652,
      "grad_norm": 0.6311420202255249,
      "learning_rate": 0.00016735672313264883,
      "loss": 0.9322,
      "step": 1000
    },
    {
      "epoch": 1.6920473773265652,
      "eval_loss": 1.0022894144058228,
      "eval_runtime": 9.3099,
      "eval_samples_per_second": 106.983,
      "eval_steps_per_second": 13.427,
      "step": 1000
    },
    {
      "epoch": 1.708967851099831,
      "grad_norm": 0.5729072690010071,
      "learning_rate": 0.00016647876932994373,
      "loss": 0.9525,
      "step": 1010
    },
    {
      "epoch": 1.7258883248730963,
      "grad_norm": 0.8473700284957886,
      "learning_rate": 0.00016559153567652363,
      "loss": 0.9479,
      "step": 1020
    },
    {
      "epoch": 1.742808798646362,
      "grad_norm": 0.6889095306396484,
      "learning_rate": 0.00016469514602239252,
      "loss": 0.9369,
      "step": 1030
    },
    {
      "epoch": 1.7597292724196278,
      "grad_norm": 0.4434736967086792,
      "learning_rate": 0.0001637897254956517,
      "loss": 0.947,
      "step": 1040
    },
    {
      "epoch": 1.7766497461928934,
      "grad_norm": 0.925630509853363,
      "learning_rate": 0.00016287540048503244,
      "loss": 0.9714,
      "step": 1050
    },
    {
      "epoch": 1.793570219966159,
      "grad_norm": 0.7930585741996765,
      "learning_rate": 0.00016195229862225378,
      "loss": 0.9665,
      "step": 1060
    },
    {
      "epoch": 1.8104906937394247,
      "grad_norm": 0.5998470187187195,
      "learning_rate": 0.00016102054876420592,
      "loss": 0.9476,
      "step": 1070
    },
    {
      "epoch": 1.8274111675126905,
      "grad_norm": 0.6296101212501526,
      "learning_rate": 0.00016008028097496308,
      "loss": 0.997,
      "step": 1080
    },
    {
      "epoch": 1.844331641285956,
      "grad_norm": 0.6930395364761353,
      "learning_rate": 0.0001591316265076276,
      "loss": 0.9438,
      "step": 1090
    },
    {
      "epoch": 1.8612521150592216,
      "grad_norm": 0.6003424525260925,
      "learning_rate": 0.0001581747177860082,
      "loss": 0.9711,
      "step": 1100
    },
    {
      "epoch": 1.8781725888324874,
      "grad_norm": 0.48814520239830017,
      "learning_rate": 0.00015720968838613497,
      "loss": 0.9653,
      "step": 1110
    },
    {
      "epoch": 1.895093062605753,
      "grad_norm": 0.5773620009422302,
      "learning_rate": 0.00015623667301761294,
      "loss": 0.9782,
      "step": 1120
    },
    {
      "epoch": 1.9120135363790185,
      "grad_norm": 0.6531692147254944,
      "learning_rate": 0.0001552558075048182,
      "loss": 0.9546,
      "step": 1130
    },
    {
      "epoch": 1.9289340101522843,
      "grad_norm": 0.5369117259979248,
      "learning_rate": 0.00015426722876793779,
      "loss": 0.9598,
      "step": 1140
    },
    {
      "epoch": 1.94585448392555,
      "grad_norm": 0.635637640953064,
      "learning_rate": 0.0001532710748038568,
      "loss": 0.9992,
      "step": 1150
    },
    {
      "epoch": 1.9627749576988156,
      "grad_norm": 0.7120236754417419,
      "learning_rate": 0.00015226748466689552,
      "loss": 0.98,
      "step": 1160
    },
    {
      "epoch": 1.9796954314720812,
      "grad_norm": 0.49258938431739807,
      "learning_rate": 0.00015125659844939833,
      "loss": 0.9703,
      "step": 1170
    },
    {
      "epoch": 1.996615905245347,
      "grad_norm": 0.6356224417686462,
      "learning_rate": 0.0001502385572621783,
      "loss": 1.0121,
      "step": 1180
    },
    {
      "epoch": 2.0135363790186127,
      "grad_norm": 0.5743930339813232,
      "learning_rate": 0.00014921350321481905,
      "loss": 0.8652,
      "step": 1190
    },
    {
      "epoch": 2.030456852791878,
      "grad_norm": 0.9692409634590149,
      "learning_rate": 0.00014818157939583803,
      "loss": 0.796,
      "step": 1200
    },
    {
      "epoch": 2.047377326565144,
      "grad_norm": 0.7314022779464722,
      "learning_rate": 0.00014714292985271206,
      "loss": 0.808,
      "step": 1210
    },
    {
      "epoch": 2.0642978003384096,
      "grad_norm": 0.7976405024528503,
      "learning_rate": 0.00014609769957176993,
      "loss": 0.8067,
      "step": 1220
    },
    {
      "epoch": 2.081218274111675,
      "grad_norm": 0.54390549659729,
      "learning_rate": 0.0001450460344579534,
      "loss": 0.8239,
      "step": 1230
    },
    {
      "epoch": 2.0981387478849407,
      "grad_norm": 0.7505801916122437,
      "learning_rate": 0.00014398808131445032,
      "loss": 0.8165,
      "step": 1240
    },
    {
      "epoch": 2.1150592216582065,
      "grad_norm": 0.6022424697875977,
      "learning_rate": 0.00014292398782220203,
      "loss": 0.8244,
      "step": 1250
    },
    {
      "epoch": 2.1319796954314723,
      "grad_norm": 0.6155591607093811,
      "learning_rate": 0.00014185390251928844,
      "loss": 0.8392,
      "step": 1260
    },
    {
      "epoch": 2.1489001692047376,
      "grad_norm": 0.7514758110046387,
      "learning_rate": 0.0001407779747801936,
      "loss": 0.8271,
      "step": 1270
    },
    {
      "epoch": 2.1658206429780034,
      "grad_norm": 0.6205560564994812,
      "learning_rate": 0.00013969635479495408,
      "loss": 0.8004,
      "step": 1280
    },
    {
      "epoch": 2.182741116751269,
      "grad_norm": 0.6635140180587769,
      "learning_rate": 0.0001386091935481939,
      "loss": 0.8035,
      "step": 1290
    },
    {
      "epoch": 2.199661590524535,
      "grad_norm": 0.5528069138526917,
      "learning_rate": 0.00013751664279804842,
      "loss": 0.8381,
      "step": 1300
    },
    {
      "epoch": 2.2165820642978002,
      "grad_norm": 0.5508107542991638,
      "learning_rate": 0.00013641885505498016,
      "loss": 0.8232,
      "step": 1310
    },
    {
      "epoch": 2.233502538071066,
      "grad_norm": 0.7910854816436768,
      "learning_rate": 0.0001353159835604898,
      "loss": 0.802,
      "step": 1320
    },
    {
      "epoch": 2.250423011844332,
      "grad_norm": 0.8323054909706116,
      "learning_rate": 0.0001342081822657248,
      "loss": 0.8026,
      "step": 1330
    },
    {
      "epoch": 2.267343485617597,
      "grad_norm": 0.622173547744751,
      "learning_rate": 0.00013309560580998956,
      "loss": 0.8385,
      "step": 1340
    },
    {
      "epoch": 2.284263959390863,
      "grad_norm": 0.8506642580032349,
      "learning_rate": 0.00013197840949915867,
      "loss": 0.8216,
      "step": 1350
    },
    {
      "epoch": 2.3011844331641287,
      "grad_norm": 0.9256905317306519,
      "learning_rate": 0.0001308567492839979,
      "loss": 0.814,
      "step": 1360
    },
    {
      "epoch": 2.3181049069373945,
      "grad_norm": 0.9968581795692444,
      "learning_rate": 0.00012973078173839477,
      "loss": 0.8147,
      "step": 1370
    },
    {
      "epoch": 2.33502538071066,
      "grad_norm": 0.6434370279312134,
      "learning_rate": 0.00012860066403750213,
      "loss": 0.7992,
      "step": 1380
    },
    {
      "epoch": 2.3519458544839256,
      "grad_norm": 1.0632697343826294,
      "learning_rate": 0.00012746655393579802,
      "loss": 0.8443,
      "step": 1390
    },
    {
      "epoch": 2.3688663282571913,
      "grad_norm": 0.6085894107818604,
      "learning_rate": 0.00012632860974506443,
      "loss": 0.8297,
      "step": 1400
    },
    {
      "epoch": 2.3857868020304567,
      "grad_norm": 0.7611764073371887,
      "learning_rate": 0.00012518699031228848,
      "loss": 0.8222,
      "step": 1410
    },
    {
      "epoch": 2.4027072758037225,
      "grad_norm": 1.16623854637146,
      "learning_rate": 0.00012404185499748858,
      "loss": 0.8006,
      "step": 1420
    },
    {
      "epoch": 2.4196277495769882,
      "grad_norm": 0.7282375693321228,
      "learning_rate": 0.00012289336365146943,
      "loss": 0.7958,
      "step": 1430
    },
    {
      "epoch": 2.436548223350254,
      "grad_norm": 0.9697285890579224,
      "learning_rate": 0.00012174167659350805,
      "loss": 0.858,
      "step": 1440
    },
    {
      "epoch": 2.4534686971235193,
      "grad_norm": 0.6591439247131348,
      "learning_rate": 0.0001205869545889748,
      "loss": 0.8013,
      "step": 1450
    },
    {
      "epoch": 2.470389170896785,
      "grad_norm": 0.7019118666648865,
      "learning_rate": 0.00011942935882689177,
      "loss": 0.8002,
      "step": 1460
    },
    {
      "epoch": 2.487309644670051,
      "grad_norm": 1.2326512336730957,
      "learning_rate": 0.00011826905089743228,
      "loss": 0.8113,
      "step": 1470
    },
    {
      "epoch": 2.504230118443316,
      "grad_norm": 0.9669883251190186,
      "learning_rate": 0.00011710619276936441,
      "loss": 0.8567,
      "step": 1480
    },
    {
      "epoch": 2.521150592216582,
      "grad_norm": 0.6777768135070801,
      "learning_rate": 0.0001159409467674414,
      "loss": 0.8158,
      "step": 1490
    },
    {
      "epoch": 2.5380710659898478,
      "grad_norm": 0.6380109190940857,
      "learning_rate": 0.00011477347554974278,
      "loss": 0.8402,
      "step": 1500
    },
    {
      "epoch": 2.5549915397631136,
      "grad_norm": 1.0947014093399048,
      "learning_rate": 0.0001136039420849685,
      "loss": 0.7928,
      "step": 1510
    },
    {
      "epoch": 2.571912013536379,
      "grad_norm": 0.6936770081520081,
      "learning_rate": 0.00011243250962969008,
      "loss": 0.8269,
      "step": 1520
    },
    {
      "epoch": 2.5888324873096447,
      "grad_norm": 0.9474750757217407,
      "learning_rate": 0.0001112593417055614,
      "loss": 0.8305,
      "step": 1530
    },
    {
      "epoch": 2.6057529610829104,
      "grad_norm": 0.710216224193573,
      "learning_rate": 0.00011008460207649242,
      "loss": 0.8548,
      "step": 1540
    },
    {
      "epoch": 2.6226734348561758,
      "grad_norm": 0.6556210517883301,
      "learning_rate": 0.00010890845472578947,
      "loss": 0.8315,
      "step": 1550
    },
    {
      "epoch": 2.6395939086294415,
      "grad_norm": 0.6288842558860779,
      "learning_rate": 0.00010773106383326417,
      "loss": 0.8224,
      "step": 1560
    },
    {
      "epoch": 2.6565143824027073,
      "grad_norm": 0.8483306169509888,
      "learning_rate": 0.00010655259375231583,
      "loss": 0.8345,
      "step": 1570
    },
    {
      "epoch": 2.673434856175973,
      "grad_norm": 0.7669579386711121,
      "learning_rate": 0.00010537320898698882,
      "loss": 0.8165,
      "step": 1580
    },
    {
      "epoch": 2.6903553299492384,
      "grad_norm": 0.7949408292770386,
      "learning_rate": 0.00010419307416900947,
      "loss": 0.7951,
      "step": 1590
    },
    {
      "epoch": 2.707275803722504,
      "grad_norm": 0.8271426558494568,
      "learning_rate": 0.00010301235403480487,
      "loss": 0.8385,
      "step": 1600
    },
    {
      "epoch": 2.72419627749577,
      "grad_norm": 0.6712045073509216,
      "learning_rate": 0.00010183121340250699,
      "loss": 0.832,
      "step": 1610
    },
    {
      "epoch": 2.7411167512690353,
      "grad_norm": 0.768624484539032,
      "learning_rate": 0.00010064981714894582,
      "loss": 0.8365,
      "step": 1620
    },
    {
      "epoch": 2.758037225042301,
      "grad_norm": 0.8181334733963013,
      "learning_rate": 9.946833018663359e-05,
      "loss": 0.8448,
      "step": 1630
    },
    {
      "epoch": 2.774957698815567,
      "grad_norm": 0.7336423993110657,
      "learning_rate": 9.828691744074483e-05,
      "loss": 0.8259,
      "step": 1640
    },
    {
      "epoch": 2.7918781725888326,
      "grad_norm": 0.5810146927833557,
      "learning_rate": 9.710574382609416e-05,
      "loss": 0.8443,
      "step": 1650
    },
    {
      "epoch": 2.808798646362098,
      "grad_norm": 0.7470970153808594,
      "learning_rate": 9.59249742241154e-05,
      "loss": 0.8428,
      "step": 1660
    },
    {
      "epoch": 2.8257191201353637,
      "grad_norm": 0.6533593535423279,
      "learning_rate": 9.474477345984592e-05,
      "loss": 0.8078,
      "step": 1670
    },
    {
      "epoch": 2.8426395939086295,
      "grad_norm": 0.8344042897224426,
      "learning_rate": 9.356530627891827e-05,
      "loss": 0.8132,
      "step": 1680
    },
    {
      "epoch": 2.859560067681895,
      "grad_norm": 0.6438416242599487,
      "learning_rate": 9.238673732456323e-05,
      "loss": 0.8553,
      "step": 1690
    },
    {
      "epoch": 2.8764805414551606,
      "grad_norm": 0.9393163323402405,
      "learning_rate": 9.120923111462715e-05,
      "loss": 0.8263,
      "step": 1700
    },
    {
      "epoch": 2.8934010152284264,
      "grad_norm": 0.8571431040763855,
      "learning_rate": 9.003295201860652e-05,
      "loss": 0.8161,
      "step": 1710
    },
    {
      "epoch": 2.910321489001692,
      "grad_norm": 0.7677854895591736,
      "learning_rate": 8.885806423470356e-05,
      "loss": 0.8384,
      "step": 1720
    },
    {
      "epoch": 2.927241962774958,
      "grad_norm": 0.7890878319740295,
      "learning_rate": 8.76847317669056e-05,
      "loss": 0.7891,
      "step": 1730
    },
    {
      "epoch": 2.9441624365482233,
      "grad_norm": 0.6829676628112793,
      "learning_rate": 8.651311840209145e-05,
      "loss": 0.7662,
      "step": 1740
    },
    {
      "epoch": 2.961082910321489,
      "grad_norm": 0.6441112756729126,
      "learning_rate": 8.534338768716845e-05,
      "loss": 0.8046,
      "step": 1750
    },
    {
      "epoch": 2.9780033840947544,
      "grad_norm": 0.7504361271858215,
      "learning_rate": 8.417570290624246e-05,
      "loss": 0.8523,
      "step": 1760
    },
    {
      "epoch": 2.99492385786802,
      "grad_norm": 0.7779954075813293,
      "learning_rate": 8.301022705782498e-05,
      "loss": 0.7861,
      "step": 1770
    },
    {
      "epoch": 3.011844331641286,
      "grad_norm": 0.6992425322532654,
      "learning_rate": 8.184712283208004e-05,
      "loss": 0.72,
      "step": 1780
    },
    {
      "epoch": 3.0287648054145517,
      "grad_norm": 0.8021610379219055,
      "learning_rate": 8.068655258811404e-05,
      "loss": 0.7146,
      "step": 1790
    },
    {
      "epoch": 3.045685279187817,
      "grad_norm": 0.763023316860199,
      "learning_rate": 7.952867833131176e-05,
      "loss": 0.6373,
      "step": 1800
    },
    {
      "epoch": 3.062605752961083,
      "grad_norm": 0.9050746560096741,
      "learning_rate": 7.837366169072202e-05,
      "loss": 0.6911,
      "step": 1810
    },
    {
      "epoch": 3.0795262267343486,
      "grad_norm": 0.9052395820617676,
      "learning_rate": 7.722166389649548e-05,
      "loss": 0.6528,
      "step": 1820
    },
    {
      "epoch": 3.0964467005076144,
      "grad_norm": 0.7168397903442383,
      "learning_rate": 7.607284575737848e-05,
      "loss": 0.691,
      "step": 1830
    },
    {
      "epoch": 3.1133671742808797,
      "grad_norm": 0.8127416372299194,
      "learning_rate": 7.492736763826553e-05,
      "loss": 0.6903,
      "step": 1840
    },
    {
      "epoch": 3.1302876480541455,
      "grad_norm": 0.9594517350196838,
      "learning_rate": 7.378538943781381e-05,
      "loss": 0.6682,
      "step": 1850
    },
    {
      "epoch": 3.1472081218274113,
      "grad_norm": 1.0214790105819702,
      "learning_rate": 7.264707056612252e-05,
      "loss": 0.6508,
      "step": 1860
    },
    {
      "epoch": 3.164128595600677,
      "grad_norm": 1.0813875198364258,
      "learning_rate": 7.151256992248097e-05,
      "loss": 0.6635,
      "step": 1870
    },
    {
      "epoch": 3.1810490693739424,
      "grad_norm": 0.8683750629425049,
      "learning_rate": 7.038204587318728e-05,
      "loss": 0.6959,
      "step": 1880
    },
    {
      "epoch": 3.197969543147208,
      "grad_norm": 0.8788993954658508,
      "learning_rate": 6.92556562294422e-05,
      "loss": 0.6487,
      "step": 1890
    },
    {
      "epoch": 3.214890016920474,
      "grad_norm": 0.8151355981826782,
      "learning_rate": 6.813355822531984e-05,
      "loss": 0.6615,
      "step": 1900
    },
    {
      "epoch": 3.2318104906937393,
      "grad_norm": 0.875066876411438,
      "learning_rate": 6.701590849581907e-05,
      "loss": 0.6716,
      "step": 1910
    },
    {
      "epoch": 3.248730964467005,
      "grad_norm": 0.6827503442764282,
      "learning_rate": 6.590286305499895e-05,
      "loss": 0.6533,
      "step": 1920
    },
    {
      "epoch": 3.265651438240271,
      "grad_norm": 1.032798171043396,
      "learning_rate": 6.479457727420038e-05,
      "loss": 0.6691,
      "step": 1930
    },
    {
      "epoch": 3.2825719120135366,
      "grad_norm": 0.7288352251052856,
      "learning_rate": 6.369120586035757e-05,
      "loss": 0.7198,
      "step": 1940
    },
    {
      "epoch": 3.299492385786802,
      "grad_norm": 1.0082377195358276,
      "learning_rate": 6.259290283440243e-05,
      "loss": 0.68,
      "step": 1950
    },
    {
      "epoch": 3.3164128595600677,
      "grad_norm": 0.7989615797996521,
      "learning_rate": 6.149982150976453e-05,
      "loss": 0.6623,
      "step": 1960
    },
    {
      "epoch": 3.3333333333333335,
      "grad_norm": 0.8861303925514221,
      "learning_rate": 6.0412114470969925e-05,
      "loss": 0.7331,
      "step": 1970
    },
    {
      "epoch": 3.350253807106599,
      "grad_norm": 0.9690698981285095,
      "learning_rate": 5.932993355234177e-05,
      "loss": 0.661,
      "step": 1980
    },
    {
      "epoch": 3.3671742808798646,
      "grad_norm": 0.9235168099403381,
      "learning_rate": 5.825342981680544e-05,
      "loss": 0.6648,
      "step": 1990
    },
    {
      "epoch": 3.3840947546531304,
      "grad_norm": 1.0237386226654053,
      "learning_rate": 5.718275353480155e-05,
      "loss": 0.7121,
      "step": 2000
    },
    {
      "epoch": 3.3840947546531304,
      "eval_loss": 1.1582704782485962,
      "eval_runtime": 9.4671,
      "eval_samples_per_second": 105.206,
      "eval_steps_per_second": 13.204,
      "step": 2000
    },
    {
      "epoch": 3.401015228426396,
      "grad_norm": 1.019852876663208,
      "learning_rate": 5.611805416330955e-05,
      "loss": 0.6862,
      "step": 2010
    },
    {
      "epoch": 3.4179357021996615,
      "grad_norm": 0.8144916296005249,
      "learning_rate": 5.505948032498481e-05,
      "loss": 0.6798,
      "step": 2020
    },
    {
      "epoch": 3.4348561759729273,
      "grad_norm": 0.7801826596260071,
      "learning_rate": 5.400717978741223e-05,
      "loss": 0.6704,
      "step": 2030
    },
    {
      "epoch": 3.451776649746193,
      "grad_norm": 0.8852076530456543,
      "learning_rate": 5.296129944247917e-05,
      "loss": 0.6604,
      "step": 2040
    },
    {
      "epoch": 3.4686971235194584,
      "grad_norm": 1.0277949571609497,
      "learning_rate": 5.1921985285870666e-05,
      "loss": 0.6787,
      "step": 2050
    },
    {
      "epoch": 3.485617597292724,
      "grad_norm": 0.8751595616340637,
      "learning_rate": 5.088938239668957e-05,
      "loss": 0.6816,
      "step": 2060
    },
    {
      "epoch": 3.50253807106599,
      "grad_norm": 0.8326672315597534,
      "learning_rate": 4.986363491720508e-05,
      "loss": 0.6683,
      "step": 2070
    },
    {
      "epoch": 3.5194585448392557,
      "grad_norm": 0.9706553220748901,
      "learning_rate": 4.884488603273153e-05,
      "loss": 0.6619,
      "step": 2080
    },
    {
      "epoch": 3.536379018612521,
      "grad_norm": 0.9801552891731262,
      "learning_rate": 4.78332779516409e-05,
      "loss": 0.6801,
      "step": 2090
    },
    {
      "epoch": 3.553299492385787,
      "grad_norm": 0.9298548102378845,
      "learning_rate": 4.682895188551205e-05,
      "loss": 0.6684,
      "step": 2100
    },
    {
      "epoch": 3.5702199661590526,
      "grad_norm": 0.9834477305412292,
      "learning_rate": 4.583204802941861e-05,
      "loss": 0.6934,
      "step": 2110
    },
    {
      "epoch": 3.587140439932318,
      "grad_norm": 0.8049277067184448,
      "learning_rate": 4.4842705542359164e-05,
      "loss": 0.7124,
      "step": 2120
    },
    {
      "epoch": 3.6040609137055837,
      "grad_norm": 0.7810288071632385,
      "learning_rate": 4.386106252783162e-05,
      "loss": 0.6825,
      "step": 2130
    },
    {
      "epoch": 3.6209813874788495,
      "grad_norm": 0.8989554047584534,
      "learning_rate": 4.288725601455543e-05,
      "loss": 0.6328,
      "step": 2140
    },
    {
      "epoch": 3.6379018612521152,
      "grad_norm": 1.3646354675292969,
      "learning_rate": 4.192142193734344e-05,
      "loss": 0.6594,
      "step": 2150
    },
    {
      "epoch": 3.6548223350253806,
      "grad_norm": 1.177277684211731,
      "learning_rate": 4.096369511812669e-05,
      "loss": 0.6728,
      "step": 2160
    },
    {
      "epoch": 3.6717428087986463,
      "grad_norm": 1.1561075448989868,
      "learning_rate": 4.001420924713435e-05,
      "loss": 0.6511,
      "step": 2170
    },
    {
      "epoch": 3.688663282571912,
      "grad_norm": 0.9678720831871033,
      "learning_rate": 3.9073096864231815e-05,
      "loss": 0.6789,
      "step": 2180
    },
    {
      "epoch": 3.7055837563451774,
      "grad_norm": 0.9476550817489624,
      "learning_rate": 3.814048934041934e-05,
      "loss": 0.6378,
      "step": 2190
    },
    {
      "epoch": 3.7225042301184432,
      "grad_norm": 0.9431784749031067,
      "learning_rate": 3.72165168594936e-05,
      "loss": 0.6823,
      "step": 2200
    },
    {
      "epoch": 3.739424703891709,
      "grad_norm": 0.92593914270401,
      "learning_rate": 3.630130839987553e-05,
      "loss": 0.6861,
      "step": 2210
    },
    {
      "epoch": 3.7563451776649748,
      "grad_norm": 0.9857144951820374,
      "learning_rate": 3.539499171660581e-05,
      "loss": 0.6585,
      "step": 2220
    },
    {
      "epoch": 3.77326565143824,
      "grad_norm": 0.8839899301528931,
      "learning_rate": 3.4497693323511326e-05,
      "loss": 0.6802,
      "step": 2230
    },
    {
      "epoch": 3.790186125211506,
      "grad_norm": 0.9294420480728149,
      "learning_rate": 3.3609538475545196e-05,
      "loss": 0.6914,
      "step": 2240
    },
    {
      "epoch": 3.8071065989847717,
      "grad_norm": 1.0007585287094116,
      "learning_rate": 3.273065115130223e-05,
      "loss": 0.6954,
      "step": 2250
    },
    {
      "epoch": 3.824027072758037,
      "grad_norm": 0.9174544215202332,
      "learning_rate": 3.186115403571245e-05,
      "loss": 0.6564,
      "step": 2260
    },
    {
      "epoch": 3.8409475465313028,
      "grad_norm": 0.9077910780906677,
      "learning_rate": 3.10011685029154e-05,
      "loss": 0.682,
      "step": 2270
    },
    {
      "epoch": 3.8578680203045685,
      "grad_norm": 1.0111888647079468,
      "learning_rate": 3.0150814599317556e-05,
      "loss": 0.7105,
      "step": 2280
    },
    {
      "epoch": 3.8747884940778343,
      "grad_norm": 1.0113632678985596,
      "learning_rate": 2.93102110268347e-05,
      "loss": 0.6659,
      "step": 2290
    },
    {
      "epoch": 3.8917089678511,
      "grad_norm": 0.9199107885360718,
      "learning_rate": 2.847947512632232e-05,
      "loss": 0.7125,
      "step": 2300
    },
    {
      "epoch": 3.9086294416243654,
      "grad_norm": 0.7901623845100403,
      "learning_rate": 2.765872286119575e-05,
      "loss": 0.6582,
      "step": 2310
    },
    {
      "epoch": 3.925549915397631,
      "grad_norm": 0.873816967010498,
      "learning_rate": 2.6848068801242797e-05,
      "loss": 0.71,
      "step": 2320
    },
    {
      "epoch": 3.9424703891708965,
      "grad_norm": 0.8795329332351685,
      "learning_rate": 2.6047626106630764e-05,
      "loss": 0.6864,
      "step": 2330
    },
    {
      "epoch": 3.9593908629441623,
      "grad_norm": 1.2641881704330444,
      "learning_rate": 2.5257506512110173e-05,
      "loss": 0.7093,
      "step": 2340
    },
    {
      "epoch": 3.976311336717428,
      "grad_norm": 0.9996365904808044,
      "learning_rate": 2.4477820311417866e-05,
      "loss": 0.6928,
      "step": 2350
    },
    {
      "epoch": 3.993231810490694,
      "grad_norm": 1.2021087408065796,
      "learning_rate": 2.3708676341880665e-05,
      "loss": 0.6513,
      "step": 2360
    },
    {
      "epoch": 4.01015228426396,
      "grad_norm": 0.7619489431381226,
      "learning_rate": 2.295018196922285e-05,
      "loss": 0.5933,
      "step": 2370
    },
    {
      "epoch": 4.027072758037225,
      "grad_norm": 0.9575318098068237,
      "learning_rate": 2.220244307257865e-05,
      "loss": 0.5914,
      "step": 2380
    },
    {
      "epoch": 4.04399323181049,
      "grad_norm": 1.0752573013305664,
      "learning_rate": 2.1465564029712704e-05,
      "loss": 0.6044,
      "step": 2390
    },
    {
      "epoch": 4.060913705583756,
      "grad_norm": 0.8898993730545044,
      "learning_rate": 2.073964770244967e-05,
      "loss": 0.5523,
      "step": 2400
    },
    {
      "epoch": 4.077834179357022,
      "grad_norm": 1.2780104875564575,
      "learning_rate": 2.002479542231558e-05,
      "loss": 0.5536,
      "step": 2410
    },
    {
      "epoch": 4.094754653130288,
      "grad_norm": 1.157820463180542,
      "learning_rate": 1.9321106976392998e-05,
      "loss": 0.6153,
      "step": 2420
    },
    {
      "epoch": 4.111675126903553,
      "grad_norm": 1.119541049003601,
      "learning_rate": 1.8628680593391556e-05,
      "loss": 0.5425,
      "step": 2430
    },
    {
      "epoch": 4.128595600676819,
      "grad_norm": 0.9344776272773743,
      "learning_rate": 1.7947612929936053e-05,
      "loss": 0.5635,
      "step": 2440
    },
    {
      "epoch": 4.145516074450085,
      "grad_norm": 0.9114211201667786,
      "learning_rate": 1.72779990570741e-05,
      "loss": 0.5909,
      "step": 2450
    },
    {
      "epoch": 4.16243654822335,
      "grad_norm": 0.8573546409606934,
      "learning_rate": 1.6619932447005003e-05,
      "loss": 0.597,
      "step": 2460
    },
    {
      "epoch": 4.179357021996616,
      "grad_norm": 0.9491050839424133,
      "learning_rate": 1.5973504960031936e-05,
      "loss": 0.5778,
      "step": 2470
    },
    {
      "epoch": 4.196277495769881,
      "grad_norm": 1.0331389904022217,
      "learning_rate": 1.533880683173885e-05,
      "loss": 0.6187,
      "step": 2480
    },
    {
      "epoch": 4.213197969543147,
      "grad_norm": 1.0236334800720215,
      "learning_rate": 1.4715926660394696e-05,
      "loss": 0.5816,
      "step": 2490
    },
    {
      "epoch": 4.230118443316413,
      "grad_norm": 0.8633939027786255,
      "learning_rate": 1.410495139458563e-05,
      "loss": 0.584,
      "step": 2500
    },
    {
      "epoch": 4.247038917089679,
      "grad_norm": 1.0404596328735352,
      "learning_rate": 1.3505966321077857e-05,
      "loss": 0.5848,
      "step": 2510
    },
    {
      "epoch": 4.2639593908629445,
      "grad_norm": 1.1281291246414185,
      "learning_rate": 1.2919055052912288e-05,
      "loss": 0.5899,
      "step": 2520
    },
    {
      "epoch": 4.280879864636209,
      "grad_norm": 0.9481174945831299,
      "learning_rate": 1.2344299517733048e-05,
      "loss": 0.5875,
      "step": 2530
    },
    {
      "epoch": 4.297800338409475,
      "grad_norm": 0.9576058387756348,
      "learning_rate": 1.1781779946350924e-05,
      "loss": 0.5772,
      "step": 2540
    },
    {
      "epoch": 4.314720812182741,
      "grad_norm": 1.1529393196105957,
      "learning_rate": 1.1231574861543892e-05,
      "loss": 0.5495,
      "step": 2550
    },
    {
      "epoch": 4.331641285956007,
      "grad_norm": 1.1568267345428467,
      "learning_rate": 1.069376106709612e-05,
      "loss": 0.5644,
      "step": 2560
    },
    {
      "epoch": 4.3485617597292725,
      "grad_norm": 1.0102249383926392,
      "learning_rate": 1.0168413637076735e-05,
      "loss": 0.5893,
      "step": 2570
    },
    {
      "epoch": 4.365482233502538,
      "grad_norm": 1.1391773223876953,
      "learning_rate": 9.65560590536021e-06,
      "loss": 0.6075,
      "step": 2580
    },
    {
      "epoch": 4.382402707275804,
      "grad_norm": 1.0958099365234375,
      "learning_rate": 9.155409455389553e-06,
      "loss": 0.6103,
      "step": 2590
    },
    {
      "epoch": 4.39932318104907,
      "grad_norm": 1.1859424114227295,
      "learning_rate": 8.667894110183895e-06,
      "loss": 0.5932,
      "step": 2600
    },
    {
      "epoch": 4.416243654822335,
      "grad_norm": 1.0985631942749023,
      "learning_rate": 8.19312792259187e-06,
      "loss": 0.6136,
      "step": 2610
    },
    {
      "epoch": 4.4331641285956005,
      "grad_norm": 1.019136667251587,
      "learning_rate": 7.731177165791948e-06,
      "loss": 0.6161,
      "step": 2620
    },
    {
      "epoch": 4.450084602368866,
      "grad_norm": 0.9153673052787781,
      "learning_rate": 7.282106324041349e-06,
      "loss": 0.6149,
      "step": 2630
    },
    {
      "epoch": 4.467005076142132,
      "grad_norm": 1.2232962846755981,
      "learning_rate": 6.845978083674587e-06,
      "loss": 0.5584,
      "step": 2640
    },
    {
      "epoch": 4.483925549915398,
      "grad_norm": 1.3178131580352783,
      "learning_rate": 6.4228533243530065e-06,
      "loss": 0.6045,
      "step": 2650
    },
    {
      "epoch": 4.500846023688664,
      "grad_norm": 1.2076665163040161,
      "learning_rate": 6.012791110566473e-06,
      "loss": 0.613,
      "step": 2660
    },
    {
      "epoch": 4.517766497461929,
      "grad_norm": 1.0967868566513062,
      "learning_rate": 5.615848683388636e-06,
      "loss": 0.6126,
      "step": 2670
    },
    {
      "epoch": 4.534686971235194,
      "grad_norm": 1.1182656288146973,
      "learning_rate": 5.232081452486437e-06,
      "loss": 0.5872,
      "step": 2680
    },
    {
      "epoch": 4.55160744500846,
      "grad_norm": 1.2936855554580688,
      "learning_rate": 4.861542988385393e-06,
      "loss": 0.5872,
      "step": 2690
    },
    {
      "epoch": 4.568527918781726,
      "grad_norm": 1.2152096033096313,
      "learning_rate": 4.504285014991761e-06,
      "loss": 0.5829,
      "step": 2700
    },
    {
      "epoch": 4.585448392554992,
      "grad_norm": 0.859682559967041,
      "learning_rate": 4.160357402372217e-06,
      "loss": 0.6164,
      "step": 2710
    },
    {
      "epoch": 4.602368866328257,
      "grad_norm": 1.1588406562805176,
      "learning_rate": 3.8298081597925025e-06,
      "loss": 0.5858,
      "step": 2720
    },
    {
      "epoch": 4.619289340101523,
      "grad_norm": 0.8907061815261841,
      "learning_rate": 3.5126834290157063e-06,
      "loss": 0.583,
      "step": 2730
    },
    {
      "epoch": 4.636209813874789,
      "grad_norm": 1.0281578302383423,
      "learning_rate": 3.209027477861293e-06,
      "loss": 0.6147,
      "step": 2740
    },
    {
      "epoch": 4.653130287648054,
      "grad_norm": 1.67826247215271,
      "learning_rate": 2.9188826940257373e-06,
      "loss": 0.5593,
      "step": 2750
    },
    {
      "epoch": 4.67005076142132,
      "grad_norm": 0.8971337676048279,
      "learning_rate": 2.6422895791655243e-06,
      "loss": 0.5717,
      "step": 2760
    },
    {
      "epoch": 4.686971235194585,
      "grad_norm": 0.9563831090927124,
      "learning_rate": 2.379286743243514e-06,
      "loss": 0.5616,
      "step": 2770
    },
    {
      "epoch": 4.703891708967851,
      "grad_norm": 0.8459508419036865,
      "learning_rate": 2.1299108991393314e-06,
      "loss": 0.6003,
      "step": 2780
    },
    {
      "epoch": 4.720812182741117,
      "grad_norm": 1.0900635719299316,
      "learning_rate": 1.8941968575245327e-06,
      "loss": 0.5673,
      "step": 2790
    },
    {
      "epoch": 4.737732656514383,
      "grad_norm": 0.8653163313865662,
      "learning_rate": 1.6721775220033598e-06,
      "loss": 0.5735,
      "step": 2800
    },
    {
      "epoch": 4.7546531302876485,
      "grad_norm": 1.0450564622879028,
      "learning_rate": 1.4638838845197344e-06,
      "loss": 0.5527,
      "step": 2810
    },
    {
      "epoch": 4.771573604060913,
      "grad_norm": 1.1020820140838623,
      "learning_rate": 1.2693450210309877e-06,
      "loss": 0.5623,
      "step": 2820
    },
    {
      "epoch": 4.788494077834179,
      "grad_norm": 0.9869380593299866,
      "learning_rate": 1.0885880874491273e-06,
      "loss": 0.5677,
      "step": 2830
    },
    {
      "epoch": 4.805414551607445,
      "grad_norm": 1.2315829992294312,
      "learning_rate": 9.216383158501596e-07,
      "loss": 0.5805,
      "step": 2840
    },
    {
      "epoch": 4.822335025380711,
      "grad_norm": 0.9307577610015869,
      "learning_rate": 7.685190109518514e-07,
      "loss": 0.5778,
      "step": 2850
    },
    {
      "epoch": 4.8392554991539765,
      "grad_norm": 1.1593735218048096,
      "learning_rate": 6.29251546860643e-07,
      "loss": 0.5772,
      "step": 2860
    },
    {
      "epoch": 4.856175972927242,
      "grad_norm": 1.1079590320587158,
      "learning_rate": 5.038553640879684e-07,
      "loss": 0.6036,
      "step": 2870
    },
    {
      "epoch": 4.873096446700508,
      "grad_norm": 1.0658416748046875,
      "learning_rate": 3.923479668365815e-07,
      "loss": 0.5779,
      "step": 2880
    },
    {
      "epoch": 4.890016920473773,
      "grad_norm": 0.9755645990371704,
      "learning_rate": 2.9474492055708845e-07,
      "loss": 0.6032,
      "step": 2890
    },
    {
      "epoch": 4.906937394247039,
      "grad_norm": 1.306435465812683,
      "learning_rate": 2.1105984977513038e-07,
      "loss": 0.6159,
      "step": 2900
    },
    {
      "epoch": 4.9238578680203045,
      "grad_norm": 1.0507274866104126,
      "learning_rate": 1.413044361895932e-07,
      "loss": 0.5738,
      "step": 2910
    },
    {
      "epoch": 4.94077834179357,
      "grad_norm": 0.9364911913871765,
      "learning_rate": 8.548841704185684e-08,
      "loss": 0.5774,
      "step": 2920
    },
    {
      "epoch": 4.957698815566836,
      "grad_norm": 0.8698614239692688,
      "learning_rate": 4.361958375662667e-08,
      "loss": 0.5491,
      "step": 2930
    },
    {
      "epoch": 4.974619289340102,
      "grad_norm": 1.1547551155090332,
      "learning_rate": 1.570378085428148e-08,
      "loss": 0.5937,
      "step": 2940
    },
    {
      "epoch": 4.991539763113368,
      "grad_norm": 1.1454553604125977,
      "learning_rate": 1.7449051350482137e-09,
      "loss": 0.5837,
      "step": 2950
    },
    {
      "epoch": 5.0,
      "step": 2955,
      "total_flos": 3.752202925842104e+17,
      "train_loss": 0.8236714023422267,
      "train_runtime": 1258.0596,
      "train_samples_per_second": 75.155,
      "train_steps_per_second": 2.349
    }
  ],
  "logging_steps": 10,
  "max_steps": 2955,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 5,
  "save_steps": 1000,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 3.752202925842104e+17,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}